====== Hierarchical clustering of TQs ======
A set of units described by TQs (a kind of symbolic objects) in the format
L = [
[ 'nameA', [ [TQa1, totalA1 ], [TQa2, totalA2 ], ..., [TQak, totalAk ] ] ],
[ 'nameB', [ [TQb1, totalB1 ], [TQb2, totalB2 ], ..., [TQbk, totalBk ] ] ],
...
[ 'nameZ', [ [TQz1, totalZ1 ], [TQz2, totalZ2 ], ..., [TQzk, totalZk ] ] ]
]
For simple example see [[vlado:work:alg:hctq#toy_test_data| toy test data]].
===== Franzosi's Violence network =====
https://raw.githubusercontent.com/bavla/TQ/master/json/violenceTQ.json
wdir = "C:/Users/batagelj/work/Python/graph/Nets/clusTQ"
gdir = 'c:/users/batagelj/work/python/graph/Nets'
import sys, os, re, datetime, json
sys.path = [gdir]+sys.path; os.chdir(wdir)
from TQ import *
from Nets import Network as N
import numpy as np
from copy import copy, deepcopy
# computes dissimilarity between clusters
def distCl(X,Y,nVar,alpha):
d = 0
for i in range(nVar):
nX = X[i][1]; nY = Y[i][1]
s = TQ.sum(TQ.prodConst(X[i][0],1/max(1,nX)),
TQ.prodConst(Y[i][0],-1/max(1,nY)))
d = d + alpha[i]*nX*nY/max(1,nX+nY)*TQ.total(TQ.prod(s,s))
return d
# adapted Ward's hierarchical clustering
#---------------------------------------------
# VB, 16. julij 2010 Clamix in R
# Python version: VB, 23. junij 2020
def hclusTQ(L,nVar,alpha):
global m
def ordNode(S): return [ int(a) for a in S ]
def orDendro(i):
global m
if i<0: return [-i]
else: return orDendro(m[i-1,0])+orDendro(m[i-1,1])
infinity = float('inf')
numL = len(L); numLm = numL-1
# each unit is a cluster; compute inter-cluster dissimilarity matrix
D = np.zeros((numL,numL))
for i in range(numL): D[i][i] = infinity
for i in range(numL-1):
for j in range(i+1,numL):
D[i][j] = distCl(L[i][1],L[j][1],nVar,alpha); D[j][i] = D[i][j]
active = set(range(numL)); m = np.zeros((numLm,2),dtype=int)
node = np.zeros(numL,dtype=int); h = np.zeros(numLm)
LL = []
for k in range(numLm):
# determine the closest pair of clusters (p,q)
numA = len(active); dmin = infinity
for a in active:
for b in active:
if D[a][b] < dmin:
dmin = D[a][b]; p,q = a,b
# join the closest pair of clusters
h[k] = dmin
if node[p]==0: m[k][0] = -(p+1); Lp = L[p]
else: m[k][0] = node[p]; Lp = LL[node[p]-1]
if node[q]==0: m[k][1] = -(q+1); Lq = L[q]
else: m[k][1] = node[q]; Lq = LL[node[q]-1]
tq = []
for t in range(nVar):
tq.append([TQ.sum(Lp[1][t][0],Lq[1][t][0]), Lp[1][t][1]+Lq[1][t][1]])
LL.append(['C'+str(k+1),tq[:]])
active.discard(p)
# determine dissimilarities to the new cluster
for s in active:
if s != q:
if node[s]==0: Ls = L[s]
else: Ls = LL[node[s]-1]
D[q][s] = distCl(LL[k][1],Ls[1],nVar,alpha); D[s][q] = D[q][s]
node[q] = k+1;
return {'proc':"hclusTQ", 'merge':m.tolist(), 'height':h.tolist(),
'order': ordNode(orDendro(len(m))),
'labels': [e[0] for e in L],
'method':"hclusTQ", 'call':None, 'dist.method':"Ward", 'leaders':LL }
G = N.loadNetsJSON("violenceTQ.json")
G.Info()
# G._info['legends']['Tlabs']
I = G.Index()
sum = [ [ u, G.TQnetInSum(u), G.TQnetOutSum(u) ] for u in G.nodes() ]
nVar = 2; alpha = [0.5,0.5]
L = []
for u in G.nodes():
name = G._nodes[u][3]['lab']
ins = G.TQnetInSum(u); tins = TQ.total(ins)
ous = G.TQnetOutSum(u); tous = TQ.total(ous)
print(u, ' tins', tins, ' tous', tous, name)
unit = [ name, [ [ins,tins], [ous,tous] ] ]
L.append(unit)
R = hclusTQ(L,nVar,alpha)
# l = R['leaders']; m = R['merge']
js = open("violenceHC.json",'w'); json.dump(R, js, indent=1); js.close()
==== Drawing a dendrogram in R ====
> wdir <- "C:/Users/batagelj/work/Python/graph/Nets/clusTQ"
> setwd(wdir)
> library(rjson)
> js <- "violenceHC.json"
> R <- fromJSON(js)
> attr(R,"class") <- "hclust"
> plot(R,hang=-1)
{{vlado:work:pics:violenced.png}}
===== September 11th terror news =====
https://github.com/bavla/TQ/blob/master/json/Terror.zip
...
G = N.loadNetsJSON("C:/Users/batagelj/work/Python/graph/JSON/terror/terror.json")
G.Info()
# G._info['legends']['Tlabs']
# I = G.Index()
tot = [ [ u, G.TQnetSum(u) ] for u in G.nodes() ]
Sum = [ [a,TQ.total(b)] for a,b in tot ]
def takeSecond(elem): return elem[1]
Sum.sort(key=takeSecond,reverse=True)
cut = [ a-1 for a,b in Sum if b >= 1000 ]
nVar = 1; alpha = [1]
Ter1000 = [ [G._nodes[i+1][3]['lab'],
[[tot[i][1],TQ.total(tot[i][1])]] ] for i in cut ]
R = hclusTQ(Ter1000,nVar,alpha)
js = open("Ter1000HC.json",'w'); json.dump(R, js, indent=1); js.close()
...
> js <- "Ter1000HC.json"
> R <- fromJSON(js)
> attr(R,"class") <- "hclust"
> plot(R,hang=-1)
{{vlado:work:pics:ter1000d.png}}
and for Ter750HC.json
plot(R,hang=-1,cex=0.7)
{{vlado:work:pics:ter750d.png}}
> names(R)
[1] "proc" "merge" "height" "order" "labels"
[6] "method" "call" "dist.method" "leaders"
> C <- cutree(R,k=4)
> table(C)
C
1 2 3 4
58 18 16 3
> R$labels[C==4]
[1] "anthrax" "letter" "mail"
> R$labels[C==3]
[1] "new_york" "plane" "tuesday" "pentagon"
[5] "world_trade_ctr" "wednesday" "airport" "police"
[9] "flight" "buildng" "national" "aircraft"
[13] "white_house" "center" "service" "airline"
> length(R$leaders)
[1] 94
> unitTQ <- function(unit){
+ total <- unit[[1]][[2]][[1]][[2]]
+ TQ <- unit[[1]][[2]][[1]][[1]]
+ name <- unit[[1]][[1]]
+ TQ[,3] <- TQ[,3]/total
+ return(list(name,TQ))
+ }
> L = R$leaders
> source("https://raw.githubusercontent.com/bavla/Nets/master/source/hist.R")
> siHist(unitTQ(L[63]),1,66,TRUE,ylim=c(0,0.06),cex.names=0.5,cex.lab=1.5,col="royalblue1")
> coHist(unitTQ(L[88]),unitTQ(L[93]),1,66,lab="C88:C93",ylim=c(0,0.12),cex.names=0.5,cex.lab=1.5)
> coHist(unitTQ(L[90]),unitTQ(L[92]),1,66,lab="C90:C92",ylim=c(0,0.06),cex.names=0.5,cex.lab=1.5)
> coHist(unitTQ(L[86]),unitTQ(L[89]),1,66,lab="C86:C89",ylim=c(0,0.10),cex.names=0.5,cex.lab=1.5)
> R$merge[85:94,]
[,1] [,2]
[1,] -62 78
[2,] -46 -60
[3,] 76 85
[4,] -47 84
[5,] 81 83
[6,] 86 89
[7,] -70 87
[8,] 63 91
[9,] 90 92
[10,] 88 93
+------C94----------+
C88 +------C93------+
+--C90--+ +---C92---+
C86 C89 C63 +--C91--+
(70) C87
{{vlado:work:pics:c88-93.png}}
{{vlado:work:pics:c90-92.png}}
{{vlado:work:pics:c86-89.png}}
{{vlado:work:pics:c63.png}}
===== Toy test data =====
>>> nvar = 1; alpha = [1]
>>> a = [ (1,3,1), (5,8,2), (11,12,3) ]; b = [ (2,3,3), (4,7,2), (10,14,1) ]
>>> c = [ (2,8,1), (9,14,1) ]; d = [ (1,8,1) ]; e = [ (1,8,2) ]
>>> ta = TQ.total(a); tb = TQ.total(b)
>>> tc = TQ.total(c); td = TQ.total(d); te = TQ.total(e)
>>> La = [[ a, ta ]]; Lb = [[ b, tb ]]; Lc = [[ c, tc ]]
>>> Ld = [[ d, td ]]; Le = [[ e, te ]]
>>> L = [ ['a', La ], ['b', Lb ], ['c', Lc ], ['d', Ld ], ['e', Le ] ]
>>> R = hclusTQ(L,nVar,alpha)
>>> R['merge']
[[-4, -5], [-2, -3], [2, 1], [-1, 3]]
...
>>> nVar = 2; alpha = [0.5,0.5]
>>> La = [[ a, ta ], [ a, ta ]]; Lb = [[ b, tb ], [ b, tb ]]; Lc = [[ c, tc ], [ c, tc ]]
>>> Ld = [[ d, td ], [ d, td ]]; Le = [[ e, te ], [ e, te ]]; Lf = [[ a, ta ], [ b, tb ]]
>>> L = [ ['a', La ], ['b', Lb ], ['c', Lc ], ['d', Ld ], ['e', Le ], ['f', Lf ] ]
>>> R = hclusTQ(L,nVar,alpha)
>>> R['merge']
[[-4, -5], [-2, -3], [-1, -6], [2, 1], [4, 3]]