====== Hierarchical clustering of TQs ====== A set of units described by TQs (a kind of symbolic objects) in the format L = [ [ 'nameA', [ [TQa1, totalA1 ], [TQa2, totalA2 ], ..., [TQak, totalAk ] ] ], [ 'nameB', [ [TQb1, totalB1 ], [TQb2, totalB2 ], ..., [TQbk, totalBk ] ] ], ... [ 'nameZ', [ [TQz1, totalZ1 ], [TQz2, totalZ2 ], ..., [TQzk, totalZk ] ] ] ] For simple example see [[vlado:work:alg:hctq#toy_test_data| toy test data]]. ===== Franzosi's Violence network ===== https://raw.githubusercontent.com/bavla/TQ/master/json/violenceTQ.json wdir = "C:/Users/batagelj/work/Python/graph/Nets/clusTQ" gdir = 'c:/users/batagelj/work/python/graph/Nets' import sys, os, re, datetime, json sys.path = [gdir]+sys.path; os.chdir(wdir) from TQ import * from Nets import Network as N import numpy as np from copy import copy, deepcopy # computes dissimilarity between clusters def distCl(X,Y,nVar,alpha): d = 0 for i in range(nVar): nX = X[i][1]; nY = Y[i][1] s = TQ.sum(TQ.prodConst(X[i][0],1/max(1,nX)), TQ.prodConst(Y[i][0],-1/max(1,nY))) d = d + alpha[i]*nX*nY/max(1,nX+nY)*TQ.total(TQ.prod(s,s)) return d # adapted Ward's hierarchical clustering #--------------------------------------------- # VB, 16. julij 2010 Clamix in R # Python version: VB, 23. junij 2020 def hclusTQ(L,nVar,alpha): global m def ordNode(S): return [ int(a) for a in S ] def orDendro(i): global m if i<0: return [-i] else: return orDendro(m[i-1,0])+orDendro(m[i-1,1]) infinity = float('inf') numL = len(L); numLm = numL-1 # each unit is a cluster; compute inter-cluster dissimilarity matrix D = np.zeros((numL,numL)) for i in range(numL): D[i][i] = infinity for i in range(numL-1): for j in range(i+1,numL): D[i][j] = distCl(L[i][1],L[j][1],nVar,alpha); D[j][i] = D[i][j] active = set(range(numL)); m = np.zeros((numLm,2),dtype=int) node = np.zeros(numL,dtype=int); h = np.zeros(numLm) LL = [] for k in range(numLm): # determine the closest pair of clusters (p,q) numA = len(active); dmin = infinity for a in active: for b in active: if D[a][b] < dmin: dmin = D[a][b]; p,q = a,b # join the closest pair of clusters h[k] = dmin if node[p]==0: m[k][0] = -(p+1); Lp = L[p] else: m[k][0] = node[p]; Lp = LL[node[p]-1] if node[q]==0: m[k][1] = -(q+1); Lq = L[q] else: m[k][1] = node[q]; Lq = LL[node[q]-1] tq = [] for t in range(nVar): tq.append([TQ.sum(Lp[1][t][0],Lq[1][t][0]), Lp[1][t][1]+Lq[1][t][1]]) LL.append(['C'+str(k+1),tq[:]]) active.discard(p) # determine dissimilarities to the new cluster for s in active: if s != q: if node[s]==0: Ls = L[s] else: Ls = LL[node[s]-1] D[q][s] = distCl(LL[k][1],Ls[1],nVar,alpha); D[s][q] = D[q][s] node[q] = k+1; return {'proc':"hclusTQ", 'merge':m.tolist(), 'height':h.tolist(), 'order': ordNode(orDendro(len(m))), 'labels': [e[0] for e in L], 'method':"hclusTQ", 'call':None, 'dist.method':"Ward", 'leaders':LL } G = N.loadNetsJSON("violenceTQ.json") G.Info() # G._info['legends']['Tlabs'] I = G.Index() sum = [ [ u, G.TQnetInSum(u), G.TQnetOutSum(u) ] for u in G.nodes() ] nVar = 2; alpha = [0.5,0.5] L = [] for u in G.nodes(): name = G._nodes[u][3]['lab'] ins = G.TQnetInSum(u); tins = TQ.total(ins) ous = G.TQnetOutSum(u); tous = TQ.total(ous) print(u, ' tins', tins, ' tous', tous, name) unit = [ name, [ [ins,tins], [ous,tous] ] ] L.append(unit) R = hclusTQ(L,nVar,alpha) # l = R['leaders']; m = R['merge'] js = open("violenceHC.json",'w'); json.dump(R, js, indent=1); js.close() ==== Drawing a dendrogram in R ==== > wdir <- "C:/Users/batagelj/work/Python/graph/Nets/clusTQ" > setwd(wdir) > library(rjson) > js <- "violenceHC.json" > R <- fromJSON(js) > attr(R,"class") <- "hclust" > plot(R,hang=-1) {{vlado:work:pics:violenced.png}} ===== September 11th terror news ===== https://github.com/bavla/TQ/blob/master/json/Terror.zip ... G = N.loadNetsJSON("C:/Users/batagelj/work/Python/graph/JSON/terror/terror.json") G.Info() # G._info['legends']['Tlabs'] # I = G.Index() tot = [ [ u, G.TQnetSum(u) ] for u in G.nodes() ] Sum = [ [a,TQ.total(b)] for a,b in tot ] def takeSecond(elem): return elem[1] Sum.sort(key=takeSecond,reverse=True) cut = [ a-1 for a,b in Sum if b >= 1000 ] nVar = 1; alpha = [1] Ter1000 = [ [G._nodes[i+1][3]['lab'], [[tot[i][1],TQ.total(tot[i][1])]] ] for i in cut ] R = hclusTQ(Ter1000,nVar,alpha) js = open("Ter1000HC.json",'w'); json.dump(R, js, indent=1); js.close() ... > js <- "Ter1000HC.json" > R <- fromJSON(js) > attr(R,"class") <- "hclust" > plot(R,hang=-1) {{vlado:work:pics:ter1000d.png}} and for Ter750HC.json plot(R,hang=-1,cex=0.7) {{vlado:work:pics:ter750d.png}} > names(R) [1] "proc" "merge" "height" "order" "labels" [6] "method" "call" "dist.method" "leaders" > C <- cutree(R,k=4) > table(C) C 1 2 3 4 58 18 16 3 > R$labels[C==4] [1] "anthrax" "letter" "mail" > R$labels[C==3] [1] "new_york" "plane" "tuesday" "pentagon" [5] "world_trade_ctr" "wednesday" "airport" "police" [9] "flight" "buildng" "national" "aircraft" [13] "white_house" "center" "service" "airline" > length(R$leaders) [1] 94 > unitTQ <- function(unit){ + total <- unit[[1]][[2]][[1]][[2]] + TQ <- unit[[1]][[2]][[1]][[1]] + name <- unit[[1]][[1]] + TQ[,3] <- TQ[,3]/total + return(list(name,TQ)) + } > L = R$leaders > source("https://raw.githubusercontent.com/bavla/Nets/master/source/hist.R") > siHist(unitTQ(L[63]),1,66,TRUE,ylim=c(0,0.06),cex.names=0.5,cex.lab=1.5,col="royalblue1") > coHist(unitTQ(L[88]),unitTQ(L[93]),1,66,lab="C88:C93",ylim=c(0,0.12),cex.names=0.5,cex.lab=1.5) > coHist(unitTQ(L[90]),unitTQ(L[92]),1,66,lab="C90:C92",ylim=c(0,0.06),cex.names=0.5,cex.lab=1.5) > coHist(unitTQ(L[86]),unitTQ(L[89]),1,66,lab="C86:C89",ylim=c(0,0.10),cex.names=0.5,cex.lab=1.5) > R$merge[85:94,] [,1] [,2] [1,] -62 78 [2,] -46 -60 [3,] 76 85 [4,] -47 84 [5,] 81 83 [6,] 86 89 [7,] -70 87 [8,] 63 91 [9,] 90 92 [10,] 88 93 +------C94----------+ C88 +------C93------+ +--C90--+ +---C92---+ C86 C89 C63 +--C91--+ (70) C87 {{vlado:work:pics:c88-93.png}} {{vlado:work:pics:c90-92.png}} {{vlado:work:pics:c86-89.png}} {{vlado:work:pics:c63.png}} ===== Toy test data ===== >>> nvar = 1; alpha = [1] >>> a = [ (1,3,1), (5,8,2), (11,12,3) ]; b = [ (2,3,3), (4,7,2), (10,14,1) ] >>> c = [ (2,8,1), (9,14,1) ]; d = [ (1,8,1) ]; e = [ (1,8,2) ] >>> ta = TQ.total(a); tb = TQ.total(b) >>> tc = TQ.total(c); td = TQ.total(d); te = TQ.total(e) >>> La = [[ a, ta ]]; Lb = [[ b, tb ]]; Lc = [[ c, tc ]] >>> Ld = [[ d, td ]]; Le = [[ e, te ]] >>> L = [ ['a', La ], ['b', Lb ], ['c', Lc ], ['d', Ld ], ['e', Le ] ] >>> R = hclusTQ(L,nVar,alpha) >>> R['merge'] [[-4, -5], [-2, -3], [2, 1], [-1, 3]] ... >>> nVar = 2; alpha = [0.5,0.5] >>> La = [[ a, ta ], [ a, ta ]]; Lb = [[ b, tb ], [ b, tb ]]; Lc = [[ c, tc ], [ c, tc ]] >>> Ld = [[ d, td ], [ d, td ]]; Le = [[ e, te ], [ e, te ]]; Lf = [[ a, ta ], [ b, tb ]] >>> L = [ ['a', La ], ['b', Lb ], ['c', Lc ], ['d', Ld ], ['e', Le ], ['f', Lf ] ] >>> R = hclusTQ(L,nVar,alpha) >>> R['merge'] [[-4, -5], [-2, -3], [-1, -6], [2, 1], [4, 3]]