Hierarchical clustering of TQs

A set of units described by TQs (a kind of symbolic objects) in the format

L = [
      [ 'nameA', [ [TQa1, totalA1 ], [TQa2, totalA2 ], ..., [TQak, totalAk ] ] ],
      [ 'nameB', [ [TQb1, totalB1 ], [TQb2, totalB2 ], ..., [TQbk, totalBk ] ] ],
...
      [ 'nameZ', [ [TQz1, totalZ1 ], [TQz2, totalZ2 ], ..., [TQzk, totalZk ] ] ]
    ]      

For simple example see toy test data.

Franzosi's Violence network

https://raw.githubusercontent.com/bavla/TQ/master/json/violenceTQ.json

wdir = "C:/Users/batagelj/work/Python/graph/Nets/clusTQ"
gdir = 'c:/users/batagelj/work/python/graph/Nets'

import sys, os, re, datetime, json
sys.path = [gdir]+sys.path; os.chdir(wdir)
from TQ import *
from Nets import Network as N
import numpy as np
from copy import copy, deepcopy

# computes dissimilarity between clusters
def distCl(X,Y,nVar,alpha):
  d = 0
  for i in range(nVar):
    nX = X[i][1]; nY = Y[i][1]
    s = TQ.sum(TQ.prodConst(X[i][0],1/max(1,nX)),
               TQ.prodConst(Y[i][0],-1/max(1,nY)))
    d = d + alpha[i]*nX*nY/max(1,nX+nY)*TQ.total(TQ.prod(s,s))
  return d

# adapted Ward's hierarchical clustering
#---------------------------------------------
# VB, 16. julij 2010  Clamix in R
# Python version: VB, 23. junij 2020

def hclusTQ(L,nVar,alpha):
  global m
  def ordNode(S): return [ int(a) for a in S ]
  def orDendro(i):
    global m
    if i<0: return [-i]
    else: return orDendro(m[i-1,0])+orDendro(m[i-1,1])
  infinity = float('inf')  
  numL = len(L); numLm = numL-1
# each unit is a cluster; compute inter-cluster dissimilarity matrix
  D = np.zeros((numL,numL))
  for i in range(numL): D[i][i] = infinity
  for i in range(numL-1):
    for j in range(i+1,numL):      
      D[i][j] = distCl(L[i][1],L[j][1],nVar,alpha); D[j][i] = D[i][j]
  active = set(range(numL)); m = np.zeros((numLm,2),dtype=int) 
  node = np.zeros(numL,dtype=int); h = np.zeros(numLm)
  LL = []
  for k in range(numLm):
  # determine the closest pair of clusters (p,q)
    numA = len(active); dmin = infinity
    for a in active:
      for b in active:
        if D[a][b] < dmin:
          dmin = D[a][b]; p,q = a,b          
  # join the closest pair of clusters
    h[k] = dmin
    if node[p]==0: m[k][0] = -(p+1); Lp = L[p]
    else: m[k][0] = node[p]; Lp = LL[node[p]-1]
    if node[q]==0: m[k][1] = -(q+1); Lq = L[q]
    else: m[k][1] = node[q]; Lq = LL[node[q]-1]
    tq = []
    for t in range(nVar):
      tq.append([TQ.sum(Lp[1][t][0],Lq[1][t][0]), Lp[1][t][1]+Lq[1][t][1]])
    LL.append(['C'+str(k+1),tq[:]])  
    active.discard(p) 
  # determine dissimilarities to the new cluster
    for s in active:
      if s != q:
        if node[s]==0: Ls = L[s]
        else: Ls = LL[node[s]-1]
        D[q][s] = distCl(LL[k][1],Ls[1],nVar,alpha); D[s][q] = D[q][s]
    node[q] = k+1; 
  return {'proc':"hclusTQ", 'merge':m.tolist(), 'height':h.tolist(),
        'order': ordNode(orDendro(len(m))),
        'labels': [e[0] for e in L],
        'method':"hclusTQ", 'call':None, 'dist.method':"Ward", 'leaders':LL }

G = N.loadNetsJSON("violenceTQ.json")
G.Info()
# G._info['legends']['Tlabs']
I = G.Index()
sum = [ [ u, G.TQnetInSum(u), G.TQnetOutSum(u) ] for u in G.nodes() ]

nVar = 2; alpha = [0.5,0.5]
L = []
for u in G.nodes():
  name = G._nodes[u][3]['lab']  
  ins = G.TQnetInSum(u); tins = TQ.total(ins)
  ous = G.TQnetOutSum(u); tous = TQ.total(ous)
  print(u, ' tins', tins, ' tous', tous, name)
  unit = [ name, [ [ins,tins], [ous,tous] ] ]
  L.append(unit)

R = hclusTQ(L,nVar,alpha)
# l = R['leaders']; m = R['merge']
js = open("violenceHC.json",'w'); json.dump(R, js, indent=1); js.close()

Drawing a dendrogram in R

> wdir <- "C:/Users/batagelj/work/Python/graph/Nets/clusTQ"
> setwd(wdir)
> library(rjson)
> js <- "violenceHC.json"
> R <- fromJSON(js)
> attr(R,"class") <- "hclust"
> plot(R,hang=-1)

September 11th terror news

https://github.com/bavla/TQ/blob/master/json/Terror.zip

...
G = N.loadNetsJSON("C:/Users/batagelj/work/Python/graph/JSON/terror/terror.json")
G.Info()
# G._info['legends']['Tlabs']
# I = G.Index()
tot = [ [ u, G.TQnetSum(u) ] for u in G.nodes() ]
Sum = [ [a,TQ.total(b)] for a,b in tot ]
def takeSecond(elem): return elem[1]
Sum.sort(key=takeSecond,reverse=True)
cut = [ a-1 for a,b in Sum if b >= 1000 ]
nVar = 1; alpha = [1]
Ter1000 = [ [G._nodes[i+1][3]['lab'],
    [[tot[i][1],TQ.total(tot[i][1])]] ] for i in cut ]
R = hclusTQ(Ter1000,nVar,alpha)
js = open("Ter1000HC.json",'w'); json.dump(R, js, indent=1); js.close()
...
> js <- "Ter1000HC.json"
> R <- fromJSON(js)
> attr(R,"class") <- "hclust"
> plot(R,hang=-1)

and for Ter750HC.json

plot(R,hang=-1,cex=0.7)

> names(R)
[1] "proc"        "merge"       "height"      "order"       "labels"     
[6] "method"      "call"        "dist.method" "leaders" 
> C <- cutree(R,k=4)
> table(C)
C
 1  2  3  4 
58 18 16  3 
> R$labels[C==4]
[1] "anthrax" "letter"  "mail"   
> R$labels[C==3]
 [1] "new_york"        "plane"           "tuesday"         "pentagon"       
 [5] "world_trade_ctr" "wednesday"       "airport"         "police"         
 [9] "flight"          "buildng"         "national"        "aircraft"       
[13] "white_house"     "center"          "service"         "airline"  
> length(R$leaders)
[1] 94
> unitTQ <- function(unit){
+    total <- unit[[1]][[2]][[1]][[2]]
+    TQ <- unit[[1]][[2]][[1]][[1]]
+    name <- unit[[1]][[1]]
+    TQ[,3] <- TQ[,3]/total
+    return(list(name,TQ))
+ }
> L = R$leaders
> source("https://raw.githubusercontent.com/bavla/Nets/master/source/hist.R")
> siHist(unitTQ(L[63]),1,66,TRUE,ylim=c(0,0.06),cex.names=0.5,cex.lab=1.5,col="royalblue1")
> coHist(unitTQ(L[88]),unitTQ(L[93]),1,66,lab="C88:C93",ylim=c(0,0.12),cex.names=0.5,cex.lab=1.5)
> coHist(unitTQ(L[90]),unitTQ(L[92]),1,66,lab="C90:C92",ylim=c(0,0.06),cex.names=0.5,cex.lab=1.5)
> coHist(unitTQ(L[86]),unitTQ(L[89]),1,66,lab="C86:C89",ylim=c(0,0.10),cex.names=0.5,cex.lab=1.5)
> R$merge[85:94,]
      [,1] [,2]
 [1,]  -62   78
 [2,]  -46  -60
 [3,]   76   85
 [4,]  -47   84
 [5,]   81   83
 [6,]   86   89
 [7,]  -70   87
 [8,]   63   91
 [9,]   90   92
[10,]   88   93

         +------C94----------+
        C88          +------C93------+
                 +--C90--+      +---C92---+
                C86     C89    C63    +--C91--+
                                     (70)    C87

Toy test data

>>> nvar = 1; alpha = [1]
>>> a = [ (1,3,1), (5,8,2), (11,12,3) ]; b = [ (2,3,3), (4,7,2), (10,14,1) ]
>>> c = [ (2,8,1), (9,14,1) ]; d = [ (1,8,1) ]; e = [ (1,8,2) ]
>>> ta = TQ.total(a); tb = TQ.total(b)
>>> tc = TQ.total(c); td = TQ.total(d); te = TQ.total(e)
>>> La = [[ a, ta ]];  Lb = [[ b, tb ]];  Lc = [[ c, tc ]]
>>> Ld = [[ d, td ]];  Le = [[ e, te ]]
>>> L = [ ['a', La ], ['b', Lb ], ['c', Lc ], ['d', Ld ], ['e', Le ] ]
>>> R = hclusTQ(L,nVar,alpha)
>>> R['merge']
[[-4, -5], [-2, -3], [2, 1], [-1, 3]]
...
>>> nVar = 2; alpha = [0.5,0.5]
>>> La = [[ a, ta ], [ a, ta ]]; Lb = [[ b, tb ], [ b, tb ]]; Lc = [[ c, tc ], [ c, tc ]]
>>> Ld = [[ d, td ], [ d, td ]]; Le = [[ e, te ], [ e, te ]]; Lf = [[ a, ta ], [ b, tb ]]
>>> L = [ ['a', La ], ['b', Lb ], ['c', Lc ], ['d', Ld ], ['e', Le ], ['f', Lf ] ]
>>> R = hclusTQ(L,nVar,alpha)
>>> R['merge']
[[-4, -5], [-2, -3], [-1, -6], [2, 1], [4, 3]]
vlado/work/alg/hctq.txt · Last modified: 2020/08/17 05:39 by vlado
 
Except where otherwise noted, content on this wiki is licensed under the following license: CC Attribution-Noncommercial-Share Alike 3.0 Unported
Recent changes RSS feed Donate Powered by PHP Valid XHTML 1.0 Valid CSS Driven by DokuWiki