====== Ordinary clustering ====== January 19, 2018 ===== Reading Pajek's clusterings and networks ===== read_Pajek_clu <- function(f,skip=1){ # reads a partition from Pajek's clu file; skip *vertices and comments lines read.table(f,skip=skip,colClasses=c("integer"),header=FALSE)$V1 } read_Pajek_net <- function(f,skip=0){ # reads a network from Pajek's net file; skip comments lines L <- readLines(f) st <- grep("\\*",L) S <- unlist(strsplit(L[st[1]]," ")) n <- as.integer(S[2]); n1 <- st[1]+1; n2 <- st[2]-1 m1 <- st[2]+1; m2 <- length(L); m <- m2-m1+1 Names <- unlist(strsplit(L[n1:n2],'"'))[3*(1:n)-1] R <- vector(mode="list",length=n) S <- unlist(strsplit(L[m1:m2],'[[:space:]]+')) b <- as.integer(S[4*(1:m)-2]); e <- as.integer(S[4*(1:m)-1]) r <- 1; k <- 0; row <- NULL repeat { k <- k+1 if((k>m)|(b[k]!=r)){ R[[r]] <- row if(k>m) break r <- b[k]; row <- NULL } row <- c(row,e[k]) } names(R) <- Names return(R) } wdir <- "C:/Users/batagelj/work/clamix/relC" setwd(wdir) a <- scan("C:/Users/batagelj/work/Delphi/Cluse/Cluse/data/RelCon/SomeTy.dis") s <- length(a); n <- round((-1+sqrt(1+8*s))/2); nm <- n-1 D <- matrix(0, nrow=n, ncol=n); D[lower.tri(D,diag=TRUE)] <- a; D <- D+t(D) netRel <- "C:/Users/batagelj/work/Delphi/Cluse/Cluse/data/RelCon/SomeTyXY.net" R <- read_Pajek_net(netRel,3) rownames(D) <- colnames(D) <- names(R); print(D) DD <- as.dist(D); print(DD) a b c d e f g h i j a 0 5 7 4 6 6 2 4 2 3 b 5 0 8 1 3 4 4 5 3 4 c 7 8 0 4 5 7 9 3 2 5 d 4 1 4 0 3 2 4 8 6 3 e 6 3 5 3 0 6 4 6 5 7 f 6 4 7 2 6 0 6 8 5 7 g 2 4 9 4 4 6 0 4 8 2 h 4 5 3 8 6 8 4 0 3 4 i 2 3 2 6 5 5 8 3 0 5 j 3 4 5 3 7 7 2 4 5 0 a b c d e f g h i b 5 c 7 8 d 4 1 4 e 6 3 5 3 f 6 4 7 2 6 g 2 4 9 4 4 6 h 4 5 3 8 6 8 4 i 2 3 2 6 5 5 8 3 j 3 4 5 3 7 7 2 4 5 ===== Clustering ===== hRelClust <- function(D,method="max"){ orDendro <- function(i){if(i<0) return(-i) return(c(orDendro(m[i,1]),orDendro(m[i,2])))} numL <- nrow(D); numLm <- numL-1 # each unit is a cluster; compute inter-cluster dissimilarity matrix diag(D) <- Inf print(D); flush.console() N <- rownames(D) active <- 1:numL; m <- matrix(nrow=numLm,ncol=2) node <- rep(0,numL); h <- numeric(numLm); w <- rep(1,numL) for(k in 1:numLm){ # determine the closest pair of clusters (p,q) ind <- active[sapply(active,function(i) which.min(D[i,active]))] dd <- sapply(active,function(i) min(D[i,active])) print(active); print(ind); print(dd) pq <- which.min(dd) # join the closest pair of clusters p<-active[pq]; q <- ind[pq]; h[k] <- D[p,q] cat(pq,'join ',p,'/',N[p],' and ',q,'/',N[q],' at level ',D[p,q],'\n') if(node[p]==0) m[k,1] <- -p else m[k,1] <- node[p] if(node[q]==0) m[k,2] <- -q else m[k,2] <- node[q] active <- setdiff(active,p) # determine dissimilarities to the new cluster for(s in setdiff(active,q)){ if(method=="max") D[q,s] <- max(D[q,s],D[p,s]) else if(method=="min") D[q,s] <- min(D[q,s],D[p,s]) else if(method=="ward") { ww <- w[p]+w[q]+w[s] D[q,s] <- ((w[q]+w[s])*D[q,s] + (w[p]+w[s])*D[p,s] - w[s]*h[k])/ww } else {cat('unknown method','\n'); return(NULL)} D[s,q] <- D[q,s] } w[q] <- w[q]+w[p]; node[[q]] <- k } hc <- list(merge=m,height=h,order=orDendro(numLm),labels=rownames(D), method=method,call=NULL,dist.method=NULL,leaders=NULL) class(hc) <- "hclust" return(hc) } ===== Test ===== ==== Maximum ==== r <- hRelClust(D,method="max") plot(r,hang=-1,main="max") t <- hclust(DD,method="complete") plot(t,hang=-1,main="complete") a b c d e f g h i j a Inf 5 7 4 6 6 2 4 2 3 b 5 Inf 8 1 3 4 4 5 3 4 c 7 8 Inf 4 5 7 9 3 2 5 d 4 1 4 Inf 3 2 4 8 6 3 e 6 3 5 3 Inf 6 4 6 5 7 f 6 4 7 2 6 Inf 6 8 5 7 g 2 4 9 4 4 6 Inf 4 8 2 h 4 5 3 8 6 8 4 Inf 3 4 i 2 3 2 6 5 5 8 3 Inf 5 j 3 4 5 3 7 7 2 4 5 Inf [1] 1 2 3 4 5 6 7 8 9 10 [1] 7 4 9 2 2 4 1 3 1 7 [1] 2 1 2 1 3 2 2 3 2 2 2 join 2 / b and 4 / d at level 1 [1] 1 3 4 5 6 7 8 9 10 [1] 7 9 5 4 4 1 3 1 7 [1] 2 2 3 3 4 2 3 2 2 1 join 1 / a and 7 / g at level 2 [1] 3 4 5 6 7 8 9 10 [1] 9 5 4 4 10 3 3 7 [1] 2 3 3 4 3 3 2 3 1 join 3 / c and 9 / i at level 2 [1] 4 5 6 7 8 9 10 [1] 5 4 4 10 9 8 7 [1] 3 3 4 3 3 3 3 1 join 4 / d and 5 / e at level 3 [1] 5 6 7 8 9 10 [1] 6 5 10 9 8 7 [1] 6 6 3 3 3 3 3 join 7 / g and 10 / j at level 3 [1] 5 6 8 9 10 [1] 6 5 9 8 8 [1] 6 6 3 3 4 3 join 8 / h and 9 / i at level 3 [1] 5 6 9 10 [1] 6 5 5 5 [1] 6 6 8 7 1 join 5 / e and 6 / f at level 6 [1] 6 9 10 [1] 10 6 6 [1] 7 8 7 1 join 6 / f and 10 / j at level 7 [1] 9 10 [1] 10 9 [1] 9 9 1 join 9 / i and 10 / j at level 9 {{pro:pics:max.png}} {{pro:pics:complete.png}} ==== Ward ==== r <- hRelClust(D,method="ward") plot(r,hang=-1,main="ward") t <- hclust(DD,method="ward.D") plot(t,hang=-1,main="ward.D") a b c d e f g h i j a Inf 5 7 4 6 6 2 4 2 3 b 5 Inf 8 1 3 4 4 5 3 4 c 7 8 Inf 4 5 7 9 3 2 5 d 4 1 4 Inf 3 2 4 8 6 3 e 6 3 5 3 Inf 6 4 6 5 7 f 6 4 7 2 6 Inf 6 8 5 7 g 2 4 9 4 4 6 Inf 4 8 2 h 4 5 3 8 6 8 4 Inf 3 4 i 2 3 2 6 5 5 8 3 Inf 5 j 3 4 5 3 7 7 2 4 5 Inf [1] 1 2 3 4 5 6 7 8 9 10 [1] 7 4 9 2 2 4 1 3 1 7 [1] 2 1 2 1 3 2 2 3 2 2 2 join 2 / b and 4 / d at level 1 [1] 1 3 4 5 6 7 8 9 10 [1] 7 9 5 4 4 1 3 1 7 [1] 2.000000 2.000000 3.666667 3.666667 3.666667 2.000000 3.000000 2.000000 [9] 2.000000 1 join 1 / a and 7 / g at level 2 [1] 3 4 5 6 7 8 9 10 [1] 9 5 4 4 10 3 3 7 [1] 2.000000 3.666667 3.666667 3.666667 2.666667 3.000000 2.000000 2.666667 1 join 3 / c and 9 / i at level 2 [1] 4 5 6 7 8 9 10 [1] 5 4 4 10 9 8 7 [1] 3.666667 3.666667 3.666667 2.666667 3.333333 3.333333 2.666667 4 join 7 / g and 10 / j at level 2.666667 [1] 4 5 6 8 9 10 [1] 5 4 4 9 8 8 [1] 3.666667 3.666667 3.666667 3.333333 3.333333 4.833333 4 join 8 / h and 9 / i at level 3.333333 [1] 4 5 6 9 10 [1] 5 4 4 5 4 [1] 3.666667 3.666667 3.666667 6.666667 7.133333 1 join 4 / d and 5 / e at level 3.666667 [1] 5 6 9 10 [1] 6 5 6 6 [1] 4.833333 4.833333 8.666667 8.333333 1 join 5 / e and 6 / f at level 4.833333 [1] 6 9 10 [1] 10 10 6 [1] 10.40476 11.00000 10.40476 1 join 6 / f and 10 / j at level 10.40476 [1] 9 10 [1] 10 9 [1] 12.49524 12.49524 1 join 9 / i and 10 / j at level 12.49524 {{pro:pics:ward.png}} {{pro:pics:wardd.png}} ====== ====== \\ [[pro:relc|Back to Relational constraints]]