====== Ordinary clustering ======
January 19, 2018
===== Reading Pajek's clusterings and networks =====
read_Pajek_clu <- function(f,skip=1){
# reads a partition from Pajek's clu file; skip *vertices and comments lines
read.table(f,skip=skip,colClasses=c("integer"),header=FALSE)$V1
}
read_Pajek_net <- function(f,skip=0){
# reads a network from Pajek's net file; skip comments lines
L <- readLines(f)
st <- grep("\\*",L)
S <- unlist(strsplit(L[st[1]]," "))
n <- as.integer(S[2]); n1 <- st[1]+1; n2 <- st[2]-1
m1 <- st[2]+1; m2 <- length(L); m <- m2-m1+1
Names <- unlist(strsplit(L[n1:n2],'"'))[3*(1:n)-1]
R <- vector(mode="list",length=n)
S <- unlist(strsplit(L[m1:m2],'[[:space:]]+'))
b <- as.integer(S[4*(1:m)-2]); e <- as.integer(S[4*(1:m)-1])
r <- 1; k <- 0; row <- NULL
repeat {
k <- k+1
if((k>m)|(b[k]!=r)){
R[[r]] <- row
if(k>m) break
r <- b[k]; row <- NULL
}
row <- c(row,e[k])
}
names(R) <- Names
return(R)
}
wdir <- "C:/Users/batagelj/work/clamix/relC"
setwd(wdir)
a <- scan("C:/Users/batagelj/work/Delphi/Cluse/Cluse/data/RelCon/SomeTy.dis")
s <- length(a); n <- round((-1+sqrt(1+8*s))/2); nm <- n-1
D <- matrix(0, nrow=n, ncol=n); D[lower.tri(D,diag=TRUE)] <- a; D <- D+t(D)
netRel <- "C:/Users/batagelj/work/Delphi/Cluse/Cluse/data/RelCon/SomeTyXY.net"
R <- read_Pajek_net(netRel,3)
rownames(D) <- colnames(D) <- names(R); print(D)
DD <- as.dist(D); print(DD)
a b c d e f g h i j
a 0 5 7 4 6 6 2 4 2 3
b 5 0 8 1 3 4 4 5 3 4
c 7 8 0 4 5 7 9 3 2 5
d 4 1 4 0 3 2 4 8 6 3
e 6 3 5 3 0 6 4 6 5 7
f 6 4 7 2 6 0 6 8 5 7
g 2 4 9 4 4 6 0 4 8 2
h 4 5 3 8 6 8 4 0 3 4
i 2 3 2 6 5 5 8 3 0 5
j 3 4 5 3 7 7 2 4 5 0
a b c d e f g h i
b 5
c 7 8
d 4 1 4
e 6 3 5 3
f 6 4 7 2 6
g 2 4 9 4 4 6
h 4 5 3 8 6 8 4
i 2 3 2 6 5 5 8 3
j 3 4 5 3 7 7 2 4 5
===== Clustering =====
hRelClust <- function(D,method="max"){
orDendro <- function(i){if(i<0) return(-i)
return(c(orDendro(m[i,1]),orDendro(m[i,2])))}
numL <- nrow(D); numLm <- numL-1
# each unit is a cluster; compute inter-cluster dissimilarity matrix
diag(D) <- Inf
print(D); flush.console()
N <- rownames(D)
active <- 1:numL; m <- matrix(nrow=numLm,ncol=2)
node <- rep(0,numL); h <- numeric(numLm); w <- rep(1,numL)
for(k in 1:numLm){
# determine the closest pair of clusters (p,q)
ind <- active[sapply(active,function(i) which.min(D[i,active]))]
dd <- sapply(active,function(i) min(D[i,active]))
print(active); print(ind); print(dd)
pq <- which.min(dd)
# join the closest pair of clusters
p<-active[pq]; q <- ind[pq]; h[k] <- D[p,q]
cat(pq,'join ',p,'/',N[p],' and ',q,'/',N[q],' at level ',D[p,q],'\n')
if(node[p]==0) m[k,1] <- -p else m[k,1] <- node[p]
if(node[q]==0) m[k,2] <- -q else m[k,2] <- node[q]
active <- setdiff(active,p)
# determine dissimilarities to the new cluster
for(s in setdiff(active,q)){
if(method=="max") D[q,s] <- max(D[q,s],D[p,s]) else
if(method=="min") D[q,s] <- min(D[q,s],D[p,s]) else
if(method=="ward") { ww <- w[p]+w[q]+w[s]
D[q,s] <- ((w[q]+w[s])*D[q,s] + (w[p]+w[s])*D[p,s] - w[s]*h[k])/ww
} else {cat('unknown method','\n'); return(NULL)}
D[s,q] <- D[q,s]
}
w[q] <- w[q]+w[p]; node[[q]] <- k
}
hc <- list(merge=m,height=h,order=orDendro(numLm),labels=rownames(D),
method=method,call=NULL,dist.method=NULL,leaders=NULL)
class(hc) <- "hclust"
return(hc)
}
===== Test =====
==== Maximum ====
r <- hRelClust(D,method="max")
plot(r,hang=-1,main="max")
t <- hclust(DD,method="complete")
plot(t,hang=-1,main="complete")
a b c d e f g h i j
a Inf 5 7 4 6 6 2 4 2 3
b 5 Inf 8 1 3 4 4 5 3 4
c 7 8 Inf 4 5 7 9 3 2 5
d 4 1 4 Inf 3 2 4 8 6 3
e 6 3 5 3 Inf 6 4 6 5 7
f 6 4 7 2 6 Inf 6 8 5 7
g 2 4 9 4 4 6 Inf 4 8 2
h 4 5 3 8 6 8 4 Inf 3 4
i 2 3 2 6 5 5 8 3 Inf 5
j 3 4 5 3 7 7 2 4 5 Inf
[1] 1 2 3 4 5 6 7 8 9 10
[1] 7 4 9 2 2 4 1 3 1 7
[1] 2 1 2 1 3 2 2 3 2 2
2 join 2 / b and 4 / d at level 1
[1] 1 3 4 5 6 7 8 9 10
[1] 7 9 5 4 4 1 3 1 7
[1] 2 2 3 3 4 2 3 2 2
1 join 1 / a and 7 / g at level 2
[1] 3 4 5 6 7 8 9 10
[1] 9 5 4 4 10 3 3 7
[1] 2 3 3 4 3 3 2 3
1 join 3 / c and 9 / i at level 2
[1] 4 5 6 7 8 9 10
[1] 5 4 4 10 9 8 7
[1] 3 3 4 3 3 3 3
1 join 4 / d and 5 / e at level 3
[1] 5 6 7 8 9 10
[1] 6 5 10 9 8 7
[1] 6 6 3 3 3 3
3 join 7 / g and 10 / j at level 3
[1] 5 6 8 9 10
[1] 6 5 9 8 8
[1] 6 6 3 3 4
3 join 8 / h and 9 / i at level 3
[1] 5 6 9 10
[1] 6 5 5 5
[1] 6 6 8 7
1 join 5 / e and 6 / f at level 6
[1] 6 9 10
[1] 10 6 6
[1] 7 8 7
1 join 6 / f and 10 / j at level 7
[1] 9 10
[1] 10 9
[1] 9 9
1 join 9 / i and 10 / j at level 9
{{pro:pics:max.png}} {{pro:pics:complete.png}}
==== Ward ====
r <- hRelClust(D,method="ward")
plot(r,hang=-1,main="ward")
t <- hclust(DD,method="ward.D")
plot(t,hang=-1,main="ward.D")
a b c d e f g h i j
a Inf 5 7 4 6 6 2 4 2 3
b 5 Inf 8 1 3 4 4 5 3 4
c 7 8 Inf 4 5 7 9 3 2 5
d 4 1 4 Inf 3 2 4 8 6 3
e 6 3 5 3 Inf 6 4 6 5 7
f 6 4 7 2 6 Inf 6 8 5 7
g 2 4 9 4 4 6 Inf 4 8 2
h 4 5 3 8 6 8 4 Inf 3 4
i 2 3 2 6 5 5 8 3 Inf 5
j 3 4 5 3 7 7 2 4 5 Inf
[1] 1 2 3 4 5 6 7 8 9 10
[1] 7 4 9 2 2 4 1 3 1 7
[1] 2 1 2 1 3 2 2 3 2 2
2 join 2 / b and 4 / d at level 1
[1] 1 3 4 5 6 7 8 9 10
[1] 7 9 5 4 4 1 3 1 7
[1] 2.000000 2.000000 3.666667 3.666667 3.666667 2.000000 3.000000 2.000000
[9] 2.000000
1 join 1 / a and 7 / g at level 2
[1] 3 4 5 6 7 8 9 10
[1] 9 5 4 4 10 3 3 7
[1] 2.000000 3.666667 3.666667 3.666667 2.666667 3.000000 2.000000 2.666667
1 join 3 / c and 9 / i at level 2
[1] 4 5 6 7 8 9 10
[1] 5 4 4 10 9 8 7
[1] 3.666667 3.666667 3.666667 2.666667 3.333333 3.333333 2.666667
4 join 7 / g and 10 / j at level 2.666667
[1] 4 5 6 8 9 10
[1] 5 4 4 9 8 8
[1] 3.666667 3.666667 3.666667 3.333333 3.333333 4.833333
4 join 8 / h and 9 / i at level 3.333333
[1] 4 5 6 9 10
[1] 5 4 4 5 4
[1] 3.666667 3.666667 3.666667 6.666667 7.133333
1 join 4 / d and 5 / e at level 3.666667
[1] 5 6 9 10
[1] 6 5 6 6
[1] 4.833333 4.833333 8.666667 8.333333
1 join 5 / e and 6 / f at level 4.833333
[1] 6 9 10
[1] 10 10 6
[1] 10.40476 11.00000 10.40476
1 join 6 / f and 10 / j at level 10.40476
[1] 9 10
[1] 10 9
[1] 12.49524 12.49524
1 join 9 / i and 10 / j at level 12.49524
{{pro:pics:ward.png}} {{pro:pics:wardd.png}}
====== ======
\\
[[pro:relc|Back to Relational constraints]]