====== Clustering ======
8. May 2011
===== Internal structures of clustering procedures in R =====
{{:notes:da:pics:units.png?260}}
{{:notes:da:pics:dendro.png?260}}
> x <- c(0,0,1,2,3)
> y <- c(2,3,1,3,4)
> U <- cbind(x,y)
> r <- hclust(d<-dist(U))
> names(r)
[1] "merge" "height" "order" "labels" "method" "call" "dist.method"
> plot(r,hang=-1)
> r$merge
[,1] [,2]
[1,] -1 -2
[2,] -4 -5
[3,] -3 1
[4,] 2 3
> plot(x,y,pch=20,cex=1.5); text(x+0.1,y,1:5)
===== Procedure for alternative cutting of clusters =====
varCutree <- function(R,var,vmin,vmax){
mark <- function(t,c){
if(t<0) part[-t] <<- c else {mark(R$merge[t,1],c); mark(R$merge[t,2],c)}
}
n <- length(var); part <- rep(NA,n); nclust <- 0
value <- cbind(var,0)
for(i in 1:(n-1)){
j <- R$merge[i,1]; a <- ifelse(j<0,value[-j,1],value[j,2])
j <- R$merge[i,2]; b <- ifelse(j<0,value[-j,1],value[j,2])
value[i,2] <- a+b
}
value[n,2] <- 0
for(i in 1:(n-1)){
if(value[i,2]<=vmax) next
l <- R$merge[i,1]; a <- ifelse(l<0,value[-l,1],value[l,2])
r <- R$merge[i,2]; b <- ifelse(r<0,value[-r,1],value[r,2])
if(min(a,b)>vmax) next
if(a<=vmax) if(a>=vmin) {nclust <- nclust+1; mark(l,nclust)} else mark(l,0)
if(b<=vmax) if(b>=vmin) {nclust <- nclust+1; mark(r,nclust)} else mark(r,0)
}
return(list(part=part,value=value))
}
{{:notes:da:pics:units12.png?350}}
{{:notes:da:pics:dendro12.png?350}}
> x <- c(0,0,1,2,3,3,0,5,7,9,7,9)
> y <- c(2,3,1,3,4,3,9,8,6,5,3,8)
> U <- cbind(x,y)
> plot(x,y,pch=20,cex=1.5,xlim=c(0,10),main="Units"); text(x+0.3,y,1:12)
> r <- hclust(dist(U))
> plot(r,hang=-1)
> rr <- varCutree(r,rep(1,12),2,5)
> rr$part
[1] 2 2 2 1 1 1 0 3 3 3 3 3
> table(rr$part)
0 1 2 3
1 3 3 5
> (rc <- cutree(r,k=4))
[1] 1 1 1 1 1 1 2 3 4 4 4 3
> table(rc)
1 2 3 4
6 1 2 3
===== Improved version of procedure for incomplete clusterings =====
varCutree <- function(R,var,vmin,vmax){
mark <- function(t,c){
if(t<0) part[-t] <<- c else {mark(R$merge[t,1],c); mark(R$merge[t,2],c)}
}
n <- length(var); part <- rep(999999,n); nclust <- 0
value <- cbind(var,rep(0,n))
for(i in 1:(n-1)){
j <- R$merge[i,1]; if(j==0) next
a <- ifelse(j<0,value[-j,1],value[j,2])
j <- R$merge[i,2]; if(j==0) next
b <- ifelse(j<0,value[-j,1],value[j,2])
value[i,2] <- a+b
}
value[n,2] <- 0
for(i in 1:(n-1)){
if(value[i,2]<=vmax) next
l <- R$merge[i,1]; r <- R$merge[i,2]
if(l==0) a <- 0 else a <- ifelse(l<0,value[-l,1],value[l,2])
if(r==0) b <- 0 else b <- ifelse(r<0,value[-r,1],value[r,2])
if(min(a,b)>vmax) next
if(a<=vmax) if(a>=vmin) {nclust <- nclust+1; mark(l,nclust)} else mark(l,0)
if(b<=vmax) if(b>=vmin) {nclust <- nclust+1; mark(r,nclust)} else mark(r,0)
}
return(list(part=part,value=value))
}
===== Transforming Pajek's hierarchical clusterings into R's =====
tree <- as.integer(read.csv("./cluster/AvgTol.clu",header=FALSE,skip=1)$V1)
N <- length(tree); n <- (N+1) %/% 2
merge <- matrix(0,nrow=(n-1),ncol=2)
for(i in 1:n) if(tree[i]>0){
k <- tree[i]-n
if(merge[k,1]==0) merge[k,1] <- -i else merge[k,2] <- -i
}
for(i in (n+1):N) if(tree[i]>0){
k <- tree[i]-n; j <- i-n
if(merge[k,1]==0) merge[k,1] <- j else merge[k,2] <- j
}
> RC <- list(merge=merge,n=n)
> rC <- varCutree(RC,rep(1,3110),10,800)
> table(rC$part)
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
729 794 600 130 97 17 13 10 337 40 20 12 25 19 11 32 116 42 10 18 16 14
> out <- file("CRc.clu","w"); cat("*vertices 3110",rC$part,sep="\n",file=out); close(out)
===== Derived dendrograms =====
rankTree <- function(R){
nm <- length(R$height)
rank <- rep(0,nm)
for (i in 1:nm){
j <- R$merge[i,1]; a <- ifelse(j<0,0,rank[j])
j <- R$merge[i,2]; b <- ifelse(j<0,0,rank[j])
rank[i] <- 1+max(a,b)
}
return(rank)
}
countTree <- function(R){
nm <- length(R$height)
count <- rep(1,nm)
for (i in 1:nm){
j <- R$merge[i,1]; a <- ifelse(j<0,1,count[j])
j <- R$merge[i,2]; b <- ifelse(j<0,1,count[j])
count[i] <- a+b
}
return(count)
}
errorTree <- function(R){
nm <- length(R$height)
error <- rep(0,nm)
for (i in 1:nm){
j <- R$merge[i,1]; a <- ifelse(j<0,0,error[j])
j <- R$merge[i,2]; b <- ifelse(j<0,0,error[j])
error[i] <- a+b+R$height[i]
}
return(error)
}
> T = matrix( c( 1, 3, 2, 2, 3, 3, 3, 4, 4, 1, 4, 12, 6, 15, 8, 12,
10, 9, 11, 13, 12, 4, 14, 2, 14, 11, 15, 4, 16, 2 ), ncol=2, byrow=TRUE)
> rownames(T) <- letters[1:nrow(T)]; colnames(T) <- c('X','Y')
> plot(T,main='Example 1'); text(T[,1]+0.3,T[,2]-0.4,rownames(T))
> D <- dist(T)
> t <- hclust(D,method="ward")
> plot(t,hang=-1,cex=0.4,main="Example 1/Ward")
> H <- t$height; N <- rankTree(t); t$height <- N
> plot(t,hang=-1,cex=0.4,main="Example 1/Ward/rank")
> t$height <- H
> E <- errorTree(t); t$height <- E
> plot(t,hang=-1,cex=0.4,main="Example 1/Ward/P")
> t$height <- log(E)
> plot(t,hang=-1,cex=0.4,main="Example 1/Ward/log P")
> t$height <- sqrt(E)
> plot(t,hang=-1,cex=0.4,main="Example 1/Ward/sqrt P")
===== All three procedures in one =====
derivedTree <- function(R,type='rank'){
if (type == 'total') { c <- 0; ex <- expression(a+b+R$height[i]) }
else if (type == 'count') { c <- 1; ex <- expression(a+b) }
else { c <- 0; ex <- expression(1+max(a,b)) }
nm <- length(R$height)
h <- rep(c,nm)
for (i in 1:nm){
j <- R$merge[i,1]; a <- ifelse(j<0,c,h[j])
j <- R$merge[i,2]; b <- ifelse(j<0,c,h[j])
h[i] <- eval(ex)
}
return(h)
}
===== Example =====
> T = matrix( c( 0, 5, 1, 1, 1, 3, 1, 4, 2, 2, 2, 15,
+ 3, 1, 3, 2, 4, 10, 6, 14, 8, 1, 8, 9, 9, 3,
+ 10, 13, 11, 2, 11, 5, 13, 2, 14, 4, 14, 15, 15, 0 ),
+ ncol=2, byrow=TRUE)
> T <- T+1
> rownames(T) <- letters[1:nrow(T)]; colnames(T) <- c('X','Y')
> plot(T,main='Example',col="red",pch=16,cex=1.5)
> text(T[,1]+0.3,T[,2]-0.4,rownames(T))
> D <- dist(T)
> t <- hclust(D,method="ward")
> plot(t,hang=-1,cex=1.0,main="Example/Ward")
> H <- t$height; t$height <- derivedTree(t,'count')
> plot(t,hang=-1,cex=1.0,main="Example/Ward/count")
> t$height <- H; t$height <- derivedTree(t,'rank')
> plot(t,hang=-1,cex=1.0,main="Example/Ward/rank")
> t$height <- H; t$height <- derivedTree(t,'total')
> plot(t,hang=-1,cex=1.0,main="Example/Ward/total")
{{:notes:da:pics:ex.png}}
{{:notes:da:pics:exward.png?300}} {{:notes:da:pics:extotal.png?300}}
{{:notes:da:pics:exrank.png?300}} {{:notes:da:pics:excount.png?300}}