Clustering at Sunbelt XXXII

Clustering at Sunbelt XXXII

Standard clustering of Counties

> setwd("E:/Data/counties/pajek2")
> load("vars42org.RData")
> load("vars42std.Rdata")
> objects()
[1] "U"        "unitIDs"  "unitNams" "V"       
> r <- hclust(d<-dist(U),method="ward")
> pdf("DendroWard.pdf",width=58.5,height=41.5)
> plot(r,hang=-1,cex=0.08,main="Ward / Free",lwd=0.01)
> dev.off()
> unitIDs[303]
[1] 12086
> unitNams[303]
[1] "Miami-Dade, FL"
> unitIDs[is.na(unitIDs)]
integer(0)
> n <- nrow(U)
> n
[1] 3110
> out <- file("vars42.vec","w")
> cat("*vertices 3110\n",file=out)
> for(i in 1:n) cat(U[i,],"\n",file=out)
> close(out)
>

Clustering with relational constraint of Counties

The file vars42.vec is read into Pajek. It is used to compute dissimilarities used for clustering (Maximum / Tolerant). The hierarchy partition is saved as treeMaxTol.clu and the corresponding heights vector as heightMaxTol.vec.

Reading Pajek's hierarchy partition into R

> Pajek2R <- function(cling){
+   tree <- as.integer(read.csv(cling,header=FALSE,skip=1)$V1)
+   N <- length(tree); n <- (N+1) %/% 2
+   merge <- matrix(0,nrow=(n-1),ncol=2)
+   for(i in 1:n) if(tree[i]>0){
+     k <- tree[i]-n
+     if(merge[k,1]==0) merge[k,1] <- -i else merge[k,2] <- -i
+   }
+   for(i in (n+1):N) if(tree[i]>0){
+     k <- tree[i]-n; j <- i-n
+     if(merge[k,1]==0) merge[k,1] <- j else merge[k,2] <- j
+   }
+   return(list(merge=merge,n=n))
+ }
> 
> orDendro <- function(i){if(i<0) return(-i)
+   return(c(orDendro(RC$merge[i,1]),orDendro(RC$merge[i,2])))}
> orDendro <- function(m,i){if(i<0) return(-i)
+   return(c(orDendro(m,m[i,1]),orDendro(m,m[i,2])))}
> 
> orSize <- function(m,i){if(i<0) return(1)
+   s[i] <<- orSize(m,m[i,1])+orSize(m,m[i,2])
+   return(s[i])}

Analysis of the hierarchy partition in R

First we make the Pajek's hierachy data compatible with class hclust.

The counties network is not connected. To do: add relation closest that makes the network connected. We manually complete the hierarchy.

> RM <- Pajek2R("treeMaxTol.clu")
> HM <- read.csv("heightMaxTol.vec",header=FALSE,skip=3111)[[1]]
> RM$height <- HM
> RM$method <- "Maximum/Tolerant"
> RM$dist.method <- "Euclidean"
> RM$labels <- rownames(U)
> class(RM) <- "hclust"
> RM$call <- "Pajek.data"
> source("E:\\Data\\counties\\pajek\\varCutree.R") 
> n <- 3110; nm <- n-1
> for(i in 3100:3109) cat(i,RM$merge[i,],"\n")
3100 -1232 3099 
3101 -1825 3100 
3102 0 0 
3103 0 0 
3104 0 0 
3105 0 0 
3106 0 0 
3107 0 0 
3108 0 0 
3109 0 0 
> a <- as.vector(RM$merge)
> b <- a[a>0]
> m <- min(which(RM$merge[,1]==0))
> c <- setdiff(1:(m-1),b)
> e <- -a[a<0]
> f <- setdiff(1:n,e)
> RM$merge[3102,1] <- -1186; RM$merge[3102,2] <- 3101
> RM$merge[3103,1] <- -1192; RM$merge[3103,2] <- 3102
> RM$merge[3104,1] <- -1818; RM$merge[3104,2] <- 3103
> RM$merge[3105,1] <- -1824; RM$merge[3105,2] <- 3104
> RM$merge[3106,1] <- -1835; RM$merge[3106,2] <- 3105
> RM$merge[3107,1] <- -1837; RM$merge[3107,2] <- 3106
> RM$merge[3108,1] <- -1846; RM$merge[3108,2] <- 3107
> RM$merge[3109,1] <- -2949; RM$merge[3109,2] <- 3108
> max(HM)
[1] 40.61588
> for (i in 3102:3109) RM$height[i] <- 42
> RM$order <- orDendro(RM$merge,nm)
> pdf("dendMaxTol.pdf",width=58.5,height=41.5)
> plot(RM,hang=-1,cex=0.08,main="Maximum/Tolerant",lwd=0.01)
> dev.off()

Making partitions at selected level

I had some problems with the procedure cutree. So, I wrote my own.

hCutree <- function(R,h){
  label <- function(node,nc){
    left <- R$merge[node,1]; right <- R$merge[node,2]
    if(nc==0){
      if(R$height[node] <= h){
        C <<- C+1; nc <- C
      }
    }
    if(left <0){part[ -left] <<- nc} else label( left,nc) 
    if(right<0){part[-right] <<- nc} else label(right,nc) 
  }  
  C <- 0; nc <- 0; nm <- nrow(R$merge); part <- rep(NA,nm+1)
  label(nm,nc)
  return(part)
}

To account for different space densities we replace the real heights with ranks.

Partition in 15 clusters + unclassified

> DM <- RM
> hh <- RM$height
> DM$height <- hh; DM$height <- derivedTree(DM,'rank')
> DM$order <- orDendro(DM$merge,nm)
> pM <- hCutree(DM,h=113)
> fm <- table(pM)
> 
> mali <- as.integer(names(fm)[fm < 20]) 
> ppM <- pM
> ppM[ppM %in% mali] <- 0 
> pdM <- as.integer(factor(as.character(ppM),levels=names(table(ppM))))
> table(pdM)
pdM
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16 
427  47  61  28 167 151 109 510 648  60  23 146  43  28  53 609  
> part <- pdM
> save(part,file="MaxTolRankRookPart3.Rdata")
> DM <- RM
> DM$order <- orDendro(DM$merge,nm)
> pdf("dendMaxTolRookBasic.pdf",width=58.5,height=41.5)
> plot(DM,hang=-1,cex=0.08,main="Maximum/Tolerant/Rook/Basic",lwd=0.01)
> dev.off()

Partition in 2 clusters

> pM <- hCutree(DM,h=17.94)
> fm <- table(pM)
> sort(fm)
pM
   1    3    2    0    5    4 
   3    4    6   21 1080 1996 
> mali <- as.integer(names(fm)[fm < 20]) 
> ppM <- pM
> ppM[ppM %in% mali] <- 0 
> pdM <- as.integer(factor(as.character(ppM),levels=names(table(ppM))))
> table(pdM)
pdM
   1    2    3 
  34 1996 1080 
> 
> part <- pdM
> save(part,file="MaxTolRankBasicPart.Rdata")

Making maps

15 clusters

 
> load("../Pajek2/MaxTolRankRookPart3.Rdata") 
> col <- c("white","salmon1","blue","lightblue","lightpink","green4","orange","yellow",
+  "green","red","violet","sienna","orangered","darkorange3","deepskyblue","blueviolet")
> clu <- rep(NA,length(UScou$NAME_1))
> for(v in 1:3110) {i <- q[p[v]]; if(!is.na(i)) clu[i] <- part[v]} 
> pdf("../pajek2/UScMaxTolRookRank3.pdf",width=11.7,height=8.3,paper="a4r")
> plot(UScou,xlim=c(-124,-67),ylim=c(23,48),col=col[clu],bg="lightcyan",border="black",lwd=0.01)
> plot(USsta,xlim=c(-124,-67),ylim=c(23,48),lwd=0.2,border="navy",add=TRUE)
> # text(coordinates(UScou),labels=as.character(UScou$NAME_2),cex=0.1)
> title("Central US"); dev.off()

2 clusters

> load("../Pajek2/MaxTolRankbasicPart.Rdata")
> col <- c("white","salmon1","blue","lightblue","lightpink","green4","orange","yellow",
+  "green","red","violet","sienna","orangered","darkorange3","deepskyblue","blueviolet")
> clu <- rep(NA,length(UScou$NAME_1))
> for(v in 1:3110) {i <- q[p[v]]; if(!is.na(i)) clu[i] <- part[v]} 
> pdf("../pajek2/UScMaxTolRookBasic.pdf",width=11.7,height=8.3,paper="a4r")
> plot(UScou,xlim=c(-124,-67),ylim=c(23,48),col=col[clu],bg="lightcyan",border="black",lwd=0.01)
> plot(USsta,xlim=c(-124,-67),ylim=c(23,48),lwd=0.2,border="navy",add=TRUE)
> text(coordinates(UScou),labels=as.character(UScou$NAME_2),cex=0.1)
> title("Central US"); dev.off()