====== Clustering at Sunbelt XXXII ====== ===== Standard clustering of Counties ===== > setwd("E:/Data/counties/pajek2") > load("vars42org.RData") > load("vars42std.Rdata") > objects() [1] "U" "unitIDs" "unitNams" "V" > r <- hclust(d<-dist(U),method="ward") > pdf("DendroWard.pdf",width=58.5,height=41.5) > plot(r,hang=-1,cex=0.08,main="Ward / Free",lwd=0.01) > dev.off() > unitIDs[303] [1] 12086 > unitNams[303] [1] "Miami-Dade, FL" > unitIDs[is.na(unitIDs)] integer(0) > n <- nrow(U) > n [1] 3110 > out <- file("vars42.vec","w") > cat("*vertices 3110\n",file=out) > for(i in 1:n) cat(U[i,],"\n",file=out) > close(out) > ===== Clustering with relational constraint of Counties ===== The file ''vars42.vec'' is read into Pajek. It is used to compute dissimilarities used for clustering (''Maximum'' / ''Tolerant''). The hierarchy partition is saved as ''treeMaxTol.clu'' and the corresponding heights vector as ''heightMaxTol.vec''. ===== Reading Pajek's hierarchy partition into R ===== > Pajek2R <- function(cling){ + tree <- as.integer(read.csv(cling,header=FALSE,skip=1)$V1) + N <- length(tree); n <- (N+1) %/% 2 + merge <- matrix(0,nrow=(n-1),ncol=2) + for(i in 1:n) if(tree[i]>0){ + k <- tree[i]-n + if(merge[k,1]==0) merge[k,1] <- -i else merge[k,2] <- -i + } + for(i in (n+1):N) if(tree[i]>0){ + k <- tree[i]-n; j <- i-n + if(merge[k,1]==0) merge[k,1] <- j else merge[k,2] <- j + } + return(list(merge=merge,n=n)) + } > > orDendro <- function(i){if(i<0) return(-i) + return(c(orDendro(RC$merge[i,1]),orDendro(RC$merge[i,2])))} > orDendro <- function(m,i){if(i<0) return(-i) + return(c(orDendro(m,m[i,1]),orDendro(m,m[i,2])))} > > orSize <- function(m,i){if(i<0) return(1) + s[i] <<- orSize(m,m[i,1])+orSize(m,m[i,2]) + return(s[i])} ===== Analysis of the hierarchy partition in R ===== First we make the Pajek's hierachy data compatible with class ''hclust''. The counties network is not connected. **To do:** add relation ''closest'' that makes the network connected. We manually complete the hierarchy. > RM <- Pajek2R("treeMaxTol.clu") > HM <- read.csv("heightMaxTol.vec",header=FALSE,skip=3111)[[1]] > RM$height <- HM > RM$method <- "Maximum/Tolerant" > RM$dist.method <- "Euclidean" > RM$labels <- rownames(U) > class(RM) <- "hclust" > RM$call <- "Pajek.data" > source("E:\\Data\\counties\\pajek\\varCutree.R") > n <- 3110; nm <- n-1 > for(i in 3100:3109) cat(i,RM$merge[i,],"\n") 3100 -1232 3099 3101 -1825 3100 3102 0 0 3103 0 0 3104 0 0 3105 0 0 3106 0 0 3107 0 0 3108 0 0 3109 0 0 > a <- as.vector(RM$merge) > b <- a[a>0] > m <- min(which(RM$merge[,1]==0)) > c <- setdiff(1:(m-1),b) > e <- -a[a<0] > f <- setdiff(1:n,e) > RM$merge[3102,1] <- -1186; RM$merge[3102,2] <- 3101 > RM$merge[3103,1] <- -1192; RM$merge[3103,2] <- 3102 > RM$merge[3104,1] <- -1818; RM$merge[3104,2] <- 3103 > RM$merge[3105,1] <- -1824; RM$merge[3105,2] <- 3104 > RM$merge[3106,1] <- -1835; RM$merge[3106,2] <- 3105 > RM$merge[3107,1] <- -1837; RM$merge[3107,2] <- 3106 > RM$merge[3108,1] <- -1846; RM$merge[3108,2] <- 3107 > RM$merge[3109,1] <- -2949; RM$merge[3109,2] <- 3108 > max(HM) [1] 40.61588 > for (i in 3102:3109) RM$height[i] <- 42 > RM$order <- orDendro(RM$merge,nm) > pdf("dendMaxTol.pdf",width=58.5,height=41.5) > plot(RM,hang=-1,cex=0.08,main="Maximum/Tolerant",lwd=0.01) > dev.off() ===== Making partitions at selected level ===== I had some problems with the procedure ''cutree''. So, I wrote my own. hCutree <- function(R,h){ label <- function(node,nc){ left <- R$merge[node,1]; right <- R$merge[node,2] if(nc==0){ if(R$height[node] <= h){ C <<- C+1; nc <- C } } if(left <0){part[ -left] <<- nc} else label( left,nc) if(right<0){part[-right] <<- nc} else label(right,nc) } C <- 0; nc <- 0; nm <- nrow(R$merge); part <- rep(NA,nm+1) label(nm,nc) return(part) } To account for different space densities we replace the real heights with ranks. ==== Partition in 15 clusters + unclassified ==== > DM <- RM > hh <- RM$height > DM$height <- hh; DM$height <- derivedTree(DM,'rank') > DM$order <- orDendro(DM$merge,nm) > pM <- hCutree(DM,h=113) > fm <- table(pM) > > mali <- as.integer(names(fm)[fm < 20]) > ppM <- pM > ppM[ppM %in% mali] <- 0 > pdM <- as.integer(factor(as.character(ppM),levels=names(table(ppM)))) > table(pdM) pdM 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 427 47 61 28 167 151 109 510 648 60 23 146 43 28 53 609 > part <- pdM > save(part,file="MaxTolRankRookPart3.Rdata") > DM <- RM > DM$order <- orDendro(DM$merge,nm) > pdf("dendMaxTolRookBasic.pdf",width=58.5,height=41.5) > plot(DM,hang=-1,cex=0.08,main="Maximum/Tolerant/Rook/Basic",lwd=0.01) > dev.off() ==== Partition in 2 clusters ==== > pM <- hCutree(DM,h=17.94) > fm <- table(pM) > sort(fm) pM 1 3 2 0 5 4 3 4 6 21 1080 1996 > mali <- as.integer(names(fm)[fm < 20]) > ppM <- pM > ppM[ppM %in% mali] <- 0 > pdM <- as.integer(factor(as.character(ppM),levels=names(table(ppM)))) > table(pdM) pdM 1 2 3 34 1996 1080 > > part <- pdM > save(part,file="MaxTolRankBasicPart.Rdata") ===== Making maps ===== ==== 15 clusters ==== > load("../Pajek2/MaxTolRankRookPart3.Rdata") > col <- c("white","salmon1","blue","lightblue","lightpink","green4","orange","yellow", + "green","red","violet","sienna","orangered","darkorange3","deepskyblue","blueviolet") > clu <- rep(NA,length(UScou$NAME_1)) > for(v in 1:3110) {i <- q[p[v]]; if(!is.na(i)) clu[i] <- part[v]} > pdf("../pajek2/UScMaxTolRookRank3.pdf",width=11.7,height=8.3,paper="a4r") > plot(UScou,xlim=c(-124,-67),ylim=c(23,48),col=col[clu],bg="lightcyan",border="black",lwd=0.01) > plot(USsta,xlim=c(-124,-67),ylim=c(23,48),lwd=0.2,border="navy",add=TRUE) > # text(coordinates(UScou),labels=as.character(UScou$NAME_2),cex=0.1) > title("Central US"); dev.off() ==== 2 clusters ==== > load("../Pajek2/MaxTolRankbasicPart.Rdata") > col <- c("white","salmon1","blue","lightblue","lightpink","green4","orange","yellow", + "green","red","violet","sienna","orangered","darkorange3","deepskyblue","blueviolet") > clu <- rep(NA,length(UScou$NAME_1)) > for(v in 1:3110) {i <- q[p[v]]; if(!is.na(i)) clu[i] <- part[v]} > pdf("../pajek2/UScMaxTolRookBasic.pdf",width=11.7,height=8.3,paper="a4r") > plot(UScou,xlim=c(-124,-67),ylim=c(23,48),col=col[clu],bg="lightcyan",border="black",lwd=0.01) > plot(USsta,xlim=c(-124,-67),ylim=c(23,48),lwd=0.2,border="navy",add=TRUE) > text(coordinates(UScou),labels=as.character(UScou$NAME_2),cex=0.1) > title("Central US"); dev.off()