> P <- read.csv2("https://raw.githubusercontent.com/bavla/cluRC/master/data/US48.csv") > ZP <- scale(P) > d <- dist(ZP, method="euc") > d2 <- d^2 > single <- hclust(d = d2, method= "single") > plot(single, hang = -1, main = "single",cex=0.5) > ward <- hclust(d = d2, method = "ward.D") > plot(ward, hang = -1, main = "ward", cex=0.5)
> wardK5 <- cutree(ward, k = 5) > table(wardK5) wardK5 1 2 3 4 5 7 17 15 7 2 > for(i in 1:5) cat(i,":",rownames(P)[wardK5==i],"\n") 1 : AL AR LA MS NM TN SC 2 : AZ CA DE FL GA IL IN KS MI MO NC NV NY OH OK PA TX 3 : CO IA ID ME MN MT ND NE OR SD WY RI WI WA VT 4 : CT MA MD NH NJ UT VA 5 : KY WV > mP <- matrix(0,nrow=5,ncol=7) > for(i in 1:5) mP[i,] <- colMeans(P[wardK5==i,]) > rownames(mP) <- paste("C",1:5,sep="") > colnames(mP) <- colnames(P) > mP <- rbind(mP,colMeans(P)) > rownames(mP)[6] <- "all" > mP crime violent smoking drinking diabetes opioid income C1 8.785714 496.4471 0.2251429 0.1447143 0.11728571 10.85714 44631.43 C2 5.911765 427.9606 0.1826471 0.1713529 0.10482353 13.85294 53535.18 C3 2.633333 239.9893 0.1755333 0.2022667 0.08466667 10.76667 55908.33 C4 3.800000 300.9900 0.1521429 0.1698571 0.09028571 23.65714 69947.14 C5 4.900000 273.0150 0.2645000 0.1195000 0.12100000 33.50000 43727.50 all 4.956250 354.2346 0.1855833 0.1747500 0.09889583 14.70000 54963.08 > mZP <- matrix(0,nrow=5,ncol=7) > for(i in 1:5) mZP[i,] <- colMeans(ZP[wardK5==i,]) > rownames(mZP) <- paste("C",1:5,sep="") > colnames(mZP) <- colnames(P) > mZP crime violent smoking drinking diabetes opioid income C1 1.57226755 1.0924135 1.13630101 -0.9926501 1.2825773 -0.42285896 -1.1989851 C2 0.39230677 0.5663303 -0.08434105 -0.1122694 0.4134191 -0.09320836 -0.1657081 C3 -0.95372256 -0.8775811 -0.28867448 0.9093981 -0.9923940 -0.43281475 0.1096960 C4 -0.47472289 -0.4090012 -0.96053853 -0.1617040 -0.6005011 0.98562294 1.7388952 C5 -0.02309463 -0.6238926 2.26678886 -1.8259568 1.5416252 2.06870780 -1.3038858 > n <- 48 > clu <- file("US48ward5.clu","w") > cat(file=clu,c("*vertices ",n,"\n")) > for (i in 1:n) cat(file=clu,wardK5[i],"\n",append=TRUE) > close(clu)
> setwd("C:/Users/batagelj/Documents/books/BM2/chapters/cluster/nets/USA") > source("https://raw.githubusercontent.com/bavla/cluRC/master/readPajekNet.R") > netRel <- "https://raw.githubusercontent.com/bavla/cluRC/master/data/USA48xy.net" > R <- read_Pajek_net(netRel) > Ro <- R > Ri <- vector("list",length(R)); names(Ri)<-names(R) > for(i in 1:length(R)) for(j in R[[i]]) Ri[[j]] <- union(Ri[[j]],i) > # sRi <- Ri; sRo <- Ro > US48 <- "https://raw.githubusercontent.com/bavla/cluRC/master/data/US48.csv" > P <- read.table(US48,sep=";",head=TRUE) > ZP <- scale(P) > d <- dist(ZP, method="euc") > D <- d^2 > source("https://raw.githubusercontent.com/bavla/cluRC/master/relConD.R") > res <- relConD(D,strategy="tolerant") Clustering with relational constraint based on the class dist by Vladimir Batagelj, March 2018 Method: max Strategy: tolerant [1] "Started: 2018-07-16 03:00:46" [1] "Finished: 2018-07-16 03:00:46" > mtK6 <- cutree(res, k = 6) > table(mtK6) mtK6 1 2 3 4 5 6 9 17 13 6 2 1 > for(i in 1:6) cat(i,":",rownames(P)[mtK6==i],"\n") 1 : AL AR FL GA LA MS NC TN SC 2 : AZ CA DE IL IN MD MI MO NJ NM NV NY OH OK PA VA TX 3 : CO IA ID KS MN MT ND NE OR SD WY WI WA 4 : CT MA ME NH RI VT 5 : KY WV 6 : UT > n <- 48 > clu <- file("US48mt6.clu","w") > cat(file=clu,c("*vertices ",n,"\n")) > for (i in 1:n) cat(file=clu,mtK6[i],"\n",append=TRUE) > close(clu) > plot(res,hang=-1,main="US states",sub="dist: tolerant / maximum",cex=0.5)
> clustAna <- function(P,clu,all=FALSE){ + k <- max(clu); nVar <- ncol(P) + mP <- matrix(0,nrow=k,ncol=nVar) + for(i in 1:k) mP[i,] <- colMeans(P[clu==i,,drop=FALSE]) + rownames(mP) <- paste("C",1:k,sep=""); colnames(mP) <- colnames(P) + if(all){mP <- rbind(mP,colMeans(P)); rownames(mP)[k+1] <- "all"} + return(mP) + } > source("https://raw.githubusercontent.com/bavla/cluRC/master/clustAna.R") > clustAna(P,mtK6,all=TRUE) crime violent smoking drinking diabetes opioid income C1 8.166667 461.9956 0.2140000 0.1487778 0.11600000 10.788889 46104.33 C2 5.970588 425.9129 0.1803529 0.1719412 0.10323529 15.794118 57054.47 C3 2.838462 265.2454 0.1764615 0.2005385 0.08515385 7.407692 55913.77 C4 2.383333 234.3067 0.1660000 0.1931667 0.08800000 26.716667 62751.83 C5 4.900000 273.0150 0.2645000 0.1195000 0.12100000 33.500000 43727.50 C6 1.900000 204.7200 0.0970000 0.1210000 0.07100000 16.400000 62518.00 all 4.956250 354.2346 0.1855833 0.1747500 0.09889583 14.700000 54963.08 > clustAna(ZP,mtK6) crime violent smoking drinking diabetes opioid income C1 1.31810446 0.8277718 0.8162355 -0.85835575 1.1929069 -0.4303695 -1.0280550 C2 0.41645801 0.5506012 -0.1502369 -0.09282878 0.3026498 0.1203941 0.2427048 C3 -0.86950284 -0.6835754 -0.2620129 0.85228264 -0.9584163 -0.8024284 0.1103268 C4 -1.05636535 -0.9212328 -0.5625083 0.60865226 -0.7599151 1.3222857 0.9038819 C5 -0.02309463 -0.6238926 2.2667889 -1.82595677 1.5416252 2.0687078 -1.3038858 C6 -1.25480807 -1.1485044 -2.5444525 -1.77638329 -1.9455571 0.1870640 0.8767456
# colDiss() # Color plots of a dissimilarity matrix, without and with ordering # http://ichthyology.usm.edu/courses/multivariate/coldiss.R # # License: GPL-2 # Author: Francois Gillet, August 2009 # addapted by: Vladimir Batagelj, July 16, 2018 # "colDiss" <- function(D, nc = 4, byrank = TRUE, diag = FALSE, order = NULL, ...) { require(gclus) D <- D/max(D) spe.color <- dmat.color(1-D, byrank=byrank, cm.colors(nc)) if(is.null(order)) spe.o <- order.single(1-D) else spe.o <- order speo.color <- spe.color[spe.o,spe.o] if (diag) {plotcolors(speo.color, rlabels=attributes(D)$Labels[spe.o], dlabels=attributes(D)$Labels[spe.o], ...) } else plotcolors(speo.color, rlabels=attributes(D)$Labels[spe.o], ...) } # colDiss(D,label.cex=0.5,order=1:48,main="Unordered Dissimilarity Matrix") # colDiss(D,label.cex=0.5,order=ward$order,main="Ordered Dissimilarity Matrix/Ward")
> source("https://raw.githubusercontent.com/bavla/cluRC/master/coldiss.R") > colDiss(D,label.cex=0.5,order=1:48,main="Unordered Dissimilarity Matrix") > colDiss(D,label.cex=0.5,order=ward$order,main="Ordered Dissimilarity Matrix / Ward")
> names(res) [1] "merge" "height" "order" "labels" "method" [6] "call" "dist.method" "leaders" > res$order [1] 5 11 35 46 14 10 27 24 37 42 21 26 45 39 12 13 20 33 3 31 30 22 34 41 4 7 36 [28] 29 32 18 40 19 6 17 43 28 48 15 47 8 9 25 44 2 38 16 1 23 > colDiss(D,label.cex=0.5,order=res$order,main="Ordered Dissimilarity Matrix / max-tolerant")
> wdir <- "C:/Users/batagelj/Documents/books/BM2/chapters/cluster/nets/USA" > setwd(wdir) > library(maptools) > USsta <- readShapeSpatial("./shape/gadm36_USA_1.shp") > plot(USsta,xlim=c(-124,-67),ylim=c(23,48),lwd=0.2,border="black") > names(USsta) [1] "GID_0" "NAME_0" "GID_1" "NAME_1" "VARNAME_1" "NL_NAME_1" [7] "TYPE_1" "ENGTYPE_1" "CC_1" "HASC_1" > Sn <- as.character(USsta$VARNAME_1) > head(Sn) [1] "AL|Ala." "AK|Alaska" "AZ|Ariz." "AR|Ark." "CA|Calif." "CO|Colo." > Sname <- sapply(strsplit(Sn,"|",fixed=TRUE),function(x) x[1]) > Sn[nchar(Sname)!=2] [1] "Commonwealth of Kentucky|KY" [2] "Commonwealth of Massachusetts|MA|Mass." [3] "Commonwealth of Pennsylvania|PA" [4] "State of Rhode Island and Providence Plantations|RI|R.I." > which(nchar(Sname)!=2) [1] 18 22 39 40 > Sname[c(18,22,39,40)] <- c("KY","MA","PA","RI") > US48 <- "https://raw.githubusercontent.com/bavla/cluRC/master/data/US48.csv" > P <- read.table(US48,sep=";",head=TRUE) > Pname <- rownames(P) > clu <- read.csv("US48mt6.clu",header=FALSE,skip=1)[[1]] > p <- match(Sname,Pname) > cp <- clu[p]; cp[is.na(cp)] <- 7 > col <- c("red","blue","magenta","green","yellow","cyan","black") > plot(USsta,xlim=c(-124,-67),ylim=c(23,48),lwd=0.2,border="black",col=col[cp]) > title("Central US - Maximum/Tolerant") > pdf("US48map.pdf",width=11.7,height=8.3,paper="a4r") > plot(USsta,xlim=c(-124,-67),ylim=c(23,48),lwd=0.2,border="black",col=col[cp]) > title("Central US - Maximum/Tolerant"); dev.off()
> wdir <- "C:/Users/batagelj/Documents/books/BM2/chapters/cluster/nets/USA" > setwd(wdir) > png("US48map.png",width=2000,height=1419) > plot(USsta,xlim=c(-124,-67),ylim=c(23,48),lwd=0.2,border="black",col=col[cp]) > title("Central US - Maximum/Tolerant"); dev.off()
> col <- c("gray48","gray76","gray56","gray66","gray84","gray92","black") > col <- c("red2","steelblue2","orchid3","goldenrod3","khaki1","darkolivegreen1","black") > col <- c("khaki1","lightgreen","lightpink","lightcyan","lightsalmon","lightblue","black")
To adapt the network partition picture to black and white printing I marked each node also with a Unicode symbol representing the corresponding partition.
read network US48xy read partition US48clu select partition as Second partition Draw/Network + First Partition Options/Mark Vertices using/Cluster Symbols of Second Partition [On] Options/Symbols for Partition Clusters/Change [⌘⃝⃞⃟✲✩❖■●◆★✿∆∇✾☺❉☼♀♂⌂] set Export/Options Export/2D/SVG/General
I saved the settings on US48uni.ini
.
I selected as symbols for clusters 1, 2 and 3 Unicode characters Combining Enclosed (Circle, Square, Diamond)
that are on some web browsers displayed overprinted. They function OK in SVG/PDF pictures.