Clustering of US states

Clustering of US states

US 2016 48 states

Ordinary Clustering

> P <- read.csv2("https://raw.githubusercontent.com/bavla/cluRC/master/data/US48.csv")
> ZP <- scale(P)
> d <- dist(ZP, method="euc")
> d2 <- d^2
> single <- hclust(d = d2, method= "single")
> plot(single, hang = -1, main = "single",cex=0.5)
> ward <- hclust(d = d2, method = "ward.D")
> plot(ward, hang = -1, main = "ward", cex=0.5)

> wardK5 <- cutree(ward, k = 5)
> table(wardK5)
wardK5
 1  2  3  4  5
 7 17 15  7  2
> for(i in 1:5) cat(i,":",rownames(P)[wardK5==i],"\n")
1 : AL AR LA MS NM TN SC
2 : AZ CA DE FL GA IL IN KS MI MO NC NV NY OH OK PA TX
3 : CO IA ID ME MN MT ND NE OR SD WY RI WI WA VT
4 : CT MA MD NH NJ UT VA
5 : KY WV
> mP <- matrix(0,nrow=5,ncol=7)
> for(i in 1:5) mP[i,] <- colMeans(P[wardK5==i,])
> rownames(mP) <- paste("C",1:5,sep="")
> colnames(mP) <- colnames(P)
> mP <- rbind(mP,colMeans(P))
> rownames(mP)[6] <- "all"
> mP
       crime  violent   smoking  drinking   diabetes   opioid   income
C1  8.785714 496.4471 0.2251429 0.1447143 0.11728571 10.85714 44631.43
C2  5.911765 427.9606 0.1826471 0.1713529 0.10482353 13.85294 53535.18
C3  2.633333 239.9893 0.1755333 0.2022667 0.08466667 10.76667 55908.33
C4  3.800000 300.9900 0.1521429 0.1698571 0.09028571 23.65714 69947.14
C5  4.900000 273.0150 0.2645000 0.1195000 0.12100000 33.50000 43727.50
all 4.956250 354.2346 0.1855833 0.1747500 0.09889583 14.70000 54963.08
> mZP <- matrix(0,nrow=5,ncol=7)
> for(i in 1:5) mZP[i,] <- colMeans(ZP[wardK5==i,])
> rownames(mZP) <- paste("C",1:5,sep="")
> colnames(mZP) <- colnames(P)
> mZP
         crime    violent     smoking   drinking   diabetes      opioid     income
C1  1.57226755  1.0924135  1.13630101 -0.9926501  1.2825773 -0.42285896 -1.1989851
C2  0.39230677  0.5663303 -0.08434105 -0.1122694  0.4134191 -0.09320836 -0.1657081
C3 -0.95372256 -0.8775811 -0.28867448  0.9093981 -0.9923940 -0.43281475  0.1096960
C4 -0.47472289 -0.4090012 -0.96053853 -0.1617040 -0.6005011  0.98562294  1.7388952
C5 -0.02309463 -0.6238926  2.26678886 -1.8259568  1.5416252  2.06870780 -1.3038858

> n <- 48
> clu <- file("US48ward5.clu","w")
> cat(file=clu,c("*vertices ",n,"\n"))
> for (i in 1:n) cat(file=clu,wardK5[i],"\n",append=TRUE)
> close(clu)

Clustering with RC / tolerant

> setwd("C:/Users/batagelj/Documents/books/BM2/chapters/cluster/nets/USA")
> source("https://raw.githubusercontent.com/bavla/cluRC/master/readPajekNet.R")
> netRel <- "https://raw.githubusercontent.com/bavla/cluRC/master/data/USA48xy.net"
> R <- read_Pajek_net(netRel)
> Ro <- R
> Ri <- vector("list",length(R)); names(Ri)<-names(R)
> for(i in 1:length(R)) for(j in R[[i]]) Ri[[j]] <- union(Ri[[j]],i)
> # sRi <- Ri; sRo <- Ro
> US48 <- "https://raw.githubusercontent.com/bavla/cluRC/master/data/US48.csv"
> P <- read.table(US48,sep=";",head=TRUE)
> ZP <- scale(P)
> d <- dist(ZP, method="euc")
> D <- d^2
> source("https://raw.githubusercontent.com/bavla/cluRC/master/relConD.R")
> res <- relConD(D,strategy="tolerant")
Clustering with relational constraint based on the class dist
by Vladimir Batagelj, March 2018
Method: max   Strategy: tolerant 
[1] "Started: 2018-07-16 03:00:46"
[1] "Finished: 2018-07-16 03:00:46"
> mtK6 <- cutree(res, k = 6)
> table(mtK6)
mtK6
 1  2  3  4  5  6
 9 17 13  6  2  1
> for(i in 1:6) cat(i,":",rownames(P)[mtK6==i],"\n")
1 : AL AR FL GA LA MS NC TN SC
2 : AZ CA DE IL IN MD MI MO NJ NM NV NY OH OK PA VA TX
3 : CO IA ID KS MN MT ND NE OR SD WY WI WA
4 : CT MA ME NH RI VT
5 : KY WV
6 : UT
> n <- 48
> clu <- file("US48mt6.clu","w")
> cat(file=clu,c("*vertices ",n,"\n"))
> for (i in 1:n) cat(file=clu,mtK6[i],"\n",append=TRUE)
> close(clu)
> plot(res,hang=-1,main="US states",sub="dist: tolerant / maximum",cex=0.5)

> clustAna <- function(P,clu,all=FALSE){
+   k <- max(clu); nVar <- ncol(P)
+   mP <- matrix(0,nrow=k,ncol=nVar)
+   for(i in 1:k) mP[i,] <- colMeans(P[clu==i,,drop=FALSE])
+   rownames(mP) <- paste("C",1:k,sep=""); colnames(mP) <- colnames(P)
+   if(all){mP <- rbind(mP,colMeans(P)); rownames(mP)[k+1] <- "all"}
+   return(mP)
+ }

> source("https://raw.githubusercontent.com/bavla/cluRC/master/clustAna.R")
> clustAna(P,mtK6,all=TRUE)
       crime  violent   smoking  drinking   diabetes    opioid   income
C1  8.166667 461.9956 0.2140000 0.1487778 0.11600000 10.788889 46104.33
C2  5.970588 425.9129 0.1803529 0.1719412 0.10323529 15.794118 57054.47
C3  2.838462 265.2454 0.1764615 0.2005385 0.08515385  7.407692 55913.77
C4  2.383333 234.3067 0.1660000 0.1931667 0.08800000 26.716667 62751.83
C5  4.900000 273.0150 0.2645000 0.1195000 0.12100000 33.500000 43727.50
C6  1.900000 204.7200 0.0970000 0.1210000 0.07100000 16.400000 62518.00
all 4.956250 354.2346 0.1855833 0.1747500 0.09889583 14.700000 54963.08
> clustAna(ZP,mtK6)
         crime    violent    smoking    drinking   diabetes     opioid     income
C1  1.31810446  0.8277718  0.8162355 -0.85835575  1.1929069 -0.4303695 -1.0280550
C2  0.41645801  0.5506012 -0.1502369 -0.09282878  0.3026498  0.1203941  0.2427048
C3 -0.86950284 -0.6835754 -0.2620129  0.85228264 -0.9584163 -0.8024284  0.1103268
C4 -1.05636535 -0.9212328 -0.5625083  0.60865226 -0.7599151  1.3222857  0.9038819
C5 -0.02309463 -0.6238926  2.2667889 -1.82595677  1.5416252  2.0687078 -1.3038858
C6 -1.25480807 -1.1485044 -2.5444525 -1.77638329 -1.9455571  0.1870640  0.8767456

Visualization of dissimilarity matrix

# colDiss()
# Color plots of a dissimilarity matrix, without and with ordering
# http://ichthyology.usm.edu/courses/multivariate/coldiss.R
#
# License: GPL-2 
# Author: Francois Gillet, August 2009
# addapted by: Vladimir Batagelj, July 16, 2018
#

"colDiss" <- function(D, nc = 4, byrank = TRUE, diag = FALSE, order = NULL, ...)
{
  require(gclus)
  D <- D/max(D)
  spe.color <- dmat.color(1-D, byrank=byrank, cm.colors(nc))
  if(is.null(order)) spe.o <- order.single(1-D) else spe.o <- order
  speo.color <- spe.color[spe.o,spe.o]	
  if (diag) {plotcolors(speo.color, rlabels=attributes(D)$Labels[spe.o],  
    dlabels=attributes(D)$Labels[spe.o], ...)
  } else plotcolors(speo.color, rlabels=attributes(D)$Labels[spe.o], ...)
}

# colDiss(D,label.cex=0.5,order=1:48,main="Unordered Dissimilarity Matrix")
# colDiss(D,label.cex=0.5,order=ward$order,main="Ordered Dissimilarity Matrix/Ward")

> source("https://raw.githubusercontent.com/bavla/cluRC/master/coldiss.R")
> colDiss(D,label.cex=0.5,order=1:48,main="Unordered Dissimilarity Matrix")
> colDiss(D,label.cex=0.5,order=ward$order,main="Ordered Dissimilarity Matrix / Ward")

> names(res)
[1] "merge"       "height"      "order"       "labels"      "method"
[6] "call"        "dist.method" "leaders"
> res$order
 [1]  5 11 35 46 14 10 27 24 37 42 21 26 45 39 12 13 20 33  3 31 30 22 34 41  4  7 36
[28] 29 32 18 40 19  6 17 43 28 48 15 47  8  9 25 44  2 38 16  1 23
> colDiss(D,label.cex=0.5,order=res$order,main="Ordered Dissimilarity Matrix / max-tolerant")

I selected as symbols for clusters 1, 2 and 3 Unicode characters Combining Enclosed (Circle, Square, Diamond) that are on some web browsers displayed overprinted. They function OK in SVG/PDF pictures.

Clustering of US states

Ordinary Clustering

Clustering with RC / tolerant

Visualization of dissimilarity matrix

Clustering on the map