====== Clustering notes ====== ===== Iris ===== > help(iris) > names(iris) [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species" > D <- iris[,1:4] > head(D) Sepal.Length Sepal.Width Petal.Length Petal.Width 1 5.1 3.5 1.4 0.2 2 4.9 3.0 1.4 0.2 3 4.7 3.2 1.3 0.2 4 4.6 3.1 1.5 0.2 5 5.0 3.6 1.4 0.2 6 5.4 3.9 1.7 0.4 > S <- apply(D,2,z) > head(S) Sepal.Length Sepal.Width Petal.Length Petal.Width [1,] -0.8976739 1.01560199 -1.335752 -1.311052 [2,] -1.1392005 -0.13153881 -1.335752 -1.311052 [3,] -1.3807271 0.32731751 -1.392399 -1.311052 [4,] -1.5014904 0.09788935 -1.279104 -1.311052 [5,] -1.0184372 1.24503015 -1.335752 -1.311052 [6,] -0.5353840 1.93331463 -1.165809 -1.048667 > n <- function(x) (x - min(x))/(max(x) - min(x)) > N <- apply(D,2,n) > head(N) Sepal.Length Sepal.Width Petal.Length Petal.Width [1,] 0.22222222 0.6250000 0.06779661 0.04166667 [2,] 0.16666667 0.4166667 0.06779661 0.04166667 [3,] 0.11111111 0.5000000 0.05084746 0.04166667 [4,] 0.08333333 0.4583333 0.08474576 0.04166667 [5,] 0.19444444 0.6666667 0.06779661 0.04166667 [6,] 0.30555556 0.7916667 0.11864407 0.12500000 > t <- hclust(dist(S)) > plot(t,hang=-1,cex=0.1,lwd=0.2,main="Iris") > help(hclust) > t <- hclust(dist(S),method="ward.D2") > plot(t,hang=-1,cex=0.1,lwd=0.2,main="Iris") > rect.hclust(t,k=5,border="red") > pdf("iris.pdf",width=11.7,height=8.3,paper="a4r") > plot(t,hang=-1,cex=0.1,lwd=0.2,main="Iris") > rect.hclust(t,k=5,border="red") > dev.off() > p <- cutree(t,k=5) > iris$Species[p==1] [1] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa [12] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa [23] setosa setosa setosa setosa setosa setosa setosa Levels: setosa versicolor virginica > C <- iris$Species[p==1] > table(C) C setosa versicolor virginica 29 0 0 > C <- iris$Species[p==2] > table(C) C setosa versicolor virginica 20 0 0 > C <- iris$Species[p==3] > table(C) C setosa versicolor virginica 1 27 2 > C <- iris$Species[p==4] > table(C) C setosa versicolor virginica 0 23 22 > C <- iris$Species[p==5] > table(C) C setosa versicolor virginica 0 0 26 > for(i in 1:5){C <- iris$Species[p==i]; cat("C",i,table(C),"\n")} C 1 29 0 0 C 2 20 0 0 C 3 1 27 2 C 4 0 23 22 C 5 0 0 26 > > library(cluster) > r <- agnes(dist(S),method="ward") > plot(r,hang=-1,which.plots=2,main="iris",cex=0.2) > library(factoextra) > fviz_dend(r, cex = 0.2) > fviz_dend(r, cex = 0.2, horiz = TRUE) > fviz_dend(r, k=4, cex=0.2, color_labels_by_k=TRUE, rect=TRUE, + k_colors = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"), + rect_border = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07")) > fviz_dend(r, cex = 0.2, type = "circular") Error in match.arg(type) : 'arg' should be one of “rectangle”, “triangle” ===== Places ===== http://www.stat.nthu.edu.tw/~swcheng/Teaching/stat5191/assignment/assignment2.html > setwd("C:/Users/batagelj/Documents/papers/2017/Moscow/sources") > help(read.table) > T <- read.table("places.txt",header=TRUE,row.names=1) > names(T) [1] "Climate" "HousingCost" "HlthCare" "Crime" "Transp" "Educ" "Arts" [8] "Recreat" "Econ" "CaseNum" "Long" "Lat" "Pop" "StNum" > head(T) > dim(T) [1] 329 14 > D <- T[,1:8] > dim(D) [1] 329 8 > head(D) Climate HousingCost HlthCare Crime Transp Educ Arts Recreat Abilene,TX 521 6200 237 923 4031 2757 996 1405 Akron,OH 575 8138 1656 886 4883 2438 5564 2632 Albany,GA 468 7339 618 970 2531 2560 237 859 Albany-Schenectady-Troy,NY 476 7908 1431 610 6883 3399 4655 1617 Albuquerque,NM 659 8393 1853 1483 6558 3026 4496 2612 Alexandria,LA 520 5819 640 727 2444 2972 334 1018 > pairs(D) > z <- function(x){(x-mean(x))/sd(x)} > Q <- apply(D,2,z) > head(Q) > R <- scale(D) > head(R) > d <- dist(Q) > t <-hclust(d) > plot(t,hang=-1,cex=0.1,lwd=0.2,main="Places") > pdf("places.pdf",width=11.7,height=8.3,paper="a4r") > plot(t,hang=-1,cex=0.1,lwd=0.2,main="Places") > dev.off() > s <- kmeans(Q,centers=10,iter.max=30) > names(s) [1] "cluster" "centers" "totss" "withinss" "tot.withinss" [6] "betweenss" "size" "iter" "ifault" > ps <- s$cluster > table(ps) ps 1 2 3 4 5 6 7 8 9 10 76 24 7 17 30 18 1 68 34 54 > s$centers Climate HousingCost HlthCare Crime Transp Educ Arts Recreat 1 -0.16486235 -0.60032838 -0.56474060 0.14072655 -0.81406786 -0.78311099 -0.42005534 -0.6398548 2 0.10602871 0.59491313 1.61445641 0.34866908 1.05977118 1.56093660 1.11519979 0.6129160 3 1.20955819 1.87916325 3.59931327 1.17500954 1.51299864 2.05062718 3.53335356 1.2242320 4 1.96059205 2.69866676 0.61306800 -0.01036463 0.07517638 -0.29377353 0.57458996 0.9567226 5 0.39815829 0.04658078 -0.09904128 1.38729613 0.06152555 -0.01450844 -0.00266058 1.3003151 6 -2.03085149 -0.24844846 -0.24001806 -1.13333946 0.15583199 -0.41442024 -0.36980028 -0.2672423 7 0.82169446 2.10100166 6.64431493 4.30331030 3.04229678 0.52717013 11.54477376 2.1451515 8 0.18395563 -0.28814826 -0.38975971 -1.00750358 -0.49263770 0.24574433 -0.43532753 -0.5800363 9 -0.50612671 0.21616789 -0.41815974 0.05839790 0.25387486 -0.76108980 -0.23589475 0.7588939 10 -0.06155396 -0.26789659 0.18378375 0.25723124 0.77296739 0.54125560 0.06429721 -0.2522054 > rownames(Q)[ps==4] [1] "Anaheim-Santa-Ana,CA" "Bridgeport-Milford,CT" [3] "Danbury,CT" "Honolulu,HI" [5] "Monmouth-Ocean,NJ" "Norwalk,CT" [7] "Oakland,CA" "Oxnard-Ventura,CA" [9] "Salinas-Seaside-Monterey,CA" "San-Diego,CA" [11] "San-Jose,CA" "Santa-Barbara-Santa-Maria-Lompoc,CA" [13] "Santa-Cruz,CA" "Santa-Rosa-Petaluma,CA" [15] "Seattle,WA" "Stamford,CT" [17] "Vallejo-Fairfield-Napa,CA" > rownames(D) <- paste0("S", 1:nrow(D)) grp <- cutree(r, k = 4) table(grp) rownames(D)[grp == 1] * [[notes:clu:cluster|Clustering and density of "space"]]; {{pub:pdf:cladag12VB.pdf|CLADAG 12}} * {{notes:clu:zip:uscounties.zip|US counties network}}; [[notes:clu:counties:neigh|Neighbors]]; * [[notes:clu:counties:clu|Counties/clustering]] \\ \\ [[ru:7iss#labs|Back to 7ISS Labs]]