====== Clustering notes ======
===== Iris =====
> help(iris)
> names(iris)
[1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species"
> D <- iris[,1:4]
> head(D)
Sepal.Length Sepal.Width Petal.Length Petal.Width
1 5.1 3.5 1.4 0.2
2 4.9 3.0 1.4 0.2
3 4.7 3.2 1.3 0.2
4 4.6 3.1 1.5 0.2
5 5.0 3.6 1.4 0.2
6 5.4 3.9 1.7 0.4
> S <- apply(D,2,z)
> head(S)
Sepal.Length Sepal.Width Petal.Length Petal.Width
[1,] -0.8976739 1.01560199 -1.335752 -1.311052
[2,] -1.1392005 -0.13153881 -1.335752 -1.311052
[3,] -1.3807271 0.32731751 -1.392399 -1.311052
[4,] -1.5014904 0.09788935 -1.279104 -1.311052
[5,] -1.0184372 1.24503015 -1.335752 -1.311052
[6,] -0.5353840 1.93331463 -1.165809 -1.048667
> n <- function(x) (x - min(x))/(max(x) - min(x))
> N <- apply(D,2,n)
> head(N)
Sepal.Length Sepal.Width Petal.Length Petal.Width
[1,] 0.22222222 0.6250000 0.06779661 0.04166667
[2,] 0.16666667 0.4166667 0.06779661 0.04166667
[3,] 0.11111111 0.5000000 0.05084746 0.04166667
[4,] 0.08333333 0.4583333 0.08474576 0.04166667
[5,] 0.19444444 0.6666667 0.06779661 0.04166667
[6,] 0.30555556 0.7916667 0.11864407 0.12500000
> t <- hclust(dist(S))
> plot(t,hang=-1,cex=0.1,lwd=0.2,main="Iris")
> help(hclust)
> t <- hclust(dist(S),method="ward.D2")
> plot(t,hang=-1,cex=0.1,lwd=0.2,main="Iris")
> rect.hclust(t,k=5,border="red")
> pdf("iris.pdf",width=11.7,height=8.3,paper="a4r")
> plot(t,hang=-1,cex=0.1,lwd=0.2,main="Iris")
> rect.hclust(t,k=5,border="red")
> dev.off()
> p <- cutree(t,k=5)
> iris$Species[p==1]
[1] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa
[12] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa
[23] setosa setosa setosa setosa setosa setosa setosa
Levels: setosa versicolor virginica
> C <- iris$Species[p==1]
> table(C)
C
setosa versicolor virginica
29 0 0
> C <- iris$Species[p==2]
> table(C)
C
setosa versicolor virginica
20 0 0
> C <- iris$Species[p==3]
> table(C)
C
setosa versicolor virginica
1 27 2
> C <- iris$Species[p==4]
> table(C)
C
setosa versicolor virginica
0 23 22
> C <- iris$Species[p==5]
> table(C)
C
setosa versicolor virginica
0 0 26
> for(i in 1:5){C <- iris$Species[p==i]; cat("C",i,table(C),"\n")}
C 1 29 0 0
C 2 20 0 0
C 3 1 27 2
C 4 0 23 22
C 5 0 0 26
>
> library(cluster)
> r <- agnes(dist(S),method="ward")
> plot(r,hang=-1,which.plots=2,main="iris",cex=0.2)
> library(factoextra)
> fviz_dend(r, cex = 0.2)
> fviz_dend(r, cex = 0.2, horiz = TRUE)
> fviz_dend(r, k=4, cex=0.2, color_labels_by_k=TRUE, rect=TRUE,
+ k_colors = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
+ rect_border = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"))
> fviz_dend(r, cex = 0.2, type = "circular")
Error in match.arg(type) : 'arg' should be one of “rectangle”, “triangle”
===== Places =====
http://www.stat.nthu.edu.tw/~swcheng/Teaching/stat5191/assignment/assignment2.html
> setwd("C:/Users/batagelj/Documents/papers/2017/Moscow/sources")
> help(read.table)
> T <- read.table("places.txt",header=TRUE,row.names=1)
> names(T)
[1] "Climate" "HousingCost" "HlthCare" "Crime" "Transp" "Educ" "Arts"
[8] "Recreat" "Econ" "CaseNum" "Long" "Lat" "Pop" "StNum"
> head(T)
> dim(T)
[1] 329 14
> D <- T[,1:8]
> dim(D)
[1] 329 8
> head(D)
Climate HousingCost HlthCare Crime Transp Educ Arts Recreat
Abilene,TX 521 6200 237 923 4031 2757 996 1405
Akron,OH 575 8138 1656 886 4883 2438 5564 2632
Albany,GA 468 7339 618 970 2531 2560 237 859
Albany-Schenectady-Troy,NY 476 7908 1431 610 6883 3399 4655 1617
Albuquerque,NM 659 8393 1853 1483 6558 3026 4496 2612
Alexandria,LA 520 5819 640 727 2444 2972 334 1018
> pairs(D)
> z <- function(x){(x-mean(x))/sd(x)}
> Q <- apply(D,2,z)
> head(Q)
> R <- scale(D)
> head(R)
> d <- dist(Q)
> t <-hclust(d)
> plot(t,hang=-1,cex=0.1,lwd=0.2,main="Places")
> pdf("places.pdf",width=11.7,height=8.3,paper="a4r")
> plot(t,hang=-1,cex=0.1,lwd=0.2,main="Places")
> dev.off()
> s <- kmeans(Q,centers=10,iter.max=30)
> names(s)
[1] "cluster" "centers" "totss" "withinss" "tot.withinss"
[6] "betweenss" "size" "iter" "ifault"
> ps <- s$cluster
> table(ps)
ps
1 2 3 4 5 6 7 8 9 10
76 24 7 17 30 18 1 68 34 54
> s$centers
Climate HousingCost HlthCare Crime Transp Educ Arts Recreat
1 -0.16486235 -0.60032838 -0.56474060 0.14072655 -0.81406786 -0.78311099 -0.42005534 -0.6398548
2 0.10602871 0.59491313 1.61445641 0.34866908 1.05977118 1.56093660 1.11519979 0.6129160
3 1.20955819 1.87916325 3.59931327 1.17500954 1.51299864 2.05062718 3.53335356 1.2242320
4 1.96059205 2.69866676 0.61306800 -0.01036463 0.07517638 -0.29377353 0.57458996 0.9567226
5 0.39815829 0.04658078 -0.09904128 1.38729613 0.06152555 -0.01450844 -0.00266058 1.3003151
6 -2.03085149 -0.24844846 -0.24001806 -1.13333946 0.15583199 -0.41442024 -0.36980028 -0.2672423
7 0.82169446 2.10100166 6.64431493 4.30331030 3.04229678 0.52717013 11.54477376 2.1451515
8 0.18395563 -0.28814826 -0.38975971 -1.00750358 -0.49263770 0.24574433 -0.43532753 -0.5800363
9 -0.50612671 0.21616789 -0.41815974 0.05839790 0.25387486 -0.76108980 -0.23589475 0.7588939
10 -0.06155396 -0.26789659 0.18378375 0.25723124 0.77296739 0.54125560 0.06429721 -0.2522054
> rownames(Q)[ps==4]
[1] "Anaheim-Santa-Ana,CA" "Bridgeport-Milford,CT"
[3] "Danbury,CT" "Honolulu,HI"
[5] "Monmouth-Ocean,NJ" "Norwalk,CT"
[7] "Oakland,CA" "Oxnard-Ventura,CA"
[9] "Salinas-Seaside-Monterey,CA" "San-Diego,CA"
[11] "San-Jose,CA" "Santa-Barbara-Santa-Maria-Lompoc,CA"
[13] "Santa-Cruz,CA" "Santa-Rosa-Petaluma,CA"
[15] "Seattle,WA" "Stamford,CT"
[17] "Vallejo-Fairfield-Napa,CA"
>
rownames(D) <- paste0("S", 1:nrow(D))
grp <- cutree(r, k = 4)
table(grp)
rownames(D)[grp == 1]
* [[notes:clu:cluster|Clustering and density of "space"]]; {{pub:pdf:cladag12VB.pdf|CLADAG 12}}
* {{notes:clu:zip:uscounties.zip|US counties network}}; [[notes:clu:counties:neigh|Neighbors]];
* [[notes:clu:counties:clu|Counties/clustering]]
\\ \\
[[ru:7iss#labs|Back to 7ISS Labs]]