Table of Contents

Clustering notes

Iris

> help(iris)
> names(iris)
[1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width"  "Species"     
> D <- iris[,1:4]
> head(D)
  Sepal.Length Sepal.Width Petal.Length Petal.Width
1          5.1         3.5          1.4         0.2
2          4.9         3.0          1.4         0.2
3          4.7         3.2          1.3         0.2
4          4.6         3.1          1.5         0.2
5          5.0         3.6          1.4         0.2
6          5.4         3.9          1.7         0.4
> S <- apply(D,2,z)
> head(S)
     Sepal.Length Sepal.Width Petal.Length Petal.Width
[1,]   -0.8976739  1.01560199    -1.335752   -1.311052
[2,]   -1.1392005 -0.13153881    -1.335752   -1.311052
[3,]   -1.3807271  0.32731751    -1.392399   -1.311052
[4,]   -1.5014904  0.09788935    -1.279104   -1.311052
[5,]   -1.0184372  1.24503015    -1.335752   -1.311052
[6,]   -0.5353840  1.93331463    -1.165809   -1.048667
> n <- function(x) (x - min(x))/(max(x) - min(x))
> N <- apply(D,2,n)
> head(N)
     Sepal.Length Sepal.Width Petal.Length Petal.Width
[1,]   0.22222222   0.6250000   0.06779661  0.04166667
[2,]   0.16666667   0.4166667   0.06779661  0.04166667
[3,]   0.11111111   0.5000000   0.05084746  0.04166667
[4,]   0.08333333   0.4583333   0.08474576  0.04166667
[5,]   0.19444444   0.6666667   0.06779661  0.04166667
[6,]   0.30555556   0.7916667   0.11864407  0.12500000
> t <- hclust(dist(S))
> plot(t,hang=-1,cex=0.1,lwd=0.2,main="Iris")
> help(hclust)
> t <- hclust(dist(S),method="ward.D2")
> plot(t,hang=-1,cex=0.1,lwd=0.2,main="Iris")
> rect.hclust(t,k=5,border="red")
> pdf("iris.pdf",width=11.7,height=8.3,paper="a4r")
> plot(t,hang=-1,cex=0.1,lwd=0.2,main="Iris")
> rect.hclust(t,k=5,border="red")
> dev.off()
 
> p <- cutree(t,k=5)
> iris$Species[p==1]
 [1] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa
[12] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa
[23] setosa setosa setosa setosa setosa setosa setosa
Levels: setosa versicolor virginica
> C <- iris$Species[p==1]
> table(C)
C
    setosa versicolor  virginica 
        29          0          0 
> C <- iris$Species[p==2]
> table(C)
C
    setosa versicolor  virginica 
        20          0          0 
> C <- iris$Species[p==3]
> table(C)
C
    setosa versicolor  virginica 
         1         27          2 
> C <- iris$Species[p==4]
> table(C)
C
    setosa versicolor  virginica 
         0         23         22 
> C <- iris$Species[p==5]
> table(C)
C
    setosa versicolor  virginica 
         0          0         26 
> for(i in 1:5){C <- iris$Species[p==i]; cat("C",i,table(C),"\n")}
C 1 29 0 0 
C 2 20 0 0 
C 3 1 27 2 
C 4 0 23 22 
C 5 0 0 26 
> 
 
> library(cluster)
> r <- agnes(dist(S),method="ward")
> plot(r,hang=-1,which.plots=2,main="iris",cex=0.2)
 
> library(factoextra)
> fviz_dend(r, cex = 0.2)
> fviz_dend(r, cex = 0.2, horiz = TRUE)
> fviz_dend(r, k=4, cex=0.2, color_labels_by_k=TRUE, rect=TRUE, 
+ k_colors = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
+ rect_border = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"))
> fviz_dend(r, cex = 0.2, type = "circular")
Error in match.arg(type) : 'arg' should be one of “rectangle”, “triangle”

Places

http://www.stat.nthu.edu.tw/~swcheng/Teaching/stat5191/assignment/assignment2.html

> setwd("C:/Users/batagelj/Documents/papers/2017/Moscow/sources")
> help(read.table)
> T <- read.table("places.txt",header=TRUE,row.names=1)
> names(T)
 [1] "Climate"     "HousingCost" "HlthCare"    "Crime"       "Transp"      "Educ"        "Arts"       
 [8] "Recreat"     "Econ"        "CaseNum"     "Long"        "Lat"         "Pop"         "StNum"      
> head(T)
> dim(T)
[1] 329  14
> D <- T[,1:8]
> dim(D)
[1] 329   8
> head(D)
                           Climate HousingCost HlthCare Crime Transp Educ Arts Recreat
Abilene,TX                     521        6200      237   923   4031 2757  996    1405
Akron,OH                       575        8138     1656   886   4883 2438 5564    2632
Albany,GA                      468        7339      618   970   2531 2560  237     859
Albany-Schenectady-Troy,NY     476        7908     1431   610   6883 3399 4655    1617
Albuquerque,NM                 659        8393     1853  1483   6558 3026 4496    2612
Alexandria,LA                  520        5819      640   727   2444 2972  334    1018
> pairs(D)
> z <- function(x){(x-mean(x))/sd(x)}
> Q <- apply(D,2,z)
> head(Q)
> R <- scale(D)
> head(R)
> d <- dist(Q)
> t <-hclust(d)
> plot(t,hang=-1,cex=0.1,lwd=0.2,main="Places")
> pdf("places.pdf",width=11.7,height=8.3,paper="a4r")
> plot(t,hang=-1,cex=0.1,lwd=0.2,main="Places")
> dev.off()
 
> s <- kmeans(Q,centers=10,iter.max=30)
> names(s)
[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"      
> ps <- s$cluster
> table(ps)
ps
 1  2  3  4  5  6  7  8  9 10 
76 24  7 17 30 18  1 68 34 54 
> s$centers
       Climate HousingCost    HlthCare       Crime      Transp        Educ        Arts    Recreat
1  -0.16486235 -0.60032838 -0.56474060  0.14072655 -0.81406786 -0.78311099 -0.42005534 -0.6398548
2   0.10602871  0.59491313  1.61445641  0.34866908  1.05977118  1.56093660  1.11519979  0.6129160
3   1.20955819  1.87916325  3.59931327  1.17500954  1.51299864  2.05062718  3.53335356  1.2242320
4   1.96059205  2.69866676  0.61306800 -0.01036463  0.07517638 -0.29377353  0.57458996  0.9567226
5   0.39815829  0.04658078 -0.09904128  1.38729613  0.06152555 -0.01450844 -0.00266058  1.3003151
6  -2.03085149 -0.24844846 -0.24001806 -1.13333946  0.15583199 -0.41442024 -0.36980028 -0.2672423
7   0.82169446  2.10100166  6.64431493  4.30331030  3.04229678  0.52717013 11.54477376  2.1451515
8   0.18395563 -0.28814826 -0.38975971 -1.00750358 -0.49263770  0.24574433 -0.43532753 -0.5800363
9  -0.50612671  0.21616789 -0.41815974  0.05839790  0.25387486 -0.76108980 -0.23589475  0.7588939
10 -0.06155396 -0.26789659  0.18378375  0.25723124  0.77296739  0.54125560  0.06429721 -0.2522054
> rownames(Q)[ps==4]
 [1] "Anaheim-Santa-Ana,CA"                "Bridgeport-Milford,CT"              
 [3] "Danbury,CT"                          "Honolulu,HI"                        
 [5] "Monmouth-Ocean,NJ"                   "Norwalk,CT"                         
 [7] "Oakland,CA"                          "Oxnard-Ventura,CA"                  
 [9] "Salinas-Seaside-Monterey,CA"         "San-Diego,CA"                       
[11] "San-Jose,CA"                         "Santa-Barbara-Santa-Maria-Lompoc,CA"
[13] "Santa-Cruz,CA"                       "Santa-Rosa-Petaluma,CA"             
[15] "Seattle,WA"                          "Stamford,CT"                        
[17] "Vallejo-Fairfield-Napa,CA"          
>
rownames(D) <- paste0("S", 1:nrow(D))
grp <- cutree(r, k = 4)
table(grp)
rownames(D)[grp == 1]



Back to 7ISS Labs

ru/7iss/labs/cl.txt · Last modified: 2017/06/22 09:01 by vlado
 
Except where otherwise noted, content on this wiki is licensed under the following license: CC Attribution-Noncommercial-Share Alike 3.0 Unported
Recent changes RSS feed Donate Powered by PHP Valid XHTML 1.0 Valid CSS Driven by DokuWiki