Explore / Cluster
- Hierarchical clustering
- k-means

Explore / Cluster

We will look into newBooksClean data set.

What is the effect of binding?

> wdir <- "D:/vlado/EDA/data"
> setwd(wdir)
> booksC <- "https://raw.githubusercontent.com/bavla/HSE/master/EDA/newBooksClean.csv"
> D <- read.csv2(url(booksC),stringsAsFactors=FALSE)
> dim(D)
> str(D)
> col <- rep('red',length(D$weig))
> j <- which(D$bind=="Hardcover")
> col[j] <- "blue"
> table(col)
> pairs(D[,c("wid","thi","hei","weig")],col=col)

Hierarchical clustering

> wdir <- "D:/vlado/EDA/data"
> setwd(wdir)
> booksC <- "https://raw.githubusercontent.com/bavla/HSE/master/EDA/newBooksClean.csv"
> D <- read.csv2(url(booksC),stringsAsFactors=FALSE)
> dim(D)
> str(D)
'data.frame':   966 obs. of  12 variables:
 $ bID   : int  1 2 3 4 5 6 7 8 9 10 ...
 $ Amazon: chr  "0521840856" "0521387078" "1446247414" "0195379470" ...
 $ bind  : chr  "Hardcover" "Paperback" "Paperback" "Paperback" ...
 $ npag  : int  402 857 304 264 720 207 344 744 248 272 ...
 $ year  : int  2004 1994 2013 2011 2010 2014 2005 2010 2017 2011 ...
 $ pub   : chr  "Cambridge University Press" "Cambridge University Press" "SAGE Publications Ltd" "Oxford University Press" ...
 $ wid   : num  6 6 7.3 9.2 9.8 6.1 7 8.5 6.7 6.7 ...
 $ thi   : num  1.1 1.5 0.7 0.7 1.7 0.5 0.7 1.2 0.6 0.6 ...
 $ hei   : num  9 9 9.1 6.1 7.6 9.2 10 10 9.5 9.5 ...
 $ weig  : num  1.4 2.6 1.4 0.8 4.1 ...
 $ pric  : num  121.5 52.4 37.4 20.8 61.5 ...
 $ titl  : chr  "Amazon.com: Generalized Blockmodeling (Structural Analysis in the Social Sciences) (9780521840859): Patrick Dor" ...
> ok <- !is.na(D$wid)&!is.na(D$thi)&!is.na(D$hei)
> A <- D[ok,c("wid","thi","hei")]
> dim(A)
[1] 927   3
> z <- function(x){(x-mean(x))/sd(x)}
> S <- cbind(z(A$wid),z(A$thi),z(A$hei))   # S <- scale(A)
> sum(S[,1])
[1] -8.533452e-14
> sd(S[,1])
[1] 1
> t <-hclust(dist(S))
> plot(t,hang=-1,cex=0.4,main="Amazon books / Complete")
> S <- cbind(z(A$wid),z(A$hei))
> t <-hclust(dist(S))
> plot(t,hang=-1,cex=0.4,main="Amazon books / Complete")
> p <- cutree(t,k=12)
> table(p)
p
  1   2   3   4   5   6   7   8   9  10  11  12 
367 237  23 113 123   9  27  11  11   3   2   1 
> p <- cutree(t,k=13)
> table(p)
p
  1   2   3   4   5   6   7   8   9  10  11  12  13 
367 237  23 113 123   6  27  11  11   3   3   2   1 
> summary(A[p==1,])
      wid             thi              hei       
 Min.   :5.000   Min.   :0.2000   Min.   :8.000  
 1st Qu.:5.500   1st Qu.:0.6000   1st Qu.:8.450  
 Median :6.000   Median :0.8000   Median :9.000  
 Mean   :5.835   Mean   :0.8507   Mean   :8.797  
 3rd Qu.:6.000   3rd Qu.:1.0000   3rd Qu.:9.100  
 Max.   :6.300   Max.   :1.8000   Max.   :9.800  
> sd(A$wid[p==1])
> sd(A$thi[p==1])
> sd(A$hei[p==1])
> summary(A[p==2,])
> summary(A[p==4,])
> summary(A[p==5,])

Numbering clusters in the order of their sizes and coloring them.

> (s <- table(p))
p
  1   2   3   4   5   6   7   8   9  10  11  12  13 
367 237  23 113 123   6  27  11  11   3   3   2   1 
> (r <- rev(order(s)))
 [1]  1  2  5  4  7  3  9  8  6 11 10 12 13
> inv <- function(p){q <- p; for(i in 1:length(p)) q[p[i]] <- i; return(q) }
> (rr <- inv(r))
 [1]  1  2  6  4  3  9  5  8  7 11 10 12 13
> q <- rr[p]
> table(q)
q
  1   2   3   4   5   6   7   8   9  10  11  12  13 
367 237 123 113  27  23  11  11   6   3   3   2   1 
> q[q>5]<-5
> col <- c("red","blue","green","magenta","grey")
> plot(A$wid,A$hei,pch=16,col=col[q],main="Amazon books / Complete")

The result is not as expected - we used a wrong method.

> t <-hclust(dist(S),method="single")
> plot(t,hang=-1,cex=0.4,main="Amazon books / Single")
> p <- cutree(t,k=6)
> table(p)
p
  1   2   3   4   5   6 
892  20  11   2   1   1 
> p <- cutree(t,k=8)
> table(p)
p
  1   2   3   4   5   6   7   8 
842  49  20  11   2   1   1   1 
> p <- cutree(t,k=9)
> table(p)
p
  1   2   3   4   5   6   7   8   9 
841  49  20  11   1   2   1   1   1 
> p <- cutree(t,k=8)
> p[p>5] <- 5
> plot(A$wid,A$hei,pch=16,col=col[p],main="Amazon books/Single")
> w1 <- A$wid[p==1]
> h1 <- A$hei[p==1]
> lin <- lm(h1 ~ w1)
> abline(lin,lw=2)
> lin$coef
(Intercept)          w1 
  4.3591487   0.7269049 
> w2 <- A$wid[p==2]
> h2 <- A$hei[p==2]
> lin2 <- lm(h2 ~ w2)
> abline(lin2,lw=2)
> lin2$coef
(Intercept)          w2 
 -1.9302901   0.9205564

Plotting on A4 in PDF.

> rownames(S) <- D$Amazon[ok]
> t <-hclust(dist(S),method="single")
> plot(t,hang=-1,cex=0.1,lwd=0.2,main="Amazon books / Single")
> pdf("amazonSingle.pdf",width=11.7,height=8.3,paper="a4r")
> plot(t,hang=-1,cex=0.1,lwd=0.2,main="Amazon books / Single")
> dev.off()

Golden ratio.

> (fi <- (1+sqrt(5))/2)
[1] 1.618034
> f <- A$hei/A$wid
> boxplot(f)
> mean(f[(f<5)&(f>0.2)])
[1] 1.388592
> median(f)
[1] 1.464286
> hist(f,breaks="Scott",prob=TRUE,ylab="",main="Proportion h:w")
> lines(density(f),col="blue",lwd=2)
> plot(density(f),col="blue",lwd=2)
> plot(density(f),xlim=c(0,5),col="blue",lwd=2)
> lines(c(fi,fi),c(0,4),col="red",lw=2)

Clustering and density of "space"; CLADAG 12

k-means

> A <- D[ok,c("wid","hei")]
> rownames(A) <- D$Amazon[ok]
> S <- scale(A)
> rez <- kmeans(S,centers=6,iter.max=30)
> pC <- rez$cluster
> rez$centers
         wid        hei
1  1.5322506  0.8907959
2  4.2624810  7.7582636
3 -0.3040938  0.1880441
4  1.1637903 -2.6890728
5 -1.0198874 -0.3324663
6  0.4978760  0.3568429
> table(pC)
pC
  1   2   3   4   5   6 
 93   3 290  60 248 233 
> col <- c("red","orange","blue","green","magenta","grey")
> plot(A$wid,A$hei,pch=16,col=col[pC],main="Amazon books/k-means")
 
> summary(A[pC==1,])
      wid             hei       
 Min.   : 5.60   Min.   :0.100  
 1st Qu.: 6.80   1st Qu.:1.075  
 Median : 8.30   Median :4.400  
 Mean   : 8.15   Mean   :3.839  
 3rd Qu.: 9.20   3rd Qu.:6.000  
 Max.   :10.10   Max.   :7.100  
> summary(A[pC==2,])
      wid              hei       
 Min.   :0.5000   Min.   : 8.00  
 1st Qu.:0.8000   1st Qu.: 9.40  
 Median :1.0000   Median :10.20  
 Mean   :0.9182   Mean   :10.03  
 3rd Qu.:1.0000   3rd Qu.:10.65  
 Max.   :1.5000   Max.   :11.50  
> summary(A[pC==3,])
      wid              hei       
 Min.   : 6.200   Min.   : 7.60  
 1st Qu.: 8.400   1st Qu.:10.20  
 Median : 8.500   Median :10.80  
 Mean   : 8.794   Mean   :11.13  
 3rd Qu.: 8.700   3rd Qu.:11.00  
 Max.   :15.500   Max.   :23.40  
>

EDA