We will look into newBooksClean data set.
> wdir <- "C:/Users/batagelj/Documents/papers/2017/Moscow/EDA/test" > setwd(wdir) > D <- read.csv2("newBooksClean.csv",stringsAsFactors=FALSE) > dim(D) [1] 966 12 > str(D) 'data.frame': 966 obs. of 12 variables: $ bID : int 1 2 3 4 5 6 7 8 9 10 ... $ Amazon: chr "0521840856" "0521387078" "1446247414" "0195379470" ... $ bind : chr "Hardcover" "Paperback" "Paperback" "Paperback" ... $ npag : int 402 857 304 264 720 207 344 744 248 272 ... $ year : int 2004 1994 2013 2011 2010 2014 2005 2010 2017 2011 ... $ pub : chr "Cambridge University Press" "Cambridge University Press" "SAGE Publications Ltd" "Oxford University Press" ... $ wid : num 6 6 7.3 9.2 9.8 6.1 7 8.5 6.7 6.7 ... $ thi : num 1.1 1.5 0.7 0.7 1.7 0.5 0.7 1.2 0.6 0.6 ... $ hei : num 9 9 9.1 6.1 7.6 9.2 10 10 9.5 9.5 ... $ weig : num 1.4 2.6 1.4 0.8 4.1 ... $ pric : num 121.5 52.4 37.4 20.8 61.5 ... $ titl : chr "Amazon.com: Generalized Blockmodeling (Structural Analysis in the Social Sciences) (9780521840859): Patrick Dor" ... > z <- function(x){(x-mean(x))/sd(x)} > ok <- !is.na(D$wid)&!is.na(D$thi)&!is.na(D$hei) > A <- D[ok,c("wid","thi","hei")] > dim(A) [1] 927 3 > S <- cbind(z(A$wid),z(A$thi),z(A$hei)) > sum(S[,1]) [1] -8.533452e-14 > sd(S[,1]) [1] 1 > t <-hclust(dist(S)) > plot(t,hang=-1,cex=0.4,main="Amazon books / Complete") > S <- cbind(z(A$wid),z(A$hei)) > t <-hclust(dist(S)) > plot(t,hang=-1,cex=0.4,main="Amazon books / Complete") > p <- cutree(t,k=12) > table(p) p 1 2 3 4 5 6 7 8 9 10 11 12 367 237 23 113 123 9 27 11 11 3 2 1 > p <- cutree(t,k=13) > table(p) p 1 2 3 4 5 6 7 8 9 10 11 12 13 367 237 23 113 123 6 27 11 11 3 3 2 1 > summary(A[p==1,]) wid thi hei Min. :5.000 Min. :0.2000 Min. :8.000 1st Qu.:5.500 1st Qu.:0.6000 1st Qu.:8.450 Median :6.000 Median :0.8000 Median :9.000 Mean :5.835 Mean :0.8507 Mean :8.797 3rd Qu.:6.000 3rd Qu.:1.0000 3rd Qu.:9.100 Max. :6.300 Max. :1.8000 Max. :9.800 > sd(A$wid[p==1]) > sd(A$thi[p==1]) > sd(A$hei[p==1]) > summary(A[p==2,]) > summary(A[p==4,]) > summary(A[p==5,])
Numbering clusters in the order of their sizes and coloring them.
> (s <- table(p)) p 1 2 3 4 5 6 7 8 9 10 11 12 13 367 237 23 113 123 6 27 11 11 3 3 2 1 > (r <- rev(order(s))) [1] 1 2 5 4 7 3 9 8 6 11 10 12 13 > inv <- function(p){q <- p; for(i in 1:length(p)) q[p[i]] <- i; return(q) } > (rr <- inv(r)) [1] 1 2 6 4 3 9 5 8 7 11 10 12 13 > q <- rr[p] > table(q) q 1 2 3 4 5 6 7 8 9 10 11 12 13 367 237 123 113 27 23 11 11 6 3 3 2 1 > q[q>5]<-5 > col <- c("red","blue","green","magenta","grey") > plot(A$wid,A$hei,pch=16,col=col[q],main="Amazon books / Complete")
The result is not as expected - we used a wrong method.
> t <-hclust(dist(S),method="single") > plot(t,hang=-1,cex=0.4,main="Amazon books / Single") > p <- cutree(t,k=6) > table(p) p 1 2 3 4 5 6 892 20 11 2 1 1 > p <- cutree(t,k=8) > table(p) p 1 2 3 4 5 6 7 8 842 49 20 11 2 1 1 1 > p <- cutree(t,k=9) > table(p) p 1 2 3 4 5 6 7 8 9 841 49 20 11 1 2 1 1 1 > p <- cutree(t,k=8) > p[p>5] <- 5 > plot(A$wid,A$hei,pch=16,col=col[p],main="Amazon books/Single") > w1 <- A$wid[p==1] > h1 <- A$hei[p==1] > lin <- lm(h1 ~ w1) > abline(lin,lw=2) > lin$coef (Intercept) w1 4.3591487 0.7269049 > w2 <- A$wid[p==2] > h2 <- A$hei[p==2] > lin2 <- lm(h2 ~ w2) > abline(lin2,lw=2) > lin2$coef (Intercept) w2 -1.9302901 0.9205564
Plotting on A4 in PDF.
> rownames(S) <- D$Amazon[ok] > t <-hclust(dist(S),method="single") > plot(t,hang=-1,cex=0.1,lwd=0.2,main="Amazon books / Single") > pdf("amazonSingle.pdf",width=11.7,height=8.3,paper="a4r") > plot(t,hang=-1,cex=0.1,lwd=0.2,main="Amazon books / Single") > dev.off()
Golden ratio.
> (fi <- (1+sqrt(5))/2) [1] 1.618034 > f <- A$hei/A$wid > boxplot(f) > mean(f[(f<5)&(f>0.2)]) [1] 1.388592 > median(f) [1] 1.464286 > hist(f,breaks="Scott",prob=TRUE,ylab="",main="Proportion h:w") > lines(density(f),col="blue",lwd=2) > plot(density(f),col="blue",lwd=2) > plot(density(f),xlim=c(0,5),col="blue",lwd=2) > lines(c(fi,fi),c(0,4),col="red",lw=2)
> A <- D[ok,c("wid","hei")] > rownames(A) <- D$Amazon[ok] > S <- scale(A) > rez <- kmeans(S,centers=6,iter.max=30) > pC <- rez$cluster > rez$centers wid hei 1 1.5322506 0.8907959 2 4.2624810 7.7582636 3 -0.3040938 0.1880441 4 1.1637903 -2.6890728 5 -1.0198874 -0.3324663 6 0.4978760 0.3568429 > table(pC) pC 1 2 3 4 5 6 93 3 290 60 248 233 > col <- c("red","orange","blue","green","magenta","grey") > plot(A$wid,A$hei,pch=16,col=col[pC],main="Amazon books/k-means")