====== Labs ====== ===== Data ===== [[https://www.gesis.org/issp/home/|International Social Survey Programme]] (ISSP) {{ru:hse:mva:dat:za6770_bq.pdf|Questionnaire}} 2015 ISSP module on work orientation IV {{ru:hse:mva:dat:za6770_cdb.pdf|Variable report}} {{ru:hse:mva:dat:za.zip|ZA6770 data}} in SPSS file ''ZA.sav'' > library(foreign) > setwd("C:/Users/batagelj/Documents/papers/2018/moskva/nusa") > T <- read.spss("ZA.sav", + use.value.labels = FALSE, + to.data.frame = TRUE, + use.missings = TRUE) > dim(T) [1] 51668 442 > names(T) [1] "studyno" "version" "doi" "c_sample" "country" "c_alphan" "v1" "v2" "v3" [10] "v4" "v5" "v6" "v7" "v8" "v9" "v10" "v11" "v12" [19] "v13" "v14" "v15" "V16" "v17" "v18" "v19" "v20" "v21" [28] "v22" "v23" "v24" "v25" "v26" "v27" "v28" "v29" "v30" [37] "v31" "v32" "v33" "v34" "v35" "v36" "v37" "v38" "v39" [46] "v40" "v41" "v42" "v43" "v44" "v45" "v46" "v47" "v48" [55] "v49" "v50" "v51" "v52" "v53" "v54" "v55" "v56" "v57" [64] "v58" "v59" "v60" "v61" "v62" "v63" "v64" "v65" "v66" [73] "v67" "v68" "v69" "v70" "v71" "v72" "v73" "v74" "v75" [82] "v76" "v77" "v78" "v79" "v80" "v81" "v82" "v83" "v84" [91] "v85" "v86" "v87" "v88" "v89" "v90" "v91" "v92" "v93" [100] "v94" "v95" "v96" "v97" "SEX" "BIRTH" "AGE" "DK_AGE" "EDUCYRS" [109] "AT_DEGR" "AU_DEGR" "BE_DEGR" "CH_DEGR" "CL_DEGR" "CN_DEGR" "CZ_DEGR" "DE_DEGR" "DK_DEGR" [118] "EE_DEGR" "ES_DEGR" "FI_DEGR" "FR_DEGR" "GB_DEGR" "GE_DEGR" "HR_DEGR" "HU_DEGR" "IL_DEGR" ... > attributes(T)$variable.labels[29:35] v23 "Q12b Apply to R's job: my income is high" v24 "Q12c Apply to R's job: opportunities for advancement are high" v25 "Q12d Apply to R's job: my job is interesting" v26 "Q12e Apply to R's job: can work independently" v27 "Q12f Apply to R's job: can help other people" v28 "Q12g Apply to R's job: job is useful to society" v29 "Q12h Apply to R's job: personal contact with other people" > T[1:15,29:35] v23 v24 v25 v26 v27 v28 v29 1 4 1 1 1 1 1 1 2 3 2 2 2 2 2 3 3 3 2 2 3 3 3 3 4 4 4 4 5 3 4 2 5 1 1 1 1 1 1 1 6 1 1 3 3 3 3 2 7 1 1 1 1 1 1 1 8 NA NA NA NA NA NA NA 9 2 3 1 1 2 1 2 10 NA NA NA NA NA NA NA 11 2 2 2 2 3 2 2 12 1 1 1 1 1 1 1 13 NA NA NA NA NA NA NA 14 NA NA NA NA NA NA NA 15 NA NA NA NA NA NA NA > D <- T[,c(6,29:35)] > dim(D) [1] 51668 8 > head(D) c_alphan v23 v24 v25 v26 v27 v28 v29 1 AT 4 1 1 1 1 1 1 2 AT 3 2 2 2 2 2 3 3 AT 3 2 2 3 3 3 3 4 AT 4 4 4 5 3 4 2 5 AT 1 1 1 1 1 1 1 6 AT 1 1 3 3 3 3 2 > D$c_alphan <- as.character(D$c_alphan) > head(D$c_alphan) [1] "AT " "AT " [3] "AT " "AT " [5] "AT " "AT " > D$c_alphan <- trimws(as.character(D$c_alphan)) > DC <- D[complete.cases(D),] > Dru <- DC[DC$c_alphan=="RU",] > dim(Dru) [1] 837 8 > varLabs <- attributes(T)$variable.labels[c(6,29:35)] > varLabs c_alphan "Country Prefix ISO 3166" v23 "Q12b Apply to R's job: my income is high" v24 "Q12c Apply to R's job: opportunities for advancement are high" v25 "Q12d Apply to R's job: my job is interesting" v26 "Q12e Apply to R's job: can work independently" v27 "Q12f Apply to R's job: can help other people" v28 "Q12g Apply to R's job: job is useful to society" v29 "Q12h Apply to R's job: personal contact with other people" > > table(D$c_alphan) AT AU BE CH CL CN CZ DE DK EE ES 1001 1211 2141 1235 1433 1795 1435 1687 1138 1207 1834 FI FR GB-GBN GE HR HU IL IN IS JP LT 1203 1224 1793 1487 1000 1003 1248 1336 1126 1573 1127 LV MX NO NZ PH PL RU SE SI SK SR 1002 1141 1550 901 1200 2112 1596 1162 1023 1150 1139 TW US VE ZA 2031 1477 1007 2940 DC - complete data on selected variables\\ Dru - Russian part of DC **The scales in selected variables are "reversed" !!!** - 1 is positive and 5 is negative ===== Hierarchical clustering ===== > library(corrgram) > library(ppcor) > library(blockmodeling) > library(psych) > library(e1071) > Ru <- Dru[,2:8] > ZRu <- scale(Ru) > d <- dist(ZRu, method="euc") > d2 <- d^2 > single <- hclust(d = d2, method= "single") > plot(single, hang = -1, main = "single") > ward <- hclust(d = d2, method = "ward.D") > plot(ward, hang = -1, main = "ward") > wardK4 <- cutree(ward, k = 4) > table(wardK4) wardK4 1 2 3 4 293 181 175 188 > mRu <- matrix(0,nrow=4,ncol=7) > for(i in 1:4) mRu[i,] <- colMeans(Ru[wardK4==i,]) > rownames(mRu) <- paste("C",1:4,sep="") > colnames(mRu) <- colnames(Ru) > mRu <- rbind(mRu,colMeans(Ru)) > rownames(mRu)[5] <- "all" > t(mRu) C1 C2 C3 C4 all v23 2.143345 3.519337 3.742857 1.904255 2.721625 v24 2.634812 3.729282 3.857143 1.925532 2.967742 v25 2.061433 3.132597 2.891429 1.255319 2.285544 v26 2.535836 3.563536 2.805714 1.675532 2.621266 v27 2.556314 3.491713 2.411429 1.345745 2.456392 v28 2.283276 2.872928 2.068571 1.218085 2.126643 v29 2.218430 3.121547 1.634286 1.319149 2.089606 > zRu <- matrix(0,nrow=4,ncol=7) > for(i in 1:4) zRu[i,] <- colMeans(ZRu[wardK4==i,]) > rownames(zRu) <- paste("C",1:4,sep="") > colnames(zRu) <- colnames(ZRu) > zRu <- rbind(zRu,colMeans(ZRu)) > rownames(zRu)[5] <- "all" > t(zRu) C1 C2 C3 C4 all v23 -0.51111827 0.7050653 0.90262564 -0.7224396 1.183177e-16 v24 -0.29022432 0.6638561 0.77531628 -0.9085243 -3.260534e-17 v25 -0.23043248 0.8709491 0.62297744 -1.0592879 2.745713e-17 v26 -0.07540963 0.8317451 0.16281287 -0.8348037 1.524070e-16 v27 0.09414952 0.9755093 -0.04236573 -1.0464840 2.013523e-16 v28 0.17838210 0.8499063 -0.06613449 -1.0347099 2.352094e-16 v29 0.13490989 1.0806896 -0.47682909 -0.8068528 -6.061791e-17 ==== Dendrograms in PDF ==== > pdf("ru-single.pdf",width=11.7,height=8.3,paper="a4r") > plot(single,hang=-1,cex=0.1,lwd=0.2,main="Russians/single linkage") > dev.off() > pdf("ru-ward.pdf",width=11.7,height=8.3,paper="a4r") > plot(ward,hang=-1,cex=0.1,lwd=0.2,main="Russians/Ward") > dev.off() {{ru:hse:mva:pics:ru-single.pdf}}; {{ru:hse:mva:pics:ru-ward.pdf}} ===== k-means ===== The k-means solutions are local optima and can be different in different runs. > ZDC <- scale(DC[,2:8]) > head(ZDC) v23 v24 v25 v26 v27 v28 v29 1 0.7533499 -1.9441852 -1.1642108 -1.0781521 -1.1096588 -1.10943828 -0.8978940 2 -0.1536723 -1.0648646 -0.1619814 -0.1629287 -0.1135945 -0.05791786 1.4511185 3 -0.1536723 -1.0648646 -0.1619814 0.7522947 0.8824698 0.99360256 1.4511185 4 0.7533499 0.6937764 1.8424773 2.5827415 0.8824698 2.04512297 0.2766123 5 -1.9677167 -1.9441852 -1.1642108 -1.0781521 -1.1096588 -1.10943828 -0.8978940 6 -1.9677167 -1.9441852 0.8402480 0.7522947 0.8824698 0.99360256 0.2766123 > s <- kmeans(ZDC,centers=20,iter.max=50) > names(s) [1] "cluster" "centers" "totss" "withinss" "tot.withinss" [6] "betweenss" "size" "iter" "ifault" > table(s$cluster) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 430 593 528 876 1690 2901 965 3202 703 1427 2838 813 521 881 2564 2146 17 18 19 20 481 849 778 1339 > s$iter [1] 8 > da <- dist(s$centers,method="euc") > da2 <- da^2 > wa <- hclust(d=da2,method="ward.D") > plot(wa,hang=-1,main="ward/All") {{ru:hse:mva:pics:wardall.png}} > waK5 <- cutree(wa,k=5) > waK5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 1 1 2 3 4 5 4 5 2 5 5 4 2 3 5 5 2 2 4 4 > ca <- waK5[s$cluster] > head(s$cluster) 1 2 3 4 5 6 8 9 17 2 16 18 > head(ca) 8 9 17 2 16 18 5 2 2 1 5 2 > # names(ca) <- names(s$cluster) > ca <- as.vector(ca) > mA <- matrix(0,nrow=5,ncol=7) > for(i in 1:5) mA[i,] <- colMeans(DC[ca==i,2:8]) > rownames(mA) <- paste("C",1:5,sep="") > colnames(mA) <- colnames(DC[,2:8]) > mA <- rbind(mA,colMeans(DC[,2:8])) > rownames(mA)[6] <- "all" > t(mA) C1 C2 C3 C4 C5 all v23 4.050831 3.144062 3.311326 3.714951 2.896206 3.169425 v24 4.269795 3.264763 3.637450 3.760072 2.875116 3.211008 v25 4.033236 2.633355 2.152533 2.834378 1.690078 2.161621 v26 3.733138 2.456522 1.853728 3.298299 1.638414 2.178021 v27 3.998045 2.794938 3.639158 2.138048 1.660432 2.114043 v28 4.041056 2.585010 2.745589 1.928201 1.778552 2.055080 v29 2.719453 3.288449 1.653956 1.608415 1.458880 1.764486 > zA <- matrix(0,nrow=5,ncol=7) > for(i in 1:5) zA[i,] <- colMeans(ZDC[ca==i,]) > rownames(zA) <- paste("C",1:5,sep="") > colnames(zA) <- colnames(ZDC) > zA <- rbind(zA,colMeans(ZDC)) > rownames(zA)[6] <- "all" > t(zA) C1 C2 C3 C4 C5 all v23 0.7994547 -0.02300460 0.128707409 0.49480392 -0.2478154 1.287388e-16 v24 0.9310125 0.04726758 0.374978962 0.48280250 -0.2953571 -1.418857e-16 v25 1.8757870 0.47278552 -0.009108647 0.67425651 -0.4725941 1.265665e-16 v26 1.4232796 0.25489064 -0.296800350 1.02530490 -0.4938611 -2.086801e-16 v27 1.8765868 0.67821521 1.519111942 0.02391051 -0.4518257 6.349513e-18 v28 2.0882939 0.55723182 0.726084270 -0.13341646 -0.2907754 2.554601e-16 v29 1.1216138 1.78990370 -0.129819027 -0.18330628 -0.3589360 8.157680e-17 ==== Analysis of clusters by countries ==== > table(ca) ca 1 2 3 4 5 1023 3082 1757 5585 15078 > N <- DC[,1] > table(N) N AT AU BE CH CL CN CZ DE DK EE ES FI FR GB-GBN 625 702 1122 772 666 690 806 1016 703 671 835 593 640 904 GE HR HU IL IN IS JP LT LV MX NO NZ PH PL 407 524 542 762 586 824 845 502 575 598 1051 558 660 770 RU SE SI SK SR TW US VE ZA 837 708 476 581 582 1243 930 376 843 > table(N[ca==1]) AT AU BE CH CL CN CZ DE DK EE ES FI FR GB-GBN 14 14 43 9 27 53 28 21 12 21 44 16 20 17 GE HR HU IL IN IS JP LT LV MX NO NZ PH PL 19 32 38 22 31 21 113 24 10 18 17 6 10 108 RU SE SI SK SR TW US VE ZA 47 18 4 25 17 39 16 6 43 > cas <- table(ca) > for(i in 1:5) {cat(i,"\n"); print(100*table(N[ca==i])/cas[i])} 1 AT AU BE CH CL CN CZ DE DK 1.3685239 1.3685239 4.2033236 0.8797654 2.6392962 5.1808407 2.7370479 2.0527859 1.1730205 EE ES FI FR GB-GBN GE HR HU IL 2.0527859 4.3010753 1.5640274 1.9550342 1.6617791 1.8572825 3.1280547 3.7145650 2.1505376 IN IS JP LT LV MX NO NZ PH 3.0303030 2.0527859 11.0459433 2.3460411 0.9775171 1.7595308 1.6617791 0.5865103 0.9775171 PL RU SE SI SK SR TW US VE 10.5571848 4.5943304 1.7595308 0.3910068 2.4437928 1.6617791 3.8123167 1.5640274 0.5865103 ZA 4.2033236 ... > table(ca[N=="RU"]) 1 2 3 4 5 47 214 42 202 332 > table(ca[N=="DE"]) 1 2 3 4 5 21 74 176 70 675 > nt <- table(N) > 100*table(ca[N=="DE"])/nt["DE"] 1 2 3 4 5 2.066929 7.283465 17.322835 6.889764 66.437008 > 100*table(ca[N=="RU"])/nt["RU"] 1 2 3 4 5 5.615293 25.567503 5.017921 24.133811 39.665472 > 100*table(ca[N=="SI"])/nt["SI"] 1 2 3 4 5 0.8403361 9.0336134 5.8823529 13.4453782 70.7983193 > 100*table(ca[N=="US"])/nt["US"] 1 2 3 4 5 1.720430 5.913978 4.731183 17.204301 70.430108 > 100*table(ca[N=="PL"])/nt["PL"] 1 2 3 4 5 14.025974 16.103896 9.220779 40.259740 20.389610 > 100*table(ca)/length(ca) ca 1 2 3 4 5 3.856739 11.619227 6.623940 21.055608 56.844486 > ==== Repeat k-means and use the best solution ==== > sb <- s; pb <- s$tot.withinss > for(i in 1:100){ + s <- kmeans(ZDC,centers=20,iter.max=50) + if(s$tot.withinss < pb) {sb <- s; pb <- s$tot.withinss; cat(i,pb,"\n")} + } 1 63693.44 13 63550.52 15 63139.9 20 63027.43 > > da <- dist(sb$centers,method="euc") > da2 <- da^2 > wa <- hclust(d=da2,method="ward.D") > plot(wa,hang=-1,main="ward/All sb") {{ru:hse:mva:pics:wardallsb.png}} ==== Analysis of countries for sb clustering ==== > clCenters <- function(D,C){ + k <- length(table(C)) + mA <- matrix(0,nrow=k,ncol=ncol(D)) + for(i in 1:k) mA[i,] <- colMeans(D[C==i,]) + rownames(mA) <- paste("C",1:k,sep="") + colnames(mA) <- colnames(D) + mA <- rbind(mA,colMeans(D)) + rownames(mA)[k+1] <- "all" + return(t(mA)) + } > > N <- DC[,1]; nt <- table(N) > Nc <- names(nt); nc <- length(Nc) > waK4 <- cutree(wa,k=4) > ca <- as.vector(waK4[sb$cluster]) > clCenters(DC[,2:8],ca) C1 C2 C3 C4 all v23 3.247956 3.130331 2.785750 3.963289 3.169425 v24 3.193898 3.257872 2.918976 4.117474 3.211008 v25 2.246442 2.575528 1.437036 3.519457 2.161621 v26 2.289522 2.352730 1.404379 3.772394 2.178021 v27 2.183752 2.796333 1.306531 3.544053 2.114043 v28 2.165051 2.579115 1.335725 3.174009 2.055080 v29 1.740157 3.411718 1.131371 2.244126 1.764486 # C1 - average, positive # C2 - average, negative # C3 - positive (happy) # C4 - negative (unhappy) > table(ca) ca 1 2 3 4 13208 2509 8084 2724 > C <- matrix(0,nrow=nc+1,ncol=4) > rownames(C) <- c(Nc,"All"); colnames(C) <- paste("C",1:4,sep="") > for(c in Nc) C[c,] <- 100*table(ca[N==c])/nt[c] > C["All",] <- 100*table(ca)/length(ca) > C C1 C2 C3 C4 AT 33.76000 12.960000 48.160000 5.120000 AU 52.56410 5.555556 36.039886 5.840456 BE 49.46524 6.595365 35.472371 8.467023 CH 42.74611 3.756477 51.554404 1.943005 CL 58.85886 8.108108 19.819820 13.213213 CN 56.23188 14.637681 12.028986 17.101449 CZ 50.24814 9.801489 28.660050 11.290323 DE 46.75197 6.102362 40.354331 6.791339 DK 39.11807 3.982930 52.773826 4.125178 EE 55.29061 9.687034 22.503726 12.518629 ES 52.21557 5.868263 27.664671 14.251497 FI 47.89207 8.431703 37.436762 6.239460 FR 52.18750 8.281250 29.843750 9.687500 GB-GBN 52.21239 6.194690 36.061947 5.530973 GE 35.87224 33.169533 24.078624 6.879607 HR 53.43511 7.251908 19.083969 20.229008 HU 51.29151 15.129151 18.819188 14.760148 IL 42.65092 16.404199 34.645669 6.299213 IN 39.07850 31.058020 18.430034 11.433447 IS 51.09223 4.126214 38.956311 5.825243 JP 46.39053 10.650888 9.230769 33.727811 LT 62.35060 7.768924 17.330677 12.549801 LV 59.47826 6.260870 22.260870 12.000000 MX 55.01672 9.197324 24.749164 11.036789 NO 50.14272 6.279734 39.771646 3.805899 NZ 55.01792 5.555556 36.559140 2.867384 PH 59.24242 10.000000 27.727273 3.030303 PL 43.89610 11.688312 5.064935 39.350649 RU 43.96655 18.996416 20.549582 16.487455 SE 50.70621 8.898305 36.016949 4.378531 SI 43.48739 6.932773 45.588235 3.991597 SK 54.04475 10.154905 22.719449 13.080895 SR 52.74914 11.512027 30.584192 5.154639 TW 63.39501 5.792438 23.008850 7.803701 US 43.33333 5.268817 46.559140 4.838710 VE 32.18085 3.457447 60.106383 4.255319 ZA 50.53381 12.455516 24.792408 12.218268 All 49.79453 9.459001 30.476909 10.269557 > dc <- dist(C[1:nc,],"euc")^2 > wardC <- hclust(d = dc, method = "ward.D") > plot(wardC,hang=-1,main="ward/Countries") > plot(wardC,hang=-1,cex=0.7,main="ward/Countries") {{ru:hse:mva:pics:wardcntr.png}} > dcZ <- dist(scale(C[1:nc,]),"euc")^2 > wardZ <- hclust(d = dcZ, method = "ward.D") > plot(wardZ,hang=-1,cex=0.7,main="ward/Countries") {{ru:hse:mva:pics:wardcntrz.png}}