====== Labs ======
===== Data =====
[[https://www.gesis.org/issp/home/|International Social Survey Programme]] (ISSP)
{{ru:hse:mva:dat:za6770_bq.pdf|Questionnaire}} 2015 ISSP module on work orientation IV
{{ru:hse:mva:dat:za6770_cdb.pdf|Variable report}}
{{ru:hse:mva:dat:za.zip|ZA6770 data}} in SPSS file ''ZA.sav''
> library(foreign)
> setwd("C:/Users/batagelj/Documents/papers/2018/moskva/nusa")
> T <- read.spss("ZA.sav",
+ use.value.labels = FALSE,
+ to.data.frame = TRUE,
+ use.missings = TRUE)
> dim(T)
[1] 51668 442
> names(T)
[1] "studyno" "version" "doi" "c_sample" "country" "c_alphan" "v1" "v2" "v3"
[10] "v4" "v5" "v6" "v7" "v8" "v9" "v10" "v11" "v12"
[19] "v13" "v14" "v15" "V16" "v17" "v18" "v19" "v20" "v21"
[28] "v22" "v23" "v24" "v25" "v26" "v27" "v28" "v29" "v30"
[37] "v31" "v32" "v33" "v34" "v35" "v36" "v37" "v38" "v39"
[46] "v40" "v41" "v42" "v43" "v44" "v45" "v46" "v47" "v48"
[55] "v49" "v50" "v51" "v52" "v53" "v54" "v55" "v56" "v57"
[64] "v58" "v59" "v60" "v61" "v62" "v63" "v64" "v65" "v66"
[73] "v67" "v68" "v69" "v70" "v71" "v72" "v73" "v74" "v75"
[82] "v76" "v77" "v78" "v79" "v80" "v81" "v82" "v83" "v84"
[91] "v85" "v86" "v87" "v88" "v89" "v90" "v91" "v92" "v93"
[100] "v94" "v95" "v96" "v97" "SEX" "BIRTH" "AGE" "DK_AGE" "EDUCYRS"
[109] "AT_DEGR" "AU_DEGR" "BE_DEGR" "CH_DEGR" "CL_DEGR" "CN_DEGR" "CZ_DEGR" "DE_DEGR" "DK_DEGR"
[118] "EE_DEGR" "ES_DEGR" "FI_DEGR" "FR_DEGR" "GB_DEGR" "GE_DEGR" "HR_DEGR" "HU_DEGR" "IL_DEGR"
...
> attributes(T)$variable.labels[29:35]
v23
"Q12b Apply to R's job: my income is high"
v24
"Q12c Apply to R's job: opportunities for advancement are high"
v25
"Q12d Apply to R's job: my job is interesting"
v26
"Q12e Apply to R's job: can work independently"
v27
"Q12f Apply to R's job: can help other people"
v28
"Q12g Apply to R's job: job is useful to society"
v29
"Q12h Apply to R's job: personal contact with other people"
> T[1:15,29:35]
v23 v24 v25 v26 v27 v28 v29
1 4 1 1 1 1 1 1
2 3 2 2 2 2 2 3
3 3 2 2 3 3 3 3
4 4 4 4 5 3 4 2
5 1 1 1 1 1 1 1
6 1 1 3 3 3 3 2
7 1 1 1 1 1 1 1
8 NA NA NA NA NA NA NA
9 2 3 1 1 2 1 2
10 NA NA NA NA NA NA NA
11 2 2 2 2 3 2 2
12 1 1 1 1 1 1 1
13 NA NA NA NA NA NA NA
14 NA NA NA NA NA NA NA
15 NA NA NA NA NA NA NA
> D <- T[,c(6,29:35)]
> dim(D)
[1] 51668 8
> head(D)
c_alphan v23 v24 v25 v26 v27 v28 v29
1 AT 4 1 1 1 1 1 1
2 AT 3 2 2 2 2 2 3
3 AT 3 2 2 3 3 3 3
4 AT 4 4 4 5 3 4 2
5 AT 1 1 1 1 1 1 1
6 AT 1 1 3 3 3 3 2
> D$c_alphan <- as.character(D$c_alphan)
> head(D$c_alphan)
[1] "AT " "AT "
[3] "AT " "AT "
[5] "AT " "AT "
> D$c_alphan <- trimws(as.character(D$c_alphan))
> DC <- D[complete.cases(D),]
> Dru <- DC[DC$c_alphan=="RU",]
> dim(Dru)
[1] 837 8
> varLabs <- attributes(T)$variable.labels[c(6,29:35)]
> varLabs
c_alphan
"Country Prefix ISO 3166"
v23
"Q12b Apply to R's job: my income is high"
v24
"Q12c Apply to R's job: opportunities for advancement are high"
v25
"Q12d Apply to R's job: my job is interesting"
v26
"Q12e Apply to R's job: can work independently"
v27
"Q12f Apply to R's job: can help other people"
v28
"Q12g Apply to R's job: job is useful to society"
v29
"Q12h Apply to R's job: personal contact with other people"
>
> table(D$c_alphan)
AT AU BE CH CL CN CZ DE DK EE ES
1001 1211 2141 1235 1433 1795 1435 1687 1138 1207 1834
FI FR GB-GBN GE HR HU IL IN IS JP LT
1203 1224 1793 1487 1000 1003 1248 1336 1126 1573 1127
LV MX NO NZ PH PL RU SE SI SK SR
1002 1141 1550 901 1200 2112 1596 1162 1023 1150 1139
TW US VE ZA
2031 1477 1007 2940
DC - complete data on selected variables\\
Dru - Russian part of DC
**The scales in selected variables are "reversed" !!!** - 1 is positive and 5 is negative
===== Hierarchical clustering =====
> library(corrgram)
> library(ppcor)
> library(blockmodeling)
> library(psych)
> library(e1071)
> Ru <- Dru[,2:8]
> ZRu <- scale(Ru)
> d <- dist(ZRu, method="euc")
> d2 <- d^2
> single <- hclust(d = d2, method= "single")
> plot(single, hang = -1, main = "single")
> ward <- hclust(d = d2, method = "ward.D")
> plot(ward, hang = -1, main = "ward")
> wardK4 <- cutree(ward, k = 4)
> table(wardK4)
wardK4
1 2 3 4
293 181 175 188
> mRu <- matrix(0,nrow=4,ncol=7)
> for(i in 1:4) mRu[i,] <- colMeans(Ru[wardK4==i,])
> rownames(mRu) <- paste("C",1:4,sep="")
> colnames(mRu) <- colnames(Ru)
> mRu <- rbind(mRu,colMeans(Ru))
> rownames(mRu)[5] <- "all"
> t(mRu)
C1 C2 C3 C4 all
v23 2.143345 3.519337 3.742857 1.904255 2.721625
v24 2.634812 3.729282 3.857143 1.925532 2.967742
v25 2.061433 3.132597 2.891429 1.255319 2.285544
v26 2.535836 3.563536 2.805714 1.675532 2.621266
v27 2.556314 3.491713 2.411429 1.345745 2.456392
v28 2.283276 2.872928 2.068571 1.218085 2.126643
v29 2.218430 3.121547 1.634286 1.319149 2.089606
> zRu <- matrix(0,nrow=4,ncol=7)
> for(i in 1:4) zRu[i,] <- colMeans(ZRu[wardK4==i,])
> rownames(zRu) <- paste("C",1:4,sep="")
> colnames(zRu) <- colnames(ZRu)
> zRu <- rbind(zRu,colMeans(ZRu))
> rownames(zRu)[5] <- "all"
> t(zRu)
C1 C2 C3 C4 all
v23 -0.51111827 0.7050653 0.90262564 -0.7224396 1.183177e-16
v24 -0.29022432 0.6638561 0.77531628 -0.9085243 -3.260534e-17
v25 -0.23043248 0.8709491 0.62297744 -1.0592879 2.745713e-17
v26 -0.07540963 0.8317451 0.16281287 -0.8348037 1.524070e-16
v27 0.09414952 0.9755093 -0.04236573 -1.0464840 2.013523e-16
v28 0.17838210 0.8499063 -0.06613449 -1.0347099 2.352094e-16
v29 0.13490989 1.0806896 -0.47682909 -0.8068528 -6.061791e-17
==== Dendrograms in PDF ====
> pdf("ru-single.pdf",width=11.7,height=8.3,paper="a4r")
> plot(single,hang=-1,cex=0.1,lwd=0.2,main="Russians/single linkage")
> dev.off()
> pdf("ru-ward.pdf",width=11.7,height=8.3,paper="a4r")
> plot(ward,hang=-1,cex=0.1,lwd=0.2,main="Russians/Ward")
> dev.off()
{{ru:hse:mva:pics:ru-single.pdf}}; {{ru:hse:mva:pics:ru-ward.pdf}}
===== k-means =====
The k-means solutions are local optima and can be different in different runs.
> ZDC <- scale(DC[,2:8])
> head(ZDC)
v23 v24 v25 v26 v27 v28 v29
1 0.7533499 -1.9441852 -1.1642108 -1.0781521 -1.1096588 -1.10943828 -0.8978940
2 -0.1536723 -1.0648646 -0.1619814 -0.1629287 -0.1135945 -0.05791786 1.4511185
3 -0.1536723 -1.0648646 -0.1619814 0.7522947 0.8824698 0.99360256 1.4511185
4 0.7533499 0.6937764 1.8424773 2.5827415 0.8824698 2.04512297 0.2766123
5 -1.9677167 -1.9441852 -1.1642108 -1.0781521 -1.1096588 -1.10943828 -0.8978940
6 -1.9677167 -1.9441852 0.8402480 0.7522947 0.8824698 0.99360256 0.2766123
> s <- kmeans(ZDC,centers=20,iter.max=50)
> names(s)
[1] "cluster" "centers" "totss" "withinss" "tot.withinss"
[6] "betweenss" "size" "iter" "ifault"
> table(s$cluster)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
430 593 528 876 1690 2901 965 3202 703 1427 2838 813 521 881 2564 2146
17 18 19 20
481 849 778 1339
> s$iter
[1] 8
> da <- dist(s$centers,method="euc")
> da2 <- da^2
> wa <- hclust(d=da2,method="ward.D")
> plot(wa,hang=-1,main="ward/All")
{{ru:hse:mva:pics:wardall.png}}
> waK5 <- cutree(wa,k=5)
> waK5
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
1 1 2 3 4 5 4 5 2 5 5 4 2 3 5 5 2 2 4 4
> ca <- waK5[s$cluster]
> head(s$cluster)
1 2 3 4 5 6
8 9 17 2 16 18
> head(ca)
8 9 17 2 16 18
5 2 2 1 5 2
> # names(ca) <- names(s$cluster)
> ca <- as.vector(ca)
> mA <- matrix(0,nrow=5,ncol=7)
> for(i in 1:5) mA[i,] <- colMeans(DC[ca==i,2:8])
> rownames(mA) <- paste("C",1:5,sep="")
> colnames(mA) <- colnames(DC[,2:8])
> mA <- rbind(mA,colMeans(DC[,2:8]))
> rownames(mA)[6] <- "all"
> t(mA)
C1 C2 C3 C4 C5 all
v23 4.050831 3.144062 3.311326 3.714951 2.896206 3.169425
v24 4.269795 3.264763 3.637450 3.760072 2.875116 3.211008
v25 4.033236 2.633355 2.152533 2.834378 1.690078 2.161621
v26 3.733138 2.456522 1.853728 3.298299 1.638414 2.178021
v27 3.998045 2.794938 3.639158 2.138048 1.660432 2.114043
v28 4.041056 2.585010 2.745589 1.928201 1.778552 2.055080
v29 2.719453 3.288449 1.653956 1.608415 1.458880 1.764486
> zA <- matrix(0,nrow=5,ncol=7)
> for(i in 1:5) zA[i,] <- colMeans(ZDC[ca==i,])
> rownames(zA) <- paste("C",1:5,sep="")
> colnames(zA) <- colnames(ZDC)
> zA <- rbind(zA,colMeans(ZDC))
> rownames(zA)[6] <- "all"
> t(zA)
C1 C2 C3 C4 C5 all
v23 0.7994547 -0.02300460 0.128707409 0.49480392 -0.2478154 1.287388e-16
v24 0.9310125 0.04726758 0.374978962 0.48280250 -0.2953571 -1.418857e-16
v25 1.8757870 0.47278552 -0.009108647 0.67425651 -0.4725941 1.265665e-16
v26 1.4232796 0.25489064 -0.296800350 1.02530490 -0.4938611 -2.086801e-16
v27 1.8765868 0.67821521 1.519111942 0.02391051 -0.4518257 6.349513e-18
v28 2.0882939 0.55723182 0.726084270 -0.13341646 -0.2907754 2.554601e-16
v29 1.1216138 1.78990370 -0.129819027 -0.18330628 -0.3589360 8.157680e-17
==== Analysis of clusters by countries ====
> table(ca)
ca
1 2 3 4 5
1023 3082 1757 5585 15078
> N <- DC[,1]
> table(N)
N
AT AU BE CH CL CN CZ DE DK EE ES FI FR GB-GBN
625 702 1122 772 666 690 806 1016 703 671 835 593 640 904
GE HR HU IL IN IS JP LT LV MX NO NZ PH PL
407 524 542 762 586 824 845 502 575 598 1051 558 660 770
RU SE SI SK SR TW US VE ZA
837 708 476 581 582 1243 930 376 843
> table(N[ca==1])
AT AU BE CH CL CN CZ DE DK EE ES FI FR GB-GBN
14 14 43 9 27 53 28 21 12 21 44 16 20 17
GE HR HU IL IN IS JP LT LV MX NO NZ PH PL
19 32 38 22 31 21 113 24 10 18 17 6 10 108
RU SE SI SK SR TW US VE ZA
47 18 4 25 17 39 16 6 43
> cas <- table(ca)
> for(i in 1:5) {cat(i,"\n"); print(100*table(N[ca==i])/cas[i])}
1
AT AU BE CH CL CN CZ DE DK
1.3685239 1.3685239 4.2033236 0.8797654 2.6392962 5.1808407 2.7370479 2.0527859 1.1730205
EE ES FI FR GB-GBN GE HR HU IL
2.0527859 4.3010753 1.5640274 1.9550342 1.6617791 1.8572825 3.1280547 3.7145650 2.1505376
IN IS JP LT LV MX NO NZ PH
3.0303030 2.0527859 11.0459433 2.3460411 0.9775171 1.7595308 1.6617791 0.5865103 0.9775171
PL RU SE SI SK SR TW US VE
10.5571848 4.5943304 1.7595308 0.3910068 2.4437928 1.6617791 3.8123167 1.5640274 0.5865103
ZA
4.2033236
...
> table(ca[N=="RU"])
1 2 3 4 5
47 214 42 202 332
> table(ca[N=="DE"])
1 2 3 4 5
21 74 176 70 675
> nt <- table(N)
> 100*table(ca[N=="DE"])/nt["DE"]
1 2 3 4 5
2.066929 7.283465 17.322835 6.889764 66.437008
> 100*table(ca[N=="RU"])/nt["RU"]
1 2 3 4 5
5.615293 25.567503 5.017921 24.133811 39.665472
> 100*table(ca[N=="SI"])/nt["SI"]
1 2 3 4 5
0.8403361 9.0336134 5.8823529 13.4453782 70.7983193
> 100*table(ca[N=="US"])/nt["US"]
1 2 3 4 5
1.720430 5.913978 4.731183 17.204301 70.430108
> 100*table(ca[N=="PL"])/nt["PL"]
1 2 3 4 5
14.025974 16.103896 9.220779 40.259740 20.389610
> 100*table(ca)/length(ca)
ca
1 2 3 4 5
3.856739 11.619227 6.623940 21.055608 56.844486
>
==== Repeat k-means and use the best solution ====
> sb <- s; pb <- s$tot.withinss
> for(i in 1:100){
+ s <- kmeans(ZDC,centers=20,iter.max=50)
+ if(s$tot.withinss < pb) {sb <- s; pb <- s$tot.withinss; cat(i,pb,"\n")}
+ }
1 63693.44
13 63550.52
15 63139.9
20 63027.43
>
> da <- dist(sb$centers,method="euc")
> da2 <- da^2
> wa <- hclust(d=da2,method="ward.D")
> plot(wa,hang=-1,main="ward/All sb")
{{ru:hse:mva:pics:wardallsb.png}}
==== Analysis of countries for sb clustering ====
> clCenters <- function(D,C){
+ k <- length(table(C))
+ mA <- matrix(0,nrow=k,ncol=ncol(D))
+ for(i in 1:k) mA[i,] <- colMeans(D[C==i,])
+ rownames(mA) <- paste("C",1:k,sep="")
+ colnames(mA) <- colnames(D)
+ mA <- rbind(mA,colMeans(D))
+ rownames(mA)[k+1] <- "all"
+ return(t(mA))
+ }
>
> N <- DC[,1]; nt <- table(N)
> Nc <- names(nt); nc <- length(Nc)
> waK4 <- cutree(wa,k=4)
> ca <- as.vector(waK4[sb$cluster])
> clCenters(DC[,2:8],ca)
C1 C2 C3 C4 all
v23 3.247956 3.130331 2.785750 3.963289 3.169425
v24 3.193898 3.257872 2.918976 4.117474 3.211008
v25 2.246442 2.575528 1.437036 3.519457 2.161621
v26 2.289522 2.352730 1.404379 3.772394 2.178021
v27 2.183752 2.796333 1.306531 3.544053 2.114043
v28 2.165051 2.579115 1.335725 3.174009 2.055080
v29 1.740157 3.411718 1.131371 2.244126 1.764486
# C1 - average, positive
# C2 - average, negative
# C3 - positive (happy)
# C4 - negative (unhappy)
> table(ca)
ca
1 2 3 4
13208 2509 8084 2724
> C <- matrix(0,nrow=nc+1,ncol=4)
> rownames(C) <- c(Nc,"All"); colnames(C) <- paste("C",1:4,sep="")
> for(c in Nc) C[c,] <- 100*table(ca[N==c])/nt[c]
> C["All",] <- 100*table(ca)/length(ca)
> C
C1 C2 C3 C4
AT 33.76000 12.960000 48.160000 5.120000
AU 52.56410 5.555556 36.039886 5.840456
BE 49.46524 6.595365 35.472371 8.467023
CH 42.74611 3.756477 51.554404 1.943005
CL 58.85886 8.108108 19.819820 13.213213
CN 56.23188 14.637681 12.028986 17.101449
CZ 50.24814 9.801489 28.660050 11.290323
DE 46.75197 6.102362 40.354331 6.791339
DK 39.11807 3.982930 52.773826 4.125178
EE 55.29061 9.687034 22.503726 12.518629
ES 52.21557 5.868263 27.664671 14.251497
FI 47.89207 8.431703 37.436762 6.239460
FR 52.18750 8.281250 29.843750 9.687500
GB-GBN 52.21239 6.194690 36.061947 5.530973
GE 35.87224 33.169533 24.078624 6.879607
HR 53.43511 7.251908 19.083969 20.229008
HU 51.29151 15.129151 18.819188 14.760148
IL 42.65092 16.404199 34.645669 6.299213
IN 39.07850 31.058020 18.430034 11.433447
IS 51.09223 4.126214 38.956311 5.825243
JP 46.39053 10.650888 9.230769 33.727811
LT 62.35060 7.768924 17.330677 12.549801
LV 59.47826 6.260870 22.260870 12.000000
MX 55.01672 9.197324 24.749164 11.036789
NO 50.14272 6.279734 39.771646 3.805899
NZ 55.01792 5.555556 36.559140 2.867384
PH 59.24242 10.000000 27.727273 3.030303
PL 43.89610 11.688312 5.064935 39.350649
RU 43.96655 18.996416 20.549582 16.487455
SE 50.70621 8.898305 36.016949 4.378531
SI 43.48739 6.932773 45.588235 3.991597
SK 54.04475 10.154905 22.719449 13.080895
SR 52.74914 11.512027 30.584192 5.154639
TW 63.39501 5.792438 23.008850 7.803701
US 43.33333 5.268817 46.559140 4.838710
VE 32.18085 3.457447 60.106383 4.255319
ZA 50.53381 12.455516 24.792408 12.218268
All 49.79453 9.459001 30.476909 10.269557
> dc <- dist(C[1:nc,],"euc")^2
> wardC <- hclust(d = dc, method = "ward.D")
> plot(wardC,hang=-1,main="ward/Countries")
> plot(wardC,hang=-1,cex=0.7,main="ward/Countries")
{{ru:hse:mva:pics:wardcntr.png}}
> dcZ <- dist(scale(C[1:nc,]),"euc")^2
> wardZ <- hclust(d = dcZ, method = "ward.D")
> plot(wardZ,hang=-1,cex=0.7,main="ward/Countries")
{{ru:hse:mva:pics:wardcntrz.png}}