====== Labs ====== ===== PCA ===== > (R <- cor(Dru[,2:8])) v23 v24 v25 v26 v27 v28 v29 v23 1.0000000 0.6723167 0.4864999 0.4495917 0.3191123 0.2197477 0.1205490 v24 0.6723167 1.0000000 0.5647139 0.4535635 0.4060885 0.2914405 0.1609813 v25 0.4864999 0.5647139 1.0000000 0.5401311 0.4553434 0.4226335 0.2866924 v26 0.4495917 0.4535635 0.5401311 1.0000000 0.4493581 0.3200328 0.3377000 v27 0.3191123 0.4060885 0.4553434 0.4493581 1.0000000 0.5681327 0.4990005 v28 0.2197477 0.2914405 0.4226335 0.3200328 0.5681327 1.0000000 0.4857639 v29 0.1205490 0.1609813 0.2866924 0.3377000 0.4990005 0.4857639 1.0000000 > (n <- nrow(Dru)) [1] 837 > plot.mat(R) > cortest.bartlett(R=R,n=n) $chisq [1] 2181.866 $p.value [1] 0 $df [1] 21 > par(mfrow = c(1, 3)) > plot(eigen(x=R)$values,type= "o") > scree(rx = R, factors = FALSE) > fa.parallel(x=R,fa="pc",n.iter=100,use="complete",n.obs=n) Parallel analysis suggests that the number of factors = NA and the number of components = 2 > par(mfrow = c(1,1)) > (PC <- princomp(x = Ru, cor = TRUE, scores = TRUE)) Call: princomp(x = Ru, cor = TRUE, scores = TRUE) Standard deviations: Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 1.8598189 1.1328839 0.7776062 0.7190742 0.6632436 0.6252869 0.5522980 7 variables and 837 observations. > names(PC) [1] "sdev" "loadings" "center" "scale" "n.obs" "scores" "call" > summary(PC) Importance of components: Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Standard deviation 1.8598189 1.1328839 0.77760619 0.71907419 0.66324361 Proportion of Variance 0.4941323 0.1833465 0.08638163 0.07386681 0.06284173 Cumulative Proportion 0.4941323 0.6774789 0.76386051 0.83772732 0.90056904 Comp.6 Comp.7 Standard deviation 0.62528685 0.55229797 Proportion of Variance 0.05585481 0.04357615 Cumulative Proportion 0.95642385 1.00000000 > PC$loadings Loadings: Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 v23 -0.363 0.479 -0.164 0.425 0.327 0.568 v24 -0.397 0.410 -0.271 0.204 -0.192 -0.722 v25 -0.420 0.150 -0.574 -0.467 -0.424 0.263 v26 -0.393 0.756 -0.169 0.276 0.362 -0.172 v27 -0.405 -0.300 -0.158 0.695 -0.437 0.211 v28 -0.356 -0.422 -0.482 -0.326 0.583 -0.107 v29 -0.299 -0.551 0.257 0.557 -0.463 -0.122 Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000 Proportion Var 0.143 0.143 0.143 0.143 0.143 0.143 0.143 Cumulative Var 0.143 0.286 0.429 0.571 0.714 0.857 1.000 > > b <- c("red","blue","green","magenta") > plot(PC$scores[,1],PC$scores[,2],col=b[wardK4],pch=16) {{ru:hse:mva:pics:pc4.png}} > s4 <- kmeans(ZDC,centers=4,iter.max=50) > names(s4) [1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss" "size" [8] "iter" "ifault" > t(s4$centers) 1 2 3 4 v23 -0.58350882 0.73111782 0.5810569 -0.6240540 v24 -0.55884567 0.72289606 0.6861428 -0.7331494 v25 -0.14874700 0.08864125 1.1835084 -0.8793363 v26 -0.08318695 0.10937110 0.8743118 -0.7431067 v27 0.20625731 -0.32579388 1.2707661 -0.8996331 v28 0.27571894 -0.30386671 1.0861920 -0.8722506 v29 0.28342478 -0.27036768 0.9061068 -0.7789340 > PC4 <- princomp(x = ZDC, cor = TRUE, scores = TRUE) > PC4$loadings Loadings: Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 v23 -0.289 0.591 0.227 0.106 -0.365 0.608 v24 -0.329 0.530 0.268 -0.720 v25 -0.450 -0.153 0.831 0.250 -0.111 v26 -0.353 -0.857 -0.305 -0.108 0.160 v27 -0.442 -0.322 -0.270 -0.255 -0.740 v28 -0.405 -0.358 0.224 -0.494 0.144 0.621 v29 -0.348 -0.353 0.245 0.811 0.137 Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000 Proportion Var 0.143 0.143 0.143 0.143 0.143 0.143 0.143 Cumulative Var 0.143 0.286 0.429 0.571 0.714 0.857 1.000 > summary(PC4) Importance of components: Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Standard deviation 1.6741225 1.1541208 0.8839795 0.80879558 0.72939407 0.70127017 0.63714461 Proportion of Variance 0.4003838 0.1902850 0.1116314 0.09345004 0.07600224 0.07025426 0.05799332 Cumulative Proportion 0.4003838 0.5906687 0.7023001 0.79575017 0.87175241 0.94200668 1.00000000 > names(PC4) [1] "sdev" "loadings" "center" "scale" "n.obs" "scores" "call" > > # centers (x,y) of clusters from s4 in PC plane > x <- numeric(4); y <- numeric(4) > for(i in 1:4){ + x[i] <- mean(PC4$scores[s4$cluster==i,1]) + y[i] <- mean(PC4$scores[s4$cluster==i,2]) + } > cbind(x,y) x y [1,] 0.1473235 -0.92528132 [2,] -0.1664773 1.14049023 [3,] -2.5526368 -0.24401054 [4,] 2.1022616 -0.01174882 > c1 <- PC4$loadings[,1]; c2 <- PC4$loadings[,2] > X <- as.vector(ZDC%*%c1); Y <- as.vector(ZDC%*%c2) > plot(X,Y,col=b[s4$cluster],pch=16) {{ru:hse:mva:pics:pc4all.png}} Check that the centers of s4 in the PC plane are the same as projections of s4 centers on PC plane. > points(x,y,cex=2,pch=16) > t(s4$centers) 1 2 3 4 v23 -0.58350882 0.73111782 0.5810569 -0.6240540 v24 -0.55884567 0.72289606 0.6861428 -0.7331494 v25 -0.14874700 0.08864125 1.1835084 -0.8793363 v26 -0.08318695 0.10937110 0.8743118 -0.7431067 v27 0.20625731 -0.32579388 1.2707661 -0.8996331 v28 0.27571894 -0.30386671 1.0861920 -0.8722506 v29 0.28342478 -0.27036768 0.9061068 -0.7789340 > x4 <- as.vector(s4$centers%*%c1) > y4 <- as.vector(s4$centers%*%c2) > cbind(x,y) x y [1,] 0.1473235 -0.92528132 [2,] -0.1664773 1.14049023 [3,] -2.5526368 -0.24401054 [4,] 2.1022616 -0.01174882 > cbind(x4,y4) x4 y4 [1,] 0.1473207 -0.9252639 [2,] -0.1664742 1.1404687 [3,] -2.5525887 -0.2440059 [4,] 2.1022220 -0.0117486 > Small differences are due to numerical errors. ===== Factor analysis ===== > library(GPArotation) > FA.oblimin <- fa(r=Ru,nfactors=2,n.obs=n,rotate="oblimin", + scores=TRUE,fm="pa",max.iter = 1000) > FA.oblimin Factor Analysis using method = pa Call: fa(r = Ru, nfactors = 2, n.obs = n, rotate = "oblimin", scores = TRUE, max.iter = 1000, fm = "pa") Standardized loadings (pattern matrix) based upon correlation matrix PA1 PA2 h2 u2 com v23 0.82 -0.08 0.62 0.38 1.0 v24 0.83 0.00 0.69 0.31 1.0 v25 0.55 0.30 0.54 0.46 1.5 v26 0.45 0.32 0.43 0.57 1.8 v27 0.15 0.70 0.60 0.40 1.1 v28 0.01 0.71 0.52 0.48 1.0 v29 -0.13 0.73 0.47 0.53 1.1 PA1 PA2 SS loadings 2.02 1.85 Proportion Var 0.29 0.26 Cumulative Var 0.29 0.55 Proportion Explained 0.52 0.48 Cumulative Proportion 0.52 1.00 With factor correlations of PA1 PA2 PA1 1.00 0.45 PA2 0.45 1.00 Mean item complexity = 1.2 Test of the hypothesis that 2 factors are sufficient. The degrees of freedom for the null model are 21 and the objective function was 2.62 with Chi Square of 2181.87 The degrees of freedom for the model are 8 and the objective function was 0.06 The root mean square of the residuals (RMSR) is 0.03 The df corrected root mean square of the residuals is 0.04 The harmonic number of observations is 837 with the empirical chi square 23.38 with prob < 0.0029 The total number of observations was 837 with Likelihood Chi Square = 53.69 with prob < 7.9e-09 Tucker Lewis Index of factoring reliability = 0.944 RMSEA index = 0.083 and the 90 % confidence intervals are 0.063 0.104 BIC = -0.15 Fit based upon off diagonal values = 1 Measures of factor score adequacy PA1 PA2 Correlation of (regression) scores with factors 0.92 0.89 Multiple R square of scores with factors 0.84 0.80 Minimum correlation of possible factor scores 0.69 0.60 > FA.varimax <- fa(r=Ru,nfactors=2,n.obs=n,rotate="varimax", + scores=TRUE,fm = "pa") > FA.varimax Factor Analysis using method = pa Call: fa(r = Ru, nfactors = 2, n.obs = n, rotate = "varimax", scores = TRUE, fm = "pa") Standardized loadings (pattern matrix) based upon correlation matrix PA1 PA2 h2 u2 com v23 0.78 0.08 0.62 0.38 1.0 v24 0.81 0.17 0.69 0.31 1.1 v25 0.62 0.40 0.54 0.46 1.7 v26 0.52 0.39 0.43 0.57 1.9 v27 0.33 0.70 0.60 0.40 1.4 v28 0.20 0.69 0.52 0.48 1.2 v29 0.07 0.68 0.47 0.53 1.0 PA1 PA2 SS loadings 2.09 1.78 Proportion Var 0.30 0.25 Cumulative Var 0.30 0.55 Proportion Explained 0.54 0.46 Cumulative Proportion 0.54 1.00 Mean item complexity = 1.3 Test of the hypothesis that 2 factors are sufficient. The degrees of freedom for the null model are 21 and the objective function was 2.62 with Chi Square of 2181.87 The degrees of freedom for the model are 8 and the objective function was 0.06 The root mean square of the residuals (RMSR) is 0.03 The df corrected root mean square of the residuals is 0.04 The harmonic number of observations is 837 with the empirical chi square 23.38 with prob < 0.0029 The total number of observations was 837 with Likelihood Chi Square = 53.69 with prob < 7.9e-09 Tucker Lewis Index of factoring reliability = 0.944 RMSEA index = 0.083 and the 90 % confidence intervals are 0.063 0.104 BIC = -0.15 Fit based upon off diagonal values = 1 Measures of factor score adequacy PA1 PA2 Correlation of (regression) scores with factors 0.90 0.86 Multiple R square of scores with factors 0.81 0.74 Minimum correlation of possible factor scores 0.61 0.49 > ===== Marjan's R code ===== * {{ru:hse:mva:dat:discrim.zip|Discriminant analysis}} * {{ru:hse:mva:dat:canon.zip|Canonical}}