====== Labs ======
===== PCA =====
> (R <- cor(Dru[,2:8]))
v23 v24 v25 v26 v27 v28 v29
v23 1.0000000 0.6723167 0.4864999 0.4495917 0.3191123 0.2197477 0.1205490
v24 0.6723167 1.0000000 0.5647139 0.4535635 0.4060885 0.2914405 0.1609813
v25 0.4864999 0.5647139 1.0000000 0.5401311 0.4553434 0.4226335 0.2866924
v26 0.4495917 0.4535635 0.5401311 1.0000000 0.4493581 0.3200328 0.3377000
v27 0.3191123 0.4060885 0.4553434 0.4493581 1.0000000 0.5681327 0.4990005
v28 0.2197477 0.2914405 0.4226335 0.3200328 0.5681327 1.0000000 0.4857639
v29 0.1205490 0.1609813 0.2866924 0.3377000 0.4990005 0.4857639 1.0000000
> (n <- nrow(Dru))
[1] 837
> plot.mat(R)
> cortest.bartlett(R=R,n=n)
$chisq
[1] 2181.866
$p.value
[1] 0
$df
[1] 21
> par(mfrow = c(1, 3))
> plot(eigen(x=R)$values,type= "o")
> scree(rx = R, factors = FALSE)
> fa.parallel(x=R,fa="pc",n.iter=100,use="complete",n.obs=n)
Parallel analysis suggests that the number of factors = NA and the number of components = 2
> par(mfrow = c(1,1))
> (PC <- princomp(x = Ru, cor = TRUE, scores = TRUE))
Call:
princomp(x = Ru, cor = TRUE, scores = TRUE)
Standard deviations:
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
1.8598189 1.1328839 0.7776062 0.7190742 0.6632436 0.6252869 0.5522980
7 variables and 837 observations.
> names(PC)
[1] "sdev" "loadings" "center" "scale" "n.obs" "scores" "call"
> summary(PC)
Importance of components:
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
Standard deviation 1.8598189 1.1328839 0.77760619 0.71907419 0.66324361
Proportion of Variance 0.4941323 0.1833465 0.08638163 0.07386681 0.06284173
Cumulative Proportion 0.4941323 0.6774789 0.76386051 0.83772732 0.90056904
Comp.6 Comp.7
Standard deviation 0.62528685 0.55229797
Proportion of Variance 0.05585481 0.04357615
Cumulative Proportion 0.95642385 1.00000000
> PC$loadings
Loadings:
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
v23 -0.363 0.479 -0.164 0.425 0.327 0.568
v24 -0.397 0.410 -0.271 0.204 -0.192 -0.722
v25 -0.420 0.150 -0.574 -0.467 -0.424 0.263
v26 -0.393 0.756 -0.169 0.276 0.362 -0.172
v27 -0.405 -0.300 -0.158 0.695 -0.437 0.211
v28 -0.356 -0.422 -0.482 -0.326 0.583 -0.107
v29 -0.299 -0.551 0.257 0.557 -0.463 -0.122
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000
Proportion Var 0.143 0.143 0.143 0.143 0.143 0.143 0.143
Cumulative Var 0.143 0.286 0.429 0.571 0.714 0.857 1.000
>
> b <- c("red","blue","green","magenta")
> plot(PC$scores[,1],PC$scores[,2],col=b[wardK4],pch=16)
{{ru:hse:mva:pics:pc4.png}}
> s4 <- kmeans(ZDC,centers=4,iter.max=50)
> names(s4)
[1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss" "size"
[8] "iter" "ifault"
> t(s4$centers)
1 2 3 4
v23 -0.58350882 0.73111782 0.5810569 -0.6240540
v24 -0.55884567 0.72289606 0.6861428 -0.7331494
v25 -0.14874700 0.08864125 1.1835084 -0.8793363
v26 -0.08318695 0.10937110 0.8743118 -0.7431067
v27 0.20625731 -0.32579388 1.2707661 -0.8996331
v28 0.27571894 -0.30386671 1.0861920 -0.8722506
v29 0.28342478 -0.27036768 0.9061068 -0.7789340
> PC4 <- princomp(x = ZDC, cor = TRUE, scores = TRUE)
> PC4$loadings
Loadings:
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
v23 -0.289 0.591 0.227 0.106 -0.365 0.608
v24 -0.329 0.530 0.268 -0.720
v25 -0.450 -0.153 0.831 0.250 -0.111
v26 -0.353 -0.857 -0.305 -0.108 0.160
v27 -0.442 -0.322 -0.270 -0.255 -0.740
v28 -0.405 -0.358 0.224 -0.494 0.144 0.621
v29 -0.348 -0.353 0.245 0.811 0.137
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000
Proportion Var 0.143 0.143 0.143 0.143 0.143 0.143 0.143
Cumulative Var 0.143 0.286 0.429 0.571 0.714 0.857 1.000
> summary(PC4)
Importance of components:
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
Standard deviation 1.6741225 1.1541208 0.8839795 0.80879558 0.72939407 0.70127017 0.63714461
Proportion of Variance 0.4003838 0.1902850 0.1116314 0.09345004 0.07600224 0.07025426 0.05799332
Cumulative Proportion 0.4003838 0.5906687 0.7023001 0.79575017 0.87175241 0.94200668 1.00000000
> names(PC4)
[1] "sdev" "loadings" "center" "scale" "n.obs" "scores" "call"
>
> # centers (x,y) of clusters from s4 in PC plane
> x <- numeric(4); y <- numeric(4)
> for(i in 1:4){
+ x[i] <- mean(PC4$scores[s4$cluster==i,1])
+ y[i] <- mean(PC4$scores[s4$cluster==i,2])
+ }
> cbind(x,y)
x y
[1,] 0.1473235 -0.92528132
[2,] -0.1664773 1.14049023
[3,] -2.5526368 -0.24401054
[4,] 2.1022616 -0.01174882
> c1 <- PC4$loadings[,1]; c2 <- PC4$loadings[,2]
> X <- as.vector(ZDC%*%c1); Y <- as.vector(ZDC%*%c2)
> plot(X,Y,col=b[s4$cluster],pch=16)
{{ru:hse:mva:pics:pc4all.png}}
Check that the centers of s4 in the PC plane are the same as projections of s4 centers on PC plane.
> points(x,y,cex=2,pch=16)
> t(s4$centers)
1 2 3 4
v23 -0.58350882 0.73111782 0.5810569 -0.6240540
v24 -0.55884567 0.72289606 0.6861428 -0.7331494
v25 -0.14874700 0.08864125 1.1835084 -0.8793363
v26 -0.08318695 0.10937110 0.8743118 -0.7431067
v27 0.20625731 -0.32579388 1.2707661 -0.8996331
v28 0.27571894 -0.30386671 1.0861920 -0.8722506
v29 0.28342478 -0.27036768 0.9061068 -0.7789340
> x4 <- as.vector(s4$centers%*%c1)
> y4 <- as.vector(s4$centers%*%c2)
> cbind(x,y)
x y
[1,] 0.1473235 -0.92528132
[2,] -0.1664773 1.14049023
[3,] -2.5526368 -0.24401054
[4,] 2.1022616 -0.01174882
> cbind(x4,y4)
x4 y4
[1,] 0.1473207 -0.9252639
[2,] -0.1664742 1.1404687
[3,] -2.5525887 -0.2440059
[4,] 2.1022220 -0.0117486
>
Small differences are due to numerical errors.
===== Factor analysis =====
> library(GPArotation)
> FA.oblimin <- fa(r=Ru,nfactors=2,n.obs=n,rotate="oblimin",
+ scores=TRUE,fm="pa",max.iter = 1000)
> FA.oblimin
Factor Analysis using method = pa
Call: fa(r = Ru, nfactors = 2, n.obs = n, rotate = "oblimin", scores = TRUE,
max.iter = 1000, fm = "pa")
Standardized loadings (pattern matrix) based upon correlation matrix
PA1 PA2 h2 u2 com
v23 0.82 -0.08 0.62 0.38 1.0
v24 0.83 0.00 0.69 0.31 1.0
v25 0.55 0.30 0.54 0.46 1.5
v26 0.45 0.32 0.43 0.57 1.8
v27 0.15 0.70 0.60 0.40 1.1
v28 0.01 0.71 0.52 0.48 1.0
v29 -0.13 0.73 0.47 0.53 1.1
PA1 PA2
SS loadings 2.02 1.85
Proportion Var 0.29 0.26
Cumulative Var 0.29 0.55
Proportion Explained 0.52 0.48
Cumulative Proportion 0.52 1.00
With factor correlations of
PA1 PA2
PA1 1.00 0.45
PA2 0.45 1.00
Mean item complexity = 1.2
Test of the hypothesis that 2 factors are sufficient.
The degrees of freedom for the null model are 21 and the objective function
was 2.62 with Chi Square of 2181.87
The degrees of freedom for the model are 8 and the objective function was 0.06
The root mean square of the residuals (RMSR) is 0.03
The df corrected root mean square of the residuals is 0.04
The harmonic number of observations is 837 with the empirical chi square 23.38 with prob < 0.0029
The total number of observations was 837 with Likelihood Chi Square = 53.69 with prob < 7.9e-09
Tucker Lewis Index of factoring reliability = 0.944
RMSEA index = 0.083 and the 90 % confidence intervals are 0.063 0.104
BIC = -0.15
Fit based upon off diagonal values = 1
Measures of factor score adequacy
PA1 PA2
Correlation of (regression) scores with factors 0.92 0.89
Multiple R square of scores with factors 0.84 0.80
Minimum correlation of possible factor scores 0.69 0.60
> FA.varimax <- fa(r=Ru,nfactors=2,n.obs=n,rotate="varimax",
+ scores=TRUE,fm = "pa")
> FA.varimax
Factor Analysis using method = pa
Call: fa(r = Ru, nfactors = 2, n.obs = n, rotate = "varimax", scores = TRUE,
fm = "pa")
Standardized loadings (pattern matrix) based upon correlation matrix
PA1 PA2 h2 u2 com
v23 0.78 0.08 0.62 0.38 1.0
v24 0.81 0.17 0.69 0.31 1.1
v25 0.62 0.40 0.54 0.46 1.7
v26 0.52 0.39 0.43 0.57 1.9
v27 0.33 0.70 0.60 0.40 1.4
v28 0.20 0.69 0.52 0.48 1.2
v29 0.07 0.68 0.47 0.53 1.0
PA1 PA2
SS loadings 2.09 1.78
Proportion Var 0.30 0.25
Cumulative Var 0.30 0.55
Proportion Explained 0.54 0.46
Cumulative Proportion 0.54 1.00
Mean item complexity = 1.3
Test of the hypothesis that 2 factors are sufficient.
The degrees of freedom for the null model are 21 and the objective function was 2.62 with Chi Square of 2181.87
The degrees of freedom for the model are 8 and the objective function was 0.06
The root mean square of the residuals (RMSR) is 0.03
The df corrected root mean square of the residuals is 0.04
The harmonic number of observations is 837 with the empirical chi square 23.38 with prob < 0.0029
The total number of observations was 837 with Likelihood Chi Square = 53.69 with prob < 7.9e-09
Tucker Lewis Index of factoring reliability = 0.944
RMSEA index = 0.083 and the 90 % confidence intervals are 0.063 0.104
BIC = -0.15
Fit based upon off diagonal values = 1
Measures of factor score adequacy
PA1 PA2
Correlation of (regression) scores with factors 0.90 0.86
Multiple R square of scores with factors 0.81 0.74
Minimum correlation of possible factor scores 0.61 0.49
>
===== Marjan's R code =====
* {{ru:hse:mva:dat:discrim.zip|Discriminant analysis}}
* {{ru:hse:mva:dat:canon.zip|Canonical}}