Labs

PCA

> (R <- cor(Dru[,2:8]))
          v23       v24       v25       v26       v27       v28       v29
v23 1.0000000 0.6723167 0.4864999 0.4495917 0.3191123 0.2197477 0.1205490
v24 0.6723167 1.0000000 0.5647139 0.4535635 0.4060885 0.2914405 0.1609813
v25 0.4864999 0.5647139 1.0000000 0.5401311 0.4553434 0.4226335 0.2866924
v26 0.4495917 0.4535635 0.5401311 1.0000000 0.4493581 0.3200328 0.3377000
v27 0.3191123 0.4060885 0.4553434 0.4493581 1.0000000 0.5681327 0.4990005
v28 0.2197477 0.2914405 0.4226335 0.3200328 0.5681327 1.0000000 0.4857639
v29 0.1205490 0.1609813 0.2866924 0.3377000 0.4990005 0.4857639 1.0000000
> (n <- nrow(Dru))
[1] 837
> plot.mat(R)
> cortest.bartlett(R=R,n=n)
$chisq
[1] 2181.866

$p.value
[1] 0

$df
[1] 21
> par(mfrow = c(1, 3))
> plot(eigen(x=R)$values,type= "o")
> scree(rx = R, factors = FALSE)
> fa.parallel(x=R,fa="pc",n.iter=100,use="complete",n.obs=n)
Parallel analysis suggests that the number of factors =  NA  and the number of components =  2 
> par(mfrow = c(1,1))
> (PC <- princomp(x = Ru, cor = TRUE, scores = TRUE))
Call:
princomp(x = Ru, cor = TRUE, scores = TRUE)

Standard deviations:
   Comp.1    Comp.2    Comp.3    Comp.4    Comp.5    Comp.6    Comp.7 
1.8598189 1.1328839 0.7776062 0.7190742 0.6632436 0.6252869 0.5522980 

 7  variables and  837 observations.
> names(PC)
[1] "sdev"     "loadings" "center"   "scale"    "n.obs"    "scores"   "call"    
> summary(PC)
Importance of components:
                          Comp.1    Comp.2     Comp.3     Comp.4     Comp.5
Standard deviation     1.8598189 1.1328839 0.77760619 0.71907419 0.66324361
Proportion of Variance 0.4941323 0.1833465 0.08638163 0.07386681 0.06284173
Cumulative Proportion  0.4941323 0.6774789 0.76386051 0.83772732 0.90056904
                           Comp.6     Comp.7
Standard deviation     0.62528685 0.55229797
Proportion of Variance 0.05585481 0.04357615
Cumulative Proportion  0.95642385 1.00000000
> PC$loadings

Loadings:
    Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
v23 -0.363  0.479 -0.164  0.425         0.327  0.568
v24 -0.397  0.410 -0.271  0.204        -0.192 -0.722
v25 -0.420  0.150        -0.574 -0.467 -0.424  0.263
v26 -0.393         0.756 -0.169  0.276  0.362 -0.172
v27 -0.405 -0.300 -0.158         0.695 -0.437  0.211
v28 -0.356 -0.422 -0.482 -0.326         0.583 -0.107
v29 -0.299 -0.551  0.257  0.557 -0.463 -0.122       

               Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
SS loadings     1.000  1.000  1.000  1.000  1.000  1.000  1.000
Proportion Var  0.143  0.143  0.143  0.143  0.143  0.143  0.143
Cumulative Var  0.143  0.286  0.429  0.571  0.714  0.857  1.000
> 
> b <- c("red","blue","green","magenta")
> plot(PC$scores[,1],PC$scores[,2],col=b[wardK4],pch=16) 

> s4 <- kmeans(ZDC,centers=4,iter.max=50)
> names(s4)
[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss" "betweenss"    "size"        
[8] "iter"         "ifault"      
> t(s4$centers)
              1           2         3          4
v23 -0.58350882  0.73111782 0.5810569 -0.6240540
v24 -0.55884567  0.72289606 0.6861428 -0.7331494
v25 -0.14874700  0.08864125 1.1835084 -0.8793363
v26 -0.08318695  0.10937110 0.8743118 -0.7431067
v27  0.20625731 -0.32579388 1.2707661 -0.8996331
v28  0.27571894 -0.30386671 1.0861920 -0.8722506
v29  0.28342478 -0.27036768 0.9061068 -0.7789340
> PC4 <- princomp(x = ZDC, cor = TRUE, scores = TRUE)
> PC4$loadings

Loadings:
    Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
v23 -0.289  0.591  0.227  0.106 -0.365  0.608       
v24 -0.329  0.530  0.268               -0.720       
v25 -0.450        -0.153         0.831  0.250 -0.111
v26 -0.353        -0.857        -0.305 -0.108  0.160
v27 -0.442 -0.322        -0.270 -0.255        -0.740
v28 -0.405 -0.358  0.224 -0.494         0.144  0.621
v29 -0.348 -0.353  0.245  0.811                0.137

               Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
SS loadings     1.000  1.000  1.000  1.000  1.000  1.000  1.000
Proportion Var  0.143  0.143  0.143  0.143  0.143  0.143  0.143
Cumulative Var  0.143  0.286  0.429  0.571  0.714  0.857  1.000
> summary(PC4)
Importance of components:
                          Comp.1    Comp.2    Comp.3     Comp.4     Comp.5     Comp.6     Comp.7
Standard deviation     1.6741225 1.1541208 0.8839795 0.80879558 0.72939407 0.70127017 0.63714461
Proportion of Variance 0.4003838 0.1902850 0.1116314 0.09345004 0.07600224 0.07025426 0.05799332
Cumulative Proportion  0.4003838 0.5906687 0.7023001 0.79575017 0.87175241 0.94200668 1.00000000
> names(PC4)
[1] "sdev"     "loadings" "center"   "scale"    "n.obs"    "scores"   "call" 
>   
> # centers (x,y) of clusters from s4 in PC plane
> x <- numeric(4); y <- numeric(4)
> for(i in 1:4){ 
+   x[i] <- mean(PC4$scores[s4$cluster==i,1])
+   y[i] <- mean(PC4$scores[s4$cluster==i,2])
+ }
> cbind(x,y)
              x           y
[1,]  0.1473235 -0.92528132
[2,] -0.1664773  1.14049023
[3,] -2.5526368 -0.24401054
[4,]  2.1022616 -0.01174882
> c1 <- PC4$loadings[,1];  c2 <- PC4$loadings[,2]
> X <- as.vector(ZDC%*%c1);  Y <- as.vector(ZDC%*%c2)
> plot(X,Y,col=b[s4$cluster],pch=16)

Check that the centers of s4 in the PC plane are the same as projections of s4 centers on PC plane.

> points(x,y,cex=2,pch=16)
> t(s4$centers)
              1           2         3          4
v23 -0.58350882  0.73111782 0.5810569 -0.6240540
v24 -0.55884567  0.72289606 0.6861428 -0.7331494
v25 -0.14874700  0.08864125 1.1835084 -0.8793363
v26 -0.08318695  0.10937110 0.8743118 -0.7431067
v27  0.20625731 -0.32579388 1.2707661 -0.8996331
v28  0.27571894 -0.30386671 1.0861920 -0.8722506
v29  0.28342478 -0.27036768 0.9061068 -0.7789340
> x4 <- as.vector(s4$centers%*%c1)
> y4 <- as.vector(s4$centers%*%c2)
> cbind(x,y)
              x           y
[1,]  0.1473235 -0.92528132
[2,] -0.1664773  1.14049023
[3,] -2.5526368 -0.24401054
[4,]  2.1022616 -0.01174882
> cbind(x4,y4)
             x4         y4
[1,]  0.1473207 -0.9252639
[2,] -0.1664742  1.1404687
[3,] -2.5525887 -0.2440059
[4,]  2.1022220 -0.0117486
> 

Small differences are due to numerical errors.

Factor analysis

> library(GPArotation)
> FA.oblimin <- fa(r=Ru,nfactors=2,n.obs=n,rotate="oblimin",
+   scores=TRUE,fm="pa",max.iter = 1000)
> FA.oblimin
Factor Analysis using method =  pa
Call: fa(r = Ru, nfactors = 2, n.obs = n, rotate = "oblimin", scores = TRUE, 
    max.iter = 1000, fm = "pa")
Standardized loadings (pattern matrix) based upon correlation matrix
      PA1   PA2   h2   u2 com
v23  0.82 -0.08 0.62 0.38 1.0
v24  0.83  0.00 0.69 0.31 1.0
v25  0.55  0.30 0.54 0.46 1.5
v26  0.45  0.32 0.43 0.57 1.8
v27  0.15  0.70 0.60 0.40 1.1
v28  0.01  0.71 0.52 0.48 1.0
v29 -0.13  0.73 0.47 0.53 1.1

                       PA1  PA2
SS loadings           2.02 1.85
Proportion Var        0.29 0.26
Cumulative Var        0.29 0.55
Proportion Explained  0.52 0.48
Cumulative Proportion 0.52 1.00

 With factor correlations of 
     PA1  PA2
PA1 1.00 0.45
PA2 0.45 1.00

Mean item complexity =  1.2
Test of the hypothesis that 2 factors are sufficient.

The degrees of freedom for the null model are  21  and the objective function
was  2.62 with Chi Square of  2181.87
The degrees of freedom for the model are 8  and the objective function was  0.06 

The root mean square of the residuals (RMSR) is  0.03 
The df corrected root mean square of the residuals is  0.04 

The harmonic number of observations is  837 with the empirical chi square  23.38  with prob <  0.0029 
The total number of observations was  837  with Likelihood Chi Square =  53.69  with prob <  7.9e-09 

Tucker Lewis Index of factoring reliability =  0.944
RMSEA index =  0.083  and the 90 % confidence intervals are  0.063 0.104
BIC =  -0.15
Fit based upon off diagonal values = 1
Measures of factor score adequacy             
                                                   PA1  PA2
Correlation of (regression) scores with factors   0.92 0.89
Multiple R square of scores with factors          0.84 0.80
Minimum correlation of possible factor scores     0.69 0.60
> FA.varimax <- fa(r=Ru,nfactors=2,n.obs=n,rotate="varimax", 
+   scores=TRUE,fm = "pa")
> FA.varimax
Factor Analysis using method =  pa
Call: fa(r = Ru, nfactors = 2, n.obs = n, rotate = "varimax", scores = TRUE, 
    fm = "pa")
Standardized loadings (pattern matrix) based upon correlation matrix
     PA1  PA2   h2   u2 com
v23 0.78 0.08 0.62 0.38 1.0
v24 0.81 0.17 0.69 0.31 1.1
v25 0.62 0.40 0.54 0.46 1.7
v26 0.52 0.39 0.43 0.57 1.9
v27 0.33 0.70 0.60 0.40 1.4
v28 0.20 0.69 0.52 0.48 1.2
v29 0.07 0.68 0.47 0.53 1.0

                       PA1  PA2
SS loadings           2.09 1.78
Proportion Var        0.30 0.25
Cumulative Var        0.30 0.55
Proportion Explained  0.54 0.46
Cumulative Proportion 0.54 1.00

Mean item complexity =  1.3
Test of the hypothesis that 2 factors are sufficient.

The degrees of freedom for the null model are  21  and the objective function was  2.62 with Chi Square of  2181.87
The degrees of freedom for the model are 8  and the objective function was  0.06 

The root mean square of the residuals (RMSR) is  0.03 
The df corrected root mean square of the residuals is  0.04 

The harmonic number of observations is  837 with the empirical chi square  23.38  with prob <  0.0029 
The total number of observations was  837  with Likelihood Chi Square =  53.69  with prob <  7.9e-09 

Tucker Lewis Index of factoring reliability =  0.944
RMSEA index =  0.083  and the 90 % confidence intervals are  0.063 0.104
BIC =  -0.15
Fit based upon off diagonal values = 1
Measures of factor score adequacy             
                                                   PA1  PA2
Correlation of (regression) scores with factors   0.90 0.86
Multiple R square of scores with factors          0.81 0.74
Minimum correlation of possible factor scores     0.61 0.49
> 

Marjan's R code

ru/hse/mva/lab3.txt · Last modified: 2019/05/24 10:19 by vlado
 
Except where otherwise noted, content on this wiki is licensed under the following license: CC Attribution-Noncommercial-Share Alike 3.0 Unported
Recent changes RSS feed Donate Powered by PHP Valid XHTML 1.0 Valid CSS Driven by DokuWiki