====== Clustering ====== ===== Storing the standardized variables as Pajek's vectors =====


setwd("D:/Data/counties/pajek2")
load("../pajek/rankDat.RData")
objects()
dim(da)
names(da)
del <- c(1,2,4,6,9,10,11,14,15,20,25,30,33,35,36,37,38,39,40,41,42,45,46,48,
   52,53,56,57,58,60,61,62,63,65,67,69,71,72,73,74,75,76,77,78,79,81,82,89,90,91)
V <- da[,-del]
for(i in 1:nrow(V)) if(length(which(is.na(V[i,])))>0) cat(i,"\n")

There are two counties that have NA values: 1620 and 2888.


> da[1620,"Areaname"]
[1] "Yellowstone National Park, MT"
> V[1620,]
     AGE050200D CLF040200D EDU685200D HSG045200D IPE010200D IPE120200D PIN020200D
1620          0          0          0          0          0          0          0
     POP050200D POP060200D POP165200D POP255200D POP285200D POP325200D POP405200D
1620          0          0          0          0          0          0          0
     VST020200D VST220200D VST420200D WAT130200D P.pop.under18 P.pop.over85 P.land.farms
1620          0          0          0          0            NA           NA           NA
     P.emply.ind.CONSTRUCTION P.emply.ind.MANUFACTORING P.emply.ind.TRANSPORT.WAREHOUSING
1620                       NA                        NA                                NA
     P.emply.ind.FINANC.INSUR P.emply.ind.PROFscientTECH P.emply.ind.EDUC.HEALTH
1620                       NA                         NA                      NA
     P.25overLESS9thGRADE P.employ.FARMING P.employ.GOV.stateLoc
1620                   NA               NA                    NA
     P.OWNERoccupiedHousingUnits P.occupiedHousingUnitsLackingPlumb P.RURALpopul
1620                          NA                                 NA           NA
     P.CHANGEurban90to00 CHANGEperCapitaIncome89to99 GroundWaterUsePerCapita
1620                   0                       -7925                      NA
     P.NET.DOMESTIC.MIGRATIONS P.NativePopulationBornInStateOfRes R.LABOR.FORCEmaleFemale
1620                        NA                                 NA                      NA
     R.VOTING.DEMOCRATESoverREPUBLICANS P.PUBLIC.SCHOOL.ENROLNEMT P.CHANGEpverty95to00
1620                                 NA                        NA                   NA
> da[2888,"Areaname"]
[1] "Clifton Forge, VA"
> V[2888,]
     AGE050200D CLF040200D EDU685200D HSG045200D IPE010200D IPE120200D PIN020200D
2888       42.9          0        9.6       -2.9          0          0          0
     POP050200D POP060200D POP165200D POP255200D POP285200D POP325200D POP405200D
2888       -8.3     1429.7       55.9       14.6        0.1          0        0.9
     VST020200D VST220200D VST420200D WAT130200D P.pop.under18 P.pop.over85 P.land.farms
2888          0          0          0          0            NA           NA            0
     P.emply.ind.CONSTRUCTION P.emply.ind.MANUFACTORING P.emply.ind.TRANSPORT.WAREHOUSING
2888                 8.024017                  17.41266                          7.969432
     P.emply.ind.FINANC.INSUR P.emply.ind.PROFscientTECH P.emply.ind.EDUC.HEALTH
2888                 2.783843                    5.18559                20.08734
     P.25overLESS9thGRADE P.employ.FARMING P.employ.GOV.stateLoc
2888             12.63666                0                     0
     P.OWNERoccupiedHousingUnits P.occupiedHousingUnitsLackingPlumb P.RURALpopul
2888                     55.7274                                  0           NA
     P.CHANGEurban90to00 CHANGEperCapitaIncome89to99 GroundWaterUsePerCapita
2888           -21.04695                        3620                      NA
     P.NET.DOMESTIC.MIGRATIONS P.NativePopulationBornInStateOfRes R.LABOR.FORCEmaleFemale
2888                        NA                           78.82956               0.9656652
     R.VOTING.DEMOCRATESoverREPUBLICANS P.PUBLIC.SCHOOL.ENROLNEMT P.CHANGEpverty95to00
2888                           141.5987                        NA                 -100
>

* http://en.wikipedia.org/wiki/Yellowstone_National_Park_%28part%29,_Montana * http://quickfacts.census.gov/qfd/states/30/30111.html * http://quickfacts.census.gov/qfd/states/30000.html * [[http://factfinder.census.gov/servlet/ACSSAFFFacts?_event=ChangeGeoContext&geo_id=05000US30111&_geoContext=&_street=&_county=yellowstone&_cityTown=yellowstone&_state=&_zip=&_lang=en&_sse=on&ActiveGeoDiv=&_useEV=&pctxt=fph&pgsl=010&_submenuId=factsheet_1&ds_name=ACS_2009_5YR_SAFF&_ci_nbr=null&qr_name=null®=null%3Anull&_keyword=&_industry=|factfinder: Yellowstone]] * http://en.wikipedia.org/wiki/Clifton_Forge,_Virginia * http://quickfacts.census.gov/qfd/states/51000.html * [[http://factfinder.census.gov/servlet/ACSSAFFFacts?_event=Search&geo_id=&_geoContext=&_street=&_county=clifton+forge&_cityTown=clifton+forge&_state=&_zip=&_lang=en&_sse=on&pctxt=fph&pgsl=010|factfinder: Clifton Forge]] Yellowstone National Park (part) was a former county-equivalent in the state of Montana, United States. Its territory was and is the Montana portion of Yellowstone National Park. Clifton Forge was an independent city during the 2000 census. However, in 2001, Clifton Forge gave up its city status and reverted to a town. We decided to replace the data for these two counties with data for the related counties:


> nams <- da[,"Areaname"]
> da[1620,"Areaname"]
[1] "Yellowstone National Park, MT"
> da[2888,"Areaname"]
[1] "Clifton Forge, VA"
> grep("Yellow",nams)
[1] 1366 1619 1620
> nams[grep("Yellow",nams)]
[1] "Yellow Medicine, MN"           "Yellowstone, MT"
[3] "Yellowstone National Park, MT"
> nams[grep("Park",nams)]
[1] "Park, CO"                      "Parke, IN"
[3] "Park, MT"                      "Yellowstone National Park, MT"
[5] "Parker, TX"                    "Manassas Park, VA"
[7] "Park, WY"
> grep("Park",nams)
[1]  262  723 1597 1620 2673 2904 3102
> da[1597,"Areaname"]
[1] "Park, MT"
> grep("Alleghany",nams)
[1] 1859 2789
> nams[grep("Alleghany",nams)]
[1] "Alleghany, NC" "Alleghany, VA"
> da[2789,"Areaname"]
[1] "Alleghany, VA"
> V[1620,] <- V[1597,]  # Yellowstone National Park, MT  =  Park, MT
> V[2888,] <- V[2789,]  # Clifton Forge, VA  =  Alleghany, VA
> for(i in 1:nrow(V)) if(length(which(is.na(V[i,])))>0) cat(i,"\n")

Standardized version of 42 variables:


> z <- function(x) (x-mean(x,na.rm=TRUE))/sd(x,na.rm=TRUE)
> U <- apply(V,2,z)
> summary(U[,1])
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
-4.230000 -0.549000  0.005642  0.000000  0.610700  4.266000 
> rownames(U) <- da[,"Areaname"]
> save(U,file="vars42.Rdata")

Checking the same-ordering of Pajek's vertices and data units:


> d <- readLines("../pajek/usc3110.paj")[14:3123]
> length(d)
[1] 3110
> d[1:10]
 [1] "    1 \"01001\"    0.6472 0.6803" "    2 \"01003\"    0.6305 0.7479"
 [3] "    3 \"01005\"    0.6676 0.7045" "    4 \"01007\"    0.6394 0.6596"
 [5] "    5 \"01009\"    0.6477 0.6217" "    6 \"01011\"    0.6626 0.6964"
 [7] "    7 \"01013\"    0.6471 0.7099" "    8 \"01015\"    0.6605 0.6314"
 [9] "    9 \"01017\"    0.6682 0.6643" "   10 \"01019\"    0.6640 0.6131"
> dd <- unlist(lapply(d,function(x) as.integer(strsplit(x,'\"')[[1]][2])))
> dd[1:10]
 [1] 1001 1003 1005 1007 1009 1011 1013 1015 1017 1019
> da$STCOU[1:10]
 [1] 1001 1003 1005 1007 1009 1011 1013 1015 1017 1019
> typeof(da$STCOU)
[1] "integer"
> cn <- da$STCOU
> which(dd!=cn)
[1] 303
> cn[303]
[1] 12086
> dd[303]
[1] 12025

We save the data as a set of Pajek's vectors:


> out <- file("vars42.vec","w")
> cat("*vertices 3110\n",file=out)
> for(i in 1:n) cat(U[i,],"\n",file=out)
> close(out)

===== Clustering with relational constraints ===== In Pajek:


read Pajek project file   USC3110.paj
read Pajek vectors file   vars42.vec
Cluster/Create Complete Cluster  [45]
edit cluster  [delete 1, 2, 3]
Operations/Dissimilarity/Vector Based/Euclidean [yes]
Net/Hierarchical Decomposition/Clustering With Relational Constraint/Options
  [Maximum, Tolerant]
Net/Hierarchical Decomposition/Clustering With Relational Constraint/Run
select partition Max/Tolerant
save partition to  MaxTol.clu
select vector Max/Tolerant heights
save vector to  MaxTolHeight.vec
Net/Hierarchical Decomposition/Clustering With Relational Constraint/Options
  [Average, Tolerant]
Net/Hierarchical Decomposition/Clustering With Relational Constraint/Run
select partition Avg/Tolerant
save partition to  AvgTol.clu
select vector Avg/Tolerant heights
save vector to  AvgTolHeight.vec

For reading of Pajek's dendrogram description we use the function ''Pajek2R''


Pajek2R <- function(cling){
  tree <- as.integer(read.csv(cling,header=FALSE,skip=1)$V1)
  N <- length(tree); n <- (N+1) %/% 2
  merge <- matrix(0,nrow=(n-1),ncol=2)
  for(i in 1:n) if(tree[i]>0){
    k <- tree[i]-n
    if(merge[k,1]==0) merge[k,1] <- -i else merge[k,2] <- -i
  }
  for(i in (n+1):N) if(tree[i]>0){
    k <- tree[i]-n; j <- i-n
    if(merge[k,1]==0) merge[k,1] <- j else merge[k,2] <- j
  }
  return(list(merge=merge,n=n))
}

orDendro <- function(m,i){if(i<0) return(-i)
  return(c(orDendro(m,m[i,1]),orDendro(m,m[i,2])))}

orSize <- function(m,i){if(i<0) return(1)
  s[i] <<- orSize(m,m[i,1])+orSize(m,m[i,2])
  return(s[i])}

Back to R:


> setwd("D:/Data/counties/pajek2")
> load("./vars42.Rdata")
> source("D:\\Data\\counties\\pajek\\varCutree.R")
> RM <- Pajek2R("MaxTol.clu")
> HM <- read.csv("MaxTolHeight.vec",header=FALSE,skip=3111)[[1]]
> RM$height <- HM
> RM$method <- "Maximum/Tolerant"
> RM$dist.method <- "Euclidean"
> RM$labels <- rownames(U)
> class(RM) <- "hclust"
> RM$call <- "Pajek.data"

The obtained hierarchical clustering can consist of several trees. We join them into a single tree:


> n <- 3110; nm <- n-1
> for(i in 3100:3109) cat(i,RM$merge[i,],"\n")
3100 -1232 3099 
3101 -1825 3100 
3102 0 0 
3103 0 0 
3104 0 0 
3105 0 0 
3106 0 0 
3107 0 0 
3108 0 0 
3109 0 0 
> a <- as.vector(RM$merge)
> b <- a[a>0]
> m <- min(which(RM$merge[,1]==0))
> c <- setdiff(1:(m-1),b)
> c
[1] 3101
> e <- -a[a<0]
> f <- setdiff(1:n,e)
> f
[1] 1186 1192 1818 1824 1835 1837 1846 2949
> 
> RM$merge[3102,1] <- -1186; RM$merge[3102,2] <- 3101
> RM$merge[3103,1] <- -1192; RM$merge[3103,2] <- 3102
> RM$merge[3104,1] <- -1818; RM$merge[3104,2] <- 3103
> RM$merge[3105,1] <- -1824; RM$merge[3105,2] <- 3104
> RM$merge[3106,1] <- -1835; RM$merge[3106,2] <- 3105
> RM$merge[3107,1] <- -1837; RM$merge[3107,2] <- 3106
> RM$merge[3108,1] <- -1846; RM$merge[3108,2] <- 3107
> RM$merge[3109,1] <- -2949; RM$merge[3109,2] <- 3108
> max(HM)
[1] 40.61588
> for (i in 3102:3109) RM$height[i] <- 42

This procedure will be automatized in the future. Now we can finally draw the dendrogram:


> RM$order <- orDendro(RM$merge,nm)
> pdf("DendroMaxTol.pdf",width=58.5,height=41.5)
> plot(RC,hang=-1,cex=0.08,main="Maximum/Tolerant",lwd=0.01)
> dev.off()

{{notes:clu:pdf:dendroMaxTol.pdf|Dendrogram 1}}, {{notes:clu:zip:RM.zip|RM.Rdata}}. To get better picture we rearrange the subtrees in the dendrogram:


> s <- rep(1,n); orSize(RM$merge,nm)
> T <- RM; M <- T$merge
> for (i in nm:1){l <- M[i,1]; r <- M[i,2]; if(l>0) if(s[l]>s[r]){M[i,1]<-r; M[i,2]<-l}}
> T$merge <- M 
> T$order <- orDendro(T$merge,nm)
> pdf("DendroMaxTol3.pdf",width=58.5,height=41.5)
> plot(T,hang=-1,cex=0.08,main="Maximum/Tolerant",lwd=0.01)
> dev.off()

{{notes:clu:pdf:dendroMaxTol3.pdf|Dendrogram 2}}.