====== Clustering ======
===== Storing the standardized variables as Pajek's vectors =====
setwd("D:/Data/counties/pajek2")
load("../pajek/rankDat.RData")
objects()
dim(da)
names(da)
del <- c(1,2,4,6,9,10,11,14,15,20,25,30,33,35,36,37,38,39,40,41,42,45,46,48,
52,53,56,57,58,60,61,62,63,65,67,69,71,72,73,74,75,76,77,78,79,81,82,89,90,91)
V <- da[,-del]
for(i in 1:nrow(V)) if(length(which(is.na(V[i,])))>0) cat(i,"\n")
There are two counties that have NA values: 1620 and 2888.
> da[1620,"Areaname"]
[1] "Yellowstone National Park, MT"
> V[1620,]
AGE050200D CLF040200D EDU685200D HSG045200D IPE010200D IPE120200D PIN020200D
1620 0 0 0 0 0 0 0
POP050200D POP060200D POP165200D POP255200D POP285200D POP325200D POP405200D
1620 0 0 0 0 0 0 0
VST020200D VST220200D VST420200D WAT130200D P.pop.under18 P.pop.over85 P.land.farms
1620 0 0 0 0 NA NA NA
P.emply.ind.CONSTRUCTION P.emply.ind.MANUFACTORING P.emply.ind.TRANSPORT.WAREHOUSING
1620 NA NA NA
P.emply.ind.FINANC.INSUR P.emply.ind.PROFscientTECH P.emply.ind.EDUC.HEALTH
1620 NA NA NA
P.25overLESS9thGRADE P.employ.FARMING P.employ.GOV.stateLoc
1620 NA NA NA
P.OWNERoccupiedHousingUnits P.occupiedHousingUnitsLackingPlumb P.RURALpopul
1620 NA NA NA
P.CHANGEurban90to00 CHANGEperCapitaIncome89to99 GroundWaterUsePerCapita
1620 0 -7925 NA
P.NET.DOMESTIC.MIGRATIONS P.NativePopulationBornInStateOfRes R.LABOR.FORCEmaleFemale
1620 NA NA NA
R.VOTING.DEMOCRATESoverREPUBLICANS P.PUBLIC.SCHOOL.ENROLNEMT P.CHANGEpverty95to00
1620 NA NA NA
> da[2888,"Areaname"]
[1] "Clifton Forge, VA"
> V[2888,]
AGE050200D CLF040200D EDU685200D HSG045200D IPE010200D IPE120200D PIN020200D
2888 42.9 0 9.6 -2.9 0 0 0
POP050200D POP060200D POP165200D POP255200D POP285200D POP325200D POP405200D
2888 -8.3 1429.7 55.9 14.6 0.1 0 0.9
VST020200D VST220200D VST420200D WAT130200D P.pop.under18 P.pop.over85 P.land.farms
2888 0 0 0 0 NA NA 0
P.emply.ind.CONSTRUCTION P.emply.ind.MANUFACTORING P.emply.ind.TRANSPORT.WAREHOUSING
2888 8.024017 17.41266 7.969432
P.emply.ind.FINANC.INSUR P.emply.ind.PROFscientTECH P.emply.ind.EDUC.HEALTH
2888 2.783843 5.18559 20.08734
P.25overLESS9thGRADE P.employ.FARMING P.employ.GOV.stateLoc
2888 12.63666 0 0
P.OWNERoccupiedHousingUnits P.occupiedHousingUnitsLackingPlumb P.RURALpopul
2888 55.7274 0 NA
P.CHANGEurban90to00 CHANGEperCapitaIncome89to99 GroundWaterUsePerCapita
2888 -21.04695 3620 NA
P.NET.DOMESTIC.MIGRATIONS P.NativePopulationBornInStateOfRes R.LABOR.FORCEmaleFemale
2888 NA 78.82956 0.9656652
R.VOTING.DEMOCRATESoverREPUBLICANS P.PUBLIC.SCHOOL.ENROLNEMT P.CHANGEpverty95to00
2888 141.5987 NA -100
>
* http://en.wikipedia.org/wiki/Yellowstone_National_Park_%28part%29,_Montana
* http://quickfacts.census.gov/qfd/states/30/30111.html
* http://quickfacts.census.gov/qfd/states/30000.html
* [[http://factfinder.census.gov/servlet/ACSSAFFFacts?_event=ChangeGeoContext&geo_id=05000US30111&_geoContext=&_street=&_county=yellowstone&_cityTown=yellowstone&_state=&_zip=&_lang=en&_sse=on&ActiveGeoDiv=&_useEV=&pctxt=fph&pgsl=010&_submenuId=factsheet_1&ds_name=ACS_2009_5YR_SAFF&_ci_nbr=null&qr_name=null®=null%3Anull&_keyword=&_industry=|factfinder: Yellowstone]]
* http://en.wikipedia.org/wiki/Clifton_Forge,_Virginia
* http://quickfacts.census.gov/qfd/states/51000.html
* [[http://factfinder.census.gov/servlet/ACSSAFFFacts?_event=Search&geo_id=&_geoContext=&_street=&_county=clifton+forge&_cityTown=clifton+forge&_state=&_zip=&_lang=en&_sse=on&pctxt=fph&pgsl=010|factfinder: Clifton Forge]]
Yellowstone National Park (part) was a former county-equivalent in the state of Montana, United States. Its territory was and is the Montana portion of Yellowstone National Park.
Clifton Forge was an independent city during the 2000 census. However, in 2001, Clifton Forge gave up its city status and reverted to a town.
We decided to replace the data for these two counties with data for the related counties:
> nams <- da[,"Areaname"]
> da[1620,"Areaname"]
[1] "Yellowstone National Park, MT"
> da[2888,"Areaname"]
[1] "Clifton Forge, VA"
> grep("Yellow",nams)
[1] 1366 1619 1620
> nams[grep("Yellow",nams)]
[1] "Yellow Medicine, MN" "Yellowstone, MT"
[3] "Yellowstone National Park, MT"
> nams[grep("Park",nams)]
[1] "Park, CO" "Parke, IN"
[3] "Park, MT" "Yellowstone National Park, MT"
[5] "Parker, TX" "Manassas Park, VA"
[7] "Park, WY"
> grep("Park",nams)
[1] 262 723 1597 1620 2673 2904 3102
> da[1597,"Areaname"]
[1] "Park, MT"
> grep("Alleghany",nams)
[1] 1859 2789
> nams[grep("Alleghany",nams)]
[1] "Alleghany, NC" "Alleghany, VA"
> da[2789,"Areaname"]
[1] "Alleghany, VA"
> V[1620,] <- V[1597,] # Yellowstone National Park, MT = Park, MT
> V[2888,] <- V[2789,] # Clifton Forge, VA = Alleghany, VA
> for(i in 1:nrow(V)) if(length(which(is.na(V[i,])))>0) cat(i,"\n")
Standardized version of 42 variables:
> z <- function(x) (x-mean(x,na.rm=TRUE))/sd(x,na.rm=TRUE)
> U <- apply(V,2,z)
> summary(U[,1])
Min. 1st Qu. Median Mean 3rd Qu. Max.
-4.230000 -0.549000 0.005642 0.000000 0.610700 4.266000
> rownames(U) <- da[,"Areaname"]
> save(U,file="vars42.Rdata")
Checking the same-ordering of Pajek's vertices and data units:
> d <- readLines("../pajek/usc3110.paj")[14:3123]
> length(d)
[1] 3110
> d[1:10]
[1] " 1 \"01001\" 0.6472 0.6803" " 2 \"01003\" 0.6305 0.7479"
[3] " 3 \"01005\" 0.6676 0.7045" " 4 \"01007\" 0.6394 0.6596"
[5] " 5 \"01009\" 0.6477 0.6217" " 6 \"01011\" 0.6626 0.6964"
[7] " 7 \"01013\" 0.6471 0.7099" " 8 \"01015\" 0.6605 0.6314"
[9] " 9 \"01017\" 0.6682 0.6643" " 10 \"01019\" 0.6640 0.6131"
> dd <- unlist(lapply(d,function(x) as.integer(strsplit(x,'\"')[[1]][2])))
> dd[1:10]
[1] 1001 1003 1005 1007 1009 1011 1013 1015 1017 1019
> da$STCOU[1:10]
[1] 1001 1003 1005 1007 1009 1011 1013 1015 1017 1019
> typeof(da$STCOU)
[1] "integer"
> cn <- da$STCOU
> which(dd!=cn)
[1] 303
> cn[303]
[1] 12086
> dd[303]
[1] 12025
We save the data as a set of Pajek's vectors:
> out <- file("vars42.vec","w")
> cat("*vertices 3110\n",file=out)
> for(i in 1:n) cat(U[i,],"\n",file=out)
> close(out)
===== Clustering with relational constraints =====
In Pajek:
read Pajek project file USC3110.paj
read Pajek vectors file vars42.vec
Cluster/Create Complete Cluster [45]
edit cluster [delete 1, 2, 3]
Operations/Dissimilarity/Vector Based/Euclidean [yes]
Net/Hierarchical Decomposition/Clustering With Relational Constraint/Options
[Maximum, Tolerant]
Net/Hierarchical Decomposition/Clustering With Relational Constraint/Run
select partition Max/Tolerant
save partition to MaxTol.clu
select vector Max/Tolerant heights
save vector to MaxTolHeight.vec
Net/Hierarchical Decomposition/Clustering With Relational Constraint/Options
[Average, Tolerant]
Net/Hierarchical Decomposition/Clustering With Relational Constraint/Run
select partition Avg/Tolerant
save partition to AvgTol.clu
select vector Avg/Tolerant heights
save vector to AvgTolHeight.vec
For reading of Pajek's dendrogram description we use the function ''Pajek2R''
Pajek2R <- function(cling){
tree <- as.integer(read.csv(cling,header=FALSE,skip=1)$V1)
N <- length(tree); n <- (N+1) %/% 2
merge <- matrix(0,nrow=(n-1),ncol=2)
for(i in 1:n) if(tree[i]>0){
k <- tree[i]-n
if(merge[k,1]==0) merge[k,1] <- -i else merge[k,2] <- -i
}
for(i in (n+1):N) if(tree[i]>0){
k <- tree[i]-n; j <- i-n
if(merge[k,1]==0) merge[k,1] <- j else merge[k,2] <- j
}
return(list(merge=merge,n=n))
}
orDendro <- function(m,i){if(i<0) return(-i)
return(c(orDendro(m,m[i,1]),orDendro(m,m[i,2])))}
orSize <- function(m,i){if(i<0) return(1)
s[i] <<- orSize(m,m[i,1])+orSize(m,m[i,2])
return(s[i])}
Back to R:
> setwd("D:/Data/counties/pajek2")
> load("./vars42.Rdata")
> source("D:\\Data\\counties\\pajek\\varCutree.R")
> RM <- Pajek2R("MaxTol.clu")
> HM <- read.csv("MaxTolHeight.vec",header=FALSE,skip=3111)[[1]]
> RM$height <- HM
> RM$method <- "Maximum/Tolerant"
> RM$dist.method <- "Euclidean"
> RM$labels <- rownames(U)
> class(RM) <- "hclust"
> RM$call <- "Pajek.data"
The obtained hierarchical clustering can consist of several trees. We join them into a single tree:
> n <- 3110; nm <- n-1
> for(i in 3100:3109) cat(i,RM$merge[i,],"\n")
3100 -1232 3099
3101 -1825 3100
3102 0 0
3103 0 0
3104 0 0
3105 0 0
3106 0 0
3107 0 0
3108 0 0
3109 0 0
> a <- as.vector(RM$merge)
> b <- a[a>0]
> m <- min(which(RM$merge[,1]==0))
> c <- setdiff(1:(m-1),b)
> c
[1] 3101
> e <- -a[a<0]
> f <- setdiff(1:n,e)
> f
[1] 1186 1192 1818 1824 1835 1837 1846 2949
>
> RM$merge[3102,1] <- -1186; RM$merge[3102,2] <- 3101
> RM$merge[3103,1] <- -1192; RM$merge[3103,2] <- 3102
> RM$merge[3104,1] <- -1818; RM$merge[3104,2] <- 3103
> RM$merge[3105,1] <- -1824; RM$merge[3105,2] <- 3104
> RM$merge[3106,1] <- -1835; RM$merge[3106,2] <- 3105
> RM$merge[3107,1] <- -1837; RM$merge[3107,2] <- 3106
> RM$merge[3108,1] <- -1846; RM$merge[3108,2] <- 3107
> RM$merge[3109,1] <- -2949; RM$merge[3109,2] <- 3108
> max(HM)
[1] 40.61588
> for (i in 3102:3109) RM$height[i] <- 42
This procedure will be automatized in the future.
Now we can finally draw the dendrogram:
> RM$order <- orDendro(RM$merge,nm)
> pdf("DendroMaxTol.pdf",width=58.5,height=41.5)
> plot(RC,hang=-1,cex=0.08,main="Maximum/Tolerant",lwd=0.01)
> dev.off()
{{notes:clu:pdf:dendroMaxTol.pdf|Dendrogram 1}}, {{notes:clu:zip:RM.zip|RM.Rdata}}.
To get better picture we rearrange the subtrees in the dendrogram:
> s <- rep(1,n); orSize(RM$merge,nm)
> T <- RM; M <- T$merge
> for (i in nm:1){l <- M[i,1]; r <- M[i,2]; if(l>0) if(s[l]>s[r]){M[i,1]<-r; M[i,2]<-l}}
> T$merge <- M
> T$order <- orDendro(T$merge,nm)
> pdf("DendroMaxTol3.pdf",width=58.5,height=41.5)
> plot(T,hang=-1,cex=0.08,main="Maximum/Tolerant",lwd=0.01)
> dev.off()
{{notes:clu:pdf:dendroMaxTol3.pdf|Dendrogram 2}}.