====== Clustering ====== ===== Storing the standardized variables as Pajek's vectors ===== setwd("D:/Data/counties/pajek2") load("../pajek/rankDat.RData") objects() dim(da) names(da) del <- c(1,2,4,6,9,10,11,14,15,20,25,30,33,35,36,37,38,39,40,41,42,45,46,48, 52,53,56,57,58,60,61,62,63,65,67,69,71,72,73,74,75,76,77,78,79,81,82,89,90,91) V <- da[,-del] for(i in 1:nrow(V)) if(length(which(is.na(V[i,])))>0) cat(i,"\n") There are two counties that have NA values: 1620 and 2888. > da[1620,"Areaname"] [1] "Yellowstone National Park, MT" > V[1620,] AGE050200D CLF040200D EDU685200D HSG045200D IPE010200D IPE120200D PIN020200D 1620 0 0 0 0 0 0 0 POP050200D POP060200D POP165200D POP255200D POP285200D POP325200D POP405200D 1620 0 0 0 0 0 0 0 VST020200D VST220200D VST420200D WAT130200D P.pop.under18 P.pop.over85 P.land.farms 1620 0 0 0 0 NA NA NA P.emply.ind.CONSTRUCTION P.emply.ind.MANUFACTORING P.emply.ind.TRANSPORT.WAREHOUSING 1620 NA NA NA P.emply.ind.FINANC.INSUR P.emply.ind.PROFscientTECH P.emply.ind.EDUC.HEALTH 1620 NA NA NA P.25overLESS9thGRADE P.employ.FARMING P.employ.GOV.stateLoc 1620 NA NA NA P.OWNERoccupiedHousingUnits P.occupiedHousingUnitsLackingPlumb P.RURALpopul 1620 NA NA NA P.CHANGEurban90to00 CHANGEperCapitaIncome89to99 GroundWaterUsePerCapita 1620 0 -7925 NA P.NET.DOMESTIC.MIGRATIONS P.NativePopulationBornInStateOfRes R.LABOR.FORCEmaleFemale 1620 NA NA NA R.VOTING.DEMOCRATESoverREPUBLICANS P.PUBLIC.SCHOOL.ENROLNEMT P.CHANGEpverty95to00 1620 NA NA NA > da[2888,"Areaname"] [1] "Clifton Forge, VA" > V[2888,] AGE050200D CLF040200D EDU685200D HSG045200D IPE010200D IPE120200D PIN020200D 2888 42.9 0 9.6 -2.9 0 0 0 POP050200D POP060200D POP165200D POP255200D POP285200D POP325200D POP405200D 2888 -8.3 1429.7 55.9 14.6 0.1 0 0.9 VST020200D VST220200D VST420200D WAT130200D P.pop.under18 P.pop.over85 P.land.farms 2888 0 0 0 0 NA NA 0 P.emply.ind.CONSTRUCTION P.emply.ind.MANUFACTORING P.emply.ind.TRANSPORT.WAREHOUSING 2888 8.024017 17.41266 7.969432 P.emply.ind.FINANC.INSUR P.emply.ind.PROFscientTECH P.emply.ind.EDUC.HEALTH 2888 2.783843 5.18559 20.08734 P.25overLESS9thGRADE P.employ.FARMING P.employ.GOV.stateLoc 2888 12.63666 0 0 P.OWNERoccupiedHousingUnits P.occupiedHousingUnitsLackingPlumb P.RURALpopul 2888 55.7274 0 NA P.CHANGEurban90to00 CHANGEperCapitaIncome89to99 GroundWaterUsePerCapita 2888 -21.04695 3620 NA P.NET.DOMESTIC.MIGRATIONS P.NativePopulationBornInStateOfRes R.LABOR.FORCEmaleFemale 2888 NA 78.82956 0.9656652 R.VOTING.DEMOCRATESoverREPUBLICANS P.PUBLIC.SCHOOL.ENROLNEMT P.CHANGEpverty95to00 2888 141.5987 NA -100 > * http://en.wikipedia.org/wiki/Yellowstone_National_Park_%28part%29,_Montana * http://quickfacts.census.gov/qfd/states/30/30111.html * http://quickfacts.census.gov/qfd/states/30000.html * [[http://factfinder.census.gov/servlet/ACSSAFFFacts?_event=ChangeGeoContext&geo_id=05000US30111&_geoContext=&_street=&_county=yellowstone&_cityTown=yellowstone&_state=&_zip=&_lang=en&_sse=on&ActiveGeoDiv=&_useEV=&pctxt=fph&pgsl=010&_submenuId=factsheet_1&ds_name=ACS_2009_5YR_SAFF&_ci_nbr=null&qr_name=null®=null%3Anull&_keyword=&_industry=|factfinder: Yellowstone]] * http://en.wikipedia.org/wiki/Clifton_Forge,_Virginia * http://quickfacts.census.gov/qfd/states/51000.html * [[http://factfinder.census.gov/servlet/ACSSAFFFacts?_event=Search&geo_id=&_geoContext=&_street=&_county=clifton+forge&_cityTown=clifton+forge&_state=&_zip=&_lang=en&_sse=on&pctxt=fph&pgsl=010|factfinder: Clifton Forge]] Yellowstone National Park (part) was a former county-equivalent in the state of Montana, United States. Its territory was and is the Montana portion of Yellowstone National Park. Clifton Forge was an independent city during the 2000 census. However, in 2001, Clifton Forge gave up its city status and reverted to a town. We decided to replace the data for these two counties with data for the related counties: > nams <- da[,"Areaname"] > da[1620,"Areaname"] [1] "Yellowstone National Park, MT" > da[2888,"Areaname"] [1] "Clifton Forge, VA" > grep("Yellow",nams) [1] 1366 1619 1620 > nams[grep("Yellow",nams)] [1] "Yellow Medicine, MN" "Yellowstone, MT" [3] "Yellowstone National Park, MT" > nams[grep("Park",nams)] [1] "Park, CO" "Parke, IN" [3] "Park, MT" "Yellowstone National Park, MT" [5] "Parker, TX" "Manassas Park, VA" [7] "Park, WY" > grep("Park",nams) [1] 262 723 1597 1620 2673 2904 3102 > da[1597,"Areaname"] [1] "Park, MT" > grep("Alleghany",nams) [1] 1859 2789 > nams[grep("Alleghany",nams)] [1] "Alleghany, NC" "Alleghany, VA" > da[2789,"Areaname"] [1] "Alleghany, VA" > V[1620,] <- V[1597,] # Yellowstone National Park, MT = Park, MT > V[2888,] <- V[2789,] # Clifton Forge, VA = Alleghany, VA > for(i in 1:nrow(V)) if(length(which(is.na(V[i,])))>0) cat(i,"\n") Standardized version of 42 variables: > z <- function(x) (x-mean(x,na.rm=TRUE))/sd(x,na.rm=TRUE) > U <- apply(V,2,z) > summary(U[,1]) Min. 1st Qu. Median Mean 3rd Qu. Max. -4.230000 -0.549000 0.005642 0.000000 0.610700 4.266000 > rownames(U) <- da[,"Areaname"] > save(U,file="vars42.Rdata") Checking the same-ordering of Pajek's vertices and data units: > d <- readLines("../pajek/usc3110.paj")[14:3123] > length(d) [1] 3110 > d[1:10] [1] " 1 \"01001\" 0.6472 0.6803" " 2 \"01003\" 0.6305 0.7479" [3] " 3 \"01005\" 0.6676 0.7045" " 4 \"01007\" 0.6394 0.6596" [5] " 5 \"01009\" 0.6477 0.6217" " 6 \"01011\" 0.6626 0.6964" [7] " 7 \"01013\" 0.6471 0.7099" " 8 \"01015\" 0.6605 0.6314" [9] " 9 \"01017\" 0.6682 0.6643" " 10 \"01019\" 0.6640 0.6131" > dd <- unlist(lapply(d,function(x) as.integer(strsplit(x,'\"')[[1]][2]))) > dd[1:10] [1] 1001 1003 1005 1007 1009 1011 1013 1015 1017 1019 > da$STCOU[1:10] [1] 1001 1003 1005 1007 1009 1011 1013 1015 1017 1019 > typeof(da$STCOU) [1] "integer" > cn <- da$STCOU > which(dd!=cn) [1] 303 > cn[303] [1] 12086 > dd[303] [1] 12025 We save the data as a set of Pajek's vectors: > out <- file("vars42.vec","w") > cat("*vertices 3110\n",file=out) > for(i in 1:n) cat(U[i,],"\n",file=out) > close(out) ===== Clustering with relational constraints ===== In Pajek: read Pajek project file USC3110.paj read Pajek vectors file vars42.vec Cluster/Create Complete Cluster [45] edit cluster [delete 1, 2, 3] Operations/Dissimilarity/Vector Based/Euclidean [yes] Net/Hierarchical Decomposition/Clustering With Relational Constraint/Options [Maximum, Tolerant] Net/Hierarchical Decomposition/Clustering With Relational Constraint/Run select partition Max/Tolerant save partition to MaxTol.clu select vector Max/Tolerant heights save vector to MaxTolHeight.vec Net/Hierarchical Decomposition/Clustering With Relational Constraint/Options [Average, Tolerant] Net/Hierarchical Decomposition/Clustering With Relational Constraint/Run select partition Avg/Tolerant save partition to AvgTol.clu select vector Avg/Tolerant heights save vector to AvgTolHeight.vec For reading of Pajek's dendrogram description we use the function ''Pajek2R'' Pajek2R <- function(cling){ tree <- as.integer(read.csv(cling,header=FALSE,skip=1)$V1) N <- length(tree); n <- (N+1) %/% 2 merge <- matrix(0,nrow=(n-1),ncol=2) for(i in 1:n) if(tree[i]>0){ k <- tree[i]-n if(merge[k,1]==0) merge[k,1] <- -i else merge[k,2] <- -i } for(i in (n+1):N) if(tree[i]>0){ k <- tree[i]-n; j <- i-n if(merge[k,1]==0) merge[k,1] <- j else merge[k,2] <- j } return(list(merge=merge,n=n)) } orDendro <- function(m,i){if(i<0) return(-i) return(c(orDendro(m,m[i,1]),orDendro(m,m[i,2])))} orSize <- function(m,i){if(i<0) return(1) s[i] <<- orSize(m,m[i,1])+orSize(m,m[i,2]) return(s[i])} Back to R: > setwd("D:/Data/counties/pajek2") > load("./vars42.Rdata") > source("D:\\Data\\counties\\pajek\\varCutree.R") > RM <- Pajek2R("MaxTol.clu") > HM <- read.csv("MaxTolHeight.vec",header=FALSE,skip=3111)[[1]] > RM$height <- HM > RM$method <- "Maximum/Tolerant" > RM$dist.method <- "Euclidean" > RM$labels <- rownames(U) > class(RM) <- "hclust" > RM$call <- "Pajek.data" The obtained hierarchical clustering can consist of several trees. We join them into a single tree: > n <- 3110; nm <- n-1 > for(i in 3100:3109) cat(i,RM$merge[i,],"\n") 3100 -1232 3099 3101 -1825 3100 3102 0 0 3103 0 0 3104 0 0 3105 0 0 3106 0 0 3107 0 0 3108 0 0 3109 0 0 > a <- as.vector(RM$merge) > b <- a[a>0] > m <- min(which(RM$merge[,1]==0)) > c <- setdiff(1:(m-1),b) > c [1] 3101 > e <- -a[a<0] > f <- setdiff(1:n,e) > f [1] 1186 1192 1818 1824 1835 1837 1846 2949 > > RM$merge[3102,1] <- -1186; RM$merge[3102,2] <- 3101 > RM$merge[3103,1] <- -1192; RM$merge[3103,2] <- 3102 > RM$merge[3104,1] <- -1818; RM$merge[3104,2] <- 3103 > RM$merge[3105,1] <- -1824; RM$merge[3105,2] <- 3104 > RM$merge[3106,1] <- -1835; RM$merge[3106,2] <- 3105 > RM$merge[3107,1] <- -1837; RM$merge[3107,2] <- 3106 > RM$merge[3108,1] <- -1846; RM$merge[3108,2] <- 3107 > RM$merge[3109,1] <- -2949; RM$merge[3109,2] <- 3108 > max(HM) [1] 40.61588 > for (i in 3102:3109) RM$height[i] <- 42 This procedure will be automatized in the future. Now we can finally draw the dendrogram: > RM$order <- orDendro(RM$merge,nm) > pdf("DendroMaxTol.pdf",width=58.5,height=41.5) > plot(RC,hang=-1,cex=0.08,main="Maximum/Tolerant",lwd=0.01) > dev.off() {{notes:clu:pdf:dendroMaxTol.pdf|Dendrogram 1}}, {{notes:clu:zip:RM.zip|RM.Rdata}}. To get better picture we rearrange the subtrees in the dendrogram: > s <- rep(1,n); orSize(RM$merge,nm) > T <- RM; M <- T$merge > for (i in nm:1){l <- M[i,1]; r <- M[i,2]; if(l>0) if(s[l]>s[r]){M[i,1]<-r; M[i,2]<-l}} > T$merge <- M > T$order <- orDendro(T$merge,nm) > pdf("DendroMaxTol3.pdf",width=58.5,height=41.5) > plot(T,hang=-1,cex=0.08,main="Maximum/Tolerant",lwd=0.01) > dev.off() {{notes:clu:pdf:dendroMaxTol3.pdf|Dendrogram 2}}.