====== Matching counties data with relational data ====== 1. October 2011 There is different number (and also the ordering) of units in the selected counties data on file ''/USAC/vars.csv'' and corresponding vertices in the neighborhood relation on file ''***''. The file ''uscIds.csv'' contains the Pajek's vertices info (V1=seq. number, V2=id, V3=X, V4=Y, V5=Z) and the file ''/USAC/selection.csv'' contains the selected variables. We have to put together data about the same unit. This can be done using the mappings between the data sets. In the construction of these mappings using R its function ''match'' turns out to be very useful. > a <- c('Tom','Pat','Jim','Bob','Ann') > b <- c('Ann','Tim','Bob','Jim','Pat','Ron') > match(a,b) [1] NA 5 4 3 1 > match(b,a) [1] 5 NA 4 3 2 NA > We first determine the mappings p and q between ids in Pajek's vertices and ids in the data set with variables. ===== Mappings ===== > setwd("D:/Data/counties") > ids <- read.csv("uscIds.csv",header=FALSE) > sel <- read.csv("./USAC/selection.csv",header=TRUE) > ids[1:5,] V1 V2 V3 V4 V5 1 1 1001 0.6472 0.6803 0 2 2 1003 0.6305 0.7479 0 3 3 1005 0.6676 0.7045 0 4 4 1007 0.6394 0.6596 0 5 5 1009 0.6477 0.6217 0 > sel[1:5,] STCOU Areaname AGE010200D AGE050200D AGE110200D AGE270200D AGE880200D 1 0 UNITED STATES 281421906 35.3 19175798 72293812 4239587 2 1000 ALABAMA 4447100 35.8 295992 1123422 67301 3 1001 Autauga, AL 43671 35.1 3023 12494 428 4 1003 Baldwin, AL 140415 39.0 8621 34320 2164 5 1005 Barbour, AL 29038 35.8 1788 7383 512 AGN020202D AGN030202D AGN050202D AGN330202D AGN420202D AGN430202D BNK050200D 1 2128982 1909598 938279056 55311236 95151954 105494401 3966224000 ................................... 5 255.51 6.36 1.12 0.06 > id <- ids$V2 > se <- sel$STCOU > id[1:10] [1] 1001 1003 1005 1007 1009 1011 1013 1015 1017 1019 > se[1:10] [1] 0 1000 1001 1003 1005 1007 1009 1011 1013 1015 > p <- match(id,se) > mi <- which(is.na(p)) > mi [1] 303 > id[303] [1] 12025 > q <- match(se,id) > ms <- which(is.na(q)) > se[ms] [1] 0 1000 2000 2013 2016 2020 2050 2060 2068 2070 2090 2100 [13] 2105 2110 2122 2130 2150 2164 2170 2180 2185 2188 2195 2198 [25] 2201 2220 2230 2232 2240 2261 2270 2275 2280 2282 2290 4000 [37] 5000 6000 8000 8014 9000 10000 11000 12000 12086 13000 15000 15001 [49] 15003 15005 15007 15009 16000 17000 18000 19000 20000 21000 22000 23000 [61] 24000 25000 26000 27000 28000 29000 30000 31000 32000 33000 34000 35000 [73] 36000 37000 38000 39000 40000 41000 42000 44000 45000 46000 47000 48000 [85] 49000 50000 51000 53000 54000 55000 56000 > se[377] [1] 12086 > sel$Areaname[377] [1] Miami-Dade, FL 3196 Levels: Abbeville, SC Acadia, LA Accomack, VA Ada, ID ... Ziebach, SD > id[303] <- 12086 ===== Vars ===== > standard <- function(x) {s <- paste("0000",x,sep=""); substr(s,nchar(s)-4,nchar(s))} > vars <- read.csv(file="./USAC/vars.csv",stringsAsFactors=FALSE) > vars[1:5,] STCOU Areaname AGE050200D BZA115203D CLF040200D EDU635200D EDU685200D HSG045200D HSG495200D 1 0 UNITED STATES 35.3 -0.6 4.0 80.4 24.4 13.4 119600 2 1000 ALABAMA 35.8 -3.4 4.1 75.3 19.0 17.6 85100 3 1001 Autauga, AL 35.1 11.0 3.6 78.7 18.0 38.7 94800 ... P.PUBLIC.SCHOOL.ENROLNEMT TOTAL.DEPOSITSperCapita CROPvaluePerFARM LIFESTOCKvaluePerFARM 1 65.18824 14.093516 44.69364 49.55157 2 65.03585 11.852627 13.08044 59.27140 3 69.04114 7.494127 21.79357 28.07775 P.CHANGEpverty95to00 1 -13.297392 2 -15.287656 3 -14.439773 > net <- vars[p,] > net[1:5,] STCOU Areaname AGE050200D BZA115203D CLF040200D EDU635200D EDU685200D HSG045200D HSG495200D 3 1001 Autauga, AL 35.1 11.0 3.6 78.7 18.0 38.7 94800 4 1003 Baldwin, AL 39.0 7.1 3.3 82.0 23.1 45.8 122500 5 1005 Barbour, AL 35.8 -16.2 4.9 64.7 10.9 16.4 68600 ... P.CHANGEpverty95to00 3 -14.439773 4 -10.578288 5 -8.967772 > write.csv(net,file="varsN.csv",row.names=FALSE) > library(xlsReadWritePro) > write.xls(net,file="varsN.xls",colNames=TRUE,rowNames=FALSE) The file ''varsN'' contains 92 = 2 + 90 variables and 3111 units. To obtain the vectors (variables) for Pajek we delete some variables and reorder the units according to vertex STCOU from the file ''usc3110lab.csv'' (Boston deleted). > del <- c("P.ind.fam.farms","P.irrigatedLand","P.aINDIAN.BELOWpovertyLevel", + "P.BLACK.BELOWpovertyLevel","P.ASIAN.BELOWpovertyLevel","P.HisLat.BELOWpovertyLevel", + "P.CHANGEemployIndustry90to00","P.IrrigationGROUNDwaterUse","CROPvaluePerFARM", + "LIFESTOCKvaluePerFARM") > nam <- names(net) > match(del,nam) [1] 33 35 76 77 78 79 81 82 90 91 > ids <- read.csv("usc3110lab.csv",header=FALSE,stringsAsFactors=FALSE) > ids$V2 <- standard(ids$V2) > id <- ids$V2 > net$STCOU <- standard(net$STCOU) > lab <- net$STCOU > p <- match(id,lab) > which(is.na(p)) [1] 303 > id[303] [1] "12025" > id[303] <- 12086 > id[303] [1] "12086" > The vertex (303; Dade; 12025) is identical to (Miami-Dade, FL; 12086). The vertex (2916; South Boston) is not included in the data set. It is removed from the relation. Therefore the relation contains now 3110 counties. 1621 NA -> 0 2889 4-78 ===== Extract selected ===== > p <- match(id,lab) > which(is.na(p)) integer(0) > Net <- net[p,setdiff(nam,del)] > dim(Net) [1] 3110 82 > n <- nrow(Net); m <- ncol(Net) > out <- file("usc3110.vec","w") > cat("*vertices 3110\n",file=out) > for(i in 1:n) {for(j in 3:m) cat(Net[i,j]," ",file=out); cat("\n",file=out)} > close(out) > S <- scale(Net[,3:m]) > class(S) [1] "matrix" > S[1:3,] AGE050200D BZA115203D CLF040200D EDU635200D EDU685200D HSG045200D HSG495200D INC610199D INC910199D 3 -0.5630560 0.6791548 -0.4390026 0.1573698 0.1921497 1.7504631 0.2252569 0.6538238 0.2624938 4 0.4063825 0.4522135 -0.6210019 0.5309126 0.8456407 2.2402234 0.8095434 0.5085709 0.8476656 5 -0.3890542 -0.9036150 0.3496611 -1.4273569 -0.7176124 0.2122021 -0.3273895 -1.0303989 -1.0564246 ... P.PUBLIC.SCHOOL.ENROLNEMT TOTAL.DEPOSITSperCapita P.CHANGEpverty95to00 3 0.009373521 NaN -0.32404400 4 -0.228558808 NaN -0.05297025 5 -0.444472915 NaN 0.06008696 > del1 <- c("P.violent.crime","R.Hisp.Lat.maleFemale","F.GOV.EXP.perCapita","P.URBANpopul", "TOTAL.DEPOSITSperCapita") > SN <- S[,setdiff(colnames(S),del1)] > dim(SN) [1] 3110 75 > SN[1:3,] AGE050200D BZA115203D CLF040200D EDU635200D EDU685200D HSG045200D HSG495200D INC610199D INC910199D 3 -0.5630560 0.6791548 -0.4390026 0.1573698 0.1921497 1.7504631 0.2252569 0.6538238 0.2624938 R.VOTING.DEMOCRATESoverREPUBLICANS P.PUBLIC.SCHOOL.ENROLNEMT P.CHANGEpverty95to00 3 -0.7041726 0.009373521 -0.32404400 4 -0.8312459 -0.228558808 -0.05297025 5 0.6997518 -0.444472915 0.06008696 > out <- file("usc3110.vec","w") > cat("*vertices 3110\n",file=out) > for(i in 1:n) cat(SN[i,],"\n",file=out) > close(out) > > write.csv(SN,"usc3110S.csv",row.names=FALSE)