Matching counties data with relational data

Matching counties data with relational data

1. October 2011

There is different number (and also the ordering) of units in the selected counties data on file /USAC/vars.csv and corresponding vertices in the neighborhood relation on file ***. The file uscIds.csv contains the Pajek's vertices info (V1=seq. number, V2=id, V3=X, V4=Y, V5=Z) and the file /USAC/selection.csv contains the selected variables.

We have to put together data about the same unit. This can be done using the mappings between the data sets. In the construction of these mappings using R its function match turns out to be very useful.

> a <- c('Tom','Pat','Jim','Bob','Ann')
> b <- c('Ann','Tim','Bob','Jim','Pat','Ron')
> match(a,b)
[1] NA  5  4  3  1
> match(b,a)
[1]  5 NA  4  3  2 NA
>

We first determine the mappings p and q between ids in Pajek's vertices and ids in the data set with variables.

Mappings

> setwd("D:/Data/counties")
> ids <- read.csv("uscIds.csv",header=FALSE)
> sel <- read.csv("./USAC/selection.csv",header=TRUE)
> ids[1:5,]
  V1   V2     V3     V4 V5
1  1 1001 0.6472 0.6803  0
2  2 1003 0.6305 0.7479  0
3  3 1005 0.6676 0.7045  0
4  4 1007 0.6394 0.6596  0
5  5 1009 0.6477 0.6217  0
> sel[1:5,]
  STCOU      Areaname AGE010200D AGE050200D AGE110200D AGE270200D AGE880200D
1     0 UNITED STATES  281421906       35.3   19175798   72293812    4239587
2  1000       ALABAMA    4447100       35.8     295992    1123422      67301
3  1001   Autauga, AL      43671       35.1       3023      12494        428
4  1003   Baldwin, AL     140415       39.0       8621      34320       2164
5  1005   Barbour, AL      29038       35.8       1788       7383        512
  AGN020202D AGN030202D AGN050202D AGN330202D AGN420202D AGN430202D BNK050200D
1    2128982    1909598  938279056   55311236   95151954  105494401 3966224000
  ...................................
5     255.51       6.36       1.12       0.06
> id <- ids$V2
> se <- sel$STCOU
> id[1:10]
 [1] 1001 1003 1005 1007 1009 1011 1013 1015 1017 1019
> se[1:10]
 [1]    0 1000 1001 1003 1005 1007 1009 1011 1013 1015
> p <- match(id,se)
> mi <- which(is.na(p))
> mi
[1] 303
> id[303]
[1] 12025
> q <- match(se,id)
> ms <- which(is.na(q))
> se[ms]
 [1]     0  1000  2000  2013  2016  2020  2050  2060  2068  2070  2090  2100
[13]  2105  2110  2122  2130  2150  2164  2170  2180  2185  2188  2195  2198
[25]  2201  2220  2230  2232  2240  2261  2270  2275  2280  2282  2290  4000
[37]  5000  6000  8000  8014  9000 10000 11000 12000 12086 13000 15000 15001
[49] 15003 15005 15007 15009 16000 17000 18000 19000 20000 21000 22000 23000
[61] 24000 25000 26000 27000 28000 29000 30000 31000 32000 33000 34000 35000
[73] 36000 37000 38000 39000 40000 41000 42000 44000 45000 46000 47000 48000
[85] 49000 50000 51000 53000 54000 55000 56000
> se[377]
[1] 12086
> sel$Areaname[377]
[1] Miami-Dade, FL
3196 Levels: Abbeville, SC Acadia, LA Accomack, VA Ada, ID ... Ziebach, SD
> id[303] <- 12086

Vars

> standard <- function(x) {s <- paste("0000",x,sep=""); substr(s,nchar(s)-4,nchar(s))}
> vars <- read.csv(file="./USAC/vars.csv",stringsAsFactors=FALSE)
> vars[1:5,]
  STCOU      Areaname AGE050200D BZA115203D CLF040200D EDU635200D EDU685200D HSG045200D HSG495200D
1     0 UNITED STATES       35.3       -0.6        4.0       80.4       24.4       13.4     119600
2  1000       ALABAMA       35.8       -3.4        4.1       75.3       19.0       17.6      85100
3  1001   Autauga, AL       35.1       11.0        3.6       78.7       18.0       38.7      94800
...
  P.PUBLIC.SCHOOL.ENROLNEMT TOTAL.DEPOSITSperCapita CROPvaluePerFARM LIFESTOCKvaluePerFARM
1                  65.18824               14.093516         44.69364              49.55157
2                  65.03585               11.852627         13.08044              59.27140
3                  69.04114                7.494127         21.79357              28.07775
  P.CHANGEpverty95to00
1           -13.297392
2           -15.287656
3           -14.439773
> net <- vars[p,]
> net[1:5,]
  STCOU    Areaname AGE050200D BZA115203D CLF040200D EDU635200D EDU685200D HSG045200D HSG495200D
3  1001 Autauga, AL       35.1       11.0        3.6       78.7       18.0       38.7      94800
4  1003 Baldwin, AL       39.0        7.1        3.3       82.0       23.1       45.8     122500
5  1005 Barbour, AL       35.8      -16.2        4.9       64.7       10.9       16.4      68600
...
  P.CHANGEpverty95to00
3           -14.439773
4           -10.578288
5            -8.967772
> write.csv(net,file="varsN.csv",row.names=FALSE)
> library(xlsReadWritePro)
> write.xls(net,file="varsN.xls",colNames=TRUE,rowNames=FALSE)

The file varsN contains 92 = 2 + 90 variables and 3111 units. To obtain the vectors (variables) for Pajek we delete some variables and reorder the units according to vertex STCOU from the file usc3110lab.csv (Boston deleted).

> del <- c("P.ind.fam.farms","P.irrigatedLand","P.aINDIAN.BELOWpovertyLevel",
+    "P.BLACK.BELOWpovertyLevel","P.ASIAN.BELOWpovertyLevel","P.HisLat.BELOWpovertyLevel",
+    "P.CHANGEemployIndustry90to00","P.IrrigationGROUNDwaterUse","CROPvaluePerFARM",
+    "LIFESTOCKvaluePerFARM")
> nam <- names(net)
> match(del,nam)
 [1] 33 35 76 77 78 79 81 82 90 91
> ids <- read.csv("usc3110lab.csv",header=FALSE,stringsAsFactors=FALSE)
> ids$V2 <- standard(ids$V2)
> id <- ids$V2
> net$STCOU <- standard(net$STCOU)
> lab <- net$STCOU
> p <- match(id,lab)
> which(is.na(p))
[1] 303
> id[303]
[1] "12025"
> id[303] <- 12086
> id[303] 
[1] "12086"
>

The vertex (303; Dade; 12025) is identical to (Miami-Dade, FL; 12086).

The vertex (2916; South Boston) is not included in the data set. It is removed from the relation. Therefore the relation contains now 3110 counties.

1621 NA → 0

2889 4-78

Extract selected

> p <- match(id,lab)
> which(is.na(p))
integer(0)
> Net <- net[p,setdiff(nam,del)]
> dim(Net)
[1] 3110   82
> n <- nrow(Net); m <- ncol(Net)
> out <- file("usc3110.vec","w")
> cat("*vertices 3110\n",file=out)
> for(i in 1:n) {for(j in 3:m) cat(Net[i,j]," ",file=out); cat("\n",file=out)}
> close(out)

> S <- scale(Net[,3:m])
> class(S)
[1] "matrix"
> S[1:3,]
  AGE050200D BZA115203D CLF040200D EDU635200D EDU685200D HSG045200D HSG495200D INC610199D INC910199D
3 -0.5630560  0.6791548 -0.4390026  0.1573698  0.1921497  1.7504631  0.2252569  0.6538238  0.2624938
4  0.4063825  0.4522135 -0.6210019  0.5309126  0.8456407  2.2402234  0.8095434  0.5085709  0.8476656
5 -0.3890542 -0.9036150  0.3496611 -1.4273569 -0.7176124  0.2122021 -0.3273895 -1.0303989 -1.0564246
...
P.PUBLIC.SCHOOL.ENROLNEMT TOTAL.DEPOSITSperCapita P.CHANGEpverty95to00
3               0.009373521                     NaN          -0.32404400
4              -0.228558808                     NaN          -0.05297025
5              -0.444472915                     NaN           0.06008696
> del1 <- c("P.violent.crime","R.Hisp.Lat.maleFemale","F.GOV.EXP.perCapita","P.URBANpopul",
   "TOTAL.DEPOSITSperCapita")
> SN <- S[,setdiff(colnames(S),del1)]
> dim(SN)
[1] 3110   75
> SN[1:3,]
  AGE050200D BZA115203D CLF040200D EDU635200D EDU685200D HSG045200D HSG495200D INC610199D INC910199D
3 -0.5630560  0.6791548 -0.4390026  0.1573698  0.1921497  1.7504631  0.2252569  0.6538238  0.2624938
  R.VOTING.DEMOCRATESoverREPUBLICANS P.PUBLIC.SCHOOL.ENROLNEMT P.CHANGEpverty95to00
3                         -0.7041726               0.009373521          -0.32404400
4                         -0.8312459              -0.228558808          -0.05297025
5                          0.6997518              -0.444472915           0.06008696
> out <- file("usc3110.vec","w")
> cat("*vertices 3110\n",file=out)
> for(i in 1:n) cat(SN[i,],"\n",file=out)
> close(out)
> 
> write.csv(SN,"usc3110S.csv",row.names=FALSE)