====== Clustering with relational constraint ====== ===== Foreigners in Germany ===== https://www-genesis.destatis.de/genesis/online Select ''Variables'', then select ''KREISE; Administrative districts'' and click on ''Administrative districts''. It would be interesting to look at the data set (for the year 2015) ''12521-0042; Foreigners: Administrative districts, reference date, sex, selected types of residence permits, citizenship'', but one needs to be a registered user. Just for an illustration we will use the data set ''12521-0040; Foreigners: Administrative districts, reference date, sex'' for the year 2015. For data normalization we will need also the population size "12411-0014; Population: Administrative districts, reference date" for the year 2015. The shape (maps) files for Germany can be downloaded from http://www.gadm.org/country . We unzip them into subdirectory ''shape''. Let's draw the map of Germany. > D0 <- readShapeSpatial("shape/DEU_adm0.shp") > D1 <- readShapeSpatial("shape/DEU_adm1.shp") > D2 <- readShapeSpatial("shape/DEU_adm2.shp") > lab <- as.character(D2$NAME_2) > Encoding(lab) <- 'UTF-8' > plot(D2,xlim=c(6,15),ylim=c(48.5,53.5),asp=1,col="wheat",bg="skyblue",border="red",lwd=0.05) > plot(D1,xlim=c(6,15),ylim=c(48.5,53.5),asp=1,lwd=0.2,border="blue",add=TRUE) > plot(D0,xlim=c(6,15),ylim=c(48.5,53.5),asp=1,lwd=0.2,add=TRUE) > text(coordinates(D2),labels=lab,cex=0.05) ==== Data ==== > P <- read.csv2("population.csv",row.names=2,skip=5,na.strings="-") > colnames(P) <- c("ID","Y2011","Y2012","Y2013","Y2014","Y2015") > NP <- unlist(lapply(strsplit(row.names(P),','), function(x) x[1])) > F <- read.csv2("foreigners.csv",row.names=3,skip=20,na.strings="-") > colnames(F) <- c("date","ID","Male","Female","Total") > NF <- unlist(lapply(strsplit(row.names(F),','), function(x) x[1])) > all(NP==NF) [1] TRUE > Pn <- row.names(P) # > Pnam <- gsub(", kreisfreie Stadt", "/S",gsub("-Kreis", "",gsub(", Landkreis", "",Pn))) > Pnam <- gsub(", kreisfreie Stadt", "/S",gsub(", Landkreis", "",Pn)) > Y <- P$Y2015 > which(F$ID %in% c(10041, 10042, 10043, 10044, 10045, 10046)) [1] 325 326 327 328 329 330 > Pnam[which(F$ID %in% c(10041, 10042, 10043, 10044, 10045, 10046))] [1] "Regionalverband Saarbrücken" "Merzig-Wadern" [3] "Neunkirchen" "Saarlouis" [5] "Saarpfalz" "Sankt Wendel" > Pnam[328] [1] "Saarlouis" > s <- sum(Y[325:330]) > Y[325:330] <- NA > Y[328] <- s > M <- cbind(100*F$Male/Y,100*F$Female/Y) > dim(M) [1] 476 2 > row.names(M) <- Pnam > MM <- na.omit(M) > colnames(MM) <- c("M","F") ==== Clustering ==== > r <- hclust(dist(MM),method="ward.D") > plot(r,hang=-1,cex=0.1,lwd=0.3,main="Foreigners") > p <- cutree(r,k=9) > table(p) p 1 2 3 4 5 6 7 8 9 39 48 34 76 57 47 49 33 12 > for(i in 1:9){C <- MM[p==i,]; cat("C",i,nrow(C),mean(C[,1]),mean(C[,2]),"\n")} C 1 39 6.304454 5.526648 C 2 48 5.254089 4.454723 C 3 34 2.443627 1.883977 C 4 76 3.424849 2.830167 C 5 57 1.646648 1.114532 C 6 47 7.586492 6.730954 C 7 49 4.541055 3.682423 C 8 33 9.421813 8.384791 C 9 12 13.6393 12.1157 > row.names(MM)[p==9] [1] "Düsseldorf/S" "Frankfurt am Main/S" "Offenbach am Main/S" [4] "Kassel/S" "Ludwigshafen am Rhein/S" "Stuttgart/S" [7] "Heilbronn/S" "Mannheim/S" "Pforzheim/S" [10] "München/S" "Nürnberg/S" "Schweinfurt/S" > Nam <- row.names(MM) > length(Nam) [1] 395 > length(lab) [1] 403 > p <- match(Nam,lab) > q <-match(lab,Nam) > cbind(which(is.na(p)),Nam[is.na(p)]) > cbind(which(is.na(q)),lab[is.na(q)]) [[match|Matching file]] > m <- matrix(read.table("f.txt")$V1,ncol=2,byrow=TRUE) > dim(m) [1] 87 2 > colnames(m) <- c("lab","nam") > head(m) lab nam [1,] 49 258 [2,] 3 192 [3,] 142 320 [4,] 244 99 [5,] 245 106 [6,] 246 80 > sum(is.na(p)) [1] 109 > sum(is.na(q)) [1] 95 > p[m[,2]] <- m[,1] > sum(is.na(p)) [1] 22 > q[m[,1]] <- m[,2] > sum(is.na(q)) [1] 8 [[http://vladowiki.fmf.uni-lj.si/doku.php?id=notes:net:shape|Shape to net]] [[notes:clu:counties:pajek|Clustering in Pajek]] \\ \\ [[ru:7iss#labs|Back to 7ISS Labs]]