====== Clustering with relational constraint ======
===== Foreigners in Germany =====
https://www-genesis.destatis.de/genesis/online
Select ''Variables'', then select ''KREISE; Administrative districts'' and click on ''Administrative districts''. It would be interesting to look at the data set (for the year 2015) ''12521-0042; Foreigners: Administrative districts, reference date, sex, selected types of residence permits, citizenship'', but one needs to be a registered user.
Just for an illustration we will use the data set ''12521-0040; Foreigners: Administrative districts, reference date, sex'' for the year 2015. For data normalization we will need also the population size "12411-0014; Population: Administrative districts, reference date" for the year 2015.
The shape (maps) files for Germany can be downloaded from http://www.gadm.org/country . We unzip them into subdirectory ''shape''.
Let's draw the map of Germany.
> D0 <- readShapeSpatial("shape/DEU_adm0.shp")
> D1 <- readShapeSpatial("shape/DEU_adm1.shp")
> D2 <- readShapeSpatial("shape/DEU_adm2.shp")
> lab <- as.character(D2$NAME_2)
> Encoding(lab) <- 'UTF-8'
> plot(D2,xlim=c(6,15),ylim=c(48.5,53.5),asp=1,col="wheat",bg="skyblue",border="red",lwd=0.05)
> plot(D1,xlim=c(6,15),ylim=c(48.5,53.5),asp=1,lwd=0.2,border="blue",add=TRUE)
> plot(D0,xlim=c(6,15),ylim=c(48.5,53.5),asp=1,lwd=0.2,add=TRUE)
> text(coordinates(D2),labels=lab,cex=0.05)
==== Data ====
> P <- read.csv2("population.csv",row.names=2,skip=5,na.strings="-")
> colnames(P) <- c("ID","Y2011","Y2012","Y2013","Y2014","Y2015")
> NP <- unlist(lapply(strsplit(row.names(P),','), function(x) x[1]))
> F <- read.csv2("foreigners.csv",row.names=3,skip=20,na.strings="-")
> colnames(F) <- c("date","ID","Male","Female","Total")
> NF <- unlist(lapply(strsplit(row.names(F),','), function(x) x[1]))
> all(NP==NF)
[1] TRUE
> Pn <- row.names(P)
# > Pnam <- gsub(", kreisfreie Stadt", "/S",gsub("-Kreis", "",gsub(", Landkreis", "",Pn)))
> Pnam <- gsub(", kreisfreie Stadt", "/S",gsub(", Landkreis", "",Pn))
> Y <- P$Y2015
> which(F$ID %in% c(10041, 10042, 10043, 10044, 10045, 10046))
[1] 325 326 327 328 329 330
> Pnam[which(F$ID %in% c(10041, 10042, 10043, 10044, 10045, 10046))]
[1] "Regionalverband Saarbrücken" "Merzig-Wadern"
[3] "Neunkirchen" "Saarlouis"
[5] "Saarpfalz" "Sankt Wendel"
> Pnam[328]
[1] "Saarlouis"
> s <- sum(Y[325:330])
> Y[325:330] <- NA
> Y[328] <- s
> M <- cbind(100*F$Male/Y,100*F$Female/Y)
> dim(M)
[1] 476 2
> row.names(M) <- Pnam
> MM <- na.omit(M)
> colnames(MM) <- c("M","F")
==== Clustering ====
> r <- hclust(dist(MM),method="ward.D")
> plot(r,hang=-1,cex=0.1,lwd=0.3,main="Foreigners")
> p <- cutree(r,k=9)
> table(p)
p
1 2 3 4 5 6 7 8 9
39 48 34 76 57 47 49 33 12
> for(i in 1:9){C <- MM[p==i,]; cat("C",i,nrow(C),mean(C[,1]),mean(C[,2]),"\n")}
C 1 39 6.304454 5.526648
C 2 48 5.254089 4.454723
C 3 34 2.443627 1.883977
C 4 76 3.424849 2.830167
C 5 57 1.646648 1.114532
C 6 47 7.586492 6.730954
C 7 49 4.541055 3.682423
C 8 33 9.421813 8.384791
C 9 12 13.6393 12.1157
> row.names(MM)[p==9]
[1] "Düsseldorf/S" "Frankfurt am Main/S" "Offenbach am Main/S"
[4] "Kassel/S" "Ludwigshafen am Rhein/S" "Stuttgart/S"
[7] "Heilbronn/S" "Mannheim/S" "Pforzheim/S"
[10] "München/S" "Nürnberg/S" "Schweinfurt/S"
> Nam <- row.names(MM)
> length(Nam)
[1] 395
> length(lab)
[1] 403
> p <- match(Nam,lab)
> q <-match(lab,Nam)
> cbind(which(is.na(p)),Nam[is.na(p)])
> cbind(which(is.na(q)),lab[is.na(q)])
[[match|Matching file]]
> m <- matrix(read.table("f.txt")$V1,ncol=2,byrow=TRUE)
> dim(m)
[1] 87 2
> colnames(m) <- c("lab","nam")
> head(m)
lab nam
[1,] 49 258
[2,] 3 192
[3,] 142 320
[4,] 244 99
[5,] 245 106
[6,] 246 80
> sum(is.na(p))
[1] 109
> sum(is.na(q))
[1] 95
> p[m[,2]] <- m[,1]
> sum(is.na(p))
[1] 22
> q[m[,1]] <- m[,2]
> sum(is.na(q))
[1] 8
[[http://vladowiki.fmf.uni-lj.si/doku.php?id=notes:net:shape|Shape to net]]
[[notes:clu:counties:pajek|Clustering in Pajek]]
\\ \\
[[ru:7iss#labs|Back to 7ISS Labs]]