====== Clustering with relational constraint ====== ===== Foreigners in Germany ===== https://www-genesis.destatis.de/genesis/online Select ''Variables'', then select ''KREISE; Administrative districts'' and click on ''Administrative districts''. It would be interesting to look at the data set (for the year 2015) ''12521-0042; Foreigners: Administrative districts, reference date, sex, selected types of residence permits, citizenship'', but one needs to be a registered user. Just for an illustration we will use the data set ''12521-0040; Foreigners: Administrative districts, reference date, sex'' for the year 2015. For data normalization we will need also the population size "12411-0014; Population: Administrative districts, reference date" for the year 2015. The shape (maps) files for Germany can be downloaded from http://www.gadm.org/country . We unzip them into subdirectory ''shape''. Let's draw the map of Germany.


> D0 <- readShapeSpatial("shape/DEU_adm0.shp")
> D1 <- readShapeSpatial("shape/DEU_adm1.shp")
> D2 <- readShapeSpatial("shape/DEU_adm2.shp") 
> lab <- as.character(D2$NAME_2)
> Encoding(lab) <- 'UTF-8'
> plot(D2,xlim=c(6,15),ylim=c(48.5,53.5),asp=1,col="wheat",bg="skyblue",border="red",lwd=0.05)
> plot(D1,xlim=c(6,15),ylim=c(48.5,53.5),asp=1,lwd=0.2,border="blue",add=TRUE)
> plot(D0,xlim=c(6,15),ylim=c(48.5,53.5),asp=1,lwd=0.2,add=TRUE)
> text(coordinates(D2),labels=lab,cex=0.05)

==== Data ====


> P <- read.csv2("population.csv",row.names=2,skip=5,na.strings="-")
> colnames(P) <- c("ID","Y2011","Y2012","Y2013","Y2014","Y2015")
> NP <- unlist(lapply(strsplit(row.names(P),','), function(x) x[1]))
> F <- read.csv2("foreigners.csv",row.names=3,skip=20,na.strings="-")
> colnames(F) <- c("date","ID","Male","Female","Total")
> NF <- unlist(lapply(strsplit(row.names(F),','), function(x) x[1]))
> all(NP==NF)
[1] TRUE

> Pn <- row.names(P)
# > Pnam <- gsub(", kreisfreie Stadt", "/S",gsub("-Kreis", "",gsub(", Landkreis", "",Pn)))
> Pnam <- gsub(", kreisfreie Stadt", "/S",gsub(", Landkreis", "",Pn))

> Y <- P$Y2015
> which(F$ID %in% c(10041, 10042, 10043, 10044, 10045, 10046))
[1] 325 326 327 328 329 330
> Pnam[which(F$ID %in% c(10041, 10042, 10043, 10044, 10045, 10046))]
[1] "Regionalverband Saarbrücken" "Merzig-Wadern"              
[3] "Neunkirchen"                 "Saarlouis"                  
[5] "Saarpfalz"                   "Sankt Wendel" 
> Pnam[328]
[1] "Saarlouis"
> s <- sum(Y[325:330])
> Y[325:330] <- NA
> Y[328] <- s

> M <- cbind(100*F$Male/Y,100*F$Female/Y)
> dim(M)
[1] 476   2
> row.names(M) <- Pnam
> MM <- na.omit(M)
> colnames(MM) <- c("M","F")

==== Clustering ====

        
> r <- hclust(dist(MM),method="ward.D")
> plot(r,hang=-1,cex=0.1,lwd=0.3,main="Foreigners")
> p <- cutree(r,k=9)
> table(p)
p
 1  2  3  4  5  6  7  8  9 
39 48 34 76 57 47 49 33 12 
> for(i in 1:9){C <- MM[p==i,]; cat("C",i,nrow(C),mean(C[,1]),mean(C[,2]),"\n")}
C 1 39 6.304454 5.526648 
C 2 48 5.254089 4.454723 
C 3 34 2.443627 1.883977 
C 4 76 3.424849 2.830167 
C 5 57 1.646648 1.114532 
C 6 47 7.586492 6.730954 
C 7 49 4.541055 3.682423 
C 8 33 9.421813 8.384791 
C 9 12 13.6393 12.1157 
> row.names(MM)[p==9]
 [1] "Düsseldorf/S"            "Frankfurt am Main/S"     "Offenbach am Main/S"    
 [4] "Kassel/S"                "Ludwigshafen am Rhein/S" "Stuttgart/S"            
 [7] "Heilbronn/S"             "Mannheim/S"              "Pforzheim/S"            
[10] "München/S"               "Nürnberg/S"              "Schweinfurt/S"

        
> Nam <- row.names(MM)
> length(Nam)
[1] 395
> length(lab)
[1] 403
> p <- match(Nam,lab)
> q <-match(lab,Nam)
> cbind(which(is.na(p)),Nam[is.na(p)])
> cbind(which(is.na(q)),lab[is.na(q)])

[[match|Matching file]]


> m <- matrix(read.table("f.txt")$V1,ncol=2,byrow=TRUE)
> dim(m)
[1] 87  2
> colnames(m) <- c("lab","nam")
> head(m)
     lab nam
[1,]  49 258
[2,]   3 192
[3,] 142 320
[4,] 244  99
[5,] 245 106
[6,] 246  80
> sum(is.na(p))
[1] 109
> sum(is.na(q))
[1] 95
> p[m[,2]] <- m[,1]
> sum(is.na(p))
[1] 22
> q[m[,1]] <- m[,2]
> sum(is.na(q))
[1] 8

[[http://vladowiki.fmf.uni-lj.si/doku.php?id=notes:net:shape|Shape to net]] [[notes:clu:counties:pajek|Clustering in Pajek]] \\ \\ [[ru:7iss#labs|Back to 7ISS Labs]]