====== Visualization ====== ===== Preparing the data ===== The {{book:temp:private:ana:foreignplayersfrance.zip|raw data for France}} are lists (country name, number of players) for different periods. > getwd() [1] "D:/Data/football/Pat" > a <- read.csv("france.csv",header=TRUE,sep=";",stringsAsFactors=FALSE) > names(a) [1] "C46.60" "X1946.1960" "C60.70" "X1960.1970" "C70.80" [6] "X1970.1980" "C80.90" "X1980.1990" "C90.00" "X1990.00" [11] "C2000.10" "X2000.10" "X" "X.1" "C90.94" [16] "X1990.94" "C96.00" "X1996.00" > C <- setdiff(union(a$C46.60,c(a$C60.70,a$C70.80,a$C80.90,a$C90.00,a$C2000.10)),"") > C [1] "ALG" "MAR" "ESP" "HUN" "ARG" "ITA" "AUT" "NED" "SWE" "CZE" "DEN" "BRA" [13] "SUI" "URU" "ENG" "CMR" "LVA" "GER" "TUN" "NOR" "CIV" "PAR" "LUX" "SCO" [25] "YUG" "WAL" "SEN" "MLI" "BEL" "TGO" "SCG" "POL" "ISL" "COG" "UKR" "RUS" [37] "ROU" "MKD" "GAB" "FIN" "BIH" "USA" "TUR" "SVN" "SVK" "GIN" "GEO" "CRO" [49] "CHI" "CAN" "ALB" "POR" "PER" "MDG" "HTI" "ISR" "TCD" "BFA" "IRL" "COD" [61] "BUL" "ARM" "GHA" "NGA" "NIR" "LBR" "HON" "COL" "AUS" "KOR" "BEN" "RSA" [73] "PAN" "MWI" "MRT" "MEX" "GRE" "CHN" "JPN" "EGY" "ZWE" "ZAM" "SLE" "NER" [85] "MUS" "KEN" "GNB" "QAT" "LTU" "JAM" "CPV" "AGO" We combine the lists into a matrix W = Country X Period with Number of players as values. setwd("D:/Data/football/Pat") library(lattice) a <- read.csv("france.csv",header=TRUE,sep=";",stringsAsFactors=FALSE) C <- setdiff(union(a$C46.60,c(a$C60.70,a$C70.80,a$C80.90,a$C90.00,a$C2000.10)),"") T <- matrix(0,nrow=length(C),ncol=8); rownames(T) <- C V1 <- a$X1946.1960; names(V1) <- a$C46.60 V2 <- a$X1960.1970; names(V2) <- a$C60.70 V3 <- a$X1970.1980; names(V3) <- a$C70.80 V4 <- a$X1980.1990; names(V4) <- a$C80.90 V5 <- a$X1990.00; names(V5) <- a$C90.00 V6 <- a$X2000.10; names(V6) <- a$C2000.10 V7 <- a$X1990.94; names(V7) <- a$C90.94 V8 <- a$X1996.00; names(V8) <- a$C96.00 U1 <- V1[!is.na(V1)]; T[names(U1),1] <- U1 U2 <- V2[!is.na(V2)]; T[names(U2),2] <- U2 U3 <- V3[!is.na(V3)]; T[names(U3),3] <- U3 U4 <- V4[!is.na(V4)]; T[names(U4),4] <- U4 U5 <- V5[!is.na(V5)]; T[names(U5),5] <- U5 U6 <- V6[!is.na(V6)]; T[names(U6),6] <- U6 U7 <- V7[!is.na(V7)]; T[names(U7),7] <- U7 U8 <- V8[!is.na(V8)]; T[names(U8),8] <- U8 colnames(T) <- c("46-60","60-70","70-80","80-90","90-00","00-10","90-94","96-00") S <- apply(T[,1:6],1,sum); q <- order(-S); W <- T[q,] The countries in W are ordered by the total number of players: > W 46-60 60-70 70-80 80-90 90-00 00-10 90-94 96-00 ALG 72 37 21 41 46 72 14 32 ARG 35 43 56 45 38 61 10 28 SEN 4 5 8 24 47 122 14 33 BRA 16 8 8 7 46 117 17 30 SCG 3 29 49 27 39 50 12 26 CMR 9 16 14 28 44 69 22 22 MAR 50 14 8 18 24 52 10 14 CIV 7 11 8 21 35 76 13 22 MLI 4 6 9 2 11 58 0 11 POL 3 3 22 26 23 12 9 13 DEN 19 7 8 21 23 9 12 11 NED 24 2 13 16 22 7 15 7 ESP 38 10 6 2 10 17 3 7 GER 8 8 20 21 13 2 9 4 TUN 7 2 3 1 14 42 3 10 ITA 32 7 7 1 10 10 2 8 GIN 1 2 1 2 22 38 3 17 HUN 35 8 2 8 9 3 5 4 BEL 4 4 2 12 18 25 4 15 CRO 1 5 9 15 20 14 13 7 POR 0 3 4 9 16 31 5 11 SUI 12 4 4 8 13 21 3 9 SWE 23 3 5 2 6 21 4 2 AUT 31 11 7 3 2 1 0 2 URU 11 5 10 12 3 14 2 1 CZE 20 2 1 2 7 20 4 3 BIH 2 2 10 13 12 5 6 5 NGA 0 0 0 2 13 26 5 10 TGO 3 5 3 1 11 15 1 9 COG 3 4 4 3 3 19 0 3 COD 0 0 0 5 9 17 2 7 PAR 6 6 5 4 3 4 2 1 LUX 6 7 7 3 3 1 3 1 GHA 0 0 0 3 8 15 4 4 COL 0 0 0 1 5 20 1 4 ENG 9 0 1 9 6 0 3 3 ROU 2 2 2 0 6 12 0 4 NOR 7 0 0 1 4 11 0 4 GAB 2 2 0 3 2 11 2 0 SVN 1 0 1 5 3 8 0 3 MDG 0 1 0 2 12 3 6 8 RUS 2 0 0 1 7 7 3 4 SVK 1 2 1 1 8 4 6 3 SCO 5 0 0 4 7 0 2 5 BFA 0 0 2 2 0 11 0 0 LBR 0 0 0 1 12 2 7 6 USA 1 1 0 0 4 8 2 2 HTI 0 1 2 0 5 6 1 3 BUL 0 0 0 5 5 4 4 1 CHI 1 0 4 0 2 5 0 2 TCD 0 0 5 4 3 0 2 2 ISL 3 1 0 3 1 2 1 0 AUS 0 0 0 1 7 2 4 3 UKR 2 0 0 1 4 2 2 2 ISR 0 0 6 1 2 0 2 0 IRL 0 0 0 6 3 0 0 3 ARM 0 0 0 4 4 1 2 2 LVA 8 0 0 0 0 0 0 0 TUR 1 0 0 1 1 5 0 1 GRE 0 0 0 0 1 7 0 1 YUG 4 2 1 0 0 0 0 0 ALB 1 0 0 0 3 3 0 3 KOR 0 0 0 0 2 5 0 2 BEN 0 0 0 0 2 5 2 1 JPN 0 0 0 0 0 6 0 0 MKD 2 0 2 0 0 1 0 0 EGY 0 0 0 0 0 5 0 0 WAL 4 0 0 0 0 0 0 0 FIN 2 0 0 0 1 1 0 1 PER 0 1 0 0 0 3 0 0 MRT 0 0 0 0 1 3 0 1 GEO 1 0 0 0 1 1 1 0 NIR 0 0 0 1 2 0 1 1 RSA 0 0 0 0 1 2 0 1 ZWE 0 0 0 0 0 3 0 0 MEX 0 0 0 0 1 1 0 1 ZAM 0 0 0 0 0 2 0 0 SLE 0 0 0 0 0 2 0 0 NER 0 0 0 0 0 2 0 0 MUS 0 0 0 0 0 2 0 0 KEN 0 0 0 0 0 2 0 0 GNB 0 0 0 0 0 2 0 0 CAN 1 0 0 0 0 0 0 0 HON 0 0 0 1 0 0 0 0 PAN 0 0 0 0 1 0 0 1 MWI 0 0 0 0 1 0 0 1 CHN 0 0 0 0 1 0 0 1 QAT 0 0 0 0 0 1 0 0 LTU 0 0 0 0 0 1 0 0 JAM 0 0 0 0 0 1 0 0 CPV 0 0 0 0 0 1 0 0 AGO 0 0 0 0 0 1 0 0 ===== Display of absolute values ===== pdf("France.pdf",height=11.7,width=8.3,paper="a4") levelplot(t(W[92:1,1:6]),scales=list(x=list(rot=90,cex=0.9),y=list(cex=0.6)), aspect=4, cuts=15, par.settings=list(regions=list(col=gray(15:0 / 15))), xlab="years",ylab="countries",main="France") dev.off() {{:book:temp:private:ana:france.pdf|France}} ===== Display of normalized columns ===== prob <- function(x) x/sum(x,na.rm=TRUE) Z <- apply(W,2,prob) pdf("FranceN.pdf",height=11.7,width=8.3,paper="a4") levelplot(t(Z[92:1,]),scales=list(x=list(rot=90,cex=0.9),y=list(cex=0.6)),aspect=4, cuts=15, par.settings=list(regions=list(col=gray(15:0 / 15))), xlab="years",ylab="countries",main="France") dev.off() {{:book:temp:private:ana:francen.pdf|France normalized columns}} ===== Display in colors with selected breaks ===== mbreaks <- c(0,1,3,10,50,122); rbreaks <- mbreaks-0.5; rbreaks[6] <- 123 pdf("France.pdf",height=11.7,width=8.3,paper="a4") levelplot(t(W[92:1,]),at=rbreaks, scales=list(x=list(rot=90,cex=0.9),y=list(cex=0.6)),aspect=4, par.settings=list(regions=list(col=c("white","yellow","cyan","red","blue"))), colorkey=list(at=1:length(mbreaks),labels = list(labels=mbreaks)), xlab="years",ylab="countries",main="France") dev.off() {{:book:temp:private:ana:francec.pdf|France colors with selected breaks}} ===== Flow of players ===== The regional flows to the top 5 leagues: for france, italy and spain: there are 6 periods:46-60;60-70;70-80;80-90;90-00;00-10; for gemany: there are 5 periods:60-70;70-80;80-90;90-00;00-10; for england: 2 periods: 90-00;00-10. The regions are: * AFC Asian Football Confederation * CSA Confederation of South America * NAM North America, Central America; Caribbean [CONCACAF] * CAF Confederation African Football * WEU Western Europe * EEU Eastern Europe The {{:book:temp:private:ana:ch4:flow.txt|source data}} contain square roots of numbers of players. > setwd("D:/Data/football/Pat/flow") > T <- read.table("flow.txt",header=TRUE,skip=1,sep=";") > S <- T[,2:7] > rownames(S) <- T[,1] > colnames(S) <- c("46-60","60-70","70-80","80-90","90-00","00-10") > S <- as.matrix(round(S**2)) > library(lattice) > Q <- as.vector(S) > summary(Q[Q>0]) Min. 1st Qu. Median Mean 3rd Qu. Max. 1.0 11.5 63.0 119.7 139.5 2363.0 > summary(Q[Q>139.5]) Min. 1st Qu. Median Mean 3rd Qu. Max. 148.0 165.2 200.5 346.1 298.5 2363.0 > mbreaks <- c(0,0.5,11.5,63,139.5,200,2363) > pdf("Flow.pdf",height=11.7,width=8.3,paper="a4") > levelplot(t(S),scales=list(x=list(rot=90,cex=0.9),y=list(cex=0.6)),aspect=2, cuts=6, + par.settings=list(regions=list(col=gray(c(1,0.80,0.65,0.50,0.4,0.3,0)))),at=mbreaks, + xlab="years",ylab="countries",main="Flow") > dev.off() We get the {{:book:temp:private:ana:ch4:flow.pdf|picture}} and the original table: > S 46-60 60-70 70-80 80-90 90-00 00-10 CAF-FRA 162 105 86 163 321 680 CSA-FRA 69 63 83 69 97 224 AFC-FRA 0 0 0 1 10 14 EEU-FRA 92 55 100 109 152 153 WEU-FRA 227 67 90 123 163 166 NAM-FRA 2 2 2 1 11 16 CAF-ITA 0 0 0 1 42 109 CSA-ITA 89 51 13 75 184 373 AFC-ITA 0 0 0 0 9 15 EEU-ITA 46 7 0 25 120 181 WEU-ITA 89 42 11 73 186 237 NAM-ITA 0 0 0 0 7 13 CAF-ESP 4 4 3 9 48 71 CSA-ESP 122 131 217 178 303 419 AFC-ESP 0 0 0 0 3 8 EEU-ESP 19 10 9 57 215 107 WEU-ESP 24 9 29 51 149 237 NAM-ESP 2 1 2 14 15 18 CAF-GER 0 0 1 10 67 42 CSA-GER 0 4 2 12 47 30 AFC-GER 0 0 4 8 21 22 EEU-GER 0 34 49 77 297 148 WEU-GER 0 38 96 121 186 95 NAM-GER 0 0 0 3 23 6 CAF-ENG 0 0 0 0 104 325 CSA-ENG 0 0 0 0 44 116 AFC-ENG 0 0 0 0 91 178 EEU-ENG 0 0 0 0 82 167 WEU-ENG 0 0 0 0 1699 2363 NAM-ENG 0 0 0 0 148 274 Removing the England data: > Q <- as.vector(S[24:1,]) > summary(Q[Q>0]) Min. 1st Qu. Median Mean 3rd Qu. Max. 1.00 10.00 49.00 83.54 120.50 680.00 > summary(Q[Q>120.5]) Min. 1st Qu. Median Mean 3rd Qu. Max. 121.0 153.0 184.0 221.3 237.0 680.0 > mbreaks <- c(0,0.5,10,49,120.5,184,680) > pdf("Flow3.pdf",height=11.7,width=8.3,paper="a4") > levelplot(t(S[24:1,]),scales=list(x=list(rot=90,cex=0.9),y=list(cex=0.9)),aspect=2, cuts=6, + par.settings=list(regions=list(col=c("white","yellow","cyan","red","blue","black"))), + at=mbreaks,xlab="years",ylab="countries",main="Flow") > dev.off() {{:book:temp:private:ana:ch4:flow3.pdf|Picture}} for 4 leagues. ===== Some references ===== * http://learnr.wordpress.com/2009/07/20/ggplot2-version-of-figures-in-lattice-multivariate-data-visualization-with-r-part-6/ * http://learnr.wordpress.com/2010/01/26/ggplot2-quick-heatmap-plotting/#more-2380 * http://www2.warwick.ac.uk/fac/sci/moac/students/peter_cock/r/matrix_contour/ * http://csg.sph.umich.edu/docs/R/graphics-1.pdf * http://csg.sph.umich.edu/docs/R/ * http://casoilresource.lawr.ucdavis.edu/drupal/blog/2?page=4