====== CIA factbook -> data frame ====== The data about the world countries are available in [[https://www.cia.gov/the-world-factbook/|The world factbook]]. Their conversion into JSON format is available at [[https://github.com/iancoleman/cia_world_factbook_api#data|GitHub.com/iancoleman]]. ===== Inspection ===== We inspected using R the content of the JSON file and selected interesting properties: > wdir <- "C:/Users/vlado/DL/data/hyper/CIA" > setwd(wdir) > library(jsonlite) > library(countrycode) > check <- function(v) ifelse(is.null(v),NA,v[1]) > fb <- "https://github.com/iancoleman/cia_world_factbook_api/raw/master/data/factbook.json" > # fb <- "factbook2020nov.json" > J <- fromJSON(readLines(fb)) > C <- names(J$countries) > head(C) > n <- length(C) > str(J,max.level=2) > J$countries[[4]]$data$name [1] "Albania" > J$countries$albania$data$name [1] "Albania" > names(J$countries) > names(J$countries$albania$data) [1] "name" "introduction" "geography" [4] "people" "government" "economy" [7] "energy" "communications" "transportation" [10] "military_and_security" "transnational_issues" > names(J$countries$albania$data$energy) > names(J$countries$albania$data$geography) > J$countries$albania$data$geography$area$total # scalar > J$countries$albania$data$geography$natural_resources # set > J$countries$albania$data$geography$elevation # interval > J$countries$albania$data$geography$land_use # modal > names(J$countries$albania$data$people) > J$countries$albania$data$people$population$total # scalar > J$countries$albania$data$people$population$age_structure # modal > names(J$countries$albania$data$government) > J$countries$albania$data$government$international_organization_participation # set > names(J$countries$albania$data$economy) > J$countries$albania$data$economy$gdp$composition > J$countries$albania$data$economy$gdp$composition$by_end_use > J$countries$albania$data$economy$gdp$composition$by_sector_of_origin > J$countries$albania$data$economy$labor_force > J$countries$albania$data$economy$agriculture_products$products # set > J$countries$albania$data$economy$industries # set > J$countries$albania$data$economy$exports$commodities # set > J$countries$albania$data$economy$imports$commodities # set > J$countries$albania$data$economy$exports$partners # modal > J$countries$albania$data$economy$imports$partners > names(J$countries$albania$data$energy) > J$countries$albania$data$energy$electricity$by_source # modal ===== Countries ===== Before extracting selected variables we determined for countries the corresponding ISO2 codes and world regions (used in the factbook) > Cnames <- countryname(C,"country.name.en") > iso2 <- countryname(C,'iso2c') > Cnames <- countryname(C,"country.name.en") > iso2 <- countryname(C,'iso2c') > add <- c("isle_of_man", "saint_lucia", "saint_martin", "virgin_islands") > i <- which(C %in% add) > Cnames[i] <- c("Isle of man", "Saint Lucia", "Saint Martin", "Virgin Islands") > iso2[i] <- c("IM", "LC", "MF", "VI") > np <- c("AQ", "BV", "IO", "HM", "GS") # unknown population > iso2[which(iso2 %in% np)] <- NA Recognizing the countries some values were not matched unambiguously: akrotiri, arctic_ocean, ashmore_and_cartier_islands, atlantic_ocean, clipperton_island, coral_sea_islands, dhekelia, european_union, indian_ocean, isle_of_man, jan_mayen, navassa_island, pacific_ocean, paracel_islands, saint_lucia, saint_martin, southern_ocean, spratly_islands, virgin_islands, wake_island, world The same countries turned out to be also problematic for conversion into ISO2. We checked them in the factbook and most of them had almost empty descriptions, except ''isle_of_man (IM)'', ''saint_lucia (LC)'', ''saint_martin (MF)'', ''virgin_islands (VI)''. We added them to the interesting countries. In the first attempt to get the vector of ''total population'', it turned out that the following countries > CNames[is.na(pt)] [1] "Antarctica" "Bouvet Island" [3] "British Indian Ocean Territory" "Heard & McDonald Islands" [5] "South Georgia & South Sandwich Islands" don't have this information and also have almost empty descriptions. We decided to remove them from the interesting countries ''I''. ===== Extracting variables ===== ==== Scalars ==== Now, we can start to extract the selected variables: > I <- which(!is.na(iso2)) > CNames <- Cnames[I] > ISO2 <- iso2[I] > n <- length(I) > # regions > reg <- rep("",n); j <- 0 > for(i in I) {j <- j+1; reg[j] <- J$countries[[i]]$data$geography$map_references} > # total area > at <- rep(NA,n); j <- 0 > for(i in I) {j <- j+1; at[j] <- J$countries[[i]]$data$geography$area$total$value} > # total population > pt <- rep(NA,n); j <- 0 > for(i in I) {j <- j+1; pt[j] <- J$countries[[i]]$data$people$population$total} > # GDP > gdp <- rep(NA,n); j <- 0 > for(i in I) {j <- j+1; + gdp[j] <- check(J$countries[[i]]$data$economy$gdp$purchasing_power_parity$annual_values$value)} ==== Set variables ==== > # resources > RS <- vector(mode="list",n); j <- 0 > for(i in I) {j <- j+1; RS[[j]] <- J$countries[[i]]$data$geography$natural_resources$resources} > W <- data.frame(Country=CNames,ISO2=ISO2,region=reg,area=at,pop=pt,resources=rep(NA,n)) > W$resources <- RS > # international organizations > orgs <- vector(mode="list",n); j <- 0 > for(i in I) {j <- j+1 + tp <- J$countries[[i]]$data$government$international_organization_participation$organization + orgs[[j]] <- gsub(" ","\\?",tp)} > W <- data.frame(Country=CNames,orgs=rep(NA,n)) > W$orgs <- orgs > head(W) > # agriculture products > AP <- vector(mode="list",n); j <- 0 > for(i in I) {j <- j+1; AP[[j]] <- J$countries[[i]]$data$economy$agriculture_products$products} > W <- data.frame(Country=CNames,ISO2=ISO2,agroP=rep(NA,n)) > W$agroP <- AP > # industries > IN <- vector(mode="list",n); j <- 0 > for(i in I) {j <- j+1; IN[[j]] <- J$countries[[i]]$data$economy$industries$industries} > W <- data.frame(Country=CNames,ISO2=ISO2,indust=rep(NA,n)) > W$indust <- IN > # exports commodities > EXP <- vector(mode="list",n); j <- 0 > for(i in I) {j <- j+1; EXP[[j]] <- J$countries[[i]]$data$economy$exports$commodities$by_commodity} > W <- data.frame(Country=CNames,ISO2=ISO2,expCom=rep(NA,n)) > W$expCom <- EXP > # imports commodities > IMP <- vector(mode="list",n); j <- 0 > for(i in I) {j <- j+1; IMP[[j]] <- J$countries[[i]]$data$economy$imports$commodities$by_commodity} > W <- data.frame(Country=CNames,ISO2=ISO2,impCom=rep(NA,n)) > W$impCom <- IMP > head(W) Some countries don't have the GDP info. > CNames[which(is.na(gdp))] [1] "Christmas Island" "Cocos (Keeling) Islands" "Palestinian Territories" [4] "Vatican City" "Norfolk Island" "Pitcairn Islands" [7] "St. Barthélemy" "Svalbard & Jan Mayen" International organizations that contain the question mark ''?'' in their name have a note (compliant country, correspondent, NGOs, observer, etc. ==== Modal variables ==== Scheme for modal data > # End users > EU <- vector(mode="list",n); j <- 0 > for(i in I) {j <- j+1; + TU <- J$countries[[i]]$data$economy$gdp$composition$by_end_use$end_uses + EUnames <- names(TU); k <- length(EUnames); U <- rep(NA,k); names(U) <- EUnames + for(s in 1:k) U[s] <- check(TU[[EUnames[s]]]$value) + EU[[j]] <- U} > W <- data.frame(Country=CNames,ISO2=ISO2,region=reg,area=at,pop=pt,Eusers=rep(NA,n)) > W$Eusers <- EU > head(W) Country ISO2 region area pop Eusers 1 Afghanistan AF Asia 652230 36643815 81.6, 12.0, 17.2, 30.0, 6.7, -47.6 2 Albania AL Europe 28748 3074579 78.1, 11.5, 25.2, 0.2, 31.5, -46.6 3 Algeria DZ Africa 2381740 42972878 42.7, 20.2, 38.1, 11.2, 23.6, -35.8 4 American Samoa AS Oceania 224 49437 66.4, 49.7, 7.3, 5.1, 65.0, -93.5 5 Andorra AD Europe 468 77000 NA, NA, NA, NA, NA, NA 6 Angola AO Africa 1246700 32522339 80.6, 15.6, 10.3, -1.2, 25.4, -30.7 > W$Eusers[3] [[1]] household_consumption government_consumption investment_in_fixed_capital 42.7 20.2 38.1 investment_in_inventories exports_of_goods_and_services imports_of_goods_and_services 11.2 23.6 -35.8 > # age structure > AS <- vector(mode="list",n); j <- 0 > for(i in I) {j <- j+1; + TU <- J$countries[[i]]$data$people$age_structure + Unames <- names(TU); k <- length(Unames)-1 + if(k==5){ + U <- rep(NA,k); names(U) <- Unames[1:k] + for(s in 1:k) U[s] <- check(TU[[Unames[s]]]$percent) + } else { U <- rep(NA,5); cat(">>>",i,k,"\n") } + AS[[j]] <- U} > W <- data.frame(Country=CNames,ISO2=ISO2,region=reg,area=at,pop=pt,ageS=rep(NA,n)) > W$ageS <- AS > head(W) > W[1,"ageS"] [[1]] 0_to_14 15_to_24 25_to_54 55_to_64 65_and_over 40.62 21.26 31.44 4.01 2.68 > mis <- c( 54, 81, 105, 170, 171, 185, 223, 233 ) > C[mis] [1] "cocos_keeling_islands" "falkland_islands_islas_malvinas" [3] "holy_see_vatican_city" "niue" [5] "norfolk_island" "pitcairn_islands" [7] "svalbard" "tokelau" > # gdp by end use > GE <- vector(mode="list",n); j <- 0 > for(i in I) {j <- j+1; + TU <- J$countries[[i]]$data$economy$gdp$composition$by_end_use$end_uses + Unames <- names(TU); k <- length(Unames) + if(k>0){ + U <- rep(NA,k); names(U) <- Unames + for(s in 1:k) U[s] <- check(TU[[Unames[s]]]$value) + } else { U <- rep(NA,6); cat(">>>",j,k,"\n") } + GE[[j]] <- U} > W <- data.frame(Country=CNames,ISO2=ISO2,region=reg,area=at,pop=pt,gdpE=rep(NA,n)) > W$gdpE <- GE > head(W) Country ISO2 region area pop gdpE 1 Afghanistan AF Asia 652230 36643815 81.6, 12.0, 17.2, 30.0, 6.7, -47.6 2 Albania AL Europe 28748 3074579 78.1, 11.5, 25.2, 0.2, 31.5, -46.6 3 Algeria DZ Africa 2381740 42972878 42.7, 20.2, 38.1, 11.2, 23.6, -35.8 > misj <- c( 5,44,45,50,70,82,88,93,103,108,112,124, + 140,155,156,168,176,177,180,181,192,203,213,234) > CNames[misj] [1] "Andorra" "Christmas Island" "Cocos (Keeling) Islands" [4] "Cook Islands" "Falkland Islands" "Gibraltar" [7] "Guernsey" "Vatican City" "Isle of man" [10] "Jersey" "Kiribati" "Liechtenstein" [13] "Monaco" "Niue" "Norfolk Island" [16] "Pitcairn Islands" "St. Barthélemy" "St. Helena" [19] "Saint Martin" "St. Pierre & Miquelon" "Sint Maarten" [22] "Svalbard & Jan Mayen" "Tokelau" "Western Sahara" > # gdp by sector of origin > GO <- vector(mode="list",n); j <- 0 > for(i in I) {j <- j+1; + TU <- J$countries[[i]]$data$economy$gdp$composition$by_sector_of_origin$sectors + Unames <- names(TU); k <- length(Unames) + if(k>0){ + U <- rep(NA,k); names(U) <- Unames + for(s in 1:k) U[s] <- check(TU[[Unames[s]]]$value) + } else { U <- rep(NA,3); cat(">>>",j,k,"\n") } + GO[[j]] <- U} > W <- data.frame(Country=CNames,ISO2=ISO2,region=reg,area=at,pop=pt,gdpO=rep(NA,n)) > W$gdpO <- GO > head(W) Country ISO2 region area pop gdpO 1 Afghanistan AF Asia 652230 36643815 23.0, 21.1, 55.9 2 Albania AL Europe 28748 3074579 21.7, 24.2, 54.1 3 Algeria DZ Africa 2381740 42972878 13.3, 39.3, 47.4 > misj <- c(44,45,93,156,168,176,177,198,203,213,232) > CNames[misj] [1] "Christmas Island" "Cocos (Keeling) Islands" "Vatican City" [4] "Norfolk Island" "Pitcairn Islands" "St. Barthélemy" [7] "St. Helena" "South Sudan" "Svalbard & Jan Mayen" [10] "Tokelau" "Wallis & Futuna" > # exports partners > EPA <- vector(mode="list",n); j <- 0 > for(i in I) {j <- j+1; + TU <- J$countries[[i]]$data$economy$exports$partners$by_country + U <- TU$percent; names(U) <- TU$name + EPA[[j]] <- U} > W <- data.frame(Country=CNames,ISO2=ISO2,region=reg,area=at,pop=pt,expPar=rep(NA,n)) > W$expPar <- EPA > head(W) Country ISO2 region area pop expPar 1 Afghanistan AF Asia 652230 36643815 56.5, 29.6 2 Albania AL Europe 28748 3074579 53.4, 7.7, 5.6, 4.2 3 Algeria DZ Africa 2381740 42972878 17.4, 13.0, 11.9, 9.4, 6.2, 5.5 4 American Samoa AS Oceania 224 49437 25.0, 19.0, 15.6, 10.4, 5.1 5 Andorra AD Europe 468 77000 NULL 6 Angola AO Africa 1246700 32522339 61.2, 13.0, 4.2 > W[3,"expPar"] [[1]] Italy Spain France US Brazil Netherlands 17.4 13.0 11.9 9.4 6.2 5.5 > # imports partners > IPA <- vector(mode="list",n); j <- 0 > for(i in I) {j <- j+1; + TU <- J$countries[[i]]$data$economy$imports$partners$by_country + U <- TU$percent; names(U) <- TU$name + IPA[[j]] <- U} > W <- data.frame(Country=CNames,ISO2=ISO2,region=reg,area=at,pop=pt,impPar=rep(NA,n)) > W$impPar <- IPA > head(W) Country ISO2 region area pop impPar 1 Afghanistan AF Asia 652230 36643815 21.0, 20.5, 11.8, 11.0, 6.8, 5.3 2 Albania AL Europe 28748 3074579 28.5, 8.1, 8.0, 8.0, 7.9, 4.0 3 Algeria DZ Africa 2381740 42972878 18.2, 9.1, 8.0, 7.0, 6.9, 4.4 > W[3,"impPar"] [[1]] China France Italy Germany Spain Turkey 18.2 9.1 8.0 7.0 6.9 4.4 > # electricity$by_source > ES <- vector(mode="list",n); j <- 0 > for(i in I) {j <- j+1; + TU <- J$countries[[i]]$data$energy$electricity$by_source + Unames <- names(TU); k <- length(Unames) + if(k>0){ + U <- rep(NA,k); names(U) <- Unames + for(s in 1:k) U[s] <- check(TU[[Unames[s]]]$percent) + } else { U <- rep(NA,4); cat(">>>",j,k,"\n") } + ES[[j]] <- U} > W <- data.frame(Country=CNames,ISO2=ISO2,region=reg,area=at,pop=pt,eleS=rep(NA,n)) > W$eleS <- ES > head(W) Country ISO2 region area pop eleS 1 Afghanistan AF Asia 652230 36643815 45, 0, 52, 4 2 Albania AL Europe 28748 3074579 5, 0, 95, 0 3 Algeria DZ Africa 2381740 42972878 96, 0, 1, 2 > W[3,"eleS"] [[1]] fossil_fuels nuclear_fuels hydroelectric_plants other_renewable_sources 96 0 1 2 > misj <- c(7,44,45,55,78,88,93,103,108,124,140,156,158,162, + 168,176,180,184,192,203,213,232) > CNames[misj] [1] "Anguilla" "Christmas Island" "Cocos (Keeling) Islands" [4] "Curaçao" "Palestinian Territories" "Guernsey" [7] "Vatican City" "Isle of man" "Jersey" [10] "Liechtenstein" "Monaco" "Norfolk Island" [13] "Northern Mariana Islands" "Palau" "Pitcairn Islands" [16] "St. Barthélemy" "Saint Martin" "San Marino" [19] "Sint Maarten" "Svalbard & Jan Mayen" "Tokelau" [22] "Wallis & Futuna" > # 220 only 3 values ==== Interval variables ==== The elevation data don't follow a uniform format > J$countries[[4]]$data$geography$elevation ... $highest_point $highest_point$name [1] "Maja e Korabit (Golem Korab)" $highest_point$elevation $highest_point$elevation$value [1] 2764 ... > J$countries[[73]]$data$geography$elevation ... $highest_point [1] "Chimborazo 6,267" ... Therefore the code is more complicated > # elevation > elev <- vector(mode="list",n); j <- 0 > for(i in I) {j <- j+1; + t <- J$countries[[i]]$data$geography$elevation$highest_point + if(typeof(t)=="character"){ + st <- unlist(strsplit(t," ")) + lM <- as.integer(gsub(",","",st[length(st)])) + } else { + lM <- J$countries[[i]]$data$geography$elevation$highest_point$elevation$value + } + lm <- check(J$countries[[i]]$data$geography$elevation$lowest_point$elevation$value) + elev[[j]] <- c(lm,lM) + } > W <- data.frame(Country=CNames,ISO2=ISO2,region=reg,area=at,pop=pt,elev=rep(NA,n)) > W$elev <- elev > head(W) Country ISO2 region area pop elev 1 Afghanistan AF Asia 652230 36643815 258, 7492 2 Albania AL Europe 28748 3074579 0, 2764 3 Algeria DZ Africa 2381740 42972878 -40, 2908 ==== Data frame ==== > NAs <- rep(NA,n) > W <- data.frame(Country=CNames,ISO2=ISO2,region=reg,area=at,pop=pt,gdp=gdp, + resources=NAs,orgs=NAs,agroP=NAs,indust=NAs,expCom=NAs,impCom=NAs,Eusers=NAs, + ageS=NAs,gdpE=NAs,gdpO=NAs,expPar=NAs,impPar=NAs,eleS=NAs,elev=NAs) > W$resources <- RS; W$orgs <- orgs; W$agroP <- AP; W$indust <- IN > W$expCom <- EXP; W$impCom <- IMP; W$Eusers <- EU; W$ageS <- AS > W$gdpE <- GE; W$gdpO <- GO; W$expPar <- EPA; W$impPar <- IPA > W$eleS <- ES; W$elev <- elev > write(toJSON(W),"Factbook.json") > wdir <- "C:/Users/vlado/DL/data/hyper/CIA" > setwd(wdir) > library(jsonlite) > FB <- fromJSON("Factbook.json")