====== CIA factbook -> data frame ======
The data about the world countries are available in [[https://www.cia.gov/the-world-factbook/|The world factbook]]. Their conversion into JSON format is available at [[https://github.com/iancoleman/cia_world_factbook_api#data|GitHub.com/iancoleman]].
===== Inspection =====
We inspected using R the content of the JSON file and selected interesting properties:
> wdir <- "C:/Users/vlado/DL/data/hyper/CIA"
> setwd(wdir)
> library(jsonlite)
> library(countrycode)
> check <- function(v) ifelse(is.null(v),NA,v[1])
> fb <- "https://github.com/iancoleman/cia_world_factbook_api/raw/master/data/factbook.json"
> # fb <- "factbook2020nov.json"
> J <- fromJSON(readLines(fb))
> C <- names(J$countries)
> head(C)
> n <- length(C)
> str(J,max.level=2)
> J$countries[[4]]$data$name
[1] "Albania"
> J$countries$albania$data$name
[1] "Albania"
> names(J$countries)
> names(J$countries$albania$data)
[1] "name" "introduction" "geography"
[4] "people" "government" "economy"
[7] "energy" "communications" "transportation"
[10] "military_and_security" "transnational_issues"
> names(J$countries$albania$data$energy)
> names(J$countries$albania$data$geography)
> J$countries$albania$data$geography$area$total # scalar
> J$countries$albania$data$geography$natural_resources # set
> J$countries$albania$data$geography$elevation # interval
> J$countries$albania$data$geography$land_use # modal
> names(J$countries$albania$data$people)
> J$countries$albania$data$people$population$total # scalar
> J$countries$albania$data$people$population$age_structure # modal
> names(J$countries$albania$data$government)
> J$countries$albania$data$government$international_organization_participation # set
> names(J$countries$albania$data$economy)
> J$countries$albania$data$economy$gdp$composition
> J$countries$albania$data$economy$gdp$composition$by_end_use
> J$countries$albania$data$economy$gdp$composition$by_sector_of_origin
> J$countries$albania$data$economy$labor_force
> J$countries$albania$data$economy$agriculture_products$products # set
> J$countries$albania$data$economy$industries # set
> J$countries$albania$data$economy$exports$commodities # set
> J$countries$albania$data$economy$imports$commodities # set
> J$countries$albania$data$economy$exports$partners # modal
> J$countries$albania$data$economy$imports$partners
> names(J$countries$albania$data$energy)
> J$countries$albania$data$energy$electricity$by_source # modal
===== Countries =====
Before extracting selected variables we determined for countries the corresponding ISO2 codes and world regions (used in the factbook)
> Cnames <- countryname(C,"country.name.en")
> iso2 <- countryname(C,'iso2c')
> Cnames <- countryname(C,"country.name.en")
> iso2 <- countryname(C,'iso2c')
> add <- c("isle_of_man", "saint_lucia", "saint_martin", "virgin_islands")
> i <- which(C %in% add)
> Cnames[i] <- c("Isle of man", "Saint Lucia", "Saint Martin", "Virgin Islands")
> iso2[i] <- c("IM", "LC", "MF", "VI")
> np <- c("AQ", "BV", "IO", "HM", "GS") # unknown population
> iso2[which(iso2 %in% np)] <- NA
Recognizing the countries some values were not matched unambiguously:
akrotiri, arctic_ocean, ashmore_and_cartier_islands, atlantic_ocean,
clipperton_island, coral_sea_islands, dhekelia, european_union,
indian_ocean, isle_of_man, jan_mayen, navassa_island, pacific_ocean,
paracel_islands, saint_lucia, saint_martin, southern_ocean, spratly_islands,
virgin_islands, wake_island, world
The same countries turned out to be also problematic for conversion into ISO2.
We checked them in the factbook and most of them had almost empty descriptions, except ''isle_of_man (IM)'', ''saint_lucia (LC)'', ''saint_martin (MF)'', ''virgin_islands (VI)''. We added them to the interesting countries.
In the first attempt to get the vector of ''total population'', it turned out that the following countries
> CNames[is.na(pt)]
[1] "Antarctica" "Bouvet Island"
[3] "British Indian Ocean Territory" "Heard & McDonald Islands"
[5] "South Georgia & South Sandwich Islands"
don't have this information and also have almost empty descriptions. We decided to remove them from the interesting countries ''I''.
===== Extracting variables =====
==== Scalars ====
Now, we can start to extract the selected variables:
> I <- which(!is.na(iso2))
> CNames <- Cnames[I]
> ISO2 <- iso2[I]
> n <- length(I)
> # regions
> reg <- rep("",n); j <- 0
> for(i in I) {j <- j+1; reg[j] <- J$countries[[i]]$data$geography$map_references}
> # total area
> at <- rep(NA,n); j <- 0
> for(i in I) {j <- j+1; at[j] <- J$countries[[i]]$data$geography$area$total$value}
> # total population
> pt <- rep(NA,n); j <- 0
> for(i in I) {j <- j+1; pt[j] <- J$countries[[i]]$data$people$population$total}
> # GDP
> gdp <- rep(NA,n); j <- 0
> for(i in I) {j <- j+1;
+ gdp[j] <- check(J$countries[[i]]$data$economy$gdp$purchasing_power_parity$annual_values$value)}
==== Set variables ====
> # resources
> RS <- vector(mode="list",n); j <- 0
> for(i in I) {j <- j+1; RS[[j]] <- J$countries[[i]]$data$geography$natural_resources$resources}
> W <- data.frame(Country=CNames,ISO2=ISO2,region=reg,area=at,pop=pt,resources=rep(NA,n))
> W$resources <- RS
> # international organizations
> orgs <- vector(mode="list",n); j <- 0
> for(i in I) {j <- j+1
+ tp <- J$countries[[i]]$data$government$international_organization_participation$organization
+ orgs[[j]] <- gsub(" ","\\?",tp)}
> W <- data.frame(Country=CNames,orgs=rep(NA,n))
> W$orgs <- orgs
> head(W)
> # agriculture products
> AP <- vector(mode="list",n); j <- 0
> for(i in I) {j <- j+1; AP[[j]] <- J$countries[[i]]$data$economy$agriculture_products$products}
> W <- data.frame(Country=CNames,ISO2=ISO2,agroP=rep(NA,n))
> W$agroP <- AP
> # industries
> IN <- vector(mode="list",n); j <- 0
> for(i in I) {j <- j+1; IN[[j]] <- J$countries[[i]]$data$economy$industries$industries}
> W <- data.frame(Country=CNames,ISO2=ISO2,indust=rep(NA,n))
> W$indust <- IN
> # exports commodities
> EXP <- vector(mode="list",n); j <- 0
> for(i in I) {j <- j+1; EXP[[j]] <- J$countries[[i]]$data$economy$exports$commodities$by_commodity}
> W <- data.frame(Country=CNames,ISO2=ISO2,expCom=rep(NA,n))
> W$expCom <- EXP
> # imports commodities
> IMP <- vector(mode="list",n); j <- 0
> for(i in I) {j <- j+1; IMP[[j]] <- J$countries[[i]]$data$economy$imports$commodities$by_commodity}
> W <- data.frame(Country=CNames,ISO2=ISO2,impCom=rep(NA,n))
> W$impCom <- IMP
> head(W)
Some countries don't have the GDP info.
> CNames[which(is.na(gdp))]
[1] "Christmas Island" "Cocos (Keeling) Islands" "Palestinian Territories"
[4] "Vatican City" "Norfolk Island" "Pitcairn Islands"
[7] "St. Barthélemy" "Svalbard & Jan Mayen"
International organizations that contain the question mark ''?'' in their name have a note (compliant country, correspondent, NGOs, observer, etc.
==== Modal variables ====
Scheme for modal data
> # End users
> EU <- vector(mode="list",n); j <- 0
> for(i in I) {j <- j+1;
+ TU <- J$countries[[i]]$data$economy$gdp$composition$by_end_use$end_uses
+ EUnames <- names(TU); k <- length(EUnames); U <- rep(NA,k); names(U) <- EUnames
+ for(s in 1:k) U[s] <- check(TU[[EUnames[s]]]$value)
+ EU[[j]] <- U}
> W <- data.frame(Country=CNames,ISO2=ISO2,region=reg,area=at,pop=pt,Eusers=rep(NA,n))
> W$Eusers <- EU
> head(W)
Country ISO2 region area pop Eusers
1 Afghanistan AF Asia 652230 36643815 81.6, 12.0, 17.2, 30.0, 6.7, -47.6
2 Albania AL Europe 28748 3074579 78.1, 11.5, 25.2, 0.2, 31.5, -46.6
3 Algeria DZ Africa 2381740 42972878 42.7, 20.2, 38.1, 11.2, 23.6, -35.8
4 American Samoa AS Oceania 224 49437 66.4, 49.7, 7.3, 5.1, 65.0, -93.5
5 Andorra AD Europe 468 77000 NA, NA, NA, NA, NA, NA
6 Angola AO Africa 1246700 32522339 80.6, 15.6, 10.3, -1.2, 25.4, -30.7
> W$Eusers[3]
[[1]]
household_consumption government_consumption investment_in_fixed_capital
42.7 20.2 38.1
investment_in_inventories exports_of_goods_and_services imports_of_goods_and_services
11.2 23.6 -35.8
> # age structure
> AS <- vector(mode="list",n); j <- 0
> for(i in I) {j <- j+1;
+ TU <- J$countries[[i]]$data$people$age_structure
+ Unames <- names(TU); k <- length(Unames)-1
+ if(k==5){
+ U <- rep(NA,k); names(U) <- Unames[1:k]
+ for(s in 1:k) U[s] <- check(TU[[Unames[s]]]$percent)
+ } else { U <- rep(NA,5); cat(">>>",i,k,"\n") }
+ AS[[j]] <- U}
> W <- data.frame(Country=CNames,ISO2=ISO2,region=reg,area=at,pop=pt,ageS=rep(NA,n))
> W$ageS <- AS
> head(W)
> W[1,"ageS"]
[[1]]
0_to_14 15_to_24 25_to_54 55_to_64 65_and_over
40.62 21.26 31.44 4.01 2.68
> mis <- c( 54, 81, 105, 170, 171, 185, 223, 233 )
> C[mis]
[1] "cocos_keeling_islands" "falkland_islands_islas_malvinas"
[3] "holy_see_vatican_city" "niue"
[5] "norfolk_island" "pitcairn_islands"
[7] "svalbard" "tokelau"
> # gdp by end use
> GE <- vector(mode="list",n); j <- 0
> for(i in I) {j <- j+1;
+ TU <- J$countries[[i]]$data$economy$gdp$composition$by_end_use$end_uses
+ Unames <- names(TU); k <- length(Unames)
+ if(k>0){
+ U <- rep(NA,k); names(U) <- Unames
+ for(s in 1:k) U[s] <- check(TU[[Unames[s]]]$value)
+ } else { U <- rep(NA,6); cat(">>>",j,k,"\n") }
+ GE[[j]] <- U}
> W <- data.frame(Country=CNames,ISO2=ISO2,region=reg,area=at,pop=pt,gdpE=rep(NA,n))
> W$gdpE <- GE
> head(W)
Country ISO2 region area pop gdpE
1 Afghanistan AF Asia 652230 36643815 81.6, 12.0, 17.2, 30.0, 6.7, -47.6
2 Albania AL Europe 28748 3074579 78.1, 11.5, 25.2, 0.2, 31.5, -46.6
3 Algeria DZ Africa 2381740 42972878 42.7, 20.2, 38.1, 11.2, 23.6, -35.8
> misj <- c( 5,44,45,50,70,82,88,93,103,108,112,124,
+ 140,155,156,168,176,177,180,181,192,203,213,234)
> CNames[misj]
[1] "Andorra" "Christmas Island" "Cocos (Keeling) Islands"
[4] "Cook Islands" "Falkland Islands" "Gibraltar"
[7] "Guernsey" "Vatican City" "Isle of man"
[10] "Jersey" "Kiribati" "Liechtenstein"
[13] "Monaco" "Niue" "Norfolk Island"
[16] "Pitcairn Islands" "St. Barthélemy" "St. Helena"
[19] "Saint Martin" "St. Pierre & Miquelon" "Sint Maarten"
[22] "Svalbard & Jan Mayen" "Tokelau" "Western Sahara"
> # gdp by sector of origin
> GO <- vector(mode="list",n); j <- 0
> for(i in I) {j <- j+1;
+ TU <- J$countries[[i]]$data$economy$gdp$composition$by_sector_of_origin$sectors
+ Unames <- names(TU); k <- length(Unames)
+ if(k>0){
+ U <- rep(NA,k); names(U) <- Unames
+ for(s in 1:k) U[s] <- check(TU[[Unames[s]]]$value)
+ } else { U <- rep(NA,3); cat(">>>",j,k,"\n") }
+ GO[[j]] <- U}
> W <- data.frame(Country=CNames,ISO2=ISO2,region=reg,area=at,pop=pt,gdpO=rep(NA,n))
> W$gdpO <- GO
> head(W)
Country ISO2 region area pop gdpO
1 Afghanistan AF Asia 652230 36643815 23.0, 21.1, 55.9
2 Albania AL Europe 28748 3074579 21.7, 24.2, 54.1
3 Algeria DZ Africa 2381740 42972878 13.3, 39.3, 47.4
> misj <- c(44,45,93,156,168,176,177,198,203,213,232)
> CNames[misj]
[1] "Christmas Island" "Cocos (Keeling) Islands" "Vatican City"
[4] "Norfolk Island" "Pitcairn Islands" "St. Barthélemy"
[7] "St. Helena" "South Sudan" "Svalbard & Jan Mayen"
[10] "Tokelau" "Wallis & Futuna"
> # exports partners
> EPA <- vector(mode="list",n); j <- 0
> for(i in I) {j <- j+1;
+ TU <- J$countries[[i]]$data$economy$exports$partners$by_country
+ U <- TU$percent; names(U) <- TU$name
+ EPA[[j]] <- U}
> W <- data.frame(Country=CNames,ISO2=ISO2,region=reg,area=at,pop=pt,expPar=rep(NA,n))
> W$expPar <- EPA
> head(W)
Country ISO2 region area pop expPar
1 Afghanistan AF Asia 652230 36643815 56.5, 29.6
2 Albania AL Europe 28748 3074579 53.4, 7.7, 5.6, 4.2
3 Algeria DZ Africa 2381740 42972878 17.4, 13.0, 11.9, 9.4, 6.2, 5.5
4 American Samoa AS Oceania 224 49437 25.0, 19.0, 15.6, 10.4, 5.1
5 Andorra AD Europe 468 77000 NULL
6 Angola AO Africa 1246700 32522339 61.2, 13.0, 4.2
> W[3,"expPar"]
[[1]]
Italy Spain France US Brazil Netherlands
17.4 13.0 11.9 9.4 6.2 5.5
> # imports partners
> IPA <- vector(mode="list",n); j <- 0
> for(i in I) {j <- j+1;
+ TU <- J$countries[[i]]$data$economy$imports$partners$by_country
+ U <- TU$percent; names(U) <- TU$name
+ IPA[[j]] <- U}
> W <- data.frame(Country=CNames,ISO2=ISO2,region=reg,area=at,pop=pt,impPar=rep(NA,n))
> W$impPar <- IPA
> head(W)
Country ISO2 region area pop impPar
1 Afghanistan AF Asia 652230 36643815 21.0, 20.5, 11.8, 11.0, 6.8, 5.3
2 Albania AL Europe 28748 3074579 28.5, 8.1, 8.0, 8.0, 7.9, 4.0
3 Algeria DZ Africa 2381740 42972878 18.2, 9.1, 8.0, 7.0, 6.9, 4.4
> W[3,"impPar"]
[[1]]
China France Italy Germany Spain Turkey
18.2 9.1 8.0 7.0 6.9 4.4
> # electricity$by_source
> ES <- vector(mode="list",n); j <- 0
> for(i in I) {j <- j+1;
+ TU <- J$countries[[i]]$data$energy$electricity$by_source
+ Unames <- names(TU); k <- length(Unames)
+ if(k>0){
+ U <- rep(NA,k); names(U) <- Unames
+ for(s in 1:k) U[s] <- check(TU[[Unames[s]]]$percent)
+ } else { U <- rep(NA,4); cat(">>>",j,k,"\n") }
+ ES[[j]] <- U}
> W <- data.frame(Country=CNames,ISO2=ISO2,region=reg,area=at,pop=pt,eleS=rep(NA,n))
> W$eleS <- ES
> head(W)
Country ISO2 region area pop eleS
1 Afghanistan AF Asia 652230 36643815 45, 0, 52, 4
2 Albania AL Europe 28748 3074579 5, 0, 95, 0
3 Algeria DZ Africa 2381740 42972878 96, 0, 1, 2
> W[3,"eleS"]
[[1]]
fossil_fuels nuclear_fuels hydroelectric_plants other_renewable_sources
96 0 1 2
> misj <- c(7,44,45,55,78,88,93,103,108,124,140,156,158,162,
+ 168,176,180,184,192,203,213,232)
> CNames[misj]
[1] "Anguilla" "Christmas Island" "Cocos (Keeling) Islands"
[4] "Curaçao" "Palestinian Territories" "Guernsey"
[7] "Vatican City" "Isle of man" "Jersey"
[10] "Liechtenstein" "Monaco" "Norfolk Island"
[13] "Northern Mariana Islands" "Palau" "Pitcairn Islands"
[16] "St. Barthélemy" "Saint Martin" "San Marino"
[19] "Sint Maarten" "Svalbard & Jan Mayen" "Tokelau"
[22] "Wallis & Futuna"
> # 220 only 3 values
==== Interval variables ====
The elevation data don't follow a uniform format
> J$countries[[4]]$data$geography$elevation
...
$highest_point
$highest_point$name
[1] "Maja e Korabit (Golem Korab)"
$highest_point$elevation
$highest_point$elevation$value
[1] 2764
...
> J$countries[[73]]$data$geography$elevation
...
$highest_point
[1] "Chimborazo 6,267"
...
Therefore the code is more complicated
> # elevation
> elev <- vector(mode="list",n); j <- 0
> for(i in I) {j <- j+1;
+ t <- J$countries[[i]]$data$geography$elevation$highest_point
+ if(typeof(t)=="character"){
+ st <- unlist(strsplit(t," "))
+ lM <- as.integer(gsub(",","",st[length(st)]))
+ } else {
+ lM <- J$countries[[i]]$data$geography$elevation$highest_point$elevation$value
+ }
+ lm <- check(J$countries[[i]]$data$geography$elevation$lowest_point$elevation$value)
+ elev[[j]] <- c(lm,lM)
+ }
> W <- data.frame(Country=CNames,ISO2=ISO2,region=reg,area=at,pop=pt,elev=rep(NA,n))
> W$elev <- elev
> head(W)
Country ISO2 region area pop elev
1 Afghanistan AF Asia 652230 36643815 258, 7492
2 Albania AL Europe 28748 3074579 0, 2764
3 Algeria DZ Africa 2381740 42972878 -40, 2908
==== Data frame ====
> NAs <- rep(NA,n)
> W <- data.frame(Country=CNames,ISO2=ISO2,region=reg,area=at,pop=pt,gdp=gdp,
+ resources=NAs,orgs=NAs,agroP=NAs,indust=NAs,expCom=NAs,impCom=NAs,Eusers=NAs,
+ ageS=NAs,gdpE=NAs,gdpO=NAs,expPar=NAs,impPar=NAs,eleS=NAs,elev=NAs)
> W$resources <- RS; W$orgs <- orgs; W$agroP <- AP; W$indust <- IN
> W$expCom <- EXP; W$impCom <- IMP; W$Eusers <- EU; W$ageS <- AS
> W$gdpE <- GE; W$gdpO <- GO; W$expPar <- EPA; W$impPar <- IPA
> W$eleS <- ES; W$elev <- elev
> write(toJSON(W),"Factbook.json")
> wdir <- "C:/Users/vlado/DL/data/hyper/CIA"
> setwd(wdir)
> library(jsonlite)
> FB <- fromJSON("Factbook.json")