CIA factbook -> data frame

CIA factbook -> data frame

The data about the world countries are available in The world factbook. Their conversion into JSON format is available at GitHub.com/iancoleman.

Inspection

We inspected using R the content of the JSON file and selected interesting properties:

> wdir <- "C:/Users/vlado/DL/data/hyper/CIA"
> setwd(wdir)

> library(jsonlite)
> library(countrycode)
> check <- function(v) ifelse(is.null(v),NA,v[1])

> fb <- "https://github.com/iancoleman/cia_world_factbook_api/raw/master/data/factbook.json"
> # fb <- "factbook2020nov.json"
> J <- fromJSON(readLines(fb))
> C <- names(J$countries)
> head(C)
> n <- length(C)

> str(J,max.level=2)
> J$countries[[4]]$data$name
[1] "Albania"
> J$countries$albania$data$name
[1] "Albania"
> names(J$countries)
> names(J$countries$albania$data)
 [1] "name"                  "introduction"          "geography"            
 [4] "people"                "government"            "economy"              
 [7] "energy"                "communications"        "transportation"       
[10] "military_and_security" "transnational_issues" 
> names(J$countries$albania$data$energy)
> names(J$countries$albania$data$geography)
> J$countries$albania$data$geography$area$total    # scalar
> J$countries$albania$data$geography$natural_resources   # set
> J$countries$albania$data$geography$elevation     # interval
> J$countries$albania$data$geography$land_use      # modal
> names(J$countries$albania$data$people)
> J$countries$albania$data$people$population$total # scalar
> J$countries$albania$data$people$population$age_structure      # modal
> names(J$countries$albania$data$government)
> J$countries$albania$data$government$international_organization_participation  # set
> names(J$countries$albania$data$economy)
> J$countries$albania$data$economy$gdp$composition
> J$countries$albania$data$economy$gdp$composition$by_end_use
> J$countries$albania$data$economy$gdp$composition$by_sector_of_origin
> J$countries$albania$data$economy$labor_force
> J$countries$albania$data$economy$agriculture_products$products  # set
> J$countries$albania$data$economy$industries                     # set
> J$countries$albania$data$economy$exports$commodities            # set
> J$countries$albania$data$economy$imports$commodities            # set
> J$countries$albania$data$economy$exports$partners               # modal
> J$countries$albania$data$economy$imports$partners
> names(J$countries$albania$data$energy)
> J$countries$albania$data$energy$electricity$by_source           # modal

Countries

Before extracting selected variables we determined for countries the corresponding ISO2 codes and world regions (used in the factbook)

> Cnames <- countryname(C,"country.name.en")
> iso2 <- countryname(C,'iso2c')
> Cnames <- countryname(C,"country.name.en")
> iso2 <- countryname(C,'iso2c')
> add <- c("isle_of_man", "saint_lucia", "saint_martin", "virgin_islands")
> i <- which(C %in% add)
> Cnames[i] <- c("Isle of man", "Saint Lucia", "Saint Martin", "Virgin Islands")
> iso2[i] <- c("IM", "LC", "MF", "VI")
> np <- c("AQ", "BV", "IO", "HM", "GS")  # unknown population
> iso2[which(iso2 %in% np)] <- NA

Recognizing the countries some values were not matched unambiguously:

akrotiri, arctic_ocean, ashmore_and_cartier_islands, atlantic_ocean, 
clipperton_island, coral_sea_islands, dhekelia, european_union, 
indian_ocean, isle_of_man, jan_mayen, navassa_island, pacific_ocean, 
paracel_islands, saint_lucia, saint_martin, southern_ocean, spratly_islands,
virgin_islands, wake_island, world

The same countries turned out to be also problematic for conversion into ISO2.

We checked them in the factbook and most of them had almost empty descriptions, except isle_of_man (IM), saint_lucia (LC), saint_martin (MF), virgin_islands (VI). We added them to the interesting countries.

In the first attempt to get the vector of total population, it turned out that the following countries

> CNames[is.na(pt)]
[1] "Antarctica"                             "Bouvet Island"                         
[3] "British Indian Ocean Territory"         "Heard & McDonald Islands"              
[5] "South Georgia & South Sandwich Islands"

don't have this information and also have almost empty descriptions. We decided to remove them from the interesting countries I.

Extracting variables

Scalars

Now, we can start to extract the selected variables:

> I <- which(!is.na(iso2))
> CNames <- Cnames[I]
> ISO2 <- iso2[I]
> n <- length(I)
> # regions
> reg <- rep("",n); j <- 0
> for(i in I) {j <- j+1; reg[j] <- J$countries[[i]]$data$geography$map_references}
> # total area
> at <- rep(NA,n); j <- 0
> for(i in I) {j <- j+1; at[j] <- J$countries[[i]]$data$geography$area$total$value}
> # total population
> pt <- rep(NA,n); j <- 0
> for(i in I) {j <- j+1; pt[j] <- J$countries[[i]]$data$people$population$total}
> # GDP
> gdp <- rep(NA,n); j <- 0
> for(i in I) {j <- j+1; 
+   gdp[j] <- check(J$countries[[i]]$data$economy$gdp$purchasing_power_parity$annual_values$value)}

Set variables

> # resources
> RS <- vector(mode="list",n); j <- 0
> for(i in I) {j <- j+1; RS[[j]] <- J$countries[[i]]$data$geography$natural_resources$resources}
> W <- data.frame(Country=CNames,ISO2=ISO2,region=reg,area=at,pop=pt,resources=rep(NA,n))
> W$resources <- RS
> # international organizations
> orgs <- vector(mode="list",n); j <- 0
> for(i in I) {j <- j+1 
+   tp <- J$countries[[i]]$data$government$international_organization_participation$organization
+   orgs[[j]] <- gsub(" ","\\?",tp)}
> W <- data.frame(Country=CNames,orgs=rep(NA,n))
> W$orgs <- orgs
> head(W)
> # agriculture products
> AP <- vector(mode="list",n); j <- 0
> for(i in I) {j <- j+1; AP[[j]] <- J$countries[[i]]$data$economy$agriculture_products$products}
> W <- data.frame(Country=CNames,ISO2=ISO2,agroP=rep(NA,n))
> W$agroP <- AP
> # industries
> IN <- vector(mode="list",n); j <- 0
> for(i in I) {j <- j+1; IN[[j]] <- J$countries[[i]]$data$economy$industries$industries}
> W <- data.frame(Country=CNames,ISO2=ISO2,indust=rep(NA,n))
> W$indust <- IN
> # exports commodities
> EXP <- vector(mode="list",n); j <- 0
> for(i in I) {j <- j+1; EXP[[j]] <- J$countries[[i]]$data$economy$exports$commodities$by_commodity}
> W <- data.frame(Country=CNames,ISO2=ISO2,expCom=rep(NA,n))
> W$expCom <- EXP
> # imports commodities
> IMP <- vector(mode="list",n); j <- 0
> for(i in I) {j <- j+1; IMP[[j]] <- J$countries[[i]]$data$economy$imports$commodities$by_commodity}
> W <- data.frame(Country=CNames,ISO2=ISO2,impCom=rep(NA,n))
> W$impCom <- IMP
> head(W)

Some countries don't have the GDP info.

> CNames[which(is.na(gdp))]
[1] "Christmas Island"        "Cocos (Keeling) Islands" "Palestinian Territories"
[4] "Vatican City"            "Norfolk Island"          "Pitcairn Islands"       
[7] "St. Barthélemy"          "Svalbard & Jan Mayen"

International organizations that contain the question mark ? in their name have a note (compliant country, correspondent, NGOs, observer, etc.

Modal variables

Scheme for modal data

> # End users
> EU <- vector(mode="list",n); j <- 0
> for(i in I) {j <- j+1; 
+   TU <- J$countries[[i]]$data$economy$gdp$composition$by_end_use$end_uses
+   EUnames <- names(TU); k <- length(EUnames); U <- rep(NA,k); names(U) <- EUnames
+   for(s in 1:k) U[s] <- check(TU[[EUnames[s]]]$value)
+   EU[[j]] <- U}
> W <- data.frame(Country=CNames,ISO2=ISO2,region=reg,area=at,pop=pt,Eusers=rep(NA,n))
> W$Eusers <- EU
> head(W)
         Country ISO2  region    area      pop                              Eusers
1    Afghanistan   AF    Asia  652230 36643815  81.6, 12.0, 17.2, 30.0, 6.7, -47.6
2        Albania   AL  Europe   28748  3074579  78.1, 11.5, 25.2, 0.2, 31.5, -46.6
3        Algeria   DZ  Africa 2381740 42972878 42.7, 20.2, 38.1, 11.2, 23.6, -35.8
4 American Samoa   AS Oceania     224    49437   66.4, 49.7, 7.3, 5.1, 65.0, -93.5
5        Andorra   AD  Europe     468    77000              NA, NA, NA, NA, NA, NA
6         Angola   AO  Africa 1246700 32522339 80.6, 15.6, 10.3, -1.2, 25.4, -30.7
> W$Eusers[3]
[[1]]
        household_consumption        government_consumption   investment_in_fixed_capital 
                         42.7                          20.2                          38.1 
    investment_in_inventories exports_of_goods_and_services imports_of_goods_and_services 
                         11.2                          23.6                         -35.8 
> # age structure
> AS <- vector(mode="list",n); j <- 0
> for(i in I) {j <- j+1; 
+   TU <- J$countries[[i]]$data$people$age_structure
+   Unames <- names(TU); k <- length(Unames)-1 
+   if(k==5){
+     U <- rep(NA,k); names(U) <- Unames[1:k]
+     for(s in 1:k) U[s] <- check(TU[[Unames[s]]]$percent)
+   } else { U <- rep(NA,5); cat(">>>",i,k,"\n") } 
+   AS[[j]] <- U}
> W <- data.frame(Country=CNames,ISO2=ISO2,region=reg,area=at,pop=pt,ageS=rep(NA,n))
> W$ageS <- AS
> head(W)
> W[1,"ageS"]
[[1]]
    0_to_14    15_to_24    25_to_54    55_to_64 65_and_over 
      40.62       21.26       31.44        4.01        2.68 
> mis <- c( 54, 81, 105, 170, 171, 185, 223, 233 )
> C[mis]
[1] "cocos_keeling_islands"           "falkland_islands_islas_malvinas"
[3] "holy_see_vatican_city"           "niue"                           
[5] "norfolk_island"                  "pitcairn_islands"               
[7] "svalbard"                        "tokelau"                        
> # gdp by end use
> GE <- vector(mode="list",n); j <- 0
> for(i in I) {j <- j+1; 
+   TU <- J$countries[[i]]$data$economy$gdp$composition$by_end_use$end_uses
+   Unames <- names(TU); k <- length(Unames) 
+   if(k>0){
+     U <- rep(NA,k); names(U) <- Unames
+     for(s in 1:k) U[s] <- check(TU[[Unames[s]]]$value)
+   } else { U <- rep(NA,6); cat(">>>",j,k,"\n") } 
+   GE[[j]] <- U}
> W <- data.frame(Country=CNames,ISO2=ISO2,region=reg,area=at,pop=pt,gdpE=rep(NA,n))
> W$gdpE <- GE
> head(W)
         Country ISO2  region    area      pop                                gdpE
1    Afghanistan   AF    Asia  652230 36643815  81.6, 12.0, 17.2, 30.0, 6.7, -47.6
2        Albania   AL  Europe   28748  3074579  78.1, 11.5, 25.2, 0.2, 31.5, -46.6
3        Algeria   DZ  Africa 2381740 42972878 42.7, 20.2, 38.1, 11.2, 23.6, -35.8
> misj <- c( 5,44,45,50,70,82,88,93,103,108,112,124,
+   140,155,156,168,176,177,180,181,192,203,213,234) 
> CNames[misj]
 [1] "Andorra"                 "Christmas Island"        "Cocos (Keeling) Islands"
 [4] "Cook Islands"            "Falkland Islands"        "Gibraltar"              
 [7] "Guernsey"                "Vatican City"            "Isle of man"            
[10] "Jersey"                  "Kiribati"                "Liechtenstein"          
[13] "Monaco"                  "Niue"                    "Norfolk Island"         
[16] "Pitcairn Islands"        "St. Barthélemy"          "St. Helena"             
[19] "Saint Martin"            "St. Pierre & Miquelon"   "Sint Maarten"           
[22] "Svalbard & Jan Mayen"    "Tokelau"                 "Western Sahara"         
> # gdp by sector of origin
> GO <- vector(mode="list",n); j <- 0
> for(i in I) {j <- j+1; 
+   TU <- J$countries[[i]]$data$economy$gdp$composition$by_sector_of_origin$sectors
+   Unames <- names(TU); k <- length(Unames) 
+   if(k>0){
+     U <- rep(NA,k); names(U) <- Unames
+     for(s in 1:k) U[s] <- check(TU[[Unames[s]]]$value)
+   } else { U <- rep(NA,3); cat(">>>",j,k,"\n") } 
+   GO[[j]] <- U}
> W <- data.frame(Country=CNames,ISO2=ISO2,region=reg,area=at,pop=pt,gdpO=rep(NA,n))
> W$gdpO <- GO
> head(W)
         Country ISO2  region    area      pop             gdpO
1    Afghanistan   AF    Asia  652230 36643815 23.0, 21.1, 55.9
2        Albania   AL  Europe   28748  3074579 21.7, 24.2, 54.1
3        Algeria   DZ  Africa 2381740 42972878 13.3, 39.3, 47.4
> misj <- c(44,45,93,156,168,176,177,198,203,213,232)
> CNames[misj] 
 [1] "Christmas Island"        "Cocos (Keeling) Islands" "Vatican City"           
 [4] "Norfolk Island"          "Pitcairn Islands"        "St. Barthélemy"         
 [7] "St. Helena"              "South Sudan"             "Svalbard & Jan Mayen"   
[10] "Tokelau"                 "Wallis & Futuna"        
> # exports partners
> EPA <- vector(mode="list",n); j <- 0
> for(i in I) {j <- j+1; 
+   TU <- J$countries[[i]]$data$economy$exports$partners$by_country
+   U <- TU$percent; names(U) <- TU$name 
+   EPA[[j]] <- U}
> W <- data.frame(Country=CNames,ISO2=ISO2,region=reg,area=at,pop=pt,expPar=rep(NA,n))
> W$expPar <- EPA
> head(W)
         Country ISO2  region    area      pop                          expPar
1    Afghanistan   AF    Asia  652230 36643815                      56.5, 29.6
2        Albania   AL  Europe   28748  3074579             53.4, 7.7, 5.6, 4.2
3        Algeria   DZ  Africa 2381740 42972878 17.4, 13.0, 11.9, 9.4, 6.2, 5.5
4 American Samoa   AS Oceania     224    49437     25.0, 19.0, 15.6, 10.4, 5.1
5        Andorra   AD  Europe     468    77000                            NULL
6         Angola   AO  Africa 1246700 32522339                 61.2, 13.0, 4.2
> W[3,"expPar"]
[[1]]
      Italy       Spain      France          US      Brazil Netherlands 
       17.4        13.0        11.9         9.4         6.2         5.5 
> # imports partners
> IPA <- vector(mode="list",n); j <- 0
> for(i in I) {j <- j+1; 
+   TU <- J$countries[[i]]$data$economy$imports$partners$by_country
+   U <- TU$percent; names(U) <- TU$name 
+   IPA[[j]] <- U}
> W <- data.frame(Country=CNames,ISO2=ISO2,region=reg,area=at,pop=pt,impPar=rep(NA,n))
> W$impPar <- IPA
> head(W)
         Country ISO2  region    area      pop                               impPar
1    Afghanistan   AF    Asia  652230 36643815     21.0, 20.5, 11.8, 11.0, 6.8, 5.3
2        Albania   AL  Europe   28748  3074579        28.5, 8.1, 8.0, 8.0, 7.9, 4.0
3        Algeria   DZ  Africa 2381740 42972878        18.2, 9.1, 8.0, 7.0, 6.9, 4.4
> W[3,"impPar"]
[[1]]
  China  France   Italy Germany   Spain  Turkey 
   18.2     9.1     8.0     7.0     6.9     4.4  
> # electricity$by_source 
> ES <- vector(mode="list",n); j <- 0
> for(i in I) {j <- j+1; 
+   TU <- J$countries[[i]]$data$energy$electricity$by_source
+   Unames <- names(TU); k <- length(Unames) 
+   if(k>0){
+     U <- rep(NA,k); names(U) <- Unames
+     for(s in 1:k) U[s] <- check(TU[[Unames[s]]]$percent)
+   } else { U <- rep(NA,4); cat(">>>",j,k,"\n") }
+   ES[[j]] <- U}
> W <- data.frame(Country=CNames,ISO2=ISO2,region=reg,area=at,pop=pt,eleS=rep(NA,n))
> W$eleS <- ES
> head(W)
         Country ISO2  region    area      pop          eleS
1    Afghanistan   AF    Asia  652230 36643815  45, 0, 52, 4
2        Albania   AL  Europe   28748  3074579   5, 0, 95, 0
3        Algeria   DZ  Africa 2381740 42972878   96, 0, 1, 2
> W[3,"eleS"]
[[1]]
           fossil_fuels           nuclear_fuels    hydroelectric_plants other_renewable_sources 
                     96                       0                       1                       2 
> misj <- c(7,44,45,55,78,88,93,103,108,124,140,156,158,162,
+   168,176,180,184,192,203,213,232) 
> CNames[misj]
 [1] "Anguilla"                 "Christmas Island"         "Cocos (Keeling) Islands" 
 [4] "Curaçao"                  "Palestinian Territories"  "Guernsey"                
 [7] "Vatican City"             "Isle of man"              "Jersey"                  
[10] "Liechtenstein"            "Monaco"                   "Norfolk Island"          
[13] "Northern Mariana Islands" "Palau"                    "Pitcairn Islands"        
[16] "St. Barthélemy"           "Saint Martin"             "San Marino"              
[19] "Sint Maarten"             "Svalbard & Jan Mayen"     "Tokelau"                 
[22] "Wallis & Futuna"         
> # 220 only 3 values

Interval variables

The elevation data don't follow a uniform format

> J$countries[[4]]$data$geography$elevation
...
$highest_point
$highest_point$name
[1] "Maja e Korabit (Golem Korab)"
$highest_point$elevation
$highest_point$elevation$value
[1] 2764
...

> J$countries[[73]]$data$geography$elevation
...
$highest_point
[1] "Chimborazo 6,267"
...

Therefore the code is more complicated

> # elevation
> elev <- vector(mode="list",n); j <- 0
> for(i in I) {j <- j+1; 
+   t <- J$countries[[i]]$data$geography$elevation$highest_point
+   if(typeof(t)=="character"){
+     st <- unlist(strsplit(t," "))
+     lM <- as.integer(gsub(",","",st[length(st)]))
+   } else {
+     lM <- J$countries[[i]]$data$geography$elevation$highest_point$elevation$value
+   }
+   lm <- check(J$countries[[i]]$data$geography$elevation$lowest_point$elevation$value)
+   elev[[j]] <- c(lm,lM)
+ }
> W <- data.frame(Country=CNames,ISO2=ISO2,region=reg,area=at,pop=pt,elev=rep(NA,n))
> W$elev <- elev
> head(W)
         Country ISO2  region    area      pop      elev
1    Afghanistan   AF    Asia  652230 36643815 258, 7492
2        Albania   AL  Europe   28748  3074579   0, 2764
3        Algeria   DZ  Africa 2381740 42972878 -40, 2908

Data frame

> NAs <- rep(NA,n)
> W <- data.frame(Country=CNames,ISO2=ISO2,region=reg,area=at,pop=pt,gdp=gdp,
+ resources=NAs,orgs=NAs,agroP=NAs,indust=NAs,expCom=NAs,impCom=NAs,Eusers=NAs,
+ ageS=NAs,gdpE=NAs,gdpO=NAs,expPar=NAs,impPar=NAs,eleS=NAs,elev=NAs)
> W$resources <- RS; W$orgs <- orgs; W$agroP <- AP; W$indust <- IN
> W$expCom <- EXP; W$impCom <- IMP; W$Eusers <- EU; W$ageS <- AS
> W$gdpE <- GE; W$gdpO <- GO; W$expPar <- EPA; W$impPar <- IPA
> W$eleS <- ES; W$elev <- elev
> write(toJSON(W),"Factbook.json")

> wdir <- "C:/Users/vlado/DL/data/hyper/CIA"
> setwd(wdir)
> library(jsonlite)
> FB <- fromJSON("Factbook.json")