Merge pull request #363 from PMassicotte/missing-country-codes

Add some missing country codes (#166).
PMassicotte · Aug 20, 2020 · 5c95f9a · 5c95f9a
2 parents 75d0732 + 3ac69ee
commit 5c95f9a
Show file tree

Hide file tree

Showing 11 changed files with 140 additions and 87 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -15,6 +15,6 @@ URL: https://github.com/PMassicotte/gtrendsR
 Depends: R (>= 3.2.0)
 LazyData: yes
 Imports: ggplot2, jsonlite, anytime, curl
-RoxygenNote: 7.1.0
+RoxygenNote: 7.1.1
 Suggests: knitr, rmarkdown, tinytest
 Encoding: UTF-8
diff --git a/NEWS.md b/NEWS.md
@@ -4,6 +4,8 @@
 
 - Fixes subsetting in extract_related_topics and data processing for Namibia (#353) @joachim-gassen
 
+- Add more country codes. (#166)
+
 # gtrendsR 1.4.6
 
 - Fix an issue when there was no "rising" data returned for the related topics. Some tests were failing due to this issue and causing errors on CRAN (#347).

diff --git a/R/countries.R b/R/countries.R
@@ -10,6 +10,6 @@
 #' @keywords datasets
 #' @name countries
 #' @usage data("countries")
-#' @format A data frame with 122845 rows and 3 variables
+#' @format A data frame with 117293 rows and 3 variables
 #' @references \url{http://www.unece.org/cefact/codesfortrade/codes_index.html}
 NULL
diff --git a/R/sysdata.rda b/R/sysdata.rda
diff --git a/data-raw/categories.R b/data-raw/categories.R
@@ -0,0 +1,17 @@
+# Process categories data
+get_categories <- function() {
+
+  file <- system.file("extdata", "categories.json", package = "gtrendsR")
+
+  res <- fromJSON(file, simplifyDataFrame = FALSE)
+
+  res <- as.Node(res)
+
+  categories <- ToDataFrameTree(res, "name", "id")
+  categories <- na.omit(categories)
+  categories <- categories[, c("name", "id")]
+  categories$name <- iconv(categories$name, to = "ASCII//TRANSLIT")
+  categories$id <- as.character(categories$id)
+
+  return(categories)
+}
diff --git a/data-raw/country_codes.R b/data-raw/country_codes.R
@@ -0,0 +1,103 @@
+# Process countries
+get_countries <- function() {
+  destfile <- tempfile(fileext = ".csv")
+
+  ret <-
+    download.file(
+      "https://raw.githubusercontent.com/CharlotteWoolley/Comprehensive_ISO_Location_Codes/master/ISO_codes.csv",
+      destfile = destfile
+    )
+
+  # Was the file found?
+  stopifnot(ret == 0)
+
+  # , col.names = c("country_code", "sub_code", "country")
+  countries <- read.csv(destfile, na.strings = "")
+
+  # Fix the encoding
+  countries <-
+    data.frame(sapply(countries, iconv, to = "ASCII//TRANSLIT"))
+
+  # *************************************************************************
+  # USA metro codes
+  # *************************************************************************
+
+  dir <- tempdir()
+  destfile <- paste0(dir, "/dma.xlsx")
+
+  file <- download.file(
+    "www.google.com/help/hc/downloads/ds3/Location-Language-Codes-AdWords.xlsx",
+    destfile = destfile
+  )
+
+  usa <- readxl::read_excel(destfile, skip = 1, .name_repair = "minimal")[, c(11:14)]
+
+  usa <- data.frame(
+    country_code = "US",
+    name = usa$Metro,
+    sub_code = paste("US",
+      lapply(regmatches(
+        usa$Metro, regexec(", (\\S{2})", usa$Metro)
+      ), "[", 2),
+      sep = "-"
+    )
+  )
+
+  usa <- na.omit(usa)
+  # usa <- usa[, c(1, 3, 2)]
+  # names(usa) <- names(countries)
+
+  # *************************************************************************
+  # More country codes from Google
+  # *************************************************************************
+
+  url <- "https://trends.google.com/trends/api/explore/pickers/geo"
+  obj <- curl::curl_fetch_memory(url)
+  ## Fix encoding issue for keywords like österreich"
+  temp <- rawToChar(obj$content)
+  Encoding(temp) <- "UTF-8"
+
+  df <- jsonlite::fromJSON(substring(temp, first = 6))
+
+  countrie_names <- data.frame(
+    country_name = df$children[[2]],
+    country_code = df$children[[3]],
+    stringsAsFactors = FALSE
+  )
+
+  i <- which(unlist(lapply(df$children[[1]], function(x) !is.null(x))))
+
+  res <- df$children$children[i]
+
+  # do.call(rbind, res[183])
+
+  extract_df <- function(l) {
+    if (length(names(l)) == 2) {
+      return(l)
+    } else {
+      (
+        return(do.call(rbind, l$children))
+      )
+    }
+  }
+
+  rr <- lapply(res, extract_df)
+  names(rr) <- df$children$name[i]
+
+  countries2 <- data.table::rbindlist(rr, idcol = "country_name")
+  names(countries2) <- c("country_name", "name", "code")
+
+  countries2 <- merge(countrie_names, countries2, by = "country_name")
+  countries2$sub_code <- paste(countries2$country_code, countries2$code, sep = "-")
+
+  countries2 <- countries2[, c("country_code", "sub_code", "name")]
+  countries2$name <- toupper(countries2$name)
+
+  # *************************************************************************
+  # Merge together
+  # *************************************************************************
+  countries <- rbind(countries, usa, countries2)
+  countries <- countries[!duplicated(countries), ]
+
+  return(countries)
+}
diff --git a/data-raw/language_codes.R b/data-raw/language_codes.R
@@ -0,0 +1,11 @@
+get_language_codes <- function() {
+  url <- "http://www.lingoes.net/en/translator/langcode.htm"
+
+  webpage <- read_html(url)
+  language_codes <- html_nodes(webpage, "table")
+  language_codes <- html_table(language_codes, header = TRUE)[[1]]
+
+  names(language_codes) <- tolower(names(language_codes))
+
+  return(language_codes)
+}
diff --git a/data-raw/process_data.R b/data-raw/process_data.R
@@ -2,91 +2,11 @@ library(rvest)
 library(jsonlite)
 library(data.tree)
 
-# Process categories data
-get_categories <- function() {
-  file <- system.file("extdata", "categories.json", package = "gtrendsR")
+source("data-raw/categories.R")
+source("data-raw/language_codes.R")
+source("data-raw/country_codes.R")
 
-  res <- fromJSON(file, simplifyDataFrame = FALSE)
-
-  res <- as.Node(res)
-
-  categories <- ToDataFrameTree(res, "name", "id")
-  categories <- na.omit(categories)
-  categories <- categories[, c("name", "id")]
-  categories$name <- iconv(categories$name, to = "ASCII//TRANSLIT")
-  categories$id <- as.character(categories$id)
-
-  return(categories)
-}
-
-# Process countries
-get_countries <- function() {
-  destfile <- tempfile(fileext = ".csv")
-
-  ret <-
-    download.file(
-      "https://raw.githubusercontent.com/CharlotteWoolley/Comprehensive_ISO_Location_Codes/master/ISO_codes.csv",
-      destfile = destfile
-    )
-
-  # Was the file found?
-  stopifnot(ret == 0)
-
-  # , col.names = c("country_code", "sub_code", "country")
-  countries <- read.csv(destfile, na.strings = "")
-
-  # Fix the encoding
-  countries <- data.frame(sapply(countries, iconv, to = "ASCII//TRANSLIT"))
-
-  # *************************************************************************
-  # USA metro codes
-  # *************************************************************************
-
-  dir <- tempdir()
-  destfile <- paste0(dir, "/dma.xlsx")
-
-  file <- download.file(
-    "www.google.com/help/hc/downloads/ds3/Location-Language-Codes-AdWords.xlsx",
-    destfile = destfile
-  )
-
-  usa <- readxl::read_excel(destfile, skip = 1)
-  # usa <- na.omit(usa[, c(8, 10, 11)])
-
-  usa <- data.frame(
-    country_code = "US",
-    description = usa$Metro,
-    sub_code = paste(
-      "US",
-      lapply(regmatches(usa$Metro, regexec(", (\\S{2})", usa$Metro)), "[", 2),
-      usa$`Metro code`,
-      sep = "-"
-    )
-  )
-
-  usa <- na.omit(usa)
-  usa <- usa[, c(1, 3, 2)]
-  names(usa) <- names(countries)
-
-  # *************************************************************************
-  # Merge together
-  # *************************************************************************
-  countries <- rbind(countries, usa)
-
-  return(countries)
-}
-
-get_language_codes <- function() {
-  url <- "http://www.lingoes.net/en/translator/langcode.htm"
-
-  webpage <- read_html(url)
-  language_codes <- html_nodes(webpage, "table")
-  language_codes <- html_table(language_codes, header = TRUE)[[1]]
-
-  names(language_codes) <- tolower(names(language_codes))
-
-  return(language_codes)
-}
+# Extract and save the data -----------------------------------------------
 
 countries <- get_countries()
 categories <- get_categories()

diff --git a/data/categories.rda b/data/categories.rda
diff --git a/data/countries.rda b/data/countries.rda
diff --git a/man/countries.Rd b/man/countries.Rd