Skip to content

Commit

Permalink
Merge pull request #363 from PMassicotte/missing-country-codes
Browse files Browse the repository at this point in the history
Add some missing country codes (#166).
  • Loading branch information
PMassicotte authored Aug 20, 2020
2 parents 75d0732 + 3ac69ee commit 5c95f9a
Show file tree
Hide file tree
Showing 11 changed files with 140 additions and 87 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,6 @@ URL: https://github.com/PMassicotte/gtrendsR
Depends: R (>= 3.2.0)
LazyData: yes
Imports: ggplot2, jsonlite, anytime, curl
RoxygenNote: 7.1.0
RoxygenNote: 7.1.1
Suggests: knitr, rmarkdown, tinytest
Encoding: UTF-8
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

- Fixes subsetting in extract_related_topics and data processing for Namibia (#353) @joachim-gassen

- Add more country codes. (#166)

# gtrendsR 1.4.6

- Fix an issue when there was no "rising" data returned for the related topics. Some tests were failing due to this issue and causing errors on CRAN (#347).
Expand Down
2 changes: 1 addition & 1 deletion R/countries.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@
#' @keywords datasets
#' @name countries
#' @usage data("countries")
#' @format A data frame with 122845 rows and 3 variables
#' @format A data frame with 117293 rows and 3 variables
#' @references \url{http://www.unece.org/cefact/codesfortrade/codes_index.html}
NULL
Binary file modified R/sysdata.rda
Binary file not shown.
17 changes: 17 additions & 0 deletions data-raw/categories.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Process categories data
get_categories <- function() {

file <- system.file("extdata", "categories.json", package = "gtrendsR")

res <- fromJSON(file, simplifyDataFrame = FALSE)

res <- as.Node(res)

categories <- ToDataFrameTree(res, "name", "id")
categories <- na.omit(categories)
categories <- categories[, c("name", "id")]
categories$name <- iconv(categories$name, to = "ASCII//TRANSLIT")
categories$id <- as.character(categories$id)

return(categories)
}
103 changes: 103 additions & 0 deletions data-raw/country_codes.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# Process countries
get_countries <- function() {
destfile <- tempfile(fileext = ".csv")

ret <-
download.file(
"https://raw.githubusercontent.com/CharlotteWoolley/Comprehensive_ISO_Location_Codes/master/ISO_codes.csv",
destfile = destfile
)

# Was the file found?
stopifnot(ret == 0)

# , col.names = c("country_code", "sub_code", "country")
countries <- read.csv(destfile, na.strings = "")

# Fix the encoding
countries <-
data.frame(sapply(countries, iconv, to = "ASCII//TRANSLIT"))

# *************************************************************************
# USA metro codes
# *************************************************************************

dir <- tempdir()
destfile <- paste0(dir, "/dma.xlsx")

file <- download.file(
"www.google.com/help/hc/downloads/ds3/Location-Language-Codes-AdWords.xlsx",
destfile = destfile
)

usa <- readxl::read_excel(destfile, skip = 1, .name_repair = "minimal")[, c(11:14)]

usa <- data.frame(
country_code = "US",
name = usa$Metro,
sub_code = paste("US",
lapply(regmatches(
usa$Metro, regexec(", (\\S{2})", usa$Metro)
), "[", 2),
sep = "-"
)
)

usa <- na.omit(usa)
# usa <- usa[, c(1, 3, 2)]
# names(usa) <- names(countries)

# *************************************************************************
# More country codes from Google
# *************************************************************************

url <- "https://trends.google.com/trends/api/explore/pickers/geo"
obj <- curl::curl_fetch_memory(url)
## Fix encoding issue for keywords like österreich"
temp <- rawToChar(obj$content)
Encoding(temp) <- "UTF-8"

df <- jsonlite::fromJSON(substring(temp, first = 6))

countrie_names <- data.frame(
country_name = df$children[[2]],
country_code = df$children[[3]],
stringsAsFactors = FALSE
)

i <- which(unlist(lapply(df$children[[1]], function(x) !is.null(x))))

res <- df$children$children[i]

# do.call(rbind, res[183])

extract_df <- function(l) {
if (length(names(l)) == 2) {
return(l)
} else {
(
return(do.call(rbind, l$children))
)
}
}

rr <- lapply(res, extract_df)
names(rr) <- df$children$name[i]

countries2 <- data.table::rbindlist(rr, idcol = "country_name")
names(countries2) <- c("country_name", "name", "code")

countries2 <- merge(countrie_names, countries2, by = "country_name")
countries2$sub_code <- paste(countries2$country_code, countries2$code, sep = "-")

countries2 <- countries2[, c("country_code", "sub_code", "name")]
countries2$name <- toupper(countries2$name)

# *************************************************************************
# Merge together
# *************************************************************************
countries <- rbind(countries, usa, countries2)
countries <- countries[!duplicated(countries), ]

return(countries)
}
11 changes: 11 additions & 0 deletions data-raw/language_codes.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
get_language_codes <- function() {
url <- "http://www.lingoes.net/en/translator/langcode.htm"

webpage <- read_html(url)
language_codes <- html_nodes(webpage, "table")
language_codes <- html_table(language_codes, header = TRUE)[[1]]

names(language_codes) <- tolower(names(language_codes))

return(language_codes)
}
88 changes: 4 additions & 84 deletions data-raw/process_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,91 +2,11 @@ library(rvest)
library(jsonlite)
library(data.tree)

# Process categories data
get_categories <- function() {
file <- system.file("extdata", "categories.json", package = "gtrendsR")
source("data-raw/categories.R")
source("data-raw/language_codes.R")
source("data-raw/country_codes.R")

res <- fromJSON(file, simplifyDataFrame = FALSE)

res <- as.Node(res)

categories <- ToDataFrameTree(res, "name", "id")
categories <- na.omit(categories)
categories <- categories[, c("name", "id")]
categories$name <- iconv(categories$name, to = "ASCII//TRANSLIT")
categories$id <- as.character(categories$id)

return(categories)
}

# Process countries
get_countries <- function() {
destfile <- tempfile(fileext = ".csv")

ret <-
download.file(
"https://raw.githubusercontent.com/CharlotteWoolley/Comprehensive_ISO_Location_Codes/master/ISO_codes.csv",
destfile = destfile
)

# Was the file found?
stopifnot(ret == 0)

# , col.names = c("country_code", "sub_code", "country")
countries <- read.csv(destfile, na.strings = "")

# Fix the encoding
countries <- data.frame(sapply(countries, iconv, to = "ASCII//TRANSLIT"))

# *************************************************************************
# USA metro codes
# *************************************************************************

dir <- tempdir()
destfile <- paste0(dir, "/dma.xlsx")

file <- download.file(
"www.google.com/help/hc/downloads/ds3/Location-Language-Codes-AdWords.xlsx",
destfile = destfile
)

usa <- readxl::read_excel(destfile, skip = 1)
# usa <- na.omit(usa[, c(8, 10, 11)])

usa <- data.frame(
country_code = "US",
description = usa$Metro,
sub_code = paste(
"US",
lapply(regmatches(usa$Metro, regexec(", (\\S{2})", usa$Metro)), "[", 2),
usa$`Metro code`,
sep = "-"
)
)

usa <- na.omit(usa)
usa <- usa[, c(1, 3, 2)]
names(usa) <- names(countries)

# *************************************************************************
# Merge together
# *************************************************************************
countries <- rbind(countries, usa)

return(countries)
}

get_language_codes <- function() {
url <- "http://www.lingoes.net/en/translator/langcode.htm"

webpage <- read_html(url)
language_codes <- html_nodes(webpage, "table")
language_codes <- html_table(language_codes, header = TRUE)[[1]]

names(language_codes) <- tolower(names(language_codes))

return(language_codes)
}
# Extract and save the data -----------------------------------------------

countries <- get_countries()
categories <- get_categories()
Expand Down
Binary file modified data/categories.rda
Binary file not shown.
Binary file modified data/countries.rda
Binary file not shown.
2 changes: 1 addition & 1 deletion man/countries.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 5c95f9a

Please sign in to comment.