Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add some missing country codes (#166). #363

Merged
merged 1 commit into from
Aug 20, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,6 @@ URL: https://github.com/PMassicotte/gtrendsR
Depends: R (>= 3.2.0)
LazyData: yes
Imports: ggplot2, jsonlite, anytime, curl
RoxygenNote: 7.1.0
RoxygenNote: 7.1.1
Suggests: knitr, rmarkdown, tinytest
Encoding: UTF-8
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

- Fixes subsetting in extract_related_topics and data processing for Namibia (#353) @joachim-gassen

- Add more country codes. (#166)

# gtrendsR 1.4.6

- Fix an issue when there was no "rising" data returned for the related topics. Some tests were failing due to this issue and causing errors on CRAN (#347).
Expand Down
2 changes: 1 addition & 1 deletion R/countries.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@
#' @keywords datasets
#' @name countries
#' @usage data("countries")
#' @format A data frame with 122845 rows and 3 variables
#' @format A data frame with 117293 rows and 3 variables
#' @references \url{http://www.unece.org/cefact/codesfortrade/codes_index.html}
NULL
Binary file modified R/sysdata.rda
Binary file not shown.
17 changes: 17 additions & 0 deletions data-raw/categories.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Process categories data
get_categories <- function() {

file <- system.file("extdata", "categories.json", package = "gtrendsR")

res <- fromJSON(file, simplifyDataFrame = FALSE)

res <- as.Node(res)

categories <- ToDataFrameTree(res, "name", "id")
categories <- na.omit(categories)
categories <- categories[, c("name", "id")]
categories$name <- iconv(categories$name, to = "ASCII//TRANSLIT")
categories$id <- as.character(categories$id)

return(categories)
}
103 changes: 103 additions & 0 deletions data-raw/country_codes.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# Process countries
get_countries <- function() {
destfile <- tempfile(fileext = ".csv")

ret <-
download.file(
"https://raw.githubusercontent.com/CharlotteWoolley/Comprehensive_ISO_Location_Codes/master/ISO_codes.csv",
destfile = destfile
)

# Was the file found?
stopifnot(ret == 0)

# , col.names = c("country_code", "sub_code", "country")
countries <- read.csv(destfile, na.strings = "")

# Fix the encoding
countries <-
data.frame(sapply(countries, iconv, to = "ASCII//TRANSLIT"))

# *************************************************************************
# USA metro codes
# *************************************************************************

dir <- tempdir()
destfile <- paste0(dir, "/dma.xlsx")

file <- download.file(
"www.google.com/help/hc/downloads/ds3/Location-Language-Codes-AdWords.xlsx",
destfile = destfile
)

usa <- readxl::read_excel(destfile, skip = 1, .name_repair = "minimal")[, c(11:14)]

usa <- data.frame(
country_code = "US",
name = usa$Metro,
sub_code = paste("US",
lapply(regmatches(
usa$Metro, regexec(", (\\S{2})", usa$Metro)
), "[", 2),
sep = "-"
)
)

usa <- na.omit(usa)
# usa <- usa[, c(1, 3, 2)]
# names(usa) <- names(countries)

# *************************************************************************
# More country codes from Google
# *************************************************************************

url <- "https://trends.google.com/trends/api/explore/pickers/geo"
obj <- curl::curl_fetch_memory(url)
## Fix encoding issue for keywords like österreich"
temp <- rawToChar(obj$content)
Encoding(temp) <- "UTF-8"

df <- jsonlite::fromJSON(substring(temp, first = 6))

countrie_names <- data.frame(
country_name = df$children[[2]],
country_code = df$children[[3]],
stringsAsFactors = FALSE
)

i <- which(unlist(lapply(df$children[[1]], function(x) !is.null(x))))

res <- df$children$children[i]

# do.call(rbind, res[183])

extract_df <- function(l) {
if (length(names(l)) == 2) {
return(l)
} else {
(
return(do.call(rbind, l$children))
)
}
}

rr <- lapply(res, extract_df)
names(rr) <- df$children$name[i]

countries2 <- data.table::rbindlist(rr, idcol = "country_name")
names(countries2) <- c("country_name", "name", "code")

countries2 <- merge(countrie_names, countries2, by = "country_name")
countries2$sub_code <- paste(countries2$country_code, countries2$code, sep = "-")

countries2 <- countries2[, c("country_code", "sub_code", "name")]
countries2$name <- toupper(countries2$name)

# *************************************************************************
# Merge together
# *************************************************************************
countries <- rbind(countries, usa, countries2)
countries <- countries[!duplicated(countries), ]

return(countries)
}
11 changes: 11 additions & 0 deletions data-raw/language_codes.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
get_language_codes <- function() {
url <- "http://www.lingoes.net/en/translator/langcode.htm"

webpage <- read_html(url)
language_codes <- html_nodes(webpage, "table")
language_codes <- html_table(language_codes, header = TRUE)[[1]]

names(language_codes) <- tolower(names(language_codes))

return(language_codes)
}
88 changes: 4 additions & 84 deletions data-raw/process_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,91 +2,11 @@ library(rvest)
library(jsonlite)
library(data.tree)

# Process categories data
get_categories <- function() {
file <- system.file("extdata", "categories.json", package = "gtrendsR")
source("data-raw/categories.R")
source("data-raw/language_codes.R")
source("data-raw/country_codes.R")

res <- fromJSON(file, simplifyDataFrame = FALSE)

res <- as.Node(res)

categories <- ToDataFrameTree(res, "name", "id")
categories <- na.omit(categories)
categories <- categories[, c("name", "id")]
categories$name <- iconv(categories$name, to = "ASCII//TRANSLIT")
categories$id <- as.character(categories$id)

return(categories)
}

# Process countries
get_countries <- function() {
destfile <- tempfile(fileext = ".csv")

ret <-
download.file(
"https://raw.githubusercontent.com/CharlotteWoolley/Comprehensive_ISO_Location_Codes/master/ISO_codes.csv",
destfile = destfile
)

# Was the file found?
stopifnot(ret == 0)

# , col.names = c("country_code", "sub_code", "country")
countries <- read.csv(destfile, na.strings = "")

# Fix the encoding
countries <- data.frame(sapply(countries, iconv, to = "ASCII//TRANSLIT"))

# *************************************************************************
# USA metro codes
# *************************************************************************

dir <- tempdir()
destfile <- paste0(dir, "/dma.xlsx")

file <- download.file(
"www.google.com/help/hc/downloads/ds3/Location-Language-Codes-AdWords.xlsx",
destfile = destfile
)

usa <- readxl::read_excel(destfile, skip = 1)
# usa <- na.omit(usa[, c(8, 10, 11)])

usa <- data.frame(
country_code = "US",
description = usa$Metro,
sub_code = paste(
"US",
lapply(regmatches(usa$Metro, regexec(", (\\S{2})", usa$Metro)), "[", 2),
usa$`Metro code`,
sep = "-"
)
)

usa <- na.omit(usa)
usa <- usa[, c(1, 3, 2)]
names(usa) <- names(countries)

# *************************************************************************
# Merge together
# *************************************************************************
countries <- rbind(countries, usa)

return(countries)
}

get_language_codes <- function() {
url <- "http://www.lingoes.net/en/translator/langcode.htm"

webpage <- read_html(url)
language_codes <- html_nodes(webpage, "table")
language_codes <- html_table(language_codes, header = TRUE)[[1]]

names(language_codes) <- tolower(names(language_codes))

return(language_codes)
}
# Extract and save the data -----------------------------------------------

countries <- get_countries()
categories <- get_categories()
Expand Down
Binary file modified data/categories.rda
Binary file not shown.
Binary file modified data/countries.rda
Binary file not shown.
2 changes: 1 addition & 1 deletion man/countries.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.