rOpenGov · antaldaniel · Feb 5, 2020 · Feb 3, 2020 · Feb 5, 2020 · Feb 5, 2020
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -24,21 +24,22 @@ Depends:
 Imports:
     broom,
     classInt,
-    countrycode,    
+    countrycode,
     curl,
     dplyr,
     httr,
     jsonlite,
     lubridate,
     RColorBrewer,
     readr,
-    RefManageR,        
+    RefManageR,
     sf,
     sp,
     stringi,
     stringr,
     tibble,
-    tidyr
+    tidyr,
+    tidyselect
 Suggests:
     Cairo,
     ggplot2,

diff --git a/NAMESPACE b/NAMESPACE
@@ -3,6 +3,7 @@
 export(check_access_to_data)
 export(check_nuts2013)
 export(clean_eurostat_cache)
+export(convert_to_nuts2016)
 export(cut_to_classes)
 export(dic_order)
 export(eurotime2date)
@@ -29,8 +30,13 @@ importFrom(classInt,classIntervals)
 importFrom(countrycode,countrycode)
 importFrom(curl,curl_download)
 importFrom(dplyr,"%>%")
+importFrom(dplyr,add_count)
+importFrom(dplyr,anti_join)
+importFrom(dplyr,arrange)
+importFrom(dplyr,case_when)
 importFrom(dplyr,filter)
 importFrom(dplyr,full_join)
+importFrom(dplyr,inner_join)
 importFrom(dplyr,left_join)
 importFrom(dplyr,mutate)
 importFrom(dplyr,mutate_if)
@@ -61,6 +67,7 @@ importFrom(tibble,data_frame)
 importFrom(tibble,is_tibble)
 importFrom(tidyr,gather_)
 importFrom(tidyr,separate)
+importFrom(tidyselect,all_of)
 importFrom(utils,data)
 importFrom(utils,download.file)
 importFrom(utils,toBibtex)
diff --git a/R/check_nuts2013.R b/R/check_nuts2013.R
@@ -11,7 +11,8 @@
 #' regions. Observations with codes ending on \code{'ZZ'} or \code{'XX'} are
 #' removed from the returned data table, because these are non-territorial
 #' observations or they are outside of the EU.
-#' @importFrom  dplyr left_join mutate filter rename mutate_if
+#' @importFrom dplyr mutate filter rename mutate_if case_when
+#' @importFrom dplyr left_join full_join anti_join
 #' @examples
 #'  \dontrun{
 #'    dat <- eurostat::tgs00026
@@ -29,25 +30,55 @@ check_nuts2013 <- function (dat) {
   changed_regions <- regional_changes_2016 %>% 
     filter ( change != 'unchanged')
 
-  tmp <- dat  %>%
-    mutate_if ( is.factor, as.character ) %>%
-    left_join (  regional_changes_2016 %>% 
-                  select ( code16, change ) %>%
-                  dplyr::rename ( geo = code16 ),
-                by = 'geo')
+  ## Changed regions to be looked up by their NUTS2016 codes -----------
+  regional_changes_by_2016 <- regional_changes_2016 %>%
+    mutate ( geo = code16 ) %>% 
+    filter ( !is.na(code13))
+
+  nrow(regional_changes_by_2016)
+
+  ## adding those that have no equivalent in the previous group
+  ## some regions have to be identified by their old and new codes -----
+  regional_changes_by_2013 <- regional_changes_2016 %>%
+    mutate ( geo = code13 ) %>% 
+    filter ( !is.na(code13)) %>%
+    anti_join ( regional_changes_by_2016, 
+                by = c("code13", "code16", "name", "nuts_level", "change", "geo"))
+
+  ## Region can be found by new or old NUTS code -----------------------
+
+  all_regional_changes <- regional_changes_by_2016 %>%
+    full_join ( regional_changes_by_2013, 
+                by = c("code13", "code16", "name", "nuts_level",
+                       "change", "geo") )
 
-  there_are_changes <- FALSE
 
-  if ( any (is.na(tmp$change)) ) {
-    tmp <- dat  %>%
-      mutate_if ( is.factor, as.character ) %>%
-      left_join (  regional_changes_2016 %>% 
-                     select ( code13, change ) %>%
-                     dplyr::rename ( geo = code13 ),
-                   by = 'geo')
-
-    there_are_changes <- TRUE
+  tmp <- dat %>%
+    mutate_if ( is.factor, as.character ) %>%
+    left_join ( all_regional_changes, by = 'geo' ) %>%
+    mutate ( nuts_level = ifelse (is.na(nuts_level), 
+                                  9, nuts_level)) %>%
+    mutate ( nuts_level = case_when ( 
+      nuts_level < 9                    ~ nuts_level,
+      nuts_level == 9 & nchar(geo) == 2 ~ 0,
+      nuts_level == 9 & nchar(geo) == 3 ~ 1,
+      nuts_level == 9 & nchar(geo) == 4 ~ 2,
+      nuts_level == 9 & nchar(geo) == 5 ~ 3,
+      TRUE ~ NA_real_ ))
+
+  if ( all ( tmp$change %in% unique(regional_changes_2016$code16) )) {
+    message ( "All observations are coded with NUTS2016 codes" )
+    there_are_changes <- FALSE
   }
+
+  non_eu <- select ( tmp, geo, code13, code16, change ) %>%
+    mutate ( change = ifelse (rowSums(is.na(.))==3, 
+                              "not in the EU", change )) %>%
+    filter ( change == 'not in the EU' )
+
+  tmp <- tmp %>% mutate ( change = ifelse ( geo %in% non_eu$geo, 
+                                            'not in the EU', change ))  
+
 
   eu_country_vector <-  eurostat::eu_countries$code
   tmp_country_vector <- unique ( substr(tmp$geo, 1, 2) )
@@ -59,20 +90,6 @@ check_nuts2013 <- function (dat) {
                "In this data frame: ", not_EU_country_vector )
   }
 
-  if ( any( stringr::str_sub(tmp$geo, -2,-1) %in% c('ZZ', 'XX')) ) {
-
-    warning ( "Regional codes ending with ZZ or XX are extra-territorial", 
-              "\n to the EU and they are removed from the data frame.")
-
-  }
-
-  tmp %>%
-    mutate ( change  = ifelse (  geo %in% not_EU_country_vector , 
-                                 'not_EU', change )) %>%
-    filter ( stringr::str_sub(geo, -3,-1) != "ZZZ", 
-             stringr::str_sub(geo, -2,-1) != "ZZ", 
-             stringr::str_sub(geo, -3,-1) != "XXX", 
-             stringr::str_sub(geo, -2,-1) != "XX" ) %>%
-    mutate_if ( is.factor, as.character ) 
+  tmp
 
 }
diff --git a/R/convert_to_nuts2016.R b/R/convert_to_nuts2016.R
@@ -0,0 +1,77 @@
+#' @title Recode geo labels and rename regions from NUTS2013 to NUTS2016 
+#' @description Eurostat mixes NUTS2013 and NUTS2016 geographic label codes
+#' in the \code{'geo'} column, which creates time-wise comparativity issues.
+#' This function recodes the observations where only the coding changed, and
+#' marks discontinued regions, and other regions which may or may not be 
+#' somehow compared to current \code{'NUTS2016'} boundaries.
+#' @param dat A Eurostat data frame downloaded with 
+#' \code{\link{get_eurostat}}.
+#' @author Daniel Antal
+#' @return An augmented and potentially relabelled data frame which 
+#' contains all formerly \code{'NUTS2013'} definition geo labels in the 
+#' \code{'NUTS2016'} vocabulary when only the code changed, but the 
+#' boundary did not. It also contains some information on other geo labels
+#' that cannot be brought to the current \code{'NUTS2016'} definition. 
+#' Furthermore, when the official name of the region changed, it will use
+#' the new name (if the otherwise the region boundary did not change.)
+#' If not called before, the function will use the helper function
+#'  \code{\link{check_nuts2013}} and  \code{\link{harmonize_geo_code}}
+#' @importFrom dplyr mutate filter rename arrange case_when
+#' @importFrom dplyr left_join inner_join anti_join
+#' @importFrom tidyselect all_of
+#' @examples
+#'  \dontrun{
+#'   eurostat::tgs00026 %>%
+#'      check_nuts2013() %>%
+#'      harmonize_geo_code() %>%
+#'      convert_to_nuts2016() 
+#'      
+#'  #If check_nuts2013() is not called, the function will call it.    
+#'   eurostat::tgs00026 %>%
+#'      convert_to_nuts2016()    
+#'  }
+#' @export
+
+convert_to_nuts2016 <- function (dat) {
+
+  if ( ! all(c("change", "code16", "code13") %in% names (dat)) ) {
+    tmp <- harmonize_geo_code(dat)
+  } else {
+    tmp <- dat
+  }
+
+  nuts_2016_codes <- unique (regional_changes_2016$code16)
+
+  tmp <- tmp %>%
+    mutate ( geo = case_when (
+      !is.na(geo)                   ~ geo,
+      change     == "not in the EU" ~ geo,
+      TRUE ~ code16
+    ))
+
+  if ( any (is.na(tmp$geo) && (nuts2016 = TRUE)) ) {
+    warning ( "The following regions have no geo labels:", 
+              tmp %>%
+                filter ( is.na(geo) && (nuts2016 = TRUE) ) %>%
+                as.character(geo) )
+
+  }
+
+  names_by_nuts2016 <- regional_changes_2016 %>%
+    filter ( !is.na(code16) )
+
+  regions_by_nuts2016_names <- tmp %>% 
+    select ( -name )  %>%
+    inner_join ( names_by_nuts2016,
+                 by = c("code13", "code16", "nuts_level", "change") ) 
+
+  regions_with_other_names <- tmp %>% 
+    anti_join ( regions_by_nuts2016_names, 
+                    by = all_of(names(tmp)) )
+
+  rbind ( regions_by_nuts2016_names,
+          regions_with_other_names ) %>%
+    arrange ( time, geo, code16 ) %>%
+    left_join ( nuts_correspondence )
+
+}