diff --git a/NAMESPACE b/NAMESPACE index 6935f36c..298d2009 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,7 +2,6 @@ export(add_nuts_level) export(check_access_to_data) -export(check_nuts_2013) export(clean_eurostat_cache) export(cut_to_classes) export(dic_order) @@ -36,6 +35,7 @@ importFrom(dplyr,add_count) importFrom(dplyr,anti_join) importFrom(dplyr,arrange) importFrom(dplyr,case_when) +importFrom(dplyr,distinct) importFrom(dplyr,filter) importFrom(dplyr,full_join) importFrom(dplyr,inner_join) @@ -43,7 +43,6 @@ importFrom(dplyr,left_join) importFrom(dplyr,mutate) importFrom(dplyr,mutate_if) importFrom(dplyr,rename) -importFrom(dplyr,right_join) importFrom(dplyr,select) importFrom(dplyr,semi_join) importFrom(httr,GET) @@ -65,7 +64,6 @@ importFrom(sp,spplot) importFrom(stats,lag) importFrom(stringi,stri_match_first_regex) importFrom(stringr,str_replace_all) -importFrom(stringr,str_sub) importFrom(tibble,as_tibble) importFrom(tibble,data_frame) importFrom(tibble,is_tibble) diff --git a/R/check_nuts_2013.R b/R/check_nuts_2013.R deleted file mode 100644 index 8a1b8d7f..00000000 --- a/R/check_nuts_2013.R +++ /dev/null @@ -1,104 +0,0 @@ -#' @title Check NUTS region codes that changed with the \code{NUTS2016} definition -#' @description Eurostat mixes \code{NUTS2013} and \code{NUTS2016} geographic -#' label codes in the \code{'geo'} column, which creates time-wise comparativity issues. -#' This function checks if you data is affected by this problem and gives -#' information on what to do. -#' @param dat A Eurostat data frame downloaded with \code{\link{get_eurostat}} -#' @export -#' @author Daniel Antal -#' @return An augmented data frame or a message about potential coding -#' errors. For filtering, it marks \code{'non_EU'} and \code{'unchanged'} -#' regions. Observations with codes ending on \code{'ZZ'} or \code{'XX'} are -#' removed from the returned data table, because these are non-territorial -#' observations or they are outside of the EU. -#' @importFrom dplyr mutate filter rename mutate_if case_when -#' @importFrom dplyr left_join full_join anti_join -#' @examples -#' \dontrun{ -#' dat <- eurostat::tgs00026 -#' check_nuts_2013(dat) -#' } - -check_nuts_2013 <- function (dat) { - - ## For non-standard evaluation ------------------------------------- - . <- change <- geo <- code13 <- code16 <- nuts_level <- NULL - regional_changes_2016 <- country_code <- NULL - - ## The data is not loaded into the global environment -------------- - - regional_changes_2016 <- load_package_data(dataset = "regional_changes_2016") - - unchanged_regions <- regional_changes_2016 %>% - filter ( change == 'unchanged' ) - - changed_regions <- regional_changes_2016 %>% - filter ( change != 'unchanged' ) - - ## Changed regions to be looked up by their NUTS2016 codes ----------- - regional_changes_by_2016 <- regional_changes_2016 %>% - mutate ( geo = code16 ) %>% - filter ( !is.na(code13) ) - - ## adding those that have no equivalent in the previous group - ## some regions have to be identified by their old and new codes ----- - regional_changes_by_2013 <- regional_changes_2016 %>% - mutate ( geo = code13 ) %>% - filter ( !is.na(code13) ) %>% - anti_join ( regional_changes_by_2016, - by = c("code13", "code16", "name", - "nuts_level", "change", "geo") ) - - ## Region can be found by new or old NUTS code ----------------------- - - all_regional_changes <- regional_changes_by_2016 %>% - full_join ( regional_changes_by_2013, - by = c("code13", "code16", "name", - "nuts_level", - "change", "geo") ) - - - tmp <- dat %>% - mutate_if ( is.factor, as.character ) %>% - left_join ( all_regional_changes, by = 'geo' ) %>% - mutate ( nuts_level = ifelse (is.na(nuts_level), - add_nuts_level(geo), - nuts_level)) - - if ( all ( tmp$change %in% unique(regional_changes_2016$code16) )) { - message ( "All observations are coded with NUTS2016 codes" ) - there_are_changes <- FALSE - } - - eu_countries <- load_package_data(dataset = "eu_countries") - - eu_country_vector <- unique ( substr(eu_countries$code, 1, 2) ) - - tmp <- tmp %>% - mutate ( country_code = substr(geo,1,2) ) %>% - mutate ( change = ifelse ( country_code %in% eu_country_vector, - yes = change, - no = "not in the EU")) %>% - select ( -country_code ) - - if ( any(tmp$change == 'not in the EU') ) { - - not_EU_country_vector <- substr(tmp$geo, 1,2) - not_EU_country_vector <- not_EU_country_vector [ !not_EU_country_vector %in% eu_country_vector] - ## The correspondence table only covers EU regions. - message ( "Not checking for regional label consistency in non-EU countries\n", - "In this data frame non-EU country: ", - paste (sort(unique(not_EU_country_vector)), - collapse = ", "), "." ) - } - - nuts_2016_codes <- unique(regional_changes_2016$code16) - nuts_2013_codes <- unique(regional_changes_2016$code13) - - tmp <- tmp %>% - mutate ( nuts_2016 = ifelse ( geo %in% nuts_2016_codes, - TRUE, FALSE), - nuts_2013 = ifelse ( geo %in% nuts_2013_codes, - TRUE, FALSE)) - -} diff --git a/R/harmonize_geo_code.R b/R/harmonize_geo_code.R index c9064864..ab5179fd 100644 --- a/R/harmonize_geo_code.R +++ b/R/harmonize_geo_code.R @@ -1,152 +1,201 @@ -#' @title Recode geo labels from NUTS2013 to NUTS2016 -#' @description Eurostat mixes NUTS2013 and NUTS2016 geographic label codes -#' in the \code{'geo'} column, which creates time-wise comparativity issues. -#' This function recodes the observations where only the coding changed, and -#' marks discontinued regions, and other regions which may or may not be -#' somehow compared to current \code{'NUTS2016'} boundaries. -#' @param dat A Eurostat data frame downloaded with \code{\link{get_eurostat}}. +#' @title Harmonize NUTS region codes that changed with the \code{NUTS2016} definition +#' @description Eurostat mixes \code{NUTS2013} and \code{NUTS2016} geographic +#' label codes in the \code{'geo'} column, which creates time-wise comparativity issues. +#' This function checks if you data is affected by this problem and gives +#' information on what to do. +#' @param dat A Eurostat data frame downloaded with \code{\link{get_eurostat}} #' @export #' @author Daniel Antal -#' @return An augmented and potentially relabelled data frame which -#' contains all formerly \code{'NUTS2013'} definition geo labels in the -#' \code{'NUTS2016'} vocabulary when only the code changed, but the -#' boundary did not. It also contains some information on other geo labels -#' that cannot be brought to the current \code{'NUTS2016'} definition. -#' If not called before, the function will use the helper function -#' \code{\link{check_nuts_2013}} -#' @importFrom dplyr mutate filter rename arrange add_count -#' @importFrom dplyr left_join full_join anti_join right_join semi_join -#' @importFrom tidyselect all_of -#' @importFrom stringr str_sub +#' @return An augmented data frame that explains potential problems and +#' possible solutions. +#' @importFrom dplyr mutate filter rename mutate_if case_when distinct +#' @importFrom dplyr left_join full_join anti_join add_count semi_join #' @examples #' \dontrun{ -#' eurostat::tgs00026 %>% -#' check_nuts_2013() %>% -#' harmonize_geo_code() -#' -#' #If check_nuts_2013() is not called, the function will call it. -#' eurostat::tgs00026 -#' harmonize_geo_code(dat) +#' dat <- eurostat::tgs00026 +#' harmonize_geo_code(dat) #' } -harmonize_geo_code <- function ( dat ) { +harmonize_geo_code <- function (dat) { ## For non-standard evaluation ------------------------------------- - change <- tmp <- geo <- nuts_level <- code13 <- code16 <- NULL - . <- n <- remaining_eu_data <- resolution <- time <- values <- NULL - regional_changes_2016 <- NULL - - ## Check if geo information is present ------------------------------ - if ( ! 'geo' %in% names(dat) ) { - stop ("There is no 'geo' column in the inserted data. This is an error.") - } - - ## Load the correspondence tables, but not to the global environment -- + . <- change <- geo <- code13 <- code16 <- nuts_level <- NULL + country_code <- n <- values <- time <- name <- resolution <- NULL + + dat <- mutate_if ( dat, is.factor, as.character) + + ## The data is not loaded into the global environment -------------- regional_changes_2016 <- load_package_data(dataset = "regional_changes_2016") + nuts_correspondence <- load_package_data(dataset = "nuts_correspondence") + ## Creating constants ----------------------------------------------- + regions_in_correspondence <- unique(c(nuts_correspondence$code13, nuts_correspondence$code16)) + regions_in_correspondence <- sort(regions_in_correspondence [!is.na(regions_in_correspondence)]) + unchanged_regions <- regional_changes_2016 %>% - filter ( change == 'unchanged' ) - - changed_regions <- regional_changes_2016 %>% - filter ( change != 'unchanged' ) - - nuts_2016_codes <- unique (regional_changes_2016$code16) - nuts_2013_codes <- unique (regional_changes_2016$code13) - # for easier debugging, this data will be re-assigned in each major - # step as tmp2, tmp3... Debugging is particulary difficult, because - # not only the program code, but the underlying logic may have faults. - - if (! all(c("change", "code16", "code13", - "nuts_2016", "nuts_2013") %in% names (dat)) ) { - tmp <- dat %>% - check_nuts_2013() - } else { - tmp <- dat + filter ( change == 'unchanged' ) + + # The Eurostat correspondence table had a duplicate entry. It may + # re-occur later and this code may help finding it. + # nuts_correspondence_duplicates <- nuts_correspondence %>% + # filter ( !is.na(code13 )) %>% + # add_count ( code13 ) %>% filter ( n > 1 ) + + ## Changed regions to be looked up by their NUTS2016 codes ----------- + regional_changes_by_2016 <- nuts_correspondence %>% + mutate ( geo = code16 ) %>% + filter ( !is.na(code16) ) %>% + select ( -geo ) %>% + distinct ( code13, code16, name, nuts_level, change, resolution) + + # Regions may be duplicated in case their NUTS2016 and NUTS2013 are the same + + ## adding those that have no equivalent in the previous group + ## some regions have to be identified by their old and new codes ----- + regional_changes_by_2013 <- nuts_correspondence %>% + mutate ( geo = code13 ) %>% + filter ( !is.na(code13) ) %>% + select ( -geo ) %>% + distinct ( code13, code16, name, nuts_level, change, resolution) + + ## Join the regions by both NUTS definitions ----------------------- + + all_regional_changes <- regional_changes_by_2016 %>% + full_join ( regional_changes_by_2013, + by = c("code13", "code16", "name", "nuts_level", + "change", "resolution")) + + + ## Check for potential duplicates ---------------------------------- + duplicates <- all_regional_changes %>% + add_count ( code13, code16 ) %>% + filter ( n > 1 ) + + if ( nrow(duplicates) > 0 ) { + stop ("There are duplicates in the correspondence table.") } + + all_regions_full_metadata <- unchanged_regions %>% + mutate ( resolution = NA_character_ ) %>% + rbind ( all_regional_changes ) + + nuts_2013_codes <- unique (all_regions_full_metadata$code13)#[!is.na(all_regions_full_metadata$code13)] + nuts_2016_codes <- unique (all_regions_full_metadata$code16)#[!is.na(all_regions_full_metadata$code16)] + nuts_2013_codes <- nuts_2013_codes[!is.na(nuts_2013_codes)] + nuts_2016_codes <- nuts_2016_codes[!is.na(nuts_2016_codes)] + + tmp_by_code16 <- dat %>% + mutate ( geo = as.character(geo)) %>% + filter ( geo %in% all_regions_full_metadata$code16 ) %>% + left_join ( all_regions_full_metadata %>% + dplyr::rename ( geo = code16 ), + by = "geo") %>% + mutate ( code16 = geo ) %>% + mutate ( nuts_2016 = geo %in% nuts_2016_codes ) %>% + mutate ( nuts_2013 = geo %in% nuts_2013_codes ) + + tmp_by_code13 <- dat %>% + mutate ( geo = as.character(geo)) %>% + filter ( geo %in% all_regions_full_metadata$code13 ) %>% + left_join ( all_regions_full_metadata %>% + dplyr::rename ( geo = code13 ), + by = "geo") %>% + mutate ( code13 = geo ) %>% + mutate ( nuts_2016 = geo %in% nuts_2016_codes, + nuts_2013 = geo %in% nuts_2013_codes) + + message ( "In this data frame ", nrow(tmp_by_code16), + " observations are coded with the current NUTS2016\ngeo labels and ", + nrow ( tmp_by_code13), " observations/rows have NUTS2013 historical labels.") + + tmp_s <- tmp_by_code16 %>% + semi_join ( tmp_by_code13, + by = names ( tmp_by_code13)) # found in both (unchanged and relabelled) - # Separating rows that need to be corrected ---------------------------- + if (! all(tmp_s$nuts_2013 & tmp_s$nuts_2016)) { stop ("Wrong selection of unchanged regions.") } - labelled_by_nuts_2016 <- tmp %>% - filter ( geo %in% nuts_2016_codes ) # These are following NUTS2016 - labelled_by_nuts_2013 <- tmp %>% - anti_join ( labelled_by_nuts_2016, - by = tidyselect::all_of(names(tmp)) ) %>% - filter ( geo %in% nuts_2013_codes ) # These are following NUTS2013 + tmp_s2 <- tmp_by_code13 %>% + semi_join ( tmp_by_code16, + by = names (tmp_by_code16)) # found in both (unchanged and relabelled) + #must be equal!!! - message ( "There are ", nrow(labelled_by_nuts_2013), " regions that were changed", - " in the transition to NUTS2016 and\nthe data frame uses their NUTS2013 geo codes.") + tmp_a1 <- tmp_by_code16 %>% + anti_join ( tmp_by_code13, + by = names(tmp_by_code13) + ) # not found in code13 (new regions) + if ( any(tmp_a1$nuts_2013) ) { stop ("Wrong selection of NUTS2013-only regions.") } - labelled_by_other <- tmp %>% - filter ( ! geo %in% nuts_2013_codes ) %>% - filter ( ! geo %in% nuts_2016_codes ) # These are not in the correspondence table (non-EU) - - message ( "There are ", nrow(labelled_by_other), " regions that are not covered by the correspondence tables.") - message ( "They are likely to be non-EU regions and their consistency cannot be checked.") - + tmp_a2 <- tmp_by_code13 %>% + anti_join ( tmp_by_code16, + by = names(tmp_by_code13) + ) # not found in code16 (changes) + if ( any(tmp_a2$nuts_2016) ) { stop ("Wrong selection of NUTS2013-only regions.") } - if ( nrow ( labelled_by_other) + nrow ( labelled_by_nuts_2013 ) + nrow(labelled_by_nuts_2016) != nrow (dat)) { - stop ( "Joining error Type I") - } - - ## NUTS regions that are NUTS2013 coded but have NUTS2016 equivalents ----- - can_be_found <- labelled_by_nuts_2013 %>% - filter ( !is.na(code16) ) - - recoded_regions <- can_be_found %>% - filter ( grepl("recoded", change )) - - message ( "There are ", nrow(recoded_regions), -" regions that only changed their geo labels. -Their boundaries are consistent in NUTS2013 and NUTS2016.") - message ( "[", recoded_regions %>% - filter ( grepl ("relabelled", change)) %>% - nrow(), " of these changed their names, too.]") - - other_cases <- can_be_found %>% - anti_join ( recoded_regions, - by = names ( can_be_found ) ) # I think these are 'small changes' - - if ( nrow(other_cases) + nrow(recoded_regions) != nrow(can_be_found) ) { - stop ( "Joining error in NUTS2013 regions that can be found in NUTS2016") - } + tmp <- rbind ( tmp_s, tmp_a1, tmp_a2 ) - ## Discontinued regions ----------------------------------------------- + not_found_geo <- unique(dat$geo[! dat$geo %in% tmp$geo ]) + not_eu_regions <- not_found_geo[! substr(not_found_geo,1,2) %in% eu_countries$code] - cannot_be_found <- labelled_by_nuts_2013 %>% - filter ( is.na(code16) ) + ## Checking if there are unmatched EU regions------------------------- - if ( nrow ( can_be_found ) + nrow(cannot_be_found ) != nrow ( labelled_by_nuts_2013 )) { - stop ("Joining error in NUTS2013 regions that can or cannot be found.") + not_found_eu_regions <- not_found_geo[ substr(not_found_geo,1,2) %in% eu_countries$code] + + if ( length(not_found_eu_regions)>0) { + stop ( "Some EU regions were not found in the correspondence table.") } - ## First join all EU regions ---------------------------------------- + ## Adding columns for non-EU regions ---------------------------------- + tmp_not_eu <- dat %>% + filter ( geo %in% not_eu_regions ) %>% + mutate ( nuts_level = nchar(geo)-2, + change = "not in EU - not controlled", + resolution = "check with national authorities", + name = NA_character_, + code13 = NA_character_, + code16 = NA_character_, + nuts_2016 = FALSE, + nuts_2013 = FALSE) - eu_joined <- labelled_by_nuts_2016 %>% - full_join ( recoded_regions, by = tidyselect::all_of(names ( recoded_regions )) ) %>% - full_join ( other_cases, by = tidyselect::all_of(names ( other_cases )) ) %>% - full_join ( cannot_be_found, by = tidyselect::all_of(names ( cannot_be_found )) ) + tmp2 <- rbind ( tmp, tmp_not_eu) - if ( nrow ( eu_joined %>% - dplyr::semi_join ( labelled_by_other, - by = tidyselect::all_of(names (eu_joined))) ) > 0 ) { - stop ( "Joining error between EU and non-EU regions") + + ## Check if all original rows are handled correctly ------------------ + if (length(dat$geo [! dat$geo %in% tmp2$geo ])>0) { + message (tmp2 %>% anti_join (dat)) + message (dat %>% anti_join (tmp2)) + stop ("Not all original rows were checked.") } + + eu_countries <- load_package_data(dataset = "eu_countries") + + eu_country_vector <- unique ( substr(eu_countries$code, 1, 2) ) - ## Add non-EU regions ---------------------------------------------- + + if ( any(tmp2$change == 'not in EU - not controlled') ) { + + not_EU_country_vector <- tmp2 %>% + filter ( tmp2$change == 'not in EU - not controlled' ) %>% + select ( geo ) + + not_eu_observations <- nrow (not_EU_country_vector) + + not_EU_country_vector <- not_EU_country_vector %>% + unlist() %>% substr(., 1,2) %>% sort () %>% + unique () + ## The correspondence table only covers EU regions. + message ( "Not checking for regional label consistency in non-EU countries.\n", + "In this data frame not controlled countries: ", + paste (not_EU_country_vector, + collapse = ", "), " \n", + "with alltogether ", not_eu_observations, " observations/rows.") + } - all_regions <- labelled_by_other %>% - full_join ( eu_joined, - by = tidyselect::all_of(names(eu_joined))) - - if ( anyDuplicated(all_regions) > 0 ) { - stop("Joining error - there are duplicates in the data frame.") - } - - all_regions %>% - dplyr::arrange(., time, geo, code16 ) + ## Reorder columns for readability ------------------------------- -} - + tmp_left <- tmp2 %>% select ( geo, time, values, code13, code16, name ) + tmp_right <- tmp2 %>% select ( -geo, -code13, -code16, -time, -values, -name ) + cbind ( tmp_left, tmp_right) +} diff --git a/R/recode_to_nuts_2013.R b/R/recode_to_nuts_2013.R index 0daa26c4..becb4400 100644 --- a/R/recode_to_nuts_2013.R +++ b/R/recode_to_nuts_2013.R @@ -15,30 +15,32 @@ #' Furthermore, when the official name of the region changed, it will use #' the new name (if the otherwise the region boundary did not change.) #' If not called before, the function will use the helper function -#' \code{\link{check_nuts_2013}} and \code{\link{harmonize_geo_code}} +#' \code{\link{harmonize_geo_code}} #' @importFrom dplyr mutate filter rename arrange case_when #' @importFrom dplyr left_join inner_join anti_join -#' @importFrom tidyselect all_of #' @examples -#' \dontrun{ -#' eurostat::tgs00026 %>% -#' check_nuts2013() %>% -#' harmonize_geo_code() %>% -#' recode_to_nuts_2013() -#' -#' #If check_nuts2013() is not called, the function will call it. -#' eurostat::tgs00026 %>% -#' recode_to_nuts_2013() -#' } +#' test_regional_codes <- data.frame ( +#' geo = c("FRB", "FRE", "UKN02", "IE022", "FR243", "FRB03"), +#' time = c(rep(as.Date ("2014-01-01"), 5), as.Date("2015-01-01")), +#' values = c(1:6), +#' control = c("Changed from NUTS2 to NUTS1", +#' "New region NUTS2016 only", +#' "Discontinued region NUTS2013", +#' "Boundary shift NUTS2013", +#' "Recoded in NUTS2013", +#' "Recoded in NUTS2016" +#' )) +#' +#' recode_to_nuts_2013(test_regional_codes) #' @export recode_to_nuts_2013 <- function (dat) { . <- nuts_level <- geo <- code13 <- code16 <- time <- name <- NULL - type <- nuts_correspondence <- regional_changes_2016 <- NULL + type <- NULL regional_changes_2016 <- load_package_data(dataset = "regional_changes_2016") - nuts_correspondence <- load_package_data(dataset = "nuts_correspondence") + nuts_correspondence <- load_package_data(dataset = "nuts_correspondence") if ( ! all(c("change", "code16", "code13") %in% names (dat)) ) { tmp <- harmonize_geo_code(dat) @@ -50,17 +52,19 @@ recode_to_nuts_2013 <- function (dat) { tmp <- tmp %>% mutate ( geo = case_when ( - !is.na(geo) ~ geo, - change == "not in the EU" ~ geo, - TRUE ~ code13 + geo == code13 ~ geo, + change == "not in EU - not controlled" ~ geo, + TRUE ~ code13 )) if ( any (is.na(tmp$geo)) ) { - warning ( "The following regions have no geo labels:", + warning ( "The following regions have no NUTS2013 geo labels: ", tmp %>% - filter ( is.na(geo) && (nuts2016 = TRUE) ) %>% - select (geo) %>% - as.character() ) + filter ( is.na(geo) & (nuts2013 = TRUE) ) %>% + select (code16) %>% + unlist() %>% + unique() %>% + paste(., collapse = ", "), "." ) } @@ -74,12 +78,13 @@ recode_to_nuts_2013 <- function (dat) { regions_with_other_names <- tmp %>% anti_join ( regions_by_nuts2013_names, - by = tidyselect::all_of(names(tmp)) ) + by = names(tmp) ) rbind ( regions_by_nuts2013_names, regions_with_other_names ) %>% arrange ( time, geo, code16 ) %>% left_join ( nuts_correspondence, - by = c("code13", "code16", "nuts_level", "change", "name")) + by = c("code13", "code16", "nuts_level", + "change", "name", "resolution")) } diff --git a/R/recode_to_nuts_2016.R b/R/recode_to_nuts_2016.R index 044ec168..8b504742 100644 --- a/R/recode_to_nuts_2016.R +++ b/R/recode_to_nuts_2016.R @@ -15,30 +15,32 @@ #' Furthermore, when the official name of the region changed, it will use #' the new name (if the otherwise the region boundary did not change.) #' If not called before, the function will use the helper function -#' \code{\link{check_nuts_2013}} and \code{\link{harmonize_geo_code}} +#' \code{\link{harmonize_geo_code}} #' @importFrom dplyr mutate filter rename arrange case_when -#' @importFrom dplyr left_join inner_join anti_join semi_join -#' @importFrom tidyselect all_of +#' @importFrom dplyr left_join inner_join anti_join #' @examples -#' \dontrun{ -#' eurostat::tgs00026 %>% -#' check_nuts2013() %>% -#' harmonize_geo_code() %>% -#' recode_to_nuts_2016() -#' -#' #If check_nuts2013() is not called, the function will call it. -#' eurostat::tgs00026 %>% -#' recode_to_nuts_2016() -#' } +#' test_regional_codes <- data.frame ( +#' geo = c("FRB", "FRE", "UKN02", "IE022", "FR243", "FRB03"), +#' time = c(rep(as.Date ("2014-01-01"), 5), as.Date("2015-01-01")), +#' values = c(1:6), +#' control = c("Changed from NUTS2 to NUTS1", +#' "New region NUTS2016 only", +#' "Discontinued region NUTS2013", +#' "Boundary shift NUTS2013", +#' "Recoded in NUTS2013", +#' "Recoded in NUTS2016" +#' )) +#' +#' recode_to_nuts_2016(test_regional_codes) #' @export recode_to_nuts_2016 <- function (dat) { . <- nuts_level <- geo <- code13 <- code16 <- time <- name <- NULL - type <- nuts_correspondence <- regional_changes_2016 <- NULL + type <- NULL regional_changes_2016 <- load_package_data(dataset = "regional_changes_2016") - nuts_correspondence <- load_package_data(dataset = "nuts_correspondence") + nuts_correspondence <- load_package_data(dataset = "nuts_correspondence") if ( ! all(c("change", "code16", "code13") %in% names (dat)) ) { tmp <- harmonize_geo_code(dat) @@ -50,16 +52,19 @@ recode_to_nuts_2016 <- function (dat) { tmp <- tmp %>% mutate ( geo = case_when ( - !is.na(geo) ~ geo, - change == "not in the EU" ~ geo, + geo == code16 ~ geo, + change == "not in EU - not controlled" ~ geo, TRUE ~ code16 )) - if ( any (is.na(tmp$geo) && (nuts2016 = TRUE)) ) { - warning ( "The following regions have no geo labels:", + if ( any (is.na(tmp$geo)) ) { + warning ( "The following regions have no NUTS2016 labels: ", tmp %>% - filter ( is.na(geo) && (nuts2016 = TRUE) ) %>% - as.character(geo) ) + filter ( is.na(geo) & (nuts2013 = TRUE) ) %>% + select (code13) %>% + unlist() %>% + unique() %>% + paste(., collapse = ", "), ".") } @@ -73,12 +78,13 @@ recode_to_nuts_2016 <- function (dat) { regions_with_other_names <- tmp %>% anti_join ( regions_by_nuts2016_names, - by = tidyselect::all_of(names(tmp)) ) + by = names(tmp) ) rbind ( regions_by_nuts2016_names, regions_with_other_names ) %>% arrange ( time, geo, code16 ) %>% left_join ( nuts_correspondence, - by = c("code13", "code16", "nuts_level", "change", "name")) + by = c("code13", "code16", "nuts_level", + "change", "name", "resolution")) } diff --git a/data-raw/nuts_coding.R b/data-raw/nuts_coding.R index c5b7d843..7465ea89 100644 --- a/data-raw/nuts_coding.R +++ b/data-raw/nuts_coding.R @@ -9,7 +9,7 @@ tf <- tempfile(fileext = ".xlsx") download.file(url = 'https://ec.europa.eu/eurostat/documents/345175/629341/NUTS2013-NUTS2016.xlsx', destfile = tf, mode = 'wb' ) -regions <- readxl::read_excel( tf, +regional_changes_2016 <- readxl::read_excel( tf, sheet = 'NUTS2013-NUTS2016', skip = 1, col_names = T) %>% select (1:12) %>% @@ -28,20 +28,18 @@ regions <- readxl::read_excel( tf, nuts1_correspondence <- readxl::read_excel( tf, sheet = 'Correspondence NUTS-1', - #file.path('data-raw', 'NUTS2013-NUTS2016.xlsx'), - #file.path('.', 'NUTS2013-NUTS2016.xlsx'), - file.path(tf), - sheet = 'Correspondence NUTS-1', skip = 0 , col_names = T) %>% purrr::set_names ( ., c("code13", "code16", "name", "change", "resolution")) %>% mutate_if ( is.factor, as.character ) %>% - mutate ( nuts_level = 1 ) + mutate ( nuts_level = 1 ) %>% + filter ( name != 'Centre-Est') # appears to be a duplicate and incorrect row, given that FR7 is also marked as recoded to FRK + +warning ( "FR7 - Centre-Est appears to be an errorneous line and it is removed from the correspondence table.") nuts2_correspondence <- readxl::read_excel( tf, sheet = 'Correspondence NUTS-2', - sheet = 'Correspondence NUTS-2', skip = 0 , col_names = T) %>% select ( 1:5 ) %>% purrr::set_names ( ., c("code13", "code16", @@ -50,12 +48,24 @@ nuts2_correspondence <- readxl::read_excel( filter ( is.na(code13) + is.na(code16) < 2) %>% mutate ( nuts_level = 2 ) +nuts3_correspondence <- readxl::read_excel( + tf, sheet = 'Correspondence NUTS-3', + skip = 0 , col_names = T) %>% + select ( 1:5 ) %>% + purrr::set_names ( ., c("code13", "code16", + "name", + "change", "resolution")) %>% + filter ( is.na(code13) + is.na(code16) < 2) %>% + mutate ( nuts_level = 2 ) + + nuts_correspondence <- rbind ( nuts1_correspondence, nuts2_correspondence ) %>% + rbind ( nuts3_correspondence ) %>% select ( code13, code16, name, nuts_level, change, resolution ) -nuts_2016_codes <- unique (regions$code16) +nuts_2016_codes <- unique (regional_changes_2016$code16) ##In these cases, the code13 == code16 ------------------------------ unchanged_regions <- regions %>% @@ -72,6 +82,79 @@ changed_regions <- regions %>% fill ( nuts2_name ) %>% select ( code13, code16, name, nuts_level, change ) +nuts_2016_codes <- unique (regional_changes_2016$code16)[!is.na(regional_changes_2016$code16)] +nuts_2013_codes <- unique (regional_changes_2016$code13)[!is.na(regional_changes_2016$code13)] +all_region_codes <- unique(c( nuts_2016_codes, nuts_2013_codes)) + +changed_region_codes <- all_region_codes [! all_region_codes %in% unchanged_regions$code16 ] +changed_region_codes <- sort(changed_region_codes [ !is.na(changed_region_codes)]) + +regions_in_correspondence <- unique(c(nuts_correspondence$code13, nuts_correspondence$code16)) +regions_in_correspondence <- sort(regions_in_correspondence [!is.na(regions_in_correspondence)]) + +if ( length( + changed_region_codes[ ! changed_region_codes %in% regions_in_correspondence] +) > 0 ) { + message ("Problem with the following regional geo labels:") + message ( changed_region_codes[ ! changed_region_codes %in% regions_in_correspondence] ) + stop ("They cannot be found in the correspondence table") +} + + +## Consistency check ---------------------------------------- +## The name field is inconsistent in two sheets, at least FR7 is not consistent +regions_in_correspondence <- regions_in_correspondence[ !is.na(regions_in_correspondence)] + +nuts2013_in_changed <- unique(changed_regions$code13) +nuts2013_in_changed <- nuts2013_in_changed[!is.na(nuts2013_in_changed)] + +nuts2016_in_changed <- unique(changed_regions$code16) +nuts2016_in_changed <- nuts2016_in_changed[!is.na(nuts2016_in_changed)] + +all ( nuts2013_in_changed %in% regions_in_correspondence) +all ( nuts2016_in_changed %in% regions_in_correspondence) + +nuts2013_in_changed [! nuts2013_in_changed %in% regions_in_correspondence ] +nuts2016_in_changed [! nuts2016_in_changed %in% regions_in_correspondence ] + +## Consistency II ---------------------------------------------------- + +all_nuts_codes <- unique(c(nuts_2013_codes, nuts_2016_codes)) + +only_in_correspondence <- regions_in_correspondence [regions_in_correspondence %in% all_nuts_codes] + +only_13 <- nuts_correspondence %>% + filter ( code13 %in% only_in_correspondence ) + +only_16 <- nuts_correspondence %>% + filter ( code16 %in% only_in_correspondence ) + +only <- full_join ( only_13, only_16) # they are unique + + +## Changed regions to be looked up by their NUTS2016 codes ----------- +regional_changes_by_2016 <- nuts_correspondence %>% + mutate ( geo = code16 ) %>% + filter ( !is.na(code16) ) + +## adding those that have no equivalent in the previous group +## some regions have to be identified by their old and new codes ----- +regional_changes_by_2013 <- nuts_correspondence %>% + mutate ( geo = code13 ) %>% + filter ( !is.na(code13) ) + +## Region can be found by new or old NUTS code ----------------------- + +all_regional_changes <- regional_changes_by_2016 %>% + full_join ( regional_changes_by_2013, + by = c("code13", "code16", "name", "nuts_level", + "change", "resolution", "geo")) + + +all_regional_changes %>% + add_count ( code13, code16, name, nuts_level, change, resolution, geo ) %>% + filter ( n > 1 ) + ## Regional changes ------------------------------------------------ regional_changes_2016 <- rbind ( changed_regions, unchanged_regions ) diff --git a/data/nuts_correspondence.rda b/data/nuts_correspondence.rda index 1dbe8bb4..fe2c146f 100644 Binary files a/data/nuts_correspondence.rda and b/data/nuts_correspondence.rda differ diff --git a/man/check_nuts_2013.Rd b/man/check_nuts_2013.Rd deleted file mode 100644 index 382692df..00000000 --- a/man/check_nuts_2013.Rd +++ /dev/null @@ -1,33 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/check_nuts_2013.R -\name{check_nuts_2013} -\alias{check_nuts_2013} -\title{Check NUTS region codes that changed with the \code{NUTS2016} definition} -\usage{ -check_nuts_2013(dat) -} -\arguments{ -\item{dat}{A Eurostat data frame downloaded with \code{\link{get_eurostat}}} -} -\value{ -An augmented data frame or a message about potential coding -errors. For filtering, it marks \code{'non_EU'} and \code{'unchanged'} -regions. Observations with codes ending on \code{'ZZ'} or \code{'XX'} are -removed from the returned data table, because these are non-territorial -observations or they are outside of the EU. -} -\description{ -Eurostat mixes \code{NUTS2013} and \code{NUTS2016} geographic -label codes in the \code{'geo'} column, which creates time-wise comparativity issues. -This function checks if you data is affected by this problem and gives -information on what to do. -} -\examples{ - \dontrun{ - dat <- eurostat::tgs00026 - check_nuts_2013(dat) - } -} -\author{ -Daniel Antal -} diff --git a/man/harmonize_geo_code.Rd b/man/harmonize_geo_code.Rd index 6443db7e..f0666c31 100644 --- a/man/harmonize_geo_code.Rd +++ b/man/harmonize_geo_code.Rd @@ -2,38 +2,29 @@ % Please edit documentation in R/harmonize_geo_code.R \name{harmonize_geo_code} \alias{harmonize_geo_code} -\title{Recode geo labels from NUTS2013 to NUTS2016} +\title{Harmonize NUTS region codes that changed with the \code{NUTS2016} definition} \usage{ harmonize_geo_code(dat) } \arguments{ -\item{dat}{A Eurostat data frame downloaded with \code{\link{get_eurostat}}.} +\item{dat}{A Eurostat data frame downloaded with \code{\link{get_eurostat}}} } \value{ -An augmented and potentially relabelled data frame which -contains all formerly \code{'NUTS2013'} definition geo labels in the -\code{'NUTS2016'} vocabulary when only the code changed, but the -boundary did not. It also contains some information on other geo labels -that cannot be brought to the current \code{'NUTS2016'} definition. -If not called before, the function will use the helper function - \code{\link{check_nuts_2013}} + +An augmented data frame that explains potential problems and +possible solutions. + } \description{ -Eurostat mixes NUTS2013 and NUTS2016 geographic label codes -in the \code{'geo'} column, which creates time-wise comparativity issues. -This function recodes the observations where only the coding changed, and -marks discontinued regions, and other regions which may or may not be -somehow compared to current \code{'NUTS2016'} boundaries. +Eurostat mixes \code{NUTS2013} and \code{NUTS2016} geographic +label codes in the \code{'geo'} column, which creates time-wise comparativity issues. +This function checks if you data is affected by this problem and gives +information on what to do. } \examples{ \dontrun{ - eurostat::tgs00026 \%>\% - check_nuts_2013() \%>\% - harmonize_geo_code() - - #If check_nuts_2013() is not called, the function will call it. - eurostat::tgs00026 - harmonize_geo_code(dat) + dat <- eurostat::tgs00026 + harmonize_geo_code(dat) } } \author{ diff --git a/man/recode_to_nuts_2013.Rd b/man/recode_to_nuts_2013.Rd index e556d80a..9561ecef 100644 --- a/man/recode_to_nuts_2013.Rd +++ b/man/recode_to_nuts_2013.Rd @@ -19,7 +19,7 @@ that cannot be brought to the current \code{'NUTS2013'} definition. Furthermore, when the official name of the region changed, it will use the new name (if the otherwise the region boundary did not change.) If not called before, the function will use the helper function - \code{\link{check_nuts_2013}} and \code{\link{harmonize_geo_code}} +\code{\link{harmonize_geo_code}} } \description{ Eurostat mixes NUTS2013 and NUTS2016 geographic label codes @@ -29,16 +29,19 @@ marks discontinued regions, and other regions which may or may not be somehow compared to the historic \code{'NUTS2013'} boundaries. } \examples{ - \dontrun{ - eurostat::tgs00026 \%>\% - check_nuts2013() \%>\% - harmonize_geo_code() \%>\% - recode_to_nuts_2013() - - #If check_nuts2013() is not called, the function will call it. - eurostat::tgs00026 \%>\% - recode_to_nuts_2013() - } +test_regional_codes <- data.frame ( + geo = c("FRB", "FRE", "UKN02", "IE022", "FR243", "FRB03"), + time = c(rep(as.Date ("2014-01-01"), 5), as.Date("2015-01-01")), + values = c(1:6), + control = c("Changed from NUTS2 to NUTS1", + "New region NUTS2016 only", + "Discontinued region NUTS2013", + "Boundary shift NUTS2013", + "Recoded in NUTS2013", + "Recoded in NUTS2016" + )) + +recode_to_nuts_2013(test_regional_codes) } \author{ Daniel Antal diff --git a/man/recode_to_nuts_2016.Rd b/man/recode_to_nuts_2016.Rd index 41ef02aa..d4fd0364 100644 --- a/man/recode_to_nuts_2016.Rd +++ b/man/recode_to_nuts_2016.Rd @@ -19,7 +19,7 @@ that cannot be brought to the current \code{'NUTS2016'} definition. Furthermore, when the official name of the region changed, it will use the new name (if the otherwise the region boundary did not change.) If not called before, the function will use the helper function - \code{\link{check_nuts_2013}} and \code{\link{harmonize_geo_code}} +\code{\link{harmonize_geo_code}} } \description{ Eurostat mixes NUTS2013 and NUTS2016 geographic label codes @@ -29,16 +29,19 @@ marks discontinued regions, and other regions which may or may not be somehow compared to current \code{'NUTS2016'} boundaries. } \examples{ - \dontrun{ - eurostat::tgs00026 \%>\% - check_nuts2013() \%>\% - harmonize_geo_code() \%>\% - recode_to_nuts_2016() - - #If check_nuts2013() is not called, the function will call it. - eurostat::tgs00026 \%>\% - recode_to_nuts_2016() - } +test_regional_codes <- data.frame ( + geo = c("FRB", "FRE", "UKN02", "IE022", "FR243", "FRB03"), + time = c(rep(as.Date ("2014-01-01"), 5), as.Date("2015-01-01")), + values = c(1:6), + control = c("Changed from NUTS2 to NUTS1", + "New region NUTS2016 only", + "Discontinued region NUTS2013", + "Boundary shift NUTS2013", + "Recoded in NUTS2013", + "Recoded in NUTS2016" + )) + +recode_to_nuts_2016(test_regional_codes) } \author{ Daniel Antal diff --git a/tests/testthat/test_regional.R b/tests/testthat/test_regional.R new file mode 100644 index 00000000..8ae5d5b4 --- /dev/null +++ b/tests/testthat/test_regional.R @@ -0,0 +1,59 @@ +context ("Regional code harmonization") + + +test_regional_codes <- data.frame ( + geo = c("FRB", "FRE", "UKN02", "IE022", "FR243", "FRB03"), + time = c(rep(as.Date ("2014-01-01"), 5), as.Date("2015-01-01")), + values = c(1:6), + control = c("Changed from NUTS2 to NUTS1", + "New region NUTS2016 only", + "Discontinued region NUTS2013", + "Boundary shift NUTS2013", + "Recoded in NUTS2013", + "Recoded in NUTS2016" + )) + +test_harmonized <- harmonize_geo_code(test_regional_codes) + +try_recode_2013 <- recode_to_nuts_2013(test_harmonized) + +try_recode_2016 <- recode_to_nuts_2016(test_harmonized) + +lookup_code16 <- test_harmonized %>% + filter ( geo == "FR243") %>% + select ( code16 ) %>% unlist() %>% as.character() + +lookup_code13 <- test_harmonized %>% + filter ( geo == "FRB03") %>% + select ( code13 ) %>% unlist() %>% as.character() + +recode_frb <- try_recode_2013 %>% + filter ( code16 == "FRB") %>% + select ( geo ) %>% unlist() %>% as.character() + +recode_ukn02 <- try_recode_2016 %>% + filter ( code13 == "UKN02") %>% + select ( geo ) %>% unlist() %>% as.character() + + + +test_that("Recoding gives correct results",{ + skip_on_cran() + skip_on_travis() + expect_equal( lookup_code16, + "FRB03" + ) + expect_equal( lookup_code13, + "FR243" + ) + expect_equal( lookup_code13, + "FR243" + ) + expect_equal( recode_frb, + NA_character_ + ) + expect_equal( recode_ukn02, + NA_character_ + ) + +}) diff --git a/vignettes/website/regional_data.Rmd b/vignettes/website/regional_data.Rmd new file mode 100644 index 00000000..4284bc30 --- /dev/null +++ b/vignettes/website/regional_data.Rmd @@ -0,0 +1,284 @@ +--- +title: "Regional data examples for the eurostat R package" +date: "`r Sys.Date()`" +output: + rmarkdown::html_vignette: + toc: true +--- + +# R Tools for Eurostat Open Data + +This [rOpenGov](http://ropengov.github.io) R package provides tools to access [Eurostat database](http://ec.europa.eu/eurostat/data/database), which you can also browse on-line for the data sets and documentation. For contact information and source code, see the [package website](http://ropengov.github.io/eurostat/). + + +See eurostat vignette for installation and basic use. + +```{r, echo=FALSE, message=FALSE} +library(eurostat) +library(dplyr) +library(tibble) +``` + +## Motivation + +Working with regional data has many advantages and many challenges. I had three aims when creating this article: + +- I wanted to highlight how you can use existing eurostat functions to work with Eurostat's regional products; +- To create simple helper functions and guidance on more complex data manipulations to improve the quality of the raw regional data; +- To start a dialogue on improving the data products of Eurostat. + +The advantage over national data is lies in the homogeneity in units, and the larger number of units, which enables us to better understand social and economic differences. National boundaries, i.e. NUTS0 regions, are historical and political constructions. They greatly vary in size and complexity. Within the EU, Germany and Malta are equally NUTS0 regions or countries, although Malta’s size would make it a small NUTS3 region in Germany. Comparing Germany with Malta hides a huge diversity within Germany. + +Statistical regions are largely homogeneous in size and in urban complexity. The smallest, NUTS3 region are cities, or towns with their rural hinterland; it can be expected that most people go to school or work within this region. Malta itself is the size of a NUTS3 region, so it could be compared with the NUTS3 regions of Germany the most meaningfully. NUTS1 units are usually provinces of larger countries, such as Bavaria in Germany. NUTS2 units comprise of (usually) several NUTS3 units within a NUTS1 large region. + +The smallest member states are the size of NUTS2 and NUTS3 regions and can be best compared with all the similar sized regions of Europe. Bit larger member states like Slovakia are NUTS1 regions, and they can be best compared with all NUTS1 regions of Europe: Bavaria and Slovakia make a more meaningful comparison in many cases than Germany and Slovakia. +There are several difficulties with working on sub-national level of data.These are related to data availability, changes in boundaries, and data & metadata quality. + +### Boundary changes + +Changes in boundaries meant that unlike national boundaries, regional boundaries change very often. Since standardizing the NUTS regions in 2003 with the EU, boundary changes were made on average every three years. Boundary changes make organizing data panels (which are several time instances of the cross section regional data) very tedious. + +### Data availability and quality + +Data availability means that many statistical produces are only available on NUTS0 country level. The creation of NUTS1-NUTS3 statistics is usually slow and the data product range is narrower at these levels. + +NUTS-level data is often disaggregated with the use of various estimations from higher levels. While some original data sources are available from NUTS3 levels (or even higher geographical resolution data, i.e. lower level of aggregation level), such as population or mortality data, many economic activities are theoretically difficult to be connected to one place and geographical disaggregation is only estimated. For example, since the GDP is mainly produced in companies, and many companies work in several locations across municipal and regional borders, locating their contribution to the GDP is the result of a more or less precise estimation. + +Pan-European surveys are very important data sources for many social data products, but they are often created with the use of nationally representative samples. Even if they contain regional coding, and they can be re-arranged into regional statistics, the results are of lower quality, as the original survey sample is not representative to each and every NUTS2 or NUTS3 region of Germany, for example. (Of course, since Malta is a NUTS2 region, survey data from Malta is representative on NUTS2 = NUTS1 = NUTS0 level.) Practically this means that many statistical products of Eurostat are mixed products, i.e. they contain NUTS1 level data for larger member states, such as Germany, France or Italy, and they contain NUTS2 level data for other member states. + +One problem of Eurostat's data products is that they have no legal mandate to force national statistical offices to create consistent datasets. Sometimes data 'goes missing' because the national statistical offices, which is responsible for the quality and validity of the data, does not recode the historical data with new geographic label definitions. + +### Metadata quality + +And at last, the metadata quality of Eurostat’s products is not as good as on NUTS0 national level. A particularly problematic issue is that Eurostat’s tables do not differentiate between the current NUTS2016 regional boundaries and the NUTS2013 or NUTS2010 boundaries. Some data tables contain rows that cannot and must not be compared. For example, France went under a very thorough change in its regional boundaries, meaning that NUTS2013 regional data from 2013 can only be compared in the case of a very small fraction of the country with NUTS2016 data from 2016 or 2018. + +You can download the correspondence table in Excel. + +```{r download, eval=FALSE} +# download to a temporary file +tf <- tempfile(fileext = ".xlsx") +download.file(url = 'https://ec.europa.eu/eurostat/documents/345175/629341/NUTS2013-NUTS2016.xlsx', destfile = tf, mode = 'wb' ) +``` + +The correspondence table(s) are not tidy, and they are in several sheets which are not fully consistent. The name of the French region `FR7` or Centre-Est is marked as `discontinued` in the sheet `Correspondence NUTS-1` and at the same time as `relabelled and recoded` to `FRK`, or Auvergne-Rhône-Alpes. We believe that the latter case is correct and use only this row in the correspondence table to avoid duplications in joining. + +Furthermore, Eurostat has a very problematic practice with simply removing statistical products when metadata definitions change. So, you may have downloaded industry-level data with the NACE Rev 2 definition or French regional data with the NUTS 2013 definition, but under the same title, you will be downloading a differently defined dataset in 2020. Or, you will not be able to reproduce your code, because they will remove the data with your earlier definition. While it is clear that Eurostat cannot take care of boundary changes if the responsible national statistical offices fail to do this, removing the history of data products makes the validation of professional and academic work made with such data impossible in some cases. + +The logical workflow is the following: + +- understand how different parts of your data are affected by the problem, particularly if you want to join different data sets, such as GDP with population; +- correct metadata (labelling) errors; +- impute additive data based on the correspondence table; +- impute non-additive data from larger territorial units; +- optionally estimate non-additive boundary change effects. + +Instead of creating only the correction functions, I added a few further steps down the road, because if you work with different dataset, not to mention different data sources, the problem may be different in each dataset that you try to join. + +# Taking care of boundaries + +Most regional statistical products are made on the NUTS2 level, or they are mixed NUTS1-NUTS2 level statistics. This means that usually you have 150-300 units to compare, which is gives an unprecedented richness in cross-sectional analysis. Most US or Australian datasets are not so detailed in cross-section, and data availability in the rest of the world is just lower. + +The power of statistical analysis can be increased when you order such data into panels, because the different change in a time interval in this huge cross-section contains usually a lot more information about the underlying social or economic process. However, organizing panels – or just simple time series of an individual region – is often hindered by changes in regional boundaries. + +A simple strategy is to create a _panel of only those data that do not change boundaries_. However, if you have many variables, this leads very quickly to a huge loss in data, because missing data is often independent from boundary changes. With the addition of each new variable you are likely to loose new and new rows of observations. + +Keeping track of the changes is a much better strategy, and up to a point, it is a costless in the amount of work, because often _only the metadata is changing_. Member states, when they change tow regions’ boundary only, will nevertheless create new regional codes for all their regions, to make sure that regional labels do not mix. However, Eurostat is not following this practice well, and it does mixes up different labels. + +With the new helper function `check_nuts_2013()` you can see if your geo label codes are affected by these changes, and you get a first view on how you can continue your work. + +```{r checknuts2013} +eurostat::tgs00026 %>% + filter ( time == 2012 ) %>% + harmonize_geo_code() +``` + +Zooming on regions `UKM` you can see that `UKM5` and `UKM6` are unchanged, `UKM3` gave birth to two new regional units `UKM8` and `UKM9` (this is an additive change) and `UKM2` lost a NUTS3 unit `UKM24`. This latter one is also an additive change, but maybe far more difficult to handle in practice, because data about `UKM24` may not be available in most cases, as NUTS1 and NUTS2 level data is only available for a very few basic indicators on NUTS3 level. You can, however, easily maintain backward compatibility among `UKM3`, `UKM8`, `UKM9`, because the new data is just available in higher resolution, or, in other words, for two halves of the earlier `UKM3` region. + +```{r checknutsUK} +# for readability the previous example is filtered and reduced +eurostat::tgs00026 %>% + filter ( time == 2012 ) %>% + harmonize_geo_code() %>% + filter ( grepl("UKM", geo) ) %>% + select ( geo, values, change ) +``` + +For easier filtering in further use, there are two logical variables added to the data frame, i.e. `nuts_2013` and `nuts_2016`. Many datasets contain non-EU regions not covered in the Eurostat correspondence tables, their filter is `nuts_2013 == FALSE & nuts_2016 == FALSE`. + +The following example will filter out all rows that use a geo code which is defined in NUTS2013 and cannot be found in NUTS2016. These are the main sources of incompatibility in your data panel. + +```{r filterdifference} +eurostat::tgs00026 %>% + filter ( time == 2012 ) %>% + check_nuts_2013() %>% + filter ( nuts_2013, ! nuts_2016 ) +``` + +## Recoding needed: only the metadata changed + +The first, logical step is to find those data points which are in fact identical, only their regional codes have changed. For example, `FRC1` is in fact identical to region with the NUTS2013 label `FR26` (Bourgogne region in France.) In this case, you can simply re-label the regions that appear to be different just because of the different codes applied. + +The helper function `harmonize_geo_code()` will assist you with these cases. + +To make the example more clear, let's zoom on changes in France. You can see that many regions changes, but some of them only changed labels. For forward compatibility, `harmonize_geo_code()` changed all geo labels to the current, `NUTS2016` definition. In fact, this is needed to use maps, for example. + +```{r harmonizeFR} +# for readability the previous example is filtered and reduced +eurostat::tgs00026 %>% + filter ( time == 2012 ) %>% + harmonize_geo_code() %>% + filter ( grepl("FR", geo) ) %>% + select ( geo, code13, code16, change, values ) +``` + +In the change log, `recoded` means that the geo code was changed in the transition to NUTS2016, `recoded and relabelled` means that not only the code, but also the official name of the region changed. + +You can decide which coding you prefer to use. Beware to use consistent map definitions if you will visualize your work - you can add the NUTS2013 labelled data to a map that contains the NUTS2013 boundary definitions. + +For comparing with additional data sources, it may be useful to make sure that you use the current name of the region. Function `recode_to_nuts_2016()` changes the name column to the NUTS2016 definition, when applicable, and `recode_to_nuts_2013()` will use the earlier definition. + +```{r convertFR} +# for readability the previous example is filtered and reduced +eurostat::tgs00026 %>% + filter ( time == 2012 ) %>% + recode_to_nuts_2016() %>% + filter ( grepl("FR", geo) ) %>% + select ( geo, name, code16, change, resolution, values ) +``` + +Another useful filter is `change == "not in the EU"`. The non-EU member state region definitions (and their possible changes) are not covered in the Eurostat correspondence table. + +```{r convertfilter} +# for readability the previous example is filtered and reduced +eurostat::tgs00026 %>% + filter ( time == 2012 ) %>% + recode_to_nuts_2016() %>% + filter ( ! nuts_2013, ! nuts_2016 ) +``` + +You may need to review these manually, and if you have a problem with the boundaries, refer to the national statistical authorities of these non-EU countries. + +## Imputing to new boundaries with historical data + +Eurostat released an untidy Excel document that contains all boundary changes from the `NUTS2013` to the `NUTS2016` boundary definition. You can load these tidy tables into your global environment with `data("nuts_correspondence")` and `data ("regional_changes_2016")` or simply reference them as `eurostat::nuts_correspondence` and `eurostat::regional_changes_2016`. (The `eurostat::` part can be omitted if you have called earlier`library(eurostat)` in your code.) + +Because NUTS3 level data is very scarce, we did not create a programmatic solution to filling in new boundaries for NUTS2 regions. + +However, using these correspondence information, many NUTS1 regions, when NUTS2 data is present in the data, can be filled in with historical data using simple equivalence or addition. + +```{r correspondence} +nuts_correspondence %>% + filter ( nuts_level == 1 ) %>% + select ( code16, resolution ) +``` + +For example, the new NUTS1 regions `FRB` is simply the continuation of the earlier NUTS2 region `FR24`. Or, the new NUTS1 region `FRC` can be filled with historical data with simply adding `FR26` and `FR43` NUTS2 data observations. + +### Backfill to historical boundaries + +When applying the latest boundaries (and visualizing according to current boundaries) is not important, it may be easier, or leave you with a larger panel of data if you use the correspondence information to backfill new, NUTS2016 data into the NUTS2013 boundaries, simply because you have more data following the earlier definition. + +## Imputation strategies + +There are many imputation methodologies implemented in various R libraries (see [CRAN Task View: Missing Data](https://cran.r-project.org/web/views/MissingData.html)) You have to beware that most of these methods are not satisfactory in regional datasets. Whenever missingness is caused by boundary changes, it will certainly violate many imputation method's conditions. For example, many imputation strategies work when missingness is random. Therefore, it is very important that you first align the boundaries, and then apply imputation. + +Consider the following very simple, hypothetical example: + +```{r example1, echo=FALSE} +tibble ( regions =c("A02 - from 2015 in D1 greater region", + "B01 - from 2015 in D1 greater region", + "C1", + "D1 - from 2015 A02+B02"), + Y2014 = c(1,2,10,NA_real_), + Y2015 = c(rep(NA_real_, 4)), + Y2016 = c(rep(NA_real_,2), 10, 5)) + +``` + +How would you interpolate the missing 2015 data? In the case of region `C`, there are no boundary changes, and the data seems constant. You would interpolate the value to be 10. + +However, in the case of the new `D1` region, we first reconstruct the sum of its smaller regions, `A02` + `B01` where we have historical data. If `D1` region would have been defined as a region in 2014, its value would have been 3. So the correct intrapolation is 4. + +```{r example2, echo=FALSE} +tibble ( regions =c("A02 - from 2015 in D1 greater region", + "B01 - from 2015 in D1 greater region", + "C1 - 2015: intrapolated", + "D1 - 2014: A02+B02"), + Y2014 = c(1,2,10,3), + Y2015 = c(rep(NA_real_, 2), 10,4), + Y2016 = c(rep(NA_real_,2), 10, 5)) + +``` + +You may still wonder if you should use the old boundary definitions, because `D1` had a higher resolution of data given it detailed the statistics to its constituent subregions, `A02` and `B01`. + +```{r example3, echo=FALSE} +data.frame ( regions =c("A02 - extrapolated with D1 data", + "B01 - extrapolated with D1 data", + "C1 - 2015: intrapolated", + "D1 - 2014: A02+B02"), + Y2014 = c(1,2,10,3), + Y2015 = c(1.5, 2.5, 10,4), + Y2016 = c(2,3, 10, 5)) + +``` + +There are a few things to keep in mind when you start actually analyse the data. + +If you fill up your data set to both old and new boundary definitions, your dataset _appears to be bigger_, but it _does not contain more information_. Keeping both `A02 and B01` and `D1` in your panel duplicates the new D1 region in your panel which is formerly known as `A02` and `B01`. If you measure growth, you will overestimate average growth, because the high-growth region is duplicated in the dataset. You must remove either `A02 and B01` or `D1` from your panel, otherwise you will skew the effect that you analyse towards `D1`. + +The use of the old boundaries makes sense if you have more data in the old definition prior to 2014. In this case, your dataset will contain less estimated values if you stick to the historical boundaries, and extrapolate the discontinued `A02` and `B01` regions, and leave `D1` out of your models. + +The use of new boundaries is useful when you have more data after Y2016. In this case, the switch to a lower geographical resolution (merging A02 and `B01` to `D1`) is balanced by the fact that you have more recent and more factual data about the less detailed `D1` observation. In this case, backfilling via reverse extrapolation the `D1` data is the better strategy. You should leave `A02` and `B01` out of your further analysis. + +# Suggestions for Eurostat + + +There are problems with Eurostat’s data products on two levels: with the data and with the metadata. + +The data problems are affecting the work of national statistical authorities, because they are responsible for the creation, validation, and when necessary, the later correction of data. Eurostat cannot change the data they submit; however, it can change harmonization methodology, guidelines, and when necessary, initiate change in statistical regulation. +I think that updating guidelines, and possible even regulation would not be controversial in the case when member states would be asked to provide the history of their statistics in the cases when the content of the data did not change, only its metadata, i.e. the labelling. If a member state changed the boundaries of a region, it may or may not be possible to re-calculate the data for this region. However, when only the name and short code changed, the data points are there, and they should be included in the data products. + +Regarding metadata, Eurostat could improve its products without the involvement of member states. The current problem with the metadata of the regional statistics is that they are not tidy and not fully consistent. The variable column ‘geo’ in the statistical products in fact contains at least three different information: the level of aggregation, the label of the information in the NUTS2013 definition and the label of the information in the NUTS2016 information. Depending on what view you take on the contents of the table, this means that a seemingly single data table in fact is an unlabelled join of three tables: a national data table, and two regional data tables following different regional boundaries. + +The addition of the NUTS (or NUTS equivalent non-EU) level would already remove a lot of confusion and several metadata errors. The source of the confusion is that many products claim to contain NUTS2 information, but they contain a mixture of NUTS0, NUTS1 and NUTS3 information. While the geo column can be easily filtered (by the number of characters of the geo code) this information is not known to all users. Adding the nuts_level variable in our case makes joining various data sources far easier and less confusing. + +Several ways could be found to add the information currently contained in the (otherwise not tidy) Correspondence Table to each regional product. This would require adding the information to which NUTS definition does the row (observation) in the dataset comply with. It could be done in several ways from a data presentation and organization point of view. What should be minimally added is the NUTS definition (vocabulary) where the NUTS unit can be found, and potentially, as our helper functions do, further information about conversion. + +A solution to the metadata presentation of the regional statistical products does not require the modification of statistical regulations (which must be adopted by the member states of the EU) and it is very urgent, because the next NUTS changes are already announced. + +And at last, it would be a non-controversial change, which may require updating guidelines or regulations, is to add, at least on a non-mandatory basis, non-EU countries to the Correspondence tables. It is very unlikely that EEA countries like Norway or potential candidate countries like North Macedonia would have objections to report their regional boundary changes to the Correspondence tables. This is a self-evident change, which is also necessary after Brexit, given that the United Kingdom’s boundary data will have to remain in the Correspondence tables. + +# Citations and related work + +### Citing the data sources + +Eurostat data: cite [Eurostat](http://ec.europa.eu/eurostat/). + +Administrative boundaries: cite EuroGeographics + + +### Citing the eurostat R package + +For main developers and contributors, see the [package homepage](http://ropengov.github.io/eurostat). + +This work can be freely used, modified and distributed under the +BSD-2-clause (modified FreeBSD) license: + +```{r citation, message=FALSE, eval=TRUE, echo=TRUE} +citation("eurostat") +``` + + +### Contact + +For contact information, see the [package homepage](http://ropengov.github.io/eurostat). + + +# Version info + +This tutorial was created with + +```{r sessioninfo, message=FALSE, warning=FALSE} +sessionInfo() +```