From 6e5d65fec61b8705f0f63b0bb4c00104a2f78909 Mon Sep 17 00:00:00 2001
From: Daniel Antal <antaldaniel@users.noreply.github.com>
Date: Sat, 8 Feb 2020 22:25:58 +0100
Subject: [PATCH 01/11] After testing the functions on almost all Eurostat
 regional statistics, I found a weird error, which was due to the fact that
 there is an error in the Eurostat correspondence table. After this I started
 creating a testthat infrastructure, too, and many-many internal consistency
 checks.

---
 data-raw/nuts_coding.R       |  99 ++++++++++++++++++++++++++++++++---
 data/nuts_correspondence.rda | Bin 2331 -> 5254 bytes
 2 files changed, 91 insertions(+), 8 deletions(-)

diff --git a/data-raw/nuts_coding.R b/data-raw/nuts_coding.R
index c5b7d843..7465ea89 100644
--- a/data-raw/nuts_coding.R
+++ b/data-raw/nuts_coding.R
@@ -9,7 +9,7 @@ tf <- tempfile(fileext = ".xlsx")
 download.file(url = 'https://ec.europa.eu/eurostat/documents/345175/629341/NUTS2013-NUTS2016.xlsx', 
               destfile = tf,  mode = 'wb'  )
 
-regions <- readxl::read_excel( tf,
+regional_changes_2016 <- readxl::read_excel( tf,
                    sheet = 'NUTS2013-NUTS2016', 
                    skip = 1, col_names = T) %>%
   select (1:12) %>%
@@ -28,20 +28,18 @@ regions <- readxl::read_excel( tf,
 
 nuts1_correspondence <- readxl::read_excel( 
   tf, sheet = 'Correspondence NUTS-1', 
-  #file.path('data-raw', 'NUTS2013-NUTS2016.xlsx'),
-  #file.path('.', 'NUTS2013-NUTS2016.xlsx'),
-  file.path(tf),    
-  sheet = 'Correspondence NUTS-1', 
   skip = 0 , col_names = T) %>%
   purrr::set_names ( ., c("code13", "code16", 
                            "name", 
                            "change", "resolution")) %>%
   mutate_if ( is.factor, as.character ) %>%
-  mutate ( nuts_level = 1 )
+  mutate ( nuts_level = 1 ) %>%
+  filter ( name != 'Centre-Est') # appears to be a duplicate and incorrect row, given that FR7 is also marked as recoded to FRK
+
+warning ( "FR7 - Centre-Est appears to be an errorneous line and it is removed from the correspondence table.")
 
 nuts2_correspondence <- readxl::read_excel(
   tf, sheet = 'Correspondence NUTS-2',   
-  sheet = 'Correspondence NUTS-2', 
   skip = 0 , col_names = T) %>%
   select ( 1:5 ) %>%
   purrr::set_names ( ., c("code13", "code16", 
@@ -50,12 +48,24 @@ nuts2_correspondence <- readxl::read_excel(
   filter ( is.na(code13) + is.na(code16) < 2) %>%
   mutate ( nuts_level = 2 )
 
+nuts3_correspondence <- readxl::read_excel(
+  tf, sheet = 'Correspondence NUTS-3',   
+  skip = 0 , col_names = T) %>%
+  select ( 1:5 ) %>%
+  purrr::set_names ( ., c("code13", "code16", 
+                          "name",
+                          "change", "resolution")) %>%
+  filter ( is.na(code13) + is.na(code16) < 2) %>%
+  mutate ( nuts_level = 2 )
+
+
 nuts_correspondence <- rbind ( 
   nuts1_correspondence, 
   nuts2_correspondence ) %>%
+  rbind ( nuts3_correspondence ) %>%
   select ( code13, code16, name, nuts_level, change, resolution )
 
-nuts_2016_codes <- unique (regions$code16)
+nuts_2016_codes <- unique (regional_changes_2016$code16)
 
 ##In these cases, the code13 == code16 ------------------------------
 unchanged_regions <- regions %>%
@@ -72,6 +82,79 @@ changed_regions <- regions %>%
   fill ( nuts2_name ) %>%
   select ( code13, code16, name, nuts_level, change )
 
+nuts_2016_codes <- unique (regional_changes_2016$code16)[!is.na(regional_changes_2016$code16)]
+nuts_2013_codes <- unique (regional_changes_2016$code13)[!is.na(regional_changes_2016$code13)]
+all_region_codes <- unique(c( nuts_2016_codes, nuts_2013_codes))
+
+changed_region_codes <- all_region_codes [! all_region_codes %in% unchanged_regions$code16 ]
+changed_region_codes <- sort(changed_region_codes [ !is.na(changed_region_codes)])
+
+regions_in_correspondence <- unique(c(nuts_correspondence$code13, nuts_correspondence$code16))
+regions_in_correspondence <- sort(regions_in_correspondence [!is.na(regions_in_correspondence)])
+
+if ( length(
+  changed_region_codes[ ! changed_region_codes %in% regions_in_correspondence]
+) > 0 ) {
+  message ("Problem with the following regional geo labels:")
+  message ( changed_region_codes[ ! changed_region_codes %in% regions_in_correspondence] )
+  stop ("They cannot be found in the correspondence table")
+}
+
+
+## Consistency check ----------------------------------------
+## The name field is inconsistent in two sheets, at least FR7 is not consistent
+regions_in_correspondence <- regions_in_correspondence[ !is.na(regions_in_correspondence)]
+
+nuts2013_in_changed <- unique(changed_regions$code13)
+nuts2013_in_changed <- nuts2013_in_changed[!is.na(nuts2013_in_changed)]
+
+nuts2016_in_changed <- unique(changed_regions$code16)
+nuts2016_in_changed <- nuts2016_in_changed[!is.na(nuts2016_in_changed)]
+
+all ( nuts2013_in_changed  %in% regions_in_correspondence)
+all ( nuts2016_in_changed  %in% regions_in_correspondence)
+
+nuts2013_in_changed [! nuts2013_in_changed  %in% regions_in_correspondence ]  
+nuts2016_in_changed [! nuts2016_in_changed  %in% regions_in_correspondence ]  
+
+## Consistency II ----------------------------------------------------
+
+all_nuts_codes <- unique(c(nuts_2013_codes, nuts_2016_codes))
+
+only_in_correspondence <- regions_in_correspondence [regions_in_correspondence %in% all_nuts_codes]
+
+only_13 <- nuts_correspondence %>%
+  filter ( code13 %in% only_in_correspondence )
+
+only_16 <- nuts_correspondence %>%
+  filter ( code16 %in% only_in_correspondence )
+
+only <- full_join ( only_13, only_16) # they are unique
+
+
+## Changed regions to be looked up by their NUTS2016 codes -----------
+regional_changes_by_2016 <- nuts_correspondence %>%
+  mutate ( geo = code16 ) %>% 
+  filter ( !is.na(code16) )
+
+## adding those that have no equivalent in the previous group
+## some regions have to be identified by their old and new codes -----
+regional_changes_by_2013 <- nuts_correspondence %>%
+  mutate ( geo = code13 ) %>% 
+  filter ( !is.na(code13) )
+
+## Region can be found by new or old NUTS code -----------------------
+
+all_regional_changes <- regional_changes_by_2016 %>%
+  full_join ( regional_changes_by_2013, 
+              by = c("code13", "code16", "name", "nuts_level",
+                     "change", "resolution", "geo"))
+
+
+all_regional_changes %>% 
+  add_count ( code13, code16, name, nuts_level, change, resolution, geo ) %>%
+  filter ( n > 1 )
+
 ## Regional changes ------------------------------------------------
 
 regional_changes_2016 <- rbind ( changed_regions, unchanged_regions )
diff --git a/data/nuts_correspondence.rda b/data/nuts_correspondence.rda
index 1dbe8bb4ff876acc230ebf99d58e3478d992ea46..fe2c146f6ced5a385eb9f73b2a054307326f97c2 100644
GIT binary patch
literal 5254
zcmYLMcQo8h)c);StBXZ+vRE}b5rkl^vO@GOdRb*dbW&bfC3*{@2T@liR`l-G6E%9G
z6J4|<l6?Dq-yh#SGxy$eX3jHr?mhFIJ4j26yrilHzmdgxBGWp+6Zp^XzeM@n`S*}$
zjys9dFQ58@ogED)EkFmTz`indDzPu$Fnj6~;yu~Ux!9W3rBrLWP#G%v7y^wucj2ta
zIqWTS@G>vOE?4)Z&j<@8r6Oe*bXnjUx-xyRT~@)1@^+ocU=1T@lIai)_mI4-ym#~Z
zQyO_VHZFZ76#zA84+y#$?7lWamvBdy=$2uI!dWFCGIr!px+z^6Nl?@jgX&G?G1RoG
zhSoVv85=;Nf}sKaU#2GMF}Rz`+FnL_M(A&fbUGP#pqc+ILqh=!(SWHbN*Hu>bcvu{
z46317c0i^C0G80;=LND@B6V31T8JL7vmU1Wr=2ePXQI$+qjNo@Dfc^UTqW5?DkPI6
z)6htFw|uVTd=g&npQYE3Rvg_MZ8}D3KS+Zo62g~;xY=TeRCyDxMHkN5{O<ke=%c^N
zjd0pX{*UsPu2kc<GXC9KD~Js$4CliBSaQpXA$aI`@j1m)SC#{BB5(|BNRpGYKP{1|
z{*G4kw!xurIGx|HN)ni@b?!nLaa>ydr`-81AtT0639T}fbC5zXt_<rM4ms+$dc0a=
zxaS=8ieZ#*wJF9T7><?=g)^wXc9d1DNp}8abHitQ@mJn~&E>1ZEBx-q$!fx^so7-A
z-|pM8Ox3g@jeAy-bzO4$i`eFDJQ`6J%B~f;b^6|vFZb0Q+LH$;kY0U*Oaf>7<@VUo
z&9i}Cr|nY9xh!na-W}=4U3r~II3Vm|-vnonmcBzciSdH0GY1BMq^0lrTg}Ch3`m+7
z10-&UMRX`|Q(6inEwz|Lk(rq_a=W#bs+A}$m4yl*8OBP6v=i9No6Rd%Y}%>$g2H%w
zRZffu{*yDTM)L`mh2?w$>vmIPO;DrT>`K;WlhmcL(ISu(36_A|anYo=C@)~+<nfNl
z%FJp%dd5wzDit;7;^G#vmS#9?JYMBX`skjN_lIZWJsacN!_nIT2pFA-3YLQP8l&|Z
z^xk1?=EL!DeaTawy|{Bu&-ABms$b~FpQJCmbR}<tgrcINwpKQD`U*6yVtYyuI|Fj<
z3Osk=D|S&vrFSZvkKFO0K;$Hv@ERZ{!U^o+{C~OV__Dkl`e=E%kViuv_-6aS4Cu)j
zoGvt+&6$9~GBhNmc%QCZY;A6RL67^S@%zQ0VQg)FDlirwRR|{t_&Ox{Vp^W{TXcn%
z#KI1D#h<)e9Qd8Z`}_yLI`4Z-diD!GA}hG2(96b<4AAm1?sp_8Xoi!g9mppqoovRc
zDzFck`6SlK9h>zZZ|t1I!e6E5_y}>iW&iBeOk>lLz%uh5ZUb^0>uVH<Bx*YFzJwOW
zfu0d6$0*6ipF}}{r6Kspfe^`CGIO#dcCwM(@o+6qXx{<@MrBSi&J97r5-uxQGUdoZ
z!p08m#u)%OFg*|&P6<J(5*-}7BQ`qDEzC_ao<O2YQ3uzo?yI&7$HgPkS9W_4KJPC!
zzJvt^SiOx(Urt{8=J6(e<X`vtyn$C_7nVomNmGT!Rkq`NUe9}xh3Pp`^%-1WIzK)(
zGOLPZ4@4lhAL>R1Wn-qDoT~o#(s`GIH7G0Sr)zlirl?}X23ky)XeSbVk#%QxvOR-i
z0_``9^Mt*e_I<Xx@#nJS<U!T(kLCQS?skOmStY9?v)RX>2orkBk(rEwV(7=aq$cUf
zDdx<4Z(pglL=fWK>1QMP#X6*L)f7-?U9B0UhSx1F8+c&esyCS!Hgr2!vH7Rs_3!&{
zb(5n?g=`bmgzeuO&Nf!KF)g+jCR}UF|8Qt|!0@(1`*88m=!=ZJrr=zJ@r{6cA9$nZ
z9j)_5W0!XLAk|X|Q6If}%UuL|6J05G5^sjr&Q8>th`sZlWSI`B*0jZBep_-ORLy#L
zDvQ_cd!MR3|FJOJ*y=<HRwt*^QhFuFIWAr-q;1erZxHl890E2>Ey9G)kNf8O&spRK
zAR%Bt<t1JEB9`zuFtDV5dBq)H1Lb8IkPP&XK${rP8@b!-jFB6d)T%)Ybw#G+CwXqq
zlFME;d5o1LU=^2^E1EFQ6;BTeDFq7^9cc)Qc3VyWzws!6fo7eh^}U^)lgfpQ+Ebp_
zs(vZXnq}RgOFYN)Ao?Jx6<zW|1oTmM4z(*&=!I`(@GSrcr*d*1z9fc195kotOVT;p
zf>p&J5E<0J%0uCio=Lu@K+b0R)qvzP=WK-z$v7Uu`&Dni)pE9>ICWziaAX;PiH1xz
zEVVZIo3$VLd)QnwFQgXpqS3{=J8z1Gw6dy%6xdAF1-Xay74RKU&n-;0Z2=SpgVE^+
zMJ#751<?e-GLWCb>dhEvnmV-%nn?>vlf2>_<Y8stvPAH$B2m#&ZGR5&-iS-DFTMV5
zLj=dJ*noq{&^WEAsBo>76Cx6kAS2=&oCOECizB~ImEP-VZpsTbgN68xzfL(f4^5S!
zMW^pnx_X>9e=gy^Tff?w2pw)SolZOY@79av;zHO~`Eeh7be9VFYP_M!L#Fa1XG=hP
z;iCPJZB>3dprOjmgbomo+f$gWQ_y2><5gHlt5UP{*k&1!h>4;H0gWfm?8W7^XWzS8
zKD3W5_OP5SiF6QwK!9(ijS7;86<4~-SuB%-Tpz$%SbNh*+68Lr62}9Fk&#GCBk6D`
zEnb%Xqns6qxw<$JW2L3GQb{uwxf$UqkBqtEi0NnFjc{wfmAwl%c(wtjKw+_iy(Vf~
zW3==pJr9^0#{88&8V;Rg@A_(vZMxZ5<9<TwsJQLyR&vCqV#r3_<SY<3oLQ~3p`;_b
zv$@<~r&d-kQy0CNmX-+{1gQHeD+l97i7G*bOq!{AY(=6j&P1iSY=F(Y-&8oqGPc=(
zjeN$(nwFU5bux0~>r@ABsc|eknfJ*1#ceJ|VRKc80q@d6rr{6;NXF{n?SN~W{m0`i
zYa)(lt5=3+7__6I<<>d$$4(TC8$1Dwun0~7h?EDYEr>(}DNegl^jZAM9k*f1ob#A)
zL$?yJk0ezf|F^M99SKXBjZaDTxnUj?C43~FQEHz0IS%?enbZe_9rKEj$#SJ$1d=-^
zd!?t_g*|qn1mZ;KD~Knc5VCz~I{7MRvp)_;V^yW)tAsZKLMt&-|Fs?<l|`%sK+l29
z@OofoLXcrn&#&KdYn1blPk#-70YH~9X-C|KkcE+u<Lq}8=+6ikmoyQ>UPeDe<dhQ<
zg=(5VUiFHY`B$UnAKvAM3G`Qfqbs?b%mJ4Sa;Y@6IC@&Bzlntz3CjS}{tyD>%J~rA
z`1tWEsXTh^+{8o$r5Uzzwr(UX{a7%1P%o&gtgH_zoD3H*O0TkN#xPkpr{7jrH(}P_
zGRK=|``*bMmlT&9kao#}n&xu#elCV>BlQavmedTyTx950e?*-H*g5`|<$1Pn{1$d~
zLSV^wM%TgmESCACbmdZ9D^I!W3vWeZ=Z#jf1oe|^>8`**cs#s8R>dxQn)EyByePi#
zhx_%z>Zlam9uN1tk(Xp5k8G5SCVz+J5-P02*@o30N1E49fO~AOWtC{-Jip(U&$!~p
zeBleQf%5QzY?2j!JUj5s^ha{Au-f^&S-cisE<wlWjG|{#bhP!JFgHvavfNx8ZXG4V
zLBI$e%KTx=mmOP5?CZND?);r~^2%@TFZzjD{<692+xazBQ`;i8qbDC=MTOF%;aVl;
z#A;lK?SqF90JTwTGgXsc7ub1Oq7YvMh%Mml0D7VH|D87<R&M~{7zD9w6vB8Fk$M%P
z=LmpZwhlr8{r_NyShq|6<Jah>p}@ce0JuR_6Bqz6OT$6Rric3=*-1PeuN{i!!Q)f%
ziDyGnwB}1$1^_@ZyR{Y|wnPBXoC#!R0u+;n`v928wg4b-H_#5`D@F7e)*dZ~0;F4q
z`(lTOht5^R$KOa)bLh>TZNeLft{~fGd=u-tJcO0$J(Kcyo)oYPDj|EreL%7jtLK}u
zEnBi;IDOIUe;_ORwTL$!cNnH6^zoz&^U0HzhoM7ag+~C*y2cnXnVM?*xvB5NM+`D0
z48HE4!8g=<`YF>1;A!cGNexFuiO^TGr~i5M^f&WDUlWQQjDgH$v-C-L7YGK|NZ;4N
zKRVFyz5h3#7%_mY)K%C$FfsYh4-tE3+>-OV`Rw`f@Ah#~F0qEv(qlfYgOR~3BC}fS
zFMi+dS6gk4rR{xe;8t$9D)it@qgtfqw>E{17+KTCi#{=P5dLM0B3@dWvyE>8_ESj!
ziq5B)pep3Yi6oS?U6Du8ma;R5Dzb0xF|JXaaSFQigl@j@Q}<~3efVBLL_O7H$g$C@
z$rbmy;xXALot4Tq8VeJm7(@A&?zXSWF4Y2{xV~z%U85T!#Ss9F48&~QB!I#(Q&^d0
zvp9gl=PLqhVuDC92BQ%`ipH{OP*1`QO*2yfkm&%OJ$qqc6SiQvK7Dq#@@Ql8<MTHi
zm1%3<hfXs4BVJ=@1oRR_;s)Q7NxL!Tl*`SRRk(Lg;k}lo%@t{8C`T_n%vHf?J$YAO
zYjI9yt~GG(iRuMoj<AY0$4b_Gb6L;I*!oBBoL)JHa{>)}ZlLQz%#Y|V>V1=4!=F?|
zDrP(SIU4f?Xb*y}zRj&J>=lx+_J7a%U>$-M9azvUN>TH^JLS~{fI_=`hO%F10RWlI
zj6M)d<O$$be9>#a&rihR{QImkbFBTZtA?5_DhJO+Fcmd%&)q*eRYhm()r5>`OG4zt
zS2G-EA_QV{UtE^ft%U_0{~-Hy%%geYC(EeVFRgLZQUBl;gxAT@{y>g5lIyPm=X9X>
zr>w8JOv)n-Zo9V11&-R@T+7L`JlZ%gUX@~TM~U4u%H^qRtC0JyM8tKvT_oz=ZF5c^
zGKYz8r%-%WTvy`#Cg%MHd)46}jYN}(eN<DX%~-JYuh7)D?*xvFhi$Y|!xL?~1598#
z%&zPUM{uRL#gAu#i#9`x?0(AqXm3upa_-n4p=y^C4tRnYFd)}zhX2V*7Iq|8L0Hfw
z3&M5z2Wa5pj^0^L=yCEC9#l7%lZNwaHr~Pg@JXb)prZPqyF{`O$8+!_vJ;!zbpl!N
zSZqh)uW{F)t4i^q0`tJpU>n<X%$boWM}X4=m&7RZ+^K>~!`I6Zcgvs<J?vilG_Ui6
zS>blhIUhIKOM1uTbr<|*CogV?>2r~&kXqTxwB+HfUXVdsIJ01hWlDz{H@VhU7Ki~u
zFC~a2KtQw{w5M7XX{ZQS2<D(GR$*zcJ!4+`<Shx+DKoaiv1&lI*iNJjOTMAdhm)MT
z!8UKjR8JDLbk4{KK@CbSU7RL9RK1Vf_4Ru%N>e2n2WVAWJ8V);kTijr%AN?{=PV+P
zT4U8^SO#<hPZh0!=tLU*`L-xa#Ggax%wVDFp-#dy(}pV4+PZJ<I+<J1DXJLFMl&$f
zM`AbM{VJtEUh>=ll@b+UjB|uNK|c5p;Y&B?v@uQi=+n^7{_EWOhNVmo-(UlSu(c_5
zq4d)RW2cs5bVT=p&|OaywS|4RyR%y`VzcRX?eYxn)kAhQ#Dy%*4H_YLSwHY6xq!;2
zo*%o=dk;$d(7F!y+vjoP-RHFK>zoG7os$`k^%Q9=WAvIV16mbV$7;fAJpNIB$pD7X
zQT5PCvD>+1cxaY|dvfw)x~xS}h0l_TcL(=92aDWDs73Pn8N*8%`zQExGK0h1$694s
z&lU_PG!7@7%Y_SZUhrF#Mq|{;yA)CFUB8#kz5TqA+cnqrTy<MtL1YW|`lcVP4fBr3
zx=axkID26p>#4nIOTRUKPPn082BlB2z{=jW)sFsvdXu1$qg{jqisV6o(WCxS$*w(^
z%mrB-+TL)a!}-eI|1ZinLAH(hp%cTYK<BrkUp6(%PkX=rtXdVh&1e}&GitjIKmO-s
z0K(Y*0grzS#V{o*2@S{D^Q=J<{(SiO>b}^HQSB8s-NdlBo`Yf*<}?MV>3O9|ugTY-
zxL!b7x%l>tvgYviV4V2D7uIVwLKc%iCXPD1#<grqq$D-0&K)df=2<?POJH_vfnDZ<
zjVuq|%|GYsv>EmNU``~T!?^0KW)OL#p6lZU(q4PpoTT(tsjbKiN7?k16E_+%x+jJR
zUR%KopYyu{{_u*2+#?+CUU6JKr|@#^Sp3S%N~zoZvUev*G}sxz$||&Y-p9>me&e_f
zp4<Ngn!StV_h%wh)@FjAm?>S_J}^$Im5j2i$6)1`KV=uv58d!Cv1nm7yTi52^=4YU
zxVPe2AQ&<;jG*TZw+DgJa7tv&FLGm+IG$eBMzVpC*fk&Rl_@01Xp5A?jOGR9a53R>
zPG7j);A!2R{Q94+cCk<5dc+ZjC8w%6{m$=RF%l*(R_0p&S-x*GRWA^C)X9yUy_=I4
z^p?NvUm^4}`y}{F5Gnt{mlizfY!kGh6=!}=;rm1i^GnKFy17W%WIHJVG>wQ)H0mhV
zhxN~PX|-27E?Moe4T1LXsfndy%zg<WUuUL#C!E?tb(`#s!OdEQ;>c8UI<s#}8cT;|
zR<{S0CUzuwvi5Yp|EoOL5J0PtZSl~MATmUz=*4-?5}VfbYWzX7bi>cm0bj<ssLyXi
z!=E(wra^h7xgk4E@0H}*Y{CPZ_f|$c7Mf5$)8rUYaD3(_+wfm8e$+8{86AG+-O$}`
z7)=q)gzLZRM;6J>6o>os7e^zfawIyRv&#b|Zl}~$B>s5O4fqvLg8qg-jm{Mebt{|E
zhZ(3an%v7XX|bEyv>UfJ^=ExJ;%C0H{yqO|3_9WTp4s%B8CTt@EH2$;*<ZOU9!g}!
z+gH!qt?w<ddU{EBN<TfD_x>_6e^y_$AeQu?e@=Oag5rI+D4N{%`q2>MDpGA$(JJnL
E0308N+5i9m

literal 2331
zcmV+$3FP)dT4*^jL0KkKS#^FO*Z>Pa|NZ~}|K9yS|L*_)|M0)>|L}0246q;|9RLmh
z5CA{{;0Ygk&BL1Atw3`c!tSnTYq?!C>asL4Bw|$k2xcWcrkYZIrbnse4LwADq<)mz
zpQwWX(;(0^O{5tbH>y2Bw9rP3G|~Yy(rEM|Dd9}?)i#<zqd@gE8U}y>0B8Vc13+kK
z0u3=SF)$MWFaQ7m34j1500A%n000v})6pbVJdaU7N;Ew~F+)s-n1e=X0009Kk)fkN
z13(%80E0|SOiTp8OaK4?0$>0MfB;MY000EgB8Z4g5NcsPO_b7ksAvdiGz@?Z000dJ
zjT!&}p`d!7>MwkN@BkJD8xeH%qyff=X-t5oy|j{5=HIK~HC5l8?-}vQ#$#%M2Yt84
z<XbZ`y;4|V06u6I2vrPe#p`f=*Hda0da#I+0roR21cFH(b`*N}c55?Q-;R!}oY1JO
zi~!s7vbP0EB@hw_b*br|+iu)i{XVBwzgt_cHonJWvu3sVWn`jJEU(*s2ES8K)yq|>
z8WU}{?(@a~0})9Z2%DJ6G=TtAv8)aRlGWC%2FZILOZmfqeB7IPv{!J@X1`(t8OQ@(
z^nN{Ww^F!LG+?DMx*B9Ags>F^s;a&}lU(Ah#}cz~Tx)9-10qbAfL&{zZvHuP<#9Wy
zsna6dE@MS;v4F}t3v*Q39O^3;<#VY5=4gWHbJ)DhB@)=itl&XIq96$ffO6qTNCa68
zxfz98LJF?j3Ltaf0wIV1f&w8yP%03C0zyS000|T-K&q)%zzygXYXAn&rsOe`SmEv1
zZ&||2>-A*IS%yihFfc%my^b&&j&pk4>R<1=$RYxy3sr=r;=($%Ghq(_$c{|s2bK>z
zsk3^NRNy2xlwQAvHleXyCa@2o8JLb}%UB{a$!c1Z3u*CHO>RbOTf|%(_)Mm{UR0R~
zLJ%KBLJ(Vmv4*)EjVhII5^G0PP{m@=Jr?q0$tX!ze#<U%1?`;YJ0~TU`XUV0rRaWg
zy+gBYTlkQ7&dtRH!YR;-0xLUFqU3UZ^2BhA{Tw<oNozuVU0z4e32^-hu#o~3s45VU
zB`Hc#K)Ltz@-}O(t?b<m-iJfI%RZI1?peP^h0xdNjI42lV)mOlNP=&Bq|?F^5J)5v
zNJ2smz-5j&;f^^|MpebFH!0>EE0nR8n$oqdiq|UA-B{yR)oWO~u$<so7ufDO&uI`U
z*eRtPKobm9B|?IrkR$;XgsvFnj#*_H&rTX#5N6Emu)`^7!u;7$>nTaHb%eRB3}fe#
zEKAi@v;d2N1Q3J}LJO|DJtZkfGUz$2X$ZnxEhq!w4dy~GaAcas0U+QJ?{pCsWqnJ6
z%nHI)GI|JO0N?;CwIo)gC<1{ZpeUVEecuAJrH$FHYwmV5b>`^HV<oYPTG|qXV6OV<
zBoM_ts00d%EzqnJs)mNp=Q_>(LJ7zK2Ig`h(5e^+!<cDmK&{=FoYxvZA>^fvdXK%X
zY<>iPt)uIkS(`A_kRkhE?~S*xLea}oF>xux!Q-qTg5oBeLLTxwN7v~$0`e(vAtHwB
zmSA^cjQyawh=fq~`pBD$cZsDya0U_*qL42=Tv@SpI*((yRlA+pFf;)PU5vpIh{iED
zeJL%Nj_&U5j^mQ<)NMA?l1q{dlGLypxpAi2WySyq%I<9J$phMx-7fAxl0Xl8QttDT
zNgxI!fs~LsGEAhBNB{wT=7hJ@ms!7CHn+W}ZxY_sZgllXRcfl=ozVc7P?lllKq4FK
zQO*(p_a^7Xi6j|88RI-kDt6P#sLT$%Rcv0~vj+(QBON}CUQG&^>qk;@66Ip;D(zE)
z<m=t6gUP)&Q4tMY&FSvrI6po(YGxPHJ!CjIX?RQXyMU(Oy9DN%@rK;}lcARdfr(6l
z@k_hq4V!p&3-b}p13bpZ+f)Ij8;FIC?K(V99QcmoMWcoc?QIZe6S(0)Ho({6f&dYV
zJ5v)N2Jw=2gz{;&n{#{|Ss(|=CVVFA(LJ8)L|;2%)1EbV7oJ;PLen`qf`he`k|9kH
zmK}SPizP4GJjl74#CaFwFu)oJ1i8>!=b`1;4ofwB*uENQIIB(Q)!fi))4Z@F1B;H0
zUOhDkHDkq`s31y&1BV5!IU^9${s#+<KtxquA6fu}As4?N(BDYxcYCVSNmj-tv)h|Q
zqmZ3z?c=E*1p<;U)sQ1owX~b7Pi0AizqUryEWExa2$!F<Mp5G51ahauu|cU20#bwE
zb?^4(++X@_XoDyoC^_SCAK3L}0(>j2uyp=Tu;R8o2>8l%Dp3H=FYDn2E)b$a2>_78
zgN9L>aq}0_H?b%bvhE~{^E5hYelOD(gGK4v$k$>yFUpNV(Oy(Orlan$o1(%I;Vgs8
zcpErWx>W%E_|*l(OhF4I#Ke&yQ4^C3*=o$)mQwAjSyR%h8;t@40T#BjdW&i3Tm-K|
z$Etv0LB+5?r4ruNw?}A1i<@NDapc*EklE*NnCh7^riYnnYy=*OdzxjLgAsp7AY80!
zMy7EP7gRu65h17zwmRj06yjARKTiA64Xi|9j$jCm1>MXS7m0Su)#nu$n{W+@s9H$D
z4cVwxDQy}dNf`m?=jBkexQmjK>zB3Wf2qKB6Lai!BE2L$VY+=g+|0Y%-5;|zE@8i#
z)u2!m1gHv7fIONU4jcJR;SwM;Xiz!k(C6ipw||bCnpmL_<UrpS!#Yr3qn`#iwHgLm
zx^Tyf8^DZN51EakA-|9dDQp>VIaw5c&uN!Sv3}`r>Vn{cJy=mVp(V3>g@thzUnnLd
zrf)%9;uL~7_SLm2PIV`o5^Z|o5Q%l5`~&0>FP&Jn9y?_lBjymT5i8>oQsycUuCz=6
zXcc@J?(}ctrQx?)fqqgTTGI$&N{3?Qv~4ZGW}i>AJNvtIye}`$DR+7_M>gTaiW^WA
zp|PSd>g(%Wp&$grAJ&x+ZyPr1S-TkaH2GCD?^*r*{cs)x;2-!;<%5Dqr2zjGK*!xr
zkXYH+P`M_%uWU;MULAM-4&v?(Ao;Ys)p%#YqyLai(LyMZc?JK)+>uTcBpqLfHUPY0
BD<S{@


From f49bbae88e782fc620fa01fcd6f6e229e9c3aa27 Mon Sep 17 00:00:00 2001
From: Daniel Antal <antaldaniel@users.noreply.github.com>
Date: Sat, 8 Feb 2020 22:26:55 +0100
Subject: [PATCH 02/11] rewrite

---
 R/harmonize_geo_code.R | 297 ++++++++++++++++++++++++-----------------
 1 file changed, 176 insertions(+), 121 deletions(-)

diff --git a/R/harmonize_geo_code.R b/R/harmonize_geo_code.R
index c9064864..d2aeb452 100644
--- a/R/harmonize_geo_code.R
+++ b/R/harmonize_geo_code.R
@@ -1,152 +1,207 @@
-#' @title Recode geo labels from NUTS2013 to NUTS2016 
-#' @description Eurostat mixes NUTS2013 and NUTS2016 geographic label codes
-#' in the \code{'geo'} column, which creates time-wise comparativity issues.
-#' This function recodes the observations where only the coding changed, and
-#' marks discontinued regions, and other regions which may or may not be 
-#' somehow compared to current \code{'NUTS2016'} boundaries.
-#' @param dat A Eurostat data frame downloaded with \code{\link{get_eurostat}}.
+#' @title Harmonize NUTS region codes that changed with the \code{NUTS2016} definition
+#' @description Eurostat mixes \code{NUTS2013} and \code{NUTS2016} geographic
+#' label codes in the \code{'geo'} column, which creates time-wise comparativity issues.
+#' This function checks if you data is affected by this problem and gives
+#' information on what to do.
+#' @param dat A Eurostat data frame downloaded with \code{\link{get_eurostat}}
 #' @export
 #' @author Daniel Antal
-#' @return An augmented and potentially relabelled data frame which 
-#' contains all formerly \code{'NUTS2013'} definition geo labels in the 
-#' \code{'NUTS2016'} vocabulary when only the code changed, but the 
-#' boundary did not. It also contains some information on other geo labels
-#' that cannot be brought to the current \code{'NUTS2016'} definition.
-#' If not called before, the function will use the helper function
-#'  \code{\link{check_nuts_2013}}
-#' @importFrom dplyr mutate filter rename arrange add_count
-#' @importFrom dplyr left_join full_join anti_join right_join semi_join
-#' @importFrom tidyselect all_of
-#' @importFrom stringr str_sub
+#' @return An augmented data frame or a message about potential coding
+#' errors.
+#' @importFrom dplyr mutate filter rename mutate_if case_when
+#' @importFrom dplyr left_join full_join anti_join
 #' @examples
 #'  \dontrun{
-#'   eurostat::tgs00026 %>%
-#'      check_nuts_2013() %>%
-#'      harmonize_geo_code()
-#'      
-#'  #If check_nuts_2013() is not called, the function will call it.    
-#'   eurostat::tgs00026
-#'      harmonize_geo_code(dat)    
+#'    dat <- eurostat::tgs00026
+#'    harmonize_geo_code(dat)
 #'  }
 
-harmonize_geo_code <- function ( dat ) {
+harmonize_geo_code <- function (dat) {
   
   ## For non-standard evaluation -------------------------------------
-  change <- tmp <- geo <- nuts_level <- code13 <- code16 <- NULL
-  . <- n  <- remaining_eu_data <- resolution <- time <- values <- NULL
-  regional_changes_2016 <- NULL
-  
-  ## Check if geo information is present ------------------------------
-  if ( ! 'geo' %in% names(dat) ) {
-    stop ("There is no 'geo' column in the inserted data. This is an error.")
-    } 
-
-  ## Load the correspondence tables, but not to the global environment --
+  . <- change  <- geo <- code13 <- code16 <- nuts_level <- NULL
+  country_code <- NULL
+  
+  dat <- mutate_if ( dat, is.factor, as.character)
+  
+  ## The data is not loaded into the global environment --------------
   
   regional_changes_2016 <- load_package_data(dataset = "regional_changes_2016")
+  nuts_correspondence   <- load_package_data(dataset = "nuts_correspondence")
   
+  ## Creating constants -----------------------------------------------
+  regions_in_correspondence <- unique(c(nuts_correspondence$code13, nuts_correspondence$code16))
+  regions_in_correspondence <- sort(regions_in_correspondence [!is.na(regions_in_correspondence)])
+
   unchanged_regions <- regional_changes_2016 %>% 
-    filter ( change == 'unchanged' ) 
-  
-  changed_regions <- regional_changes_2016 %>% 
-    filter ( change != 'unchanged' )
-  
-  nuts_2016_codes <- unique (regional_changes_2016$code16)
-  nuts_2013_codes <- unique (regional_changes_2016$code13)
-  # for easier debugging, this data will be re-assigned in each major
-  # step as tmp2, tmp3...  Debugging is particulary difficult, because
-  # not only the program code, but the underlying logic may have faults.
-  
-  if (! all(c("change", "code16", "code13", 
-              "nuts_2016", "nuts_2013") %in% names (dat)) ) {
-    tmp <- dat %>% 
-      check_nuts_2013() 
-  } else {
-    tmp <- dat
+    filter ( change == 'unchanged' )
+  
+  # The Eurostat correspondence table had a duplicate entry.  It may
+  # re-occur later and this code may help finding it.
+  # nuts_correspondence_duplicates <- nuts_correspondence %>%
+  #  filter ( !is.na(code13 )) %>%
+  #  add_count ( code13 ) %>% filter ( n > 1 )
+  
+  ## Changed regions to be looked up by their NUTS2016 codes -----------
+  regional_changes_by_2016 <- nuts_correspondence %>%
+    mutate ( geo = code16 ) %>% 
+    filter ( !is.na(code16) ) %>%
+    select ( -geo ) %>%
+    distinct ( code13, code16, name, nuts_level, change, resolution) 
+  
+  # Regions may be duplicated in case their NUTS2016 and NUTS2013 are the same
+  
+  ## adding those that have no equivalent in the previous group
+  ## some regions have to be identified by their old and new codes -----
+  regional_changes_by_2013 <- nuts_correspondence %>%
+    mutate ( geo = code13 ) %>% 
+    filter ( !is.na(code13) ) %>%
+    select ( -geo ) %>%
+    distinct ( code13, code16, name, nuts_level, change, resolution)
+  
+  ## Join the regions by both NUTS definitions -----------------------
+  
+  all_regional_changes <- regional_changes_by_2016 %>%
+    full_join ( regional_changes_by_2013, 
+                by = c("code13", "code16", "name", "nuts_level",
+                       "change", "resolution"))
+  
+  
+  ## Check for potential duplicates ----------------------------------
+  duplicates <- all_regional_changes %>% 
+    add_count ( code13, code16  ) %>%
+    filter ( n > 1 )
+  
+  if ( nrow(duplicates) > 0 ) {
+    stop ("There are duplicates in the correspondence table.")
   }
+
+  all_regions_full_metadata <- unchanged_regions %>%
+    mutate ( resolution = NA_character_ ) %>% 
+    rbind ( all_regional_changes ) 
+  
+  nuts_2013_codes <- unique (all_regions_full_metadata$code13)#[!is.na(all_regions_full_metadata$code13)]
+  nuts_2016_codes <- unique (all_regions_full_metadata$code16)#[!is.na(all_regions_full_metadata$code16)]
+  nuts_2013_codes <- nuts_2013_codes[!is.na(nuts_2013_codes)]
+  nuts_2016_codes <- nuts_2016_codes[!is.na(nuts_2016_codes)]
+  
+  "PL2" %in% all_regions_full_metadata$code13
+  "PL2" %in% unique ( all_regions_full_metadata$code13)
+  "UKN01" %in% nuts_2013_codes
+  "UKN01" %in% nuts_2016_codes
+  
+  any ( is.na(nuts_2013_codes))
+  
+  tmp_by_code16 <- dat %>%
+    mutate ( geo = as.character(geo)) %>%
+    filter ( geo %in% all_regions_full_metadata$code16 ) %>%
+    left_join (  all_regions_full_metadata %>%
+                  dplyr::rename ( geo = code16 ), 
+                by = "geo") %>%
+    mutate ( code16 = geo ) %>%
+    mutate ( nuts_2016 = geo %in% nuts_2016_codes ) %>%
+    mutate ( nuts_2013 = geo %in% nuts_2013_codes )
+  
+  tmp_by_code13 <- dat %>%
+    mutate ( geo = as.character(geo)) %>%
+    filter ( geo %in% all_regions_full_metadata$code13 ) %>%
+    left_join (  all_regions_full_metadata %>%
+                  dplyr::rename ( geo = code13 ), 
+                by = "geo") %>%
+    mutate ( code13 = geo ) %>%
+    mutate ( nuts_2016 = geo %in% nuts_2016_codes, 
+             nuts_2013 = geo %in% nuts_2013_codes)
+  
+  message ( "In this data frame ", nrow(tmp_by_code16), 
+            " observations are coded with the current NUTS2016\ngeo labels and ", 
+            nrow ( tmp_by_code13), " observations/rows have NUTS2013 historical labels.")
+  
+  tmp_s <- tmp_by_code16 %>%
+    semi_join (  tmp_by_code13, 
+                 by = names ( tmp_by_code13)) # found in both (unchanged and relabelled)
  
-  # Separating rows that need to be corrected ----------------------------
+  if (! all(tmp_s$nuts_2013 && tmp_s$nuts_2016)) { stop ("Wrong selection of unchanged regions.") }
   
-  labelled_by_nuts_2016 <- tmp %>%
-    filter ( geo %in% nuts_2016_codes )  # These are following NUTS2016
   
-  labelled_by_nuts_2013 <- tmp %>%
-    anti_join ( labelled_by_nuts_2016, 
-                by = tidyselect::all_of(names(tmp)) ) %>%
-    filter ( geo %in% nuts_2013_codes )  # These are following NUTS2013
+  tmp_s2 <- tmp_by_code13 %>%
+    semi_join (  tmp_by_code16, 
+                 by = names (tmp_by_code16)) # found in both (unchanged and relabelled)
+  #must be equal!!!
   
-  message ( "There are ", nrow(labelled_by_nuts_2013), " regions that were changed",
-            " in the transition to NUTS2016 and\nthe data frame uses their NUTS2013 geo codes.")
- 
-  labelled_by_other <- tmp %>%
-    filter ( ! geo %in% nuts_2013_codes ) %>%
-    filter ( ! geo %in% nuts_2016_codes ) # These are not in the correspondence table (non-EU)
-
-  message ( "There are ", nrow(labelled_by_other), " regions that are not covered by the correspondence tables.")
-  message ( "They are likely to be non-EU regions and their consistency cannot be checked.")
+  tmp_a1 <- tmp_by_code16 %>%
+    anti_join (  tmp_by_code13, 
+                 by = names(tmp_by_code13)) # not found in code13 (new regions)
+  if ( ! all(tmp_a2$nuts_2013)) { stop ("Wrong selection of NUTS2013-only regions.") }
   
   
-  if (  nrow ( labelled_by_other) + nrow ( labelled_by_nuts_2013 ) + nrow(labelled_by_nuts_2016) != nrow (dat)) {
-    stop ( "Joining error Type I")
-  }
- 
-  ## NUTS regions that are NUTS2013 coded but have NUTS2016 equivalents -----
-  can_be_found  <- labelled_by_nuts_2013 %>%
-    filter ( !is.na(code16) )
-  
-  recoded_regions <- can_be_found %>%
-    filter ( grepl("recoded", change )) 
-  
-  message ( "There are ", nrow(recoded_regions), 
-" regions that only changed their geo labels.
-Their boundaries are consistent in NUTS2013 and NUTS2016.")
-  message ( "[", recoded_regions %>%
-              filter ( grepl ("relabelled", change)) %>%
-              nrow(), " of these changed their names, too.]")
-  
-  other_cases <- can_be_found %>%
-    anti_join ( recoded_regions,
-                by = names ( can_be_found ) )  # I think these are 'small changes' 
-
-  if ( nrow(other_cases) + nrow(recoded_regions) != nrow(can_be_found) ) {
-    stop ( "Joining error in NUTS2013 regions that can be found in NUTS2016")
-  }
+  tmp_a2 <- tmp_by_code13 %>%
+    anti_join (  tmp_by_code16, 
+                 by = names(tmp_by_code13)) # not found in code16 (changes)
+  if ( ! all(tmp_a2$nuts_2013)) { stop ("Wrong selection of NUTS2013-only regions.") }
+  
+  tmp <- rbind ( tmp_s, tmp_a1, tmp_a2 )
   
-  ## Discontinued regions -----------------------------------------------
+  not_found_geo <- unique(dat$geo[! dat$geo %in% tmp$geo ])
+  not_eu_regions <- not_found_geo[! substr(not_found_geo,1,2) %in% eu_countries$code]
   
-  cannot_be_found <- labelled_by_nuts_2013 %>%
-    filter ( is.na(code16) ) 
+  ## Checking if there are unmatched EU regions-------------------------
   
-  if ( nrow ( can_be_found ) + nrow(cannot_be_found ) != nrow ( labelled_by_nuts_2013 )) {
-    stop ("Joining error in NUTS2013 regions that can or cannot be found.")
+  not_found_eu_regions <-  not_found_geo[ substr(not_found_geo,1,2) %in% eu_countries$code]
+ 
+  if ( length(not_found_eu_regions)>0) {
+    stop ( "Some EU regions were not found in the correspondence table.")
   }
   
-  ## First join all EU regions ----------------------------------------
+  ## Adding columns for non-EU regions ----------------------------------
+  tmp_not_eu <- dat %>%
+    filter ( geo %in% not_eu_regions ) %>%
+    mutate ( nuts_level = nchar(geo)-2, 
+             change = "not in EU - not controlled", 
+             resolution = "check with national authorities", 
+             name = NA_character_,
+             code13 = NA_character_, 
+             code16 = NA_character_,
+             nuts_2016 = FALSE, 
+             nuts_2013 = FALSE)
   
-  eu_joined <- labelled_by_nuts_2016 %>%
-    full_join ( recoded_regions, by = tidyselect::all_of(names ( recoded_regions )) ) %>%
-    full_join ( other_cases,     by = tidyselect::all_of(names ( other_cases )) ) %>%
-    full_join ( cannot_be_found, by = tidyselect::all_of(names ( cannot_be_found )) ) 
+  tmp2 <- rbind ( tmp, tmp_not_eu)
   
-  if ( nrow ( eu_joined %>%
-                dplyr::semi_join ( labelled_by_other, 
-                          by = tidyselect::all_of(names (eu_joined))) ) > 0 ) {
-    stop ( "Joining error between EU and non-EU regions")
+
+  ## Check if all original rows are handled correctly ------------------
+  if (length(dat$geo [! dat$geo %in% tmp2$geo ])>0) {
+    message (tmp2 %>% anti_join (dat))
+    message (dat %>% anti_join (tmp2))
+    stop ("Not all original rows were checked.")
   }
+
+  eu_countries <- load_package_data(dataset = "eu_countries")
+
+  eu_country_vector <-  unique ( substr(eu_countries$code, 1, 2) )
   
-  ## Add non-EU regions  ----------------------------------------------
+
+  if ( any(tmp2$change == 'not in EU - not controlled') ) {
+    
+    not_EU_country_vector <- tmp2 %>%
+      filter ( tmp2$change == 'not in EU - not controlled' ) %>%
+      select ( geo ) 
+    
+    not_eu_observations <- nrow (not_EU_country_vector)
+    
+    not_EU_country_vector <- not_EU_country_vector %>%
+      unlist() %>% substr(., 1,2) %>% sort () %>%
+      unique ()
+     ## The correspondence table only covers EU regions.
+    message ( "Not checking for regional label consistency in non-EU countries.\n",
+              "In this data frame not controlled countries: ", 
+              paste (not_EU_country_vector,
+                     collapse = ", "), " \n", 
+              "with alltogether ", not_eu_observations, " observations/rows.")
+  }
   
-  all_regions <- labelled_by_other %>%
-    full_join ( eu_joined, 
-                by = tidyselect::all_of(names(eu_joined)))
- 
-  if ( anyDuplicated(all_regions) > 0 ) {
-    stop("Joining error - there are duplicates in the data frame.")
-  } 
- 
- all_regions %>%
-   dplyr::arrange(., time, geo, code16 )
+  ## Reorder columns for readability -------------------------------
   
-}
-
+  tmp_left <- tmp2 %>% select ( geo,  time, values, code13, code16, name )
+  tmp_right <- tmp2 %>% select ( -geo, -code13, -code16, -time, -values, -name )
 
+  cbind ( tmp_left, tmp_right)
+}

From ec376b1e48af6c9af173d0258aa4f16acb016fcc Mon Sep 17 00:00:00 2001
From: Daniel Antal <antaldaniel@users.noreply.github.com>
Date: Sat, 8 Feb 2020 22:27:31 +0100
Subject: [PATCH 03/11] documentation changes

---
 man/harmonize_geo_code.Rd | 31 ++++++++++---------------------
 1 file changed, 10 insertions(+), 21 deletions(-)

diff --git a/man/harmonize_geo_code.Rd b/man/harmonize_geo_code.Rd
index 6443db7e..a7a349d7 100644
--- a/man/harmonize_geo_code.Rd
+++ b/man/harmonize_geo_code.Rd
@@ -2,38 +2,27 @@
 % Please edit documentation in R/harmonize_geo_code.R
 \name{harmonize_geo_code}
 \alias{harmonize_geo_code}
-\title{Recode geo labels from NUTS2013 to NUTS2016}
+\title{Harmonize NUTS region codes that changed with the \code{NUTS2016} definition}
 \usage{
 harmonize_geo_code(dat)
 }
 \arguments{
-\item{dat}{A Eurostat data frame downloaded with \code{\link{get_eurostat}}.}
+\item{dat}{A Eurostat data frame downloaded with \code{\link{get_eurostat}}}
 }
 \value{
-An augmented and potentially relabelled data frame which 
-contains all formerly \code{'NUTS2013'} definition geo labels in the 
-\code{'NUTS2016'} vocabulary when only the code changed, but the 
-boundary did not. It also contains some information on other geo labels
-that cannot be brought to the current \code{'NUTS2016'} definition.
-If not called before, the function will use the helper function
- \code{\link{check_nuts_2013}}
+An augmented data frame or a message about potential coding
+errors.
 }
 \description{
-Eurostat mixes NUTS2013 and NUTS2016 geographic label codes
-in the \code{'geo'} column, which creates time-wise comparativity issues.
-This function recodes the observations where only the coding changed, and
-marks discontinued regions, and other regions which may or may not be 
-somehow compared to current \code{'NUTS2016'} boundaries.
+Eurostat mixes \code{NUTS2013} and \code{NUTS2016} geographic
+label codes in the \code{'geo'} column, which creates time-wise comparativity issues.
+This function checks if you data is affected by this problem and gives
+information on what to do.
 }
 \examples{
  \dontrun{
-  eurostat::tgs00026 \%>\%
-     check_nuts_2013() \%>\%
-     harmonize_geo_code()
-     
- #If check_nuts_2013() is not called, the function will call it.    
-  eurostat::tgs00026
-     harmonize_geo_code(dat)    
+   dat <- eurostat::tgs00026
+   harmonize_geo_code(dat)
  }
 }
 \author{

From 0644a9827605b65c8f38dc899160c7372741771e Mon Sep 17 00:00:00 2001
From: Daniel Antal <antaldaniel@users.noreply.github.com>
Date: Sat, 8 Feb 2020 22:27:57 +0100
Subject: [PATCH 04/11] update vignette

---
 vignettes/website/regional_data.Rmd | 119 ++++++++++++++++++----------
 1 file changed, 79 insertions(+), 40 deletions(-)

diff --git a/vignettes/website/regional_data.Rmd b/vignettes/website/regional_data.Rmd
index d795adfc..4284bc30 100644
--- a/vignettes/website/regional_data.Rmd
+++ b/vignettes/website/regional_data.Rmd
@@ -16,6 +16,7 @@ See eurostat vignette for installation and basic use.
 ```{r, echo=FALSE, message=FALSE}
 library(eurostat)
 library(dplyr)
+library(tibble)
 ```
 
 ## Motivation
@@ -39,25 +40,40 @@ Changes in boundaries meant that unlike national boundaries, regional boundaries
 
 ### Data availability and quality
 
-Data availability means that many statistical produces are only available on NUTS0 country level.  The creation of NUTS1-NUTS3 statistics is usually slow, the data product range is smaller.  
+Data availability means that many statistical produces are only available on NUTS0 country level.  The creation of NUTS1-NUTS3 statistics is usually slow and the data product range is narrower at these levels.  
 
-NUTS-level data is often disaggregated with the use of various estimations from higher levels. While some original data sources are available from NUTS3 levels (or even lower level), such as population or mortality data, many economic activities are theoretically difficult to connect to one place and geographical disaggregation is only estimated. For example, since the GDP is mainly produced in companies, and many companies work in several locations across municipal and regional borders, locating their contribution to the GDP is the result of a more or less precise estimation.
+NUTS-level data is often disaggregated with the use of various estimations from higher levels. While some original data sources are available from NUTS3 levels (or even higher geographical resolution data, i.e. lower level of aggregation level), such as population or mortality data, many economic activities are theoretically difficult to be connected to one place and geographical disaggregation is only estimated. For example, since the GDP is mainly produced in companies, and many companies work in several locations across municipal and regional borders, locating their contribution to the GDP is the result of a more or less precise estimation.
 
-Pan European surveys are very important data sources for many social data products, but they are often created with the use of nationally representative samples.  Even if they contain regional coding, and they can be re-arranged into regional statistics, the results are of lower quality, as the original survey sample is not representative to each and every NUTS2 or NUTS3 region of Germany, for example.  (Of course, since Malta is a NUTS2 region, survey data from Malta is representative on NUTS2 = NUTS1 = NUTS0 level.)  Practically this means that many statistical products of Eurostat are mixed products, i.e. they contain NUTS1 level data for larger member states, such as Germany, France or Italy, and they contain NUTS2 level data for other member states.
+Pan-European surveys are very important data sources for many social data products, but they are often created with the use of nationally representative samples.  Even if they contain regional coding, and they can be re-arranged into regional statistics, the results are of lower quality, as the original survey sample is not representative to each and every NUTS2 or NUTS3 region of Germany, for example.  (Of course, since Malta is a NUTS2 region, survey data from Malta is representative on NUTS2 = NUTS1 = NUTS0 level.)  Practically this means that many statistical products of Eurostat are mixed products, i.e. they contain NUTS1 level data for larger member states, such as Germany, France or Italy, and they contain NUTS2 level data for other member states.
+
+One problem of Eurostat's data products is that they have no legal mandate to force national statistical offices to create consistent datasets. Sometimes data 'goes missing' because the national statistical offices, which is responsible for the quality and validity of the data, does not recode the historical data with new geographic label definitions.  
 
 ### Metadata quality
 
 And at last, the metadata quality of Eurostat’s products is not as good as on NUTS0 national level. A particularly problematic issue is that Eurostat’s tables do not differentiate between the current NUTS2016 regional boundaries and the NUTS2013 or NUTS2010 boundaries.  Some data tables contain rows that cannot and must not be compared.  For example, France went under a very thorough change in its regional boundaries, meaning that NUTS2013 regional data from 2013 can only be compared in the case of a very small fraction of the country with NUTS2016 data from 2016 or 2018. 
 
-Furthermore, Eurostat has a very problematic practice with simply removing statistical products when metadata definitions change.  So, you may have downloaded industry-level data with the NACE Rev 2 definition or French regional data with the NUTS 2013 definition, but under the same title, you will be downloading a differently defined dataset in 2020.  Or, you will not be able to reproduce your code, because they will remove the data with your earlier definition.  
+You can download the correspondence table in Excel.
+
+```{r download, eval=FALSE}
+# download to a temporary file 
+tf <- tempfile(fileext = ".xlsx")
+download.file(url = 'https://ec.europa.eu/eurostat/documents/345175/629341/NUTS2013-NUTS2016.xlsx', destfile = tf,  mode = 'wb'  )
+```
+
+The correspondence table(s) are not tidy, and they are in several sheets which are not fully consistent.  The name of the French region `FR7` or Centre-Est is marked as `discontinued` in the sheet `Correspondence NUTS-1` and at the same time as `relabelled and recoded` to `FRK`, or Auvergne-Rhône-Alpes. We believe that the latter case is correct and use only this row in the correspondence table to avoid duplications in joining.
+
+Furthermore, Eurostat has a very problematic practice with simply removing statistical products when metadata definitions change.  So, you may have downloaded industry-level data with the NACE Rev 2 definition or French regional data with the NUTS 2013 definition, but under the same title, you will be downloading a differently defined dataset in 2020.  Or, you will not be able to reproduce your code, because they will remove the data with your earlier definition.  While it is clear that Eurostat cannot take care of boundary changes if the responsible national statistical offices fail to do this, removing the history of data products makes the validation of professional and academic work made with such data impossible in some cases. 
 
 The logical workflow is the following:
 
+- understand how different parts of your data are affected by the problem, particularly if you want to join different data sets, such as GDP with population;
 - correct metadata (labelling) errors;
-- impute additve data based on the correspondence table;
+- impute additive data based on the correspondence table;
 - impute non-additive data from larger territorial units;
 - optionally estimate non-additive boundary change effects.
 
+Instead of creating only the correction functions, I added a few further steps down the road, because if you work with different dataset, not to mention different data sources, the problem may be different in each dataset that you try to join.  
+
 # Taking care of boundaries
 
 Most regional statistical products are made on the NUTS2 level, or they are mixed NUTS1-NUTS2 level statistics.  This means that usually you have 150-300 units to compare, which is gives an unprecedented richness in cross-sectional analysis.  Most US or Australian datasets are not so detailed in cross-section, and data availability in the rest of the world is just lower.
@@ -68,12 +84,12 @@ A simple strategy is to create a _panel of only those data that do not change bo
 
 Keeping track of the changes is a much better strategy, and up to a point, it is a costless in the amount of work, because often _only the metadata is changing_. Member states, when they change tow regions’ boundary only, will nevertheless create new regional codes for all their regions, to make sure that regional labels do not mix. However, Eurostat is not following this practice well, and it does mixes up different labels.
 
-With the new helper function check_nuts2013() you can see which geo labels have been changing.
+With the new helper function `check_nuts_2013()` you can see if your geo label codes are affected by these changes, and you get a first view on how you can continue your work.
 
 ```{r checknuts2013}
 eurostat::tgs00026 %>%
   filter ( time == 2012 ) %>%
-  check_nuts2013() 
+  harmonize_geo_code() 
 ```
 
 Zooming on regions `UKM` you can see that `UKM5` and `UKM6` are unchanged, `UKM3` gave birth to two new regional units `UKM8` and `UKM9` (this is an additive change) and `UKM2` lost a NUTS3 unit `UKM24`. This latter one is also an additive change, but maybe far more difficult to handle in practice, because data about `UKM24` may not be available in most cases, as NUTS1 and NUTS2 level data is only available for a very few basic indicators on NUTS3 level. You can, however, easily maintain backward compatibility among `UKM3`, `UKM8`, `UKM9`, because the new data is just available in higher resolution, or, in other words, for two halves of the earlier `UKM3` region.
@@ -82,23 +98,28 @@ Zooming on regions `UKM` you can see that `UKM5` and `UKM6` are unchanged, `UKM3
 # for readability the previous example is filtered and reduced
 eurostat::tgs00026 %>%
   filter ( time == 2012 ) %>%
-  check_nuts2013() %>%
+  harmonize_geo_code() %>%
   filter ( grepl("UKM", geo) ) %>%
   select ( geo, values, change )
 ```
 
-## Only metadata changed
+For easier filtering in further use, there are two logical variables added to the data frame, i.e. `nuts_2013` and `nuts_2016`. Many datasets contain non-EU regions not covered in the Eurostat correspondence tables, their filter is `nuts_2013 == FALSE & nuts_2016 == FALSE`.
+
+The following example will filter out all rows that use a geo code which is defined in NUTS2013 and cannot be found in NUTS2016.  These are the main sources of incompatibility in your data panel.
+
+```{r filterdifference}
+eurostat::tgs00026 %>%
+  filter ( time == 2012 ) %>%
+  check_nuts_2013() %>%
+  filter ( nuts_2013, ! nuts_2016 ) 
+```
+
+## Recoding needed: only the metadata changed
 
 The first, logical step is to find those data points which are in fact identical, only their regional codes have changed.  For example, `FRC1` is in fact identical to region with the NUTS2013 label `FR26` (Bourgogne region in France.)  In this case, you can simply re-label the regions that appear to be different just because of the different codes applied.
 		
 The helper function `harmonize_geo_code()` will assist you with these cases. 
 
-```{r harmonize}
-#If check_nuts2013() is not called, the function will call it.    
-eurostat::tgs00026  %>% 
-  harmonize_geo_code()   
-```
-
 To make the example more clear, let's zoom on changes in France.  You can see that many regions changes, but some of them only changed labels.  For forward compatibility,  `harmonize_geo_code()` changed all geo labels to the current, `NUTS2016` definition. In fact, this is needed to use maps, for example. 
 
 ```{r harmonizeFR}
@@ -107,36 +128,37 @@ eurostat::tgs00026 %>%
   filter ( time == 2012 ) %>%
   harmonize_geo_code() %>%
   filter ( grepl("FR", geo) ) %>%
-  select ( geo, code13,  code16, change, resolution, values )
+  select ( geo, code13,  code16, change, values )
 ```
 
 In the change log, `recoded` means that the geo code was changed in the transition to NUTS2016, `recoded and relabelled` means that not only the code, but also the official name of the region changed.
 
-For comparing with additional data sources, it may be useful to make sure that you use the current name of the region. Function `convert_to_nuts2016()` changes the name column to the NUTS2016 definition, when applicable.
+You can decide which coding you prefer to use. Beware to use consistent map definitions if you will visualize your work - you can add the NUTS2013 labelled data to a map that contains the NUTS2013 boundary definitions.
+
+For comparing with additional data sources, it may be useful to make sure that you use the current name of the region. Function `recode_to_nuts_2016()` changes the name column to the NUTS2016 definition, when applicable, and `recode_to_nuts_2013()` will use the earlier definition.
 
 ```{r convertFR}
 # for readability the previous example is filtered and reduced
 eurostat::tgs00026 %>%
   filter ( time == 2012 ) %>%
-  convert_to_nuts2016() %>%
+  recode_to_nuts_2016() %>%
   filter ( grepl("FR", geo) ) %>%
   select ( geo, name,  code16, change, resolution, values )
 ```
 
-You can create a NUTS2016-only dataframe with filtering for `nuts2016 == TRUE` or review the observation which are not part of the NUTS2016 definition with `nuts2016 == FALSE`. 
-
-Another useful filter is `change == "not in the EU"`.  The non-EU member state region definitions (and their possible changes) are not covered in the Eurostat correspondence table.  You may need to review these manually, and if you have a problem with the boundaries, refer to the national statistical authorities of these non-EU countries.
+Another useful filter is `change == "not in the EU"`.  The non-EU member state region definitions (and their possible changes) are not covered in the Eurostat correspondence table.  
 
 ```{r convertfilter}
 # for readability the previous example is filtered and reduced
 eurostat::tgs00026 %>%
   filter ( time == 2012 ) %>%
-  convert_to_nuts2016() %>%
-  filter ( ! nuts2016 ) %>%
-  filter ( ! change == "not in the EU")
+   recode_to_nuts_2016() %>%
+   filter ( ! nuts_2013, ! nuts_2016 )
 ```
 
-## Filling in new boundaries with historical data
+You may need to review these manually, and if you have a problem with the boundaries, refer to the national statistical authorities of these non-EU countries.
+
+## Imputing to new boundaries with historical data
 
 Eurostat released an untidy Excel document that contains all boundary changes from the `NUTS2013` to the `NUTS2016` boundary definition. You can load these tidy tables into your global environment with  `data("nuts_correspondence")` and `data ("regional_changes_2016")` or simply reference them as `eurostat::nuts_correspondence` and `eurostat::regional_changes_2016`. (The `eurostat::` part can be omitted if you have called earlier`library(eurostat)` in your code.)
 
@@ -152,18 +174,17 @@ nuts_correspondence %>%
 
 For example, the new NUTS1 regions `FRB` is simply the continuation of the earlier NUTS2 region `FR24`. Or, the new NUTS1 region `FRC` can be filled with historical data with simply adding `FR26` and `FR43` NUTS2 data observations.
 
-## Backfill to historical boundaries
+### Backfill to historical boundaries
 
 When applying the latest boundaries (and visualizing according to current boundaries) is not important, it may be easier, or leave you with a larger panel of data if you use the correspondence information to backfill new, NUTS2016 data into the NUTS2013 boundaries, simply because you have more data following the earlier definition.
 
 ## Imputation strategies
 
-There are many imputation methodologies implemented in various R libraries (see [CRAN Task View: Missing Data](https://cran.r-project.org/web/views/MissingData.html)) You have to beware that most of these methods are not satisfactory in regional datasets. Whenever misssingness is caused by boundary changes, it will certainly violate many imputation method's conditions.  For example, many imputation strategies work when missingness is random. Therefore, it is very important that you first align the boundaries, and then apply imputation.
+There are many imputation methodologies implemented in various R libraries (see [CRAN Task View: Missing Data](https://cran.r-project.org/web/views/MissingData.html)) You have to beware that most of these methods are not satisfactory in regional datasets. Whenever missingness is caused by boundary changes, it will certainly violate many imputation method's conditions.  For example, many imputation strategies work when missingness is random. Therefore, it is very important that you first align the boundaries, and then apply imputation.
 
-Consider the following very simple example:
-
-```{r example1}
+Consider the following very simple, hypothetical example:
 
+```{r example1, echo=FALSE}
 tibble ( regions =c("A02 - from 2015 in D1 greater region", 
                     "B01 - from 2015 in D1 greater region", 
                     "C1", 
@@ -174,11 +195,11 @@ tibble ( regions =c("A02 - from 2015 in D1 greater region",
 
 ```
 
-How would you interpolate the missing 2015 data?  In the case of region `C`, there are no boundary changes, and the data seems constant. You would interpolate the value 10.  
+How would you interpolate the missing 2015 data?  In the case of region `C`, there are no boundary changes, and the data seems constant. You would interpolate the value to be 10.  
 
 However, in the case of the new `D1` region, we first reconstruct the sum of its smaller regions, `A02` + `B01` where we have historical data.  If `D1` region would have been defined as a region in 2014, its value would have been 3.  So the correct intrapolation is 4.
 
-```{r example2}
+```{r example2, echo=FALSE}
 tibble ( regions =c("A02 - from 2015 in D1 greater region", 
                     "B01 - from 2015 in D1 greater region", 
                     "C1 - 2015: intrapolated", 
@@ -189,11 +210,11 @@ tibble ( regions =c("A02 - from 2015 in D1 greater region",
 
 ```
 
-However, you dataset is richer in the old boundary set, because `D1` had a higher resolution with data given for its constituent subregions, `A02` and `B01`. 
+You may still wonder if you should use the old boundary definitions, because `D1` had a higher resolution of data given it detailed the statistics to its constituent subregions, `A02` and `B01`. 
 
-```{r example3}
-tibble ( regions =c("A02 - extrapolated with D1 data", 
-                    "B01 - extrapoloated with D1 data", 
+```{r example3, echo=FALSE}
+data.frame ( regions =c("A02 - extrapolated with D1 data", 
+                    "B01 - extrapolated with D1 data", 
                     "C1  - 2015: intrapolated", 
                     "D1 -  2014: A02+B02"), 
          Y2014 = c(1,2,10,3), 
@@ -202,13 +223,31 @@ tibble ( regions =c("A02 - extrapolated with D1 data",
 
 ```
 
-There are a few things to keep in mind when you start actually analyze the data. 
+There are a few things to keep in mind when you start actually analyse the data. 
+
+If you fill up your data set to both old and new boundary definitions, your dataset _appears to be bigger_, but it _does not contain more information_. Keeping both `A02 and B01` and `D1` in your panel duplicates the new D1 region in your panel which is formerly known as `A02` and `B01`. If you measure growth, you will overestimate average growth, because the high-growth region is duplicated in the dataset. You must remove either `A02 and B01` or `D1` from your panel, otherwise you will skew the effect that you analyse towards `D1`.
+
+The use of the old boundaries makes sense if you have more data in the old definition prior to 2014.  In this case, your dataset will contain less estimated values if you stick to the historical boundaries, and extrapolate the discontinued `A02` and `B01` regions, and leave `D1` out of your models.
+
+The use of new boundaries is useful when you have more data after Y2016. In this case, the switch to a lower geographical resolution (merging A02 and `B01` to `D1`) is balanced by the fact that you have more recent and more factual data about the less detailed `D1` observation.  In this case, backfilling via reverse extrapolation the `D1` data is the better strategy. You should leave `A02` and `B01` out of your further analysis.
+
+# Suggestions for Eurostat
+
+
+There are problems with Eurostat’s data products on two levels: with the data and with the metadata.
+
+The data problems are affecting the work of national statistical authorities, because they are responsible for the creation, validation, and when necessary, the later correction of data.  Eurostat cannot change the data they submit; however, it can change harmonization methodology, guidelines, and when necessary, initiate change in statistical regulation.
+I think that updating guidelines, and possible even regulation would not be controversial in the case when member states would be asked to provide the history of their statistics in the cases when the content of the data did not change, only its metadata, i.e. the labelling. If a member state changed the boundaries of a region, it may or may not be possible to re-calculate the data for this region. However, when only the name and short code changed, the data points are there, and they should be included in the data products.
+
+Regarding metadata, Eurostat could improve its products without the involvement of member states.  The current problem with the metadata of the regional statistics is that they are not tidy and not fully consistent.  The variable column ‘geo’ in the statistical products in fact contains at least three different information: the level of aggregation, the label of the information in the NUTS2013 definition and the label of the information in the NUTS2016 information.  Depending on what view you take on the contents of the table, this means that a seemingly single data table in fact is an unlabelled join of three tables: a national data table, and two regional data tables following different regional boundaries.
+
+The addition of the NUTS (or NUTS equivalent non-EU) level would already remove a lot of confusion and several metadata errors. The source of the confusion is that many products claim to contain NUTS2 information, but they contain a mixture of NUTS0, NUTS1 and NUTS3 information.  While the geo column can be easily filtered (by the number of characters of the geo code) this information is not known to all users. Adding the nuts_level variable in our case makes joining various data sources far easier and less confusing.
 
-Your dataset appears to be bigger, but it does not contain more information. Keeping both `A02 and B01`  and `D1` in your panel duplicates the new D1 region in your panel which is formerly known as `A02` and `B01`. If you measure growth, you will overestimate average growth, because the high-growth region is duplicated in the dataset. You must remove either `A02 and B01` or `D1` from your panel, otherwise you will skew the effect that you analyze towards `D1`.
+Several ways could be found to add the information currently contained in the (otherwise not tidy) Correspondence Table to each regional product.  This would require adding the information to which NUTS definition does the row (observation) in the dataset comply with.  It could be done in several ways from a data presentation and organization point of view. What should be minimally added is the NUTS definition (vocabulary) where the NUTS unit can be found, and potentially, as our helper functions do, further information about conversion.
 
-The use of the old boundaries makes sense if you have more data in the old definiton prior to 2014.  In this case, your dataset will contain less estimated values if you stick to the historical boundaries, and extrapoloate the discontinued `A02` and `B01` regions.
+A solution to the metadata presentation of the regional statistical products does not require the modification of statistical regulations (which must be adopted by the member states of the EU) and it is very urgent, because the next NUTS changes are already announced.
 
-The use of new boundaries is useful when you have more data after Y2016. In this case, the switch to a lower geographical resolution (merging A02 and `B01` to `D1`) is balanced by the fact that you have more recent and more factual data about the less detailed `D1` observation.  In this case, backfilling via reverse extrapolation the `D1` data is the better strategy.
+And at last, it would be a non-controversial change, which may require updating guidelines or regulations, is to add, at least on a non-mandatory basis, non-EU countries to the Correspondence tables.  It is very unlikely that EEA countries like Norway or potential candidate countries like North Macedonia would have objections to report their regional boundary changes to the Correspondence tables.  This is a self-evident change, which is also necessary after Brexit, given that the United Kingdom’s boundary data will have to remain in the Correspondence tables. 
 
 # Citations and related work
 

From afe711edc828a36c9265004430346a81a075bed5 Mon Sep 17 00:00:00 2001
From: Daniel Antal <antaldaniel@users.noreply.github.com>
Date: Sat, 8 Feb 2020 22:28:23 +0100
Subject: [PATCH 05/11] check and harmonize are now in one function

---
 R/check_nuts_2013.R    | 104 -----------------------------------------
 man/check_nuts_2013.Rd |  33 -------------
 2 files changed, 137 deletions(-)
 delete mode 100644 R/check_nuts_2013.R
 delete mode 100644 man/check_nuts_2013.Rd

diff --git a/R/check_nuts_2013.R b/R/check_nuts_2013.R
deleted file mode 100644
index 8a1b8d7f..00000000
--- a/R/check_nuts_2013.R
+++ /dev/null
@@ -1,104 +0,0 @@
-#' @title Check NUTS region codes that changed with the \code{NUTS2016} definition
-#' @description Eurostat mixes \code{NUTS2013} and \code{NUTS2016} geographic
-#' label codes in the \code{'geo'} column, which creates time-wise comparativity issues.
-#' This function checks if you data is affected by this problem and gives
-#' information on what to do.
-#' @param dat A Eurostat data frame downloaded with \code{\link{get_eurostat}}
-#' @export
-#' @author Daniel Antal
-#' @return An augmented data frame or a message about potential coding
-#' errors. For filtering, it marks \code{'non_EU'} and \code{'unchanged'}
-#' regions. Observations with codes ending on \code{'ZZ'} or \code{'XX'} are
-#' removed from the returned data table, because these are non-territorial
-#' observations or they are outside of the EU.
-#' @importFrom dplyr mutate filter rename mutate_if case_when
-#' @importFrom dplyr left_join full_join anti_join
-#' @examples
-#'  \dontrun{
-#'    dat <- eurostat::tgs00026
-#'    check_nuts_2013(dat)
-#'  }
-
-check_nuts_2013 <- function (dat) {
-  
-  ## For non-standard evaluation -------------------------------------
-  . <- change  <- geo <- code13 <- code16 <- nuts_level <- NULL
-  regional_changes_2016 <- country_code <- NULL
-  
-  ## The data is not loaded into the global environment --------------
-  
-  regional_changes_2016 <- load_package_data(dataset = "regional_changes_2016")
-  
-  unchanged_regions <- regional_changes_2016 %>% 
-    filter ( change == 'unchanged' )
-  
-  changed_regions <- regional_changes_2016 %>% 
-    filter ( change != 'unchanged' )
-  
-  ## Changed regions to be looked up by their NUTS2016 codes -----------
-  regional_changes_by_2016 <- regional_changes_2016 %>%
-    mutate ( geo = code16 ) %>% 
-    filter ( !is.na(code13) )
-  
-  ## adding those that have no equivalent in the previous group
-  ## some regions have to be identified by their old and new codes -----
-  regional_changes_by_2013 <- regional_changes_2016 %>%
-    mutate ( geo = code13 ) %>% 
-    filter ( !is.na(code13) ) %>%
-    anti_join ( regional_changes_by_2016, 
-                by = c("code13", "code16", "name",
-                       "nuts_level", "change", "geo") )
-  
-  ## Region can be found by new or old NUTS code -----------------------
-  
-  all_regional_changes <- regional_changes_by_2016 %>%
-    full_join ( regional_changes_by_2013, 
-                by = c("code13", "code16", "name",
-                       "nuts_level",
-                       "change", "geo") )
-  
-  
-  tmp <- dat %>%
-    mutate_if ( is.factor, as.character ) %>%
-    left_join ( all_regional_changes, by = 'geo' ) %>%
-    mutate ( nuts_level = ifelse (is.na(nuts_level), 
-                                  add_nuts_level(geo),
-                                  nuts_level))
-  
-  if ( all ( tmp$change %in% unique(regional_changes_2016$code16) )) {
-    message ( "All observations are coded with NUTS2016 codes" )
-    there_are_changes <- FALSE
-  }
- 
-  eu_countries <- load_package_data(dataset = "eu_countries")
-
-  eu_country_vector <-  unique ( substr(eu_countries$code, 1, 2) )
-  
-  tmp <- tmp %>%
-    mutate ( country_code = substr(geo,1,2) ) %>%
-    mutate ( change = ifelse ( country_code %in% eu_country_vector, 
-                               yes  = change,
-                               no = "not in the EU")) %>%
-    select ( -country_code )
-
-  if ( any(tmp$change == 'not in the EU') ) {
-    
-    not_EU_country_vector <- substr(tmp$geo, 1,2)
-    not_EU_country_vector <- not_EU_country_vector [ !not_EU_country_vector %in% eu_country_vector]
-    ## The correspondence table only covers EU regions.
-    message ( "Not checking for regional label consistency in non-EU countries\n",
-              "In this data frame non-EU country: ", 
-              paste (sort(unique(not_EU_country_vector)),
-                     collapse = ", "), "." )
-  }
-  
-  nuts_2016_codes <- unique(regional_changes_2016$code16)
-  nuts_2013_codes <- unique(regional_changes_2016$code13)
-  
-  tmp <- tmp %>%
-    mutate ( nuts_2016 = ifelse ( geo %in% nuts_2016_codes, 
-                                  TRUE, FALSE),
-             nuts_2013 = ifelse ( geo %in% nuts_2013_codes, 
-                                  TRUE, FALSE))
-  
-}
diff --git a/man/check_nuts_2013.Rd b/man/check_nuts_2013.Rd
deleted file mode 100644
index 382692df..00000000
--- a/man/check_nuts_2013.Rd
+++ /dev/null
@@ -1,33 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/check_nuts_2013.R
-\name{check_nuts_2013}
-\alias{check_nuts_2013}
-\title{Check NUTS region codes that changed with the \code{NUTS2016} definition}
-\usage{
-check_nuts_2013(dat)
-}
-\arguments{
-\item{dat}{A Eurostat data frame downloaded with \code{\link{get_eurostat}}}
-}
-\value{
-An augmented data frame or a message about potential coding
-errors. For filtering, it marks \code{'non_EU'} and \code{'unchanged'}
-regions. Observations with codes ending on \code{'ZZ'} or \code{'XX'} are
-removed from the returned data table, because these are non-territorial
-observations or they are outside of the EU.
-}
-\description{
-Eurostat mixes \code{NUTS2013} and \code{NUTS2016} geographic
-label codes in the \code{'geo'} column, which creates time-wise comparativity issues.
-This function checks if you data is affected by this problem and gives
-information on what to do.
-}
-\examples{
- \dontrun{
-   dat <- eurostat::tgs00026
-   check_nuts_2013(dat)
- }
-}
-\author{
-Daniel Antal
-}

From 885f6747ff1596a8739c31cbb6810e42059f6062 Mon Sep 17 00:00:00 2001
From: Daniel Antal <antaldaniel@users.noreply.github.com>
Date: Sat, 8 Feb 2020 22:29:00 +0100
Subject: [PATCH 06/11] recoding is renamed and tested.

---
 NAMESPACE                      |  6 ----
 R/recode_to_nuts_2013.R        | 51 ++++++++++++++++-------------
 R/recode_to_nuts_2016.R        | 52 +++++++++++++++++-------------
 man/recode_to_nuts_2013.Rd     | 27 +++++++++-------
 man/recode_to_nuts_2016.Rd     | 27 +++++++++-------
 tests/testthat/test_regional.R | 59 ++++++++++++++++++++++++++++++++++
 6 files changed, 150 insertions(+), 72 deletions(-)
 create mode 100644 tests/testthat/test_regional.R

diff --git a/NAMESPACE b/NAMESPACE
index 6935f36c..f158a22d 100755
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -2,7 +2,6 @@
 
 export(add_nuts_level)
 export(check_access_to_data)
-export(check_nuts_2013)
 export(clean_eurostat_cache)
 export(cut_to_classes)
 export(dic_order)
@@ -32,7 +31,6 @@ importFrom(classInt,classIntervals)
 importFrom(countrycode,countrycode)
 importFrom(curl,curl_download)
 importFrom(dplyr,"%>%")
-importFrom(dplyr,add_count)
 importFrom(dplyr,anti_join)
 importFrom(dplyr,arrange)
 importFrom(dplyr,case_when)
@@ -43,9 +41,7 @@ importFrom(dplyr,left_join)
 importFrom(dplyr,mutate)
 importFrom(dplyr,mutate_if)
 importFrom(dplyr,rename)
-importFrom(dplyr,right_join)
 importFrom(dplyr,select)
-importFrom(dplyr,semi_join)
 importFrom(httr,GET)
 importFrom(httr,build_url)
 importFrom(httr,content)
@@ -65,13 +61,11 @@ importFrom(sp,spplot)
 importFrom(stats,lag)
 importFrom(stringi,stri_match_first_regex)
 importFrom(stringr,str_replace_all)
-importFrom(stringr,str_sub)
 importFrom(tibble,as_tibble)
 importFrom(tibble,data_frame)
 importFrom(tibble,is_tibble)
 importFrom(tidyr,gather_)
 importFrom(tidyr,separate)
-importFrom(tidyselect,all_of)
 importFrom(utils,data)
 importFrom(utils,download.file)
 importFrom(utils,toBibtex)
diff --git a/R/recode_to_nuts_2013.R b/R/recode_to_nuts_2013.R
index 0daa26c4..a443f70c 100644
--- a/R/recode_to_nuts_2013.R
+++ b/R/recode_to_nuts_2013.R
@@ -15,27 +15,31 @@
 #' Furthermore, when the official name of the region changed, it will use
 #' the new name (if the otherwise the region boundary did not change.)
 #' If not called before, the function will use the helper function
-#'  \code{\link{check_nuts_2013}} and  \code{\link{harmonize_geo_code}}
+#' \code{\link{harmonize_geo_code}}
 #' @importFrom dplyr mutate filter rename arrange case_when
 #' @importFrom dplyr left_join inner_join anti_join
-#' @importFrom tidyselect all_of
 #' @examples
-#'  \dontrun{
-#'   eurostat::tgs00026 %>%
-#'      check_nuts2013() %>%
-#'      harmonize_geo_code() %>%
-#'      recode_to_nuts_2013() 
-#'      
-#'  #If check_nuts2013() is not called, the function will call it.    
-#'   eurostat::tgs00026 %>%
-#'      recode_to_nuts_2013()    
-#'  }
+#' test_regional_codes <- data.frame ( 
+#'   geo = c("FRB", "FRE", "UKN02", "IE022", "FR243", "FRB03"),
+#'   time = c(rep(as.Date ("2014-01-01"), 5), as.Date("2015-01-01")), 
+#'   values = c(1:6), 
+#'   control = c("Changed from NUTS2 to NUTS1", 
+#'               "New region NUTS2016 only", 
+#'               "Discontinued region NUTS2013", 
+#'               "Boundary shift NUTS2013", 
+#'               "Recoded in NUTS2013", 
+#'               "Recoded in NUTS2016"
+#'   )) 
+#'  
+#'  test_regional_codes %>%
+#'   harmonize_geo_code () %>%
+#'    recode_to_nuts_2013()
 #' @export
  
 recode_to_nuts_2013 <- function (dat) {
   
   . <- nuts_level <- geo <- code13 <- code16 <- time <- name <- NULL
-  type <- nuts_correspondence <- regional_changes_2016 <- NULL
+  type  <- NULL
 
   regional_changes_2016 <- load_package_data(dataset = "regional_changes_2016")
   nuts_correspondence <- load_package_data(dataset = "nuts_correspondence")
@@ -50,17 +54,19 @@ recode_to_nuts_2013 <- function (dat) {
   
   tmp <- tmp %>%
     mutate ( geo = case_when (
-      !is.na(geo)                   ~ geo,
-      change     == "not in the EU" ~ geo,
-      TRUE ~ code13
+      geo    == code13                       ~ geo,
+      change == "not in EU - not controlled" ~ geo,
+      TRUE   ~ code13
     ))
   
   if ( any (is.na(tmp$geo)) ) {
-    warning ( "The following regions have no geo labels:", 
+    warning ( "The following regions have no NUTS2013 geo labels: ", 
               tmp %>%
-                filter ( is.na(geo) && (nuts2016 = TRUE) ) %>%
-                select (geo) %>%
-                as.character() )
+                filter ( is.na(geo) & (nuts2013 = TRUE) ) %>%
+                select (code16) %>%
+                unlist() %>%
+                unique() %>% 
+                paste(., collapse = ", "), "." )
     
   }
   
@@ -74,12 +80,13 @@ recode_to_nuts_2013 <- function (dat) {
   
   regions_with_other_names <- tmp %>% 
     anti_join ( regions_by_nuts2013_names, 
-                    by = tidyselect::all_of(names(tmp)) )
+                    by = names(tmp) )
   
   rbind ( regions_by_nuts2013_names,
           regions_with_other_names ) %>%
     arrange ( time, geo, code16 ) %>%
     left_join ( nuts_correspondence, 
-                by = c("code13", "code16", "nuts_level", "change", "name"))
+                by = c("code13", "code16", "nuts_level",
+                       "change", "name", "resolution"))
   
 }
diff --git a/R/recode_to_nuts_2016.R b/R/recode_to_nuts_2016.R
index 044ec168..db35cc19 100644
--- a/R/recode_to_nuts_2016.R
+++ b/R/recode_to_nuts_2016.R
@@ -15,27 +15,31 @@
 #' Furthermore, when the official name of the region changed, it will use
 #' the new name (if the otherwise the region boundary did not change.)
 #' If not called before, the function will use the helper function
-#'  \code{\link{check_nuts_2013}} and  \code{\link{harmonize_geo_code}}
+#' \code{\link{harmonize_geo_code}}
 #' @importFrom dplyr mutate filter rename arrange case_when
-#' @importFrom dplyr left_join inner_join anti_join semi_join
-#' @importFrom tidyselect all_of
+#' @importFrom dplyr left_join inner_join anti_join
 #' @examples
-#'  \dontrun{
-#'   eurostat::tgs00026 %>%
-#'      check_nuts2013() %>%
-#'      harmonize_geo_code() %>%
-#'      recode_to_nuts_2016() 
-#'      
-#'  #If check_nuts2013() is not called, the function will call it.    
-#'   eurostat::tgs00026 %>%
-#'      recode_to_nuts_2016()    
-#'  }
+#' test_regional_codes <- data.frame ( 
+#'   geo = c("FRB", "FRE", "UKN02", "IE022", "FR243", "FRB03"),
+#'   time = c(rep(as.Date ("2014-01-01"), 5), as.Date("2015-01-01")), 
+#'   values = c(1:6), 
+#'   control = c("Changed from NUTS2 to NUTS1", 
+#'               "New region NUTS2016 only", 
+#'               "Discontinued region NUTS2013", 
+#'               "Boundary shift NUTS2013", 
+#'               "Recoded in NUTS2013", 
+#'               "Recoded in NUTS2016"
+#'   )) 
+#'  
+#'  test_regional_codes %>%
+#'   harmonize_geo_code () %>%
+#'    recode_to_nuts_2016()
 #' @export
  
 recode_to_nuts_2016 <- function (dat) {
   
   . <- nuts_level <- geo <- code13 <- code16 <- time <- name <- NULL
-  type <- nuts_correspondence <- regional_changes_2016 <- NULL
+  type <- NULL
 
   regional_changes_2016 <- load_package_data(dataset = "regional_changes_2016")
   nuts_correspondence <- load_package_data(dataset = "nuts_correspondence")
@@ -50,16 +54,19 @@ recode_to_nuts_2016 <- function (dat) {
   
   tmp <- tmp %>%
     mutate ( geo = case_when (
-      !is.na(geo)                   ~ geo,
-      change     == "not in the EU" ~ geo,
+      geo    == code16                       ~ geo,
+      change == "not in EU - not controlled" ~ geo,
       TRUE ~ code16
     ))
   
-  if ( any (is.na(tmp$geo) && (nuts2016 = TRUE)) ) {
-    warning ( "The following regions have no geo labels:", 
+  if ( any (is.na(tmp$geo)) ) {
+    warning ( "The following regions have no NUTS2016 labels: ", 
               tmp %>%
-                filter ( is.na(geo) && (nuts2016 = TRUE) ) %>%
-                as.character(geo) )
+                filter ( is.na(geo) & (nuts2013 = TRUE) ) %>%
+                select (code13) %>%
+                unlist() %>%
+                unique() %>% 
+                paste(., collapse = ", "), ".")
     
   }
   
@@ -73,12 +80,13 @@ recode_to_nuts_2016 <- function (dat) {
   
   regions_with_other_names <- tmp %>% 
     anti_join ( regions_by_nuts2016_names, 
-                    by = tidyselect::all_of(names(tmp)) )
+                    by = names(tmp) )
   
   rbind ( regions_by_nuts2016_names,
           regions_with_other_names ) %>%
     arrange ( time, geo, code16 ) %>%
     left_join ( nuts_correspondence, 
-                by = c("code13", "code16", "nuts_level", "change", "name"))
+                by = c("code13", "code16", "nuts_level",
+                       "change", "name", "resolution"))
   
 }
diff --git a/man/recode_to_nuts_2013.Rd b/man/recode_to_nuts_2013.Rd
index e556d80a..42e43e86 100644
--- a/man/recode_to_nuts_2013.Rd
+++ b/man/recode_to_nuts_2013.Rd
@@ -19,7 +19,7 @@ that cannot be brought to the current \code{'NUTS2013'} definition.
 Furthermore, when the official name of the region changed, it will use
 the new name (if the otherwise the region boundary did not change.)
 If not called before, the function will use the helper function
- \code{\link{check_nuts_2013}} and  \code{\link{harmonize_geo_code}}
+\code{\link{harmonize_geo_code}}
 }
 \description{
 Eurostat mixes NUTS2013 and NUTS2016 geographic label codes
@@ -29,16 +29,21 @@ marks discontinued regions, and other regions which may or may not be
 somehow compared to the historic \code{'NUTS2013'} boundaries.
 }
 \examples{
- \dontrun{
-  eurostat::tgs00026 \%>\%
-     check_nuts2013() \%>\%
-     harmonize_geo_code() \%>\%
-     recode_to_nuts_2013() 
-     
- #If check_nuts2013() is not called, the function will call it.    
-  eurostat::tgs00026 \%>\%
-     recode_to_nuts_2013()    
- }
+test_regional_codes <- data.frame ( 
+  geo = c("FRB", "FRE", "UKN02", "IE022", "FR243", "FRB03"),
+  time = c(rep(as.Date ("2014-01-01"), 5), as.Date("2015-01-01")), 
+  values = c(1:6), 
+  control = c("Changed from NUTS2 to NUTS1", 
+              "New region NUTS2016 only", 
+              "Discontinued region NUTS2013", 
+              "Boundary shift NUTS2013", 
+              "Recoded in NUTS2013", 
+              "Recoded in NUTS2016"
+  )) 
+ 
+ test_regional_codes \%>\%
+  harmonize_geo_code () \%>\%
+   recode_to_nuts_2013()
 }
 \author{
 Daniel Antal
diff --git a/man/recode_to_nuts_2016.Rd b/man/recode_to_nuts_2016.Rd
index 41ef02aa..879c89f9 100644
--- a/man/recode_to_nuts_2016.Rd
+++ b/man/recode_to_nuts_2016.Rd
@@ -19,7 +19,7 @@ that cannot be brought to the current \code{'NUTS2016'} definition.
 Furthermore, when the official name of the region changed, it will use
 the new name (if the otherwise the region boundary did not change.)
 If not called before, the function will use the helper function
- \code{\link{check_nuts_2013}} and  \code{\link{harmonize_geo_code}}
+\code{\link{harmonize_geo_code}}
 }
 \description{
 Eurostat mixes NUTS2013 and NUTS2016 geographic label codes
@@ -29,16 +29,21 @@ marks discontinued regions, and other regions which may or may not be
 somehow compared to current \code{'NUTS2016'} boundaries.
 }
 \examples{
- \dontrun{
-  eurostat::tgs00026 \%>\%
-     check_nuts2013() \%>\%
-     harmonize_geo_code() \%>\%
-     recode_to_nuts_2016() 
-     
- #If check_nuts2013() is not called, the function will call it.    
-  eurostat::tgs00026 \%>\%
-     recode_to_nuts_2016()    
- }
+test_regional_codes <- data.frame ( 
+  geo = c("FRB", "FRE", "UKN02", "IE022", "FR243", "FRB03"),
+  time = c(rep(as.Date ("2014-01-01"), 5), as.Date("2015-01-01")), 
+  values = c(1:6), 
+  control = c("Changed from NUTS2 to NUTS1", 
+              "New region NUTS2016 only", 
+              "Discontinued region NUTS2013", 
+              "Boundary shift NUTS2013", 
+              "Recoded in NUTS2013", 
+              "Recoded in NUTS2016"
+  )) 
+ 
+ test_regional_codes \%>\%
+  harmonize_geo_code () \%>\%
+   recode_to_nuts_2016()
 }
 \author{
 Daniel Antal
diff --git a/tests/testthat/test_regional.R b/tests/testthat/test_regional.R
new file mode 100644
index 00000000..8ae5d5b4
--- /dev/null
+++ b/tests/testthat/test_regional.R
@@ -0,0 +1,59 @@
+context ("Regional code harmonization")
+
+
+test_regional_codes <- data.frame ( 
+  geo = c("FRB", "FRE", "UKN02", "IE022", "FR243", "FRB03"),
+  time = c(rep(as.Date ("2014-01-01"), 5), as.Date("2015-01-01")), 
+  values = c(1:6), 
+  control = c("Changed from NUTS2 to NUTS1", 
+              "New region NUTS2016 only", 
+              "Discontinued region NUTS2013", 
+              "Boundary shift NUTS2013", 
+              "Recoded in NUTS2013", 
+              "Recoded in NUTS2016"
+  )) 
+
+test_harmonized <- harmonize_geo_code(test_regional_codes)
+
+try_recode_2013 <- recode_to_nuts_2013(test_harmonized)
+
+try_recode_2016 <- recode_to_nuts_2016(test_harmonized)
+
+lookup_code16 <- test_harmonized %>%
+  filter ( geo  == "FR243") %>%
+  select ( code16 ) %>% unlist() %>% as.character()
+
+lookup_code13 <- test_harmonized %>%
+  filter ( geo  == "FRB03") %>%
+  select ( code13 ) %>% unlist() %>% as.character()
+
+recode_frb <- try_recode_2013 %>%
+  filter ( code16  == "FRB") %>%
+  select ( geo ) %>% unlist() %>% as.character()
+
+recode_ukn02 <- try_recode_2016 %>%
+  filter ( code13  == "UKN02") %>%
+  select ( geo ) %>% unlist() %>% as.character()
+
+
+
+test_that("Recoding gives correct results",{
+  skip_on_cran()
+  skip_on_travis()  
+  expect_equal( lookup_code16, 
+    "FRB03"
+  )
+  expect_equal( lookup_code13, 
+                "FR243"
+  )
+  expect_equal( lookup_code13, 
+                "FR243"
+  )
+  expect_equal( recode_frb, 
+                NA_character_
+  )
+  expect_equal( recode_ukn02, 
+                NA_character_
+  )
+ 
+})

From 0811ce7928f8cdc8c59aaca43f74b4fe5f9cb847 Mon Sep 17 00:00:00 2001
From: Daniel Antal <antaldaniel@users.noreply.github.com>
Date: Sat, 8 Feb 2020 22:33:39 +0100
Subject: [PATCH 07/11] merged documents had name conflicts.

---
 R/harmonize_geo_code.R                        | 207 ++++++++++++++++++
 man/harmonize_geo_code.Rd                     |   2 +-
 ...ew_regional_data.Rmd => regional_data.Rmd} |   0
 3 files changed, 208 insertions(+), 1 deletion(-)
 create mode 100644 R/harmonize_geo_code.R
 rename vignettes/website/{new_regional_data.Rmd => regional_data.Rmd} (100%)

diff --git a/R/harmonize_geo_code.R b/R/harmonize_geo_code.R
new file mode 100644
index 00000000..d2aeb452
--- /dev/null
+++ b/R/harmonize_geo_code.R
@@ -0,0 +1,207 @@
+#' @title Harmonize NUTS region codes that changed with the \code{NUTS2016} definition
+#' @description Eurostat mixes \code{NUTS2013} and \code{NUTS2016} geographic
+#' label codes in the \code{'geo'} column, which creates time-wise comparativity issues.
+#' This function checks if you data is affected by this problem and gives
+#' information on what to do.
+#' @param dat A Eurostat data frame downloaded with \code{\link{get_eurostat}}
+#' @export
+#' @author Daniel Antal
+#' @return An augmented data frame or a message about potential coding
+#' errors.
+#' @importFrom dplyr mutate filter rename mutate_if case_when
+#' @importFrom dplyr left_join full_join anti_join
+#' @examples
+#'  \dontrun{
+#'    dat <- eurostat::tgs00026
+#'    harmonize_geo_code(dat)
+#'  }
+
+harmonize_geo_code <- function (dat) {
+  
+  ## For non-standard evaluation -------------------------------------
+  . <- change  <- geo <- code13 <- code16 <- nuts_level <- NULL
+  country_code <- NULL
+  
+  dat <- mutate_if ( dat, is.factor, as.character)
+  
+  ## The data is not loaded into the global environment --------------
+  
+  regional_changes_2016 <- load_package_data(dataset = "regional_changes_2016")
+  nuts_correspondence   <- load_package_data(dataset = "nuts_correspondence")
+  
+  ## Creating constants -----------------------------------------------
+  regions_in_correspondence <- unique(c(nuts_correspondence$code13, nuts_correspondence$code16))
+  regions_in_correspondence <- sort(regions_in_correspondence [!is.na(regions_in_correspondence)])
+
+  unchanged_regions <- regional_changes_2016 %>% 
+    filter ( change == 'unchanged' )
+  
+  # The Eurostat correspondence table had a duplicate entry.  It may
+  # re-occur later and this code may help finding it.
+  # nuts_correspondence_duplicates <- nuts_correspondence %>%
+  #  filter ( !is.na(code13 )) %>%
+  #  add_count ( code13 ) %>% filter ( n > 1 )
+  
+  ## Changed regions to be looked up by their NUTS2016 codes -----------
+  regional_changes_by_2016 <- nuts_correspondence %>%
+    mutate ( geo = code16 ) %>% 
+    filter ( !is.na(code16) ) %>%
+    select ( -geo ) %>%
+    distinct ( code13, code16, name, nuts_level, change, resolution) 
+  
+  # Regions may be duplicated in case their NUTS2016 and NUTS2013 are the same
+  
+  ## adding those that have no equivalent in the previous group
+  ## some regions have to be identified by their old and new codes -----
+  regional_changes_by_2013 <- nuts_correspondence %>%
+    mutate ( geo = code13 ) %>% 
+    filter ( !is.na(code13) ) %>%
+    select ( -geo ) %>%
+    distinct ( code13, code16, name, nuts_level, change, resolution)
+  
+  ## Join the regions by both NUTS definitions -----------------------
+  
+  all_regional_changes <- regional_changes_by_2016 %>%
+    full_join ( regional_changes_by_2013, 
+                by = c("code13", "code16", "name", "nuts_level",
+                       "change", "resolution"))
+  
+  
+  ## Check for potential duplicates ----------------------------------
+  duplicates <- all_regional_changes %>% 
+    add_count ( code13, code16  ) %>%
+    filter ( n > 1 )
+  
+  if ( nrow(duplicates) > 0 ) {
+    stop ("There are duplicates in the correspondence table.")
+  }
+
+  all_regions_full_metadata <- unchanged_regions %>%
+    mutate ( resolution = NA_character_ ) %>% 
+    rbind ( all_regional_changes ) 
+  
+  nuts_2013_codes <- unique (all_regions_full_metadata$code13)#[!is.na(all_regions_full_metadata$code13)]
+  nuts_2016_codes <- unique (all_regions_full_metadata$code16)#[!is.na(all_regions_full_metadata$code16)]
+  nuts_2013_codes <- nuts_2013_codes[!is.na(nuts_2013_codes)]
+  nuts_2016_codes <- nuts_2016_codes[!is.na(nuts_2016_codes)]
+  
+  "PL2" %in% all_regions_full_metadata$code13
+  "PL2" %in% unique ( all_regions_full_metadata$code13)
+  "UKN01" %in% nuts_2013_codes
+  "UKN01" %in% nuts_2016_codes
+  
+  any ( is.na(nuts_2013_codes))
+  
+  tmp_by_code16 <- dat %>%
+    mutate ( geo = as.character(geo)) %>%
+    filter ( geo %in% all_regions_full_metadata$code16 ) %>%
+    left_join (  all_regions_full_metadata %>%
+                  dplyr::rename ( geo = code16 ), 
+                by = "geo") %>%
+    mutate ( code16 = geo ) %>%
+    mutate ( nuts_2016 = geo %in% nuts_2016_codes ) %>%
+    mutate ( nuts_2013 = geo %in% nuts_2013_codes )
+  
+  tmp_by_code13 <- dat %>%
+    mutate ( geo = as.character(geo)) %>%
+    filter ( geo %in% all_regions_full_metadata$code13 ) %>%
+    left_join (  all_regions_full_metadata %>%
+                  dplyr::rename ( geo = code13 ), 
+                by = "geo") %>%
+    mutate ( code13 = geo ) %>%
+    mutate ( nuts_2016 = geo %in% nuts_2016_codes, 
+             nuts_2013 = geo %in% nuts_2013_codes)
+  
+  message ( "In this data frame ", nrow(tmp_by_code16), 
+            " observations are coded with the current NUTS2016\ngeo labels and ", 
+            nrow ( tmp_by_code13), " observations/rows have NUTS2013 historical labels.")
+  
+  tmp_s <- tmp_by_code16 %>%
+    semi_join (  tmp_by_code13, 
+                 by = names ( tmp_by_code13)) # found in both (unchanged and relabelled)
+ 
+  if (! all(tmp_s$nuts_2013 && tmp_s$nuts_2016)) { stop ("Wrong selection of unchanged regions.") }
+  
+  
+  tmp_s2 <- tmp_by_code13 %>%
+    semi_join (  tmp_by_code16, 
+                 by = names (tmp_by_code16)) # found in both (unchanged and relabelled)
+  #must be equal!!!
+  
+  tmp_a1 <- tmp_by_code16 %>%
+    anti_join (  tmp_by_code13, 
+                 by = names(tmp_by_code13)) # not found in code13 (new regions)
+  if ( ! all(tmp_a2$nuts_2013)) { stop ("Wrong selection of NUTS2013-only regions.") }
+  
+  
+  tmp_a2 <- tmp_by_code13 %>%
+    anti_join (  tmp_by_code16, 
+                 by = names(tmp_by_code13)) # not found in code16 (changes)
+  if ( ! all(tmp_a2$nuts_2013)) { stop ("Wrong selection of NUTS2013-only regions.") }
+  
+  tmp <- rbind ( tmp_s, tmp_a1, tmp_a2 )
+  
+  not_found_geo <- unique(dat$geo[! dat$geo %in% tmp$geo ])
+  not_eu_regions <- not_found_geo[! substr(not_found_geo,1,2) %in% eu_countries$code]
+  
+  ## Checking if there are unmatched EU regions-------------------------
+  
+  not_found_eu_regions <-  not_found_geo[ substr(not_found_geo,1,2) %in% eu_countries$code]
+ 
+  if ( length(not_found_eu_regions)>0) {
+    stop ( "Some EU regions were not found in the correspondence table.")
+  }
+  
+  ## Adding columns for non-EU regions ----------------------------------
+  tmp_not_eu <- dat %>%
+    filter ( geo %in% not_eu_regions ) %>%
+    mutate ( nuts_level = nchar(geo)-2, 
+             change = "not in EU - not controlled", 
+             resolution = "check with national authorities", 
+             name = NA_character_,
+             code13 = NA_character_, 
+             code16 = NA_character_,
+             nuts_2016 = FALSE, 
+             nuts_2013 = FALSE)
+  
+  tmp2 <- rbind ( tmp, tmp_not_eu)
+  
+
+  ## Check if all original rows are handled correctly ------------------
+  if (length(dat$geo [! dat$geo %in% tmp2$geo ])>0) {
+    message (tmp2 %>% anti_join (dat))
+    message (dat %>% anti_join (tmp2))
+    stop ("Not all original rows were checked.")
+  }
+
+  eu_countries <- load_package_data(dataset = "eu_countries")
+
+  eu_country_vector <-  unique ( substr(eu_countries$code, 1, 2) )
+  
+
+  if ( any(tmp2$change == 'not in EU - not controlled') ) {
+    
+    not_EU_country_vector <- tmp2 %>%
+      filter ( tmp2$change == 'not in EU - not controlled' ) %>%
+      select ( geo ) 
+    
+    not_eu_observations <- nrow (not_EU_country_vector)
+    
+    not_EU_country_vector <- not_EU_country_vector %>%
+      unlist() %>% substr(., 1,2) %>% sort () %>%
+      unique ()
+     ## The correspondence table only covers EU regions.
+    message ( "Not checking for regional label consistency in non-EU countries.\n",
+              "In this data frame not controlled countries: ", 
+              paste (not_EU_country_vector,
+                     collapse = ", "), " \n", 
+              "with alltogether ", not_eu_observations, " observations/rows.")
+  }
+  
+  ## Reorder columns for readability -------------------------------
+  
+  tmp_left <- tmp2 %>% select ( geo,  time, values, code13, code16, name )
+  tmp_right <- tmp2 %>% select ( -geo, -code13, -code16, -time, -values, -name )
+
+  cbind ( tmp_left, tmp_right)
+}
diff --git a/man/harmonize_geo_code.Rd b/man/harmonize_geo_code.Rd
index 75c3aed7..a7a349d7 100644
--- a/man/harmonize_geo_code.Rd
+++ b/man/harmonize_geo_code.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/new_harmonize_geo_code.R
+% Please edit documentation in R/harmonize_geo_code.R
 \name{harmonize_geo_code}
 \alias{harmonize_geo_code}
 \title{Harmonize NUTS region codes that changed with the \code{NUTS2016} definition}
diff --git a/vignettes/website/new_regional_data.Rmd b/vignettes/website/regional_data.Rmd
similarity index 100%
rename from vignettes/website/new_regional_data.Rmd
rename to vignettes/website/regional_data.Rmd

From 9a51ff627df19f5d350d89b5c937175acaf514fc Mon Sep 17 00:00:00 2001
From: Daniel Antal <antaldaniel@users.noreply.github.com>
Date: Sat, 8 Feb 2020 22:42:16 +0100
Subject: [PATCH 08/11] Change @importFrom and non-standard evaluation bindings

---
 R/harmonize_geo_code.R     | 10 +++++-----
 R/recode_to_nuts_2013.R    |  6 ++----
 R/recode_to_nuts_2016.R    |  6 ++----
 man/harmonize_geo_code.Rd  |  4 ++--
 man/recode_to_nuts_2013.Rd |  4 +---
 man/recode_to_nuts_2016.Rd |  4 +---
 6 files changed, 13 insertions(+), 21 deletions(-)

diff --git a/R/harmonize_geo_code.R b/R/harmonize_geo_code.R
index d2aeb452..327b44a3 100644
--- a/R/harmonize_geo_code.R
+++ b/R/harmonize_geo_code.R
@@ -6,10 +6,10 @@
 #' @param dat A Eurostat data frame downloaded with \code{\link{get_eurostat}}
 #' @export
 #' @author Daniel Antal
-#' @return An augmented data frame or a message about potential coding
-#' errors.
-#' @importFrom dplyr mutate filter rename mutate_if case_when
-#' @importFrom dplyr left_join full_join anti_join
+#' @return An augmented data frame that explains potential problems and 
+#' possible solutions.
+#' @importFrom dplyr mutate filter rename mutate_if case_when distinct
+#' @importFrom dplyr left_join full_join anti_join add_count semi_join
 #' @examples
 #'  \dontrun{
 #'    dat <- eurostat::tgs00026
@@ -20,7 +20,7 @@ harmonize_geo_code <- function (dat) {
   
   ## For non-standard evaluation -------------------------------------
   . <- change  <- geo <- code13 <- code16 <- nuts_level <- NULL
-  country_code <- NULL
+  country_code <- n <- values <- time <- name <- resolution <- NULL
   
   dat <- mutate_if ( dat, is.factor, as.character)
   
diff --git a/R/recode_to_nuts_2013.R b/R/recode_to_nuts_2013.R
index a443f70c..becb4400 100644
--- a/R/recode_to_nuts_2013.R
+++ b/R/recode_to_nuts_2013.R
@@ -31,9 +31,7 @@
 #'               "Recoded in NUTS2016"
 #'   )) 
 #'  
-#'  test_regional_codes %>%
-#'   harmonize_geo_code () %>%
-#'    recode_to_nuts_2013()
+#' recode_to_nuts_2013(test_regional_codes)
 #' @export
  
 recode_to_nuts_2013 <- function (dat) {
@@ -42,7 +40,7 @@ recode_to_nuts_2013 <- function (dat) {
   type  <- NULL
 
   regional_changes_2016 <- load_package_data(dataset = "regional_changes_2016")
-  nuts_correspondence <- load_package_data(dataset = "nuts_correspondence")
+  nuts_correspondence   <- load_package_data(dataset = "nuts_correspondence")
 
   if ( ! all(c("change", "code16", "code13") %in% names (dat)) ) {
     tmp <- harmonize_geo_code(dat)
diff --git a/R/recode_to_nuts_2016.R b/R/recode_to_nuts_2016.R
index db35cc19..8b504742 100644
--- a/R/recode_to_nuts_2016.R
+++ b/R/recode_to_nuts_2016.R
@@ -31,9 +31,7 @@
 #'               "Recoded in NUTS2016"
 #'   )) 
 #'  
-#'  test_regional_codes %>%
-#'   harmonize_geo_code () %>%
-#'    recode_to_nuts_2016()
+#' recode_to_nuts_2016(test_regional_codes)
 #' @export
  
 recode_to_nuts_2016 <- function (dat) {
@@ -42,7 +40,7 @@ recode_to_nuts_2016 <- function (dat) {
   type <- NULL
 
   regional_changes_2016 <- load_package_data(dataset = "regional_changes_2016")
-  nuts_correspondence <- load_package_data(dataset = "nuts_correspondence")
+  nuts_correspondence   <- load_package_data(dataset = "nuts_correspondence")
 
   if ( ! all(c("change", "code16", "code13") %in% names (dat)) ) {
     tmp <- harmonize_geo_code(dat)
diff --git a/man/harmonize_geo_code.Rd b/man/harmonize_geo_code.Rd
index a7a349d7..c97756f1 100644
--- a/man/harmonize_geo_code.Rd
+++ b/man/harmonize_geo_code.Rd
@@ -10,8 +10,8 @@ harmonize_geo_code(dat)
 \item{dat}{A Eurostat data frame downloaded with \code{\link{get_eurostat}}}
 }
 \value{
-An augmented data frame or a message about potential coding
-errors.
+An augmented data frame that explains potential problems and 
+possible solutions.
 }
 \description{
 Eurostat mixes \code{NUTS2013} and \code{NUTS2016} geographic
diff --git a/man/recode_to_nuts_2013.Rd b/man/recode_to_nuts_2013.Rd
index 42e43e86..9561ecef 100644
--- a/man/recode_to_nuts_2013.Rd
+++ b/man/recode_to_nuts_2013.Rd
@@ -41,9 +41,7 @@ test_regional_codes <- data.frame (
               "Recoded in NUTS2016"
   )) 
  
- test_regional_codes \%>\%
-  harmonize_geo_code () \%>\%
-   recode_to_nuts_2013()
+recode_to_nuts_2013(test_regional_codes)
 }
 \author{
 Daniel Antal
diff --git a/man/recode_to_nuts_2016.Rd b/man/recode_to_nuts_2016.Rd
index 879c89f9..d4fd0364 100644
--- a/man/recode_to_nuts_2016.Rd
+++ b/man/recode_to_nuts_2016.Rd
@@ -41,9 +41,7 @@ test_regional_codes <- data.frame (
               "Recoded in NUTS2016"
   )) 
  
- test_regional_codes \%>\%
-  harmonize_geo_code () \%>\%
-   recode_to_nuts_2016()
+recode_to_nuts_2016(test_regional_codes)
 }
 \author{
 Daniel Antal

From 817ac4782765dfe8b8a9b31c5d7fb5760089bf96 Mon Sep 17 00:00:00 2001
From: Daniel Antal <antaldaniel@users.noreply.github.com>
Date: Sat, 8 Feb 2020 22:53:23 +0100
Subject: [PATCH 09/11] internal test amended

---
 R/harmonize_geo_code.R | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/R/harmonize_geo_code.R b/R/harmonize_geo_code.R
index 327b44a3..cb350324 100644
--- a/R/harmonize_geo_code.R
+++ b/R/harmonize_geo_code.R
@@ -84,14 +84,7 @@ harmonize_geo_code <- function (dat) {
   nuts_2016_codes <- unique (all_regions_full_metadata$code16)#[!is.na(all_regions_full_metadata$code16)]
   nuts_2013_codes <- nuts_2013_codes[!is.na(nuts_2013_codes)]
   nuts_2016_codes <- nuts_2016_codes[!is.na(nuts_2016_codes)]
-  
-  "PL2" %in% all_regions_full_metadata$code13
-  "PL2" %in% unique ( all_regions_full_metadata$code13)
-  "UKN01" %in% nuts_2013_codes
-  "UKN01" %in% nuts_2016_codes
-  
-  any ( is.na(nuts_2013_codes))
-  
+
   tmp_by_code16 <- dat %>%
     mutate ( geo = as.character(geo)) %>%
     filter ( geo %in% all_regions_full_metadata$code16 ) %>%
@@ -120,7 +113,7 @@ harmonize_geo_code <- function (dat) {
     semi_join (  tmp_by_code13, 
                  by = names ( tmp_by_code13)) # found in both (unchanged and relabelled)
  
-  if (! all(tmp_s$nuts_2013 && tmp_s$nuts_2016)) { stop ("Wrong selection of unchanged regions.") }
+  if (! all(tmp_s$nuts_2013 & tmp_s$nuts_2016)) { stop ("Wrong selection of unchanged regions.") }
   
   
   tmp_s2 <- tmp_by_code13 %>%
@@ -130,14 +123,16 @@ harmonize_geo_code <- function (dat) {
   
   tmp_a1 <- tmp_by_code16 %>%
     anti_join (  tmp_by_code13, 
-                 by = names(tmp_by_code13)) # not found in code13 (new regions)
-  if ( ! all(tmp_a2$nuts_2013)) { stop ("Wrong selection of NUTS2013-only regions.") }
+                 by = names(tmp_by_code13)
+                 ) # not found in code13 (new regions)
+  if ( any(tmp_a1$nuts_2013) ) { stop ("Wrong selection of NUTS2013-only regions.") }
   
   
   tmp_a2 <- tmp_by_code13 %>%
     anti_join (  tmp_by_code16, 
-                 by = names(tmp_by_code13)) # not found in code16 (changes)
-  if ( ! all(tmp_a2$nuts_2013)) { stop ("Wrong selection of NUTS2013-only regions.") }
+                 by = names(tmp_by_code13)
+                 ) # not found in code16 (changes)
+  if ( any(tmp_a2$nuts_2016) ) { stop ("Wrong selection of NUTS2013-only regions.") }
   
   tmp <- rbind ( tmp_s, tmp_a1, tmp_a2 )
   

From a665200a5ea444f0310d74ff98e79d15187ba48f Mon Sep 17 00:00:00 2001
From: Daniel Antal <antaldaniel@users.noreply.github.com>
Date: Sat, 8 Feb 2020 22:54:02 +0100
Subject: [PATCH 10/11] remove space

---
 R/harmonize_geo_code.R | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/R/harmonize_geo_code.R b/R/harmonize_geo_code.R
index cb350324..ab5179fd 100644
--- a/R/harmonize_geo_code.R
+++ b/R/harmonize_geo_code.R
@@ -126,8 +126,7 @@ harmonize_geo_code <- function (dat) {
                  by = names(tmp_by_code13)
                  ) # not found in code13 (new regions)
   if ( any(tmp_a1$nuts_2013) ) { stop ("Wrong selection of NUTS2013-only regions.") }
-  
-  
+ 
   tmp_a2 <- tmp_by_code13 %>%
     anti_join (  tmp_by_code16, 
                  by = names(tmp_by_code13)

From 1a81f5f813ede73ef02a54d3d383add3e0a3bf22 Mon Sep 17 00:00:00 2001
From: Daniel Antal <antaldaniel@users.noreply.github.com>
Date: Sat, 8 Feb 2020 22:55:57 +0100
Subject: [PATCH 11/11] importFrom

---
 NAMESPACE | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/NAMESPACE b/NAMESPACE
index 7c2af4d8..298d2009 100755
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -31,9 +31,11 @@ importFrom(classInt,classIntervals)
 importFrom(countrycode,countrycode)
 importFrom(curl,curl_download)
 importFrom(dplyr,"%>%")
+importFrom(dplyr,add_count)
 importFrom(dplyr,anti_join)
 importFrom(dplyr,arrange)
 importFrom(dplyr,case_when)
+importFrom(dplyr,distinct)
 importFrom(dplyr,filter)
 importFrom(dplyr,full_join)
 importFrom(dplyr,inner_join)
@@ -42,6 +44,7 @@ importFrom(dplyr,mutate)
 importFrom(dplyr,mutate_if)
 importFrom(dplyr,rename)
 importFrom(dplyr,select)
+importFrom(dplyr,semi_join)
 importFrom(httr,GET)
 importFrom(httr,build_url)
 importFrom(httr,content)