-
Notifications
You must be signed in to change notification settings - Fork 2
/
bulk_data.R
135 lines (96 loc) · 4.84 KB
/
bulk_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
library(rvest) # load the rvest package
library(vroom)
library(stringr)
library(tidyverse)
library(magrittr)
library(arrow)
# Read the HTML page
html_page <- read_html("https://www.nomisweb.co.uk/sources/census_2021_bulk")
# Get census table zip file names
zip_urls <- html_page %>%
html_nodes("a[href$='.zip']") %>%
html_attr("href") %>%
str_subset(pattern = "^(?!.*extra.zip)")
# Make zip file names into a full URL
zip_urls <- paste0("https://www.nomisweb.co.uk",zip_urls)
# Create an empty tibble with the following column names
meta_data_table <- tibble(
Table_Name = character(),
Variable_Name = character(),
Type = character(),
new_names = character(),
Table_ID = character()
)
# These tables dont have OA data
no_oa_tables <- c("https://www.nomisweb.co.uk/output/census/2021/census2021-ts007.zip",
"https://www.nomisweb.co.uk/output/census/2021/census2021-ts009.zip",
"https://www.nomisweb.co.uk/output/census/2021/census2021-ts010.zip",
"https://www.nomisweb.co.uk/output/census/2021/census2021-ts012.zip",
"https://www.nomisweb.co.uk/output/census/2021/census2021-ts013.zip",
"https://www.nomisweb.co.uk/output/census/2021/census2021-ts071.zip",
"https://www.nomisweb.co.uk/output/census/2021/census2021-ts072.zip",
"https://www.nomisweb.co.uk/output/census/2021/census2021-ts073.zip",
"https://www.nomisweb.co.uk/output/census/2021/census2021-ts074.zip",
"https://www.nomisweb.co.uk/output/census/2021/census2021-ts022.zip",
"https://www.nomisweb.co.uk/output/census/2021/census2021-ts024.zip",
"https://www.nomisweb.co.uk/output/census/2021/census2021-ts028.zip",
"https://www.nomisweb.co.uk/output/census/2021/census2021-ts031.zip",
"https://www.nomisweb.co.uk/output/census/2021/census2021-ts076.zip",
"https://www.nomisweb.co.uk/output/census/2021/census2021-ts060.zip",
"https://www.nomisweb.co.uk/output/census/2021/census2021-ts064.zip",
"https://www.nomisweb.co.uk/output/census/2021/census2021-ts047.zip",
"https://www.nomisweb.co.uk/output/census/2021/census2021-ts048.zip",
"https://www.nomisweb.co.uk/output/census/2021/census2021-ts079.zip",
"https://www.nomisweb.co.uk/output/census/2021/census2021-ts070.zip",
"https://www.nomisweb.co.uk/output/census/2021/census2021-ts077.zip",
"https://www.nomisweb.co.uk/output/census/2021/census2021-ts078.zip",
"https://www.nomisweb.co.uk/output/census/2021/census2021-ts037asp.zip",
"https://www.nomisweb.co.uk/output/census/2021/census2021-ts038asp.zip",
"https://www.nomisweb.co.uk/output/census/2021/census2021-ts039asp.zip")
# Create output directories for the census tables
dir.create("./output_data/csv",recursive = TRUE)
dir.create("./output_data/parquet",recursive = TRUE)
zip_urls <- result <- setdiff(zip_urls, no_oa_tables) # Remove the tables without OA
for (url in zip_urls){
dir.create("./tmp",recursive = TRUE)#create a temporary directory for unzipping
f <- curl::curl_download(url, tempfile(fileext = ".zip")) # Download the specified zip file
unzip(f, exdir="./tmp") # Unzip
t_tab_loc <- list.files("./tmp", pattern=".*-oa.csv") # Extract the OA csv location
t_name <- unlist(str_split(t_tab_loc,"-"))[2] # Extract the table name
assign(t_name,vroom(paste0("./tmp/",t_tab_loc),show_col_types = FALSE) %>%
select(-date,-geography) %>%
column_to_rownames("geography code")) #Move OA code to row names
old_names <- colnames(get(t_name)) # Get the column names
new_names <- paste0(t_name, sprintf("%04d",seq_along(all_of(old_names)))) # Create some new column names with zero padding
# Create a list of the new and old names, plus Table ID
N_list <- list(
old_names = old_names,
new_names = new_names,
Table_ID = t_name
)
# Creates the meta data table
N_list <- as_tibble(N_list)
# Keep the metadata
meta_data_table %<>%
bind_rows(N_list)
# Change the column names for the data frame
env <- environment() # Have to be explicit about the env, as %>% uses a temp environment
get(t_name) %>%
rename_at(vars(old_names), ~new_names) %>%
rownames_to_column("OA") %>%
assign(t_name,., envir = env) # add a reference to the environment
# Write csv and parquet to the output folders
write_parquet(get(t_name), paste0("./output_data/parquet/",t_name,".parquet"))
write_csv(get(t_name), paste0("./output_data/csv/",t_name,".csv"))
#Remove tmp objects
rm(N_list, old_names, new_names, t_name, t_tab_loc)
#Remove all downloaded files for this table
unlink("./tmp",recursive = TRUE)
}
# Format the lookup table
meta_data_table2 <- meta_data_table %>%
mutate(Table_Name = str_split_fixed(old_names, ":", 2)[,1]) %>% # Table Name
mutate(Type = str_replace_all((str_extract(old_names, "; measures: \\w+")), "; measures: ", "")) %>% # Variable Type
mutate(Variable_Name = str_replace_all(old_names, ";.*", "")) %>%
mutate(Variable_Name = str_replace_all(Variable_Name, paste0(Table_Name,": "), ""))
write_csv(meta_data_table2, "Table_Metadata.csv")