-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
229 lines (179 loc) · 7.69 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
# Author: Sam Lehmann
# Network with him at: https://www.linkedin.com/in/samuellehmann/
# Date: 2023-01-19
import os
import pickle
import shutil
import time
import urllib.request
import zipfile
import pandas as pd
from anytree import Node
import census
import interface
TREE_SEPARATOR = "⤚"
_NODE_FILENAME = "nodes_list.pickle"
_ZIP_FILENAME = "download.zip"
_TEMP_LOC = '\\temp'
def download_csv(url, keep_file, filename, remove_first_line=False):
"""
Downloads a CSV file from statistics canada
:param url: The URL of the csv file to download
:param keep_file: The file that should be kept from the zip file
:param filename: The final filename that the csv should be saved as
:param remove_first_line: Should the first line of the CSV be removed? Some CSVs have an additional header text
:return: None
"""
# Create a temporary directory
loc = os.getcwd() + _TEMP_LOC + "\\"
os.mkdir(loc)
# Download the file as a zip file and extract it
print(f"Start File Download At This URL: {url}")
urllib.request.urlretrieve(url, loc + _ZIP_FILENAME)
print("Download Complete")
with zipfile.ZipFile(loc + _ZIP_FILENAME, 'r') as zip_ref:
zip_ref.extractall(loc)
# Rename/move the file of interest and delete the temporary directory
os.rename(loc + keep_file, os.getcwd() + "\\" + filename)
shutil.rmtree(loc)
# Sometimes the first line has to be removed due to additional header text
if remove_first_line:
with open(filename, 'r') as fin:
data = fin.read().splitlines(True)
with open(filename, 'w') as fout:
fout.writelines(data[1:])
def save_csv_parquet(cen):
"""
Loading CSVs are timeconsuming. Read in the CSV and save it as a parquet file which will be quicker to load in the future
:return: The CSV as a dataframe
"""
df = pd.read_csv(cen.filename_csv, encoding="latin-1", dtype="str")
df.to_parquet(cen.filename_par, compression=None)
return df
def build_geographical_tree(geo_df):
"""
Builds a tree of all the geographic regions, nested into provinces, censuses, and census sub-divisions
:param geo_df: A dataframe of the geo_data.csv file provided by statistics canada
:return:
"""
canada = Node("Canada")
# Standard geographical code
# Ref https://www12.statcan.gc.ca/census-recensement/2021/ref/dict/az/definition-eng.cfm?ID=geo044
geo_df["SGC"] = geo_df["Geo Code"].str[9:]
prior_province = None
prior_census = None
for _, geo in geo_df.iterrows():
if len(geo["SGC"]) == 2:
# This is a province
prior_province = Node(geo["Geo Name"], canada)
elif len(geo["SGC"]) == 4:
# This is a census
prior_census = Node(geo["Geo Name"], prior_province)
elif len(geo["SGC"]) == 7:
# This is a census subdivison
Node(geo["Geo Name"], prior_census)
else:
raise Exception("Unexpected geographical code length")
def build_characteristic_tree(characteristic_list, leading_spaces=2):
"""
Builds a tree of all the options in the characteristic tree. Items are indented in the tree based on their
leading whitespace, with a specified number of spaces per indentation
:param characteristic_list: A numpy array of strings, where leading spaces indicate their levels of indentation.
:param leading_spaces: The number of spaces per indentation.
For example:
Vehicles
Cars
Mercedes
Rolls-Royce
Planes
Airbus
:return:A node tree with names including numbering
"""
prior_whitespace = 0
start_node = Node("Characteristic Types")
parent = start_node
prior = start_node
"All prefixes have a trailing separator"
prior_prefix = "0" + TREE_SEPARATOR
total = len(characteristic_list)
# Loop through all characteristics and build a tree
for i, characteristic in enumerate(characteristic_list):
print(f"Progress Building Characteristic Tree: {i} of {total}")
characteristic = characteristic.replace(u'\xa0', u' ')
white_space_chars = int((len(characteristic) - len(characteristic.lstrip())) / leading_spaces)
characteristic = characteristic.lstrip()
if white_space_chars == 0:
# Stems from the start node
prior_prefix = str(int(prior_prefix.split(TREE_SEPARATOR)[0]) + 1) + TREE_SEPARATOR
characteristic = prior_prefix + characteristic
node = Node(characteristic, start_node)
elif white_space_chars == prior_whitespace:
# Stems from the same parent as the prior node
prior_prefix = TREE_SEPARATOR.join(prior_prefix.split(TREE_SEPARATOR)[0:-2]) + TREE_SEPARATOR + str(
int(prior_prefix.split(TREE_SEPARATOR)[-2]) + 1) + TREE_SEPARATOR
characteristic = prior_prefix + characteristic
node = Node(characteristic, parent)
elif white_space_chars > prior_whitespace:
# Is a child of the prior node
prior_prefix += "1" + TREE_SEPARATOR
characteristic = prior_prefix + characteristic
node = Node(characteristic, prior)
parent = prior
else:
# Has less indentation than the prior node, decrement the tree
# Loop through the number of decrements
for _ in range(int(prior_whitespace - white_space_chars)):
prior_prefix = TREE_SEPARATOR.join(prior_prefix.split(TREE_SEPARATOR)[0:-2]) + TREE_SEPARATOR
prior = prior.parent
prior_prefix = TREE_SEPARATOR.join(prior_prefix.split(TREE_SEPARATOR)[0:-2]) + TREE_SEPARATOR + str(
int(prior_prefix.split(TREE_SEPARATOR)[-2]) + 1) + TREE_SEPARATOR
characteristic = prior_prefix + characteristic
node = Node(characteristic, prior)
parent = prior
prior = node
prior_whitespace = white_space_chars
return start_node
def process_data():
"""
Loads data from CSV files and builds a characteristic tree. This information is saved as a parquet and pickle respectively.
:return:
"""
nodes = []
for i, cen in enumerate(census.censuses):
print(cen.year)
data_df = save_csv_parquet(cen)
characteristic_list = data_df.where(data_df[cen.geo_col] == "Alberta")[
cen.characteristic_col].dropna().to_numpy()
nodes.append(build_characteristic_tree(characteristic_list, cen.leading_spaces))
# Save the nodelist to a pickle
file = open(_NODE_FILENAME, "ab")
pickle.dump(nodes, file)
file.close()
def load_data():
"""
Loads characteristic trees and dataframes from pickles and parquets respectively
:return:
"""
file = open(_NODE_FILENAME, "rb")
nodes = pickle.load(file)
for i, cen in enumerate(census.censuses):
cen.set_data_df(pd.read_parquet(cen.filename_par))
cen.set_char_tree(nodes[i])
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
print("Program Start")
if not os.path.isfile(census.censuses[0].filename_par):
# Download CSVs
for cen in census.censuses:
download_csv(cen.url, cen.filename_keep, cen.filename_csv, cen.delete_first_line)
print(f"Finished download of {cen.year} census data")
process_data()
else:
print("Files already processed. No need to download files")
current_time = time.time()
geo_df = pd.read_csv("GeoData.CSV", encoding="latin-1")
print(f"Done loading geo data in {time.time() - current_time} seconds")
current_time = time.time()
load_data()
print(f"Done loading data in {time.time() - current_time} seconds")
interface.generate_interface()