-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetStats.py
89 lines (72 loc) · 3.57 KB
/
getStats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import pandas as pd
import shapefile
import configparser
import psycopg2
import os
class GetStats(object):
def __init__(self):
cmasSf = shapefile.Reader("resources/boundaries/cmas4326/cmas4326.shp")
self.cmas = {shapeRecord.record.CMAUID: set() for shapeRecord in cmasSf.shapeRecords()}
dasSf = shapefile.Reader("resources/boundaries/das4326/das4326.shp")
self.das = [shapeRecord.record.DAUID for shapeRecord in dasSf.shapeRecords()]
for shapeRecord in dasSf.shapeRecords():
if shapeRecord.record.CMAUID in self.cmas:
self.cmas[shapeRecord.record.CMAUID].add(shapeRecord.record.DAUID)
config = configparser.ConfigParser()
config.read("config/config.ini")
self.conn = psycopg2.connect(
host=config["credentials"]["host"],
database=config["credentials"]["database"],
user=config["credentials"]["user"],
password=config["credentials"]["password"],
)
self.bcStatcan = pd.read_csv("data/statcan/bc.csv", index_col=0, dtype=int)
def getBcOsm(self, cma=None, da=None):
if cma:
sql = """select "naicsCan" as naics, count(*) as "countOsm" from businesses
where cma = {0} and "naicsCan" != -1
group by "naicsCan\"""".format(cma)
elif da:
sql = """select "naicsCan" as naics, count(*) as "countOsm" from businesses
where da = {0} and "naicsCan" != -1
group by "naicsCan\"""".format(da)
return pd.read_sql(sql, con=self.conn, index_col="naics")
def getBcStatcan(self, cma=None, da=None):
if cma:
return self.bcStatcan[self.bcStatcan.columns.intersection(self.cmas[cma])].sum(1)
elif da:
return self.bcStatcan[da] if da in self.bcStatcan else pd.DataFrame()
def getStatsLocal(self, cma=None, da=None):
def getCountStatcan(row, bcStatcanLocal):
length = 6 - len(str(row.name))
return bcStatcanLocal[bcStatcanLocal.index.astype(str).str.contains("^" + str(row.name) + "([0-9]){" + str(length) + "}$")].sum()
if cma:
bcOsmLocal = self.getBcOsm(cma=cma)
bcStatcanLocal = self.getBcStatcan(cma=cma)
elif da:
bcOsmLocal = self.getBcOsm(da=da)
bcStatcanLocal = self.getBcStatcan(da=da)
if not bcOsmLocal.empty and not bcStatcanLocal.empty:
statsLocal = bcOsmLocal
statsLocal["countStatcan"] = statsLocal.apply(getCountStatcan, axis=1, args=(bcStatcanLocal,))
statsLocal["osmCompleteness"] = statsLocal["countOsm"] / statsLocal["countStatcan"]
statsLocal["error"] = abs(statsLocal["countStatcan"] - statsLocal["countOsm"]) / statsLocal["countStatcan"]
boundary = cma if cma else da
boundaryType = "cma" if cma else "da"
statsLocal.index = pd.MultiIndex.from_tuples([[boundary, k] for k, v in statsLocal.iterrows()], names=[boundaryType, "naics"])
return statsLocal
def getStats(self, type):
if type == "cma":
stats = [self.getStatsLocal(cma=cma) for cma in self.cmas]
elif type == "da":
stats = [self.getStatsLocal(da=da) for da in self.das]
stats = list(filter(lambda x: x is not None, stats))
return pd.concat(stats)
def main():
getStats = GetStats()
statsCma = getStats.getStats("cma")
statsCma.to_csv("stats/statsCma.csv")
statsDa = getStats.getStats("da")
statsDa.to_csv("stats/statsDa.csv")
if __name__ == "__main__":
main()