forked from kokes/od
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
98 lines (83 loc) · 3.39 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env python
# coding: utf-8
import json
import csv
import os
import zipfile
from contextlib import contextmanager
from functools import lru_cache
from io import BytesIO, TextIOWrapper
from datetime import datetime
import requests
@lru_cache(maxsize=None)
def dl(url):
r = requests.get(url)
assert r.ok, r.status_code
return BytesIO(r.content)
@contextmanager
def read_compressed(zipname, filename):
burl = 'http://www.psp.cz/eknih/cdrom/opendata/{}'
with zipfile.ZipFile(dl(burl.format(zipname))) as zf:
yield TextIOWrapper(zf.open(filename), 'cp1250', errors='ignore') # tisky.unl maj encoding chyby
def read_compressed_csv(zf, fn, mp):
datetypes = {'date', 'datetime(year to hour)', 'datum', 'datetime(year to minute)', 'datetime(year to second, fraction)',
'datetime year to hour', 'datetime year to minute', 'datetime year to day', 'datetime(year to second)'}
cols = [j['sloupec'] for j in mp]
types = {j['sloupec']: j['typ'] for j in mp}
with read_compressed(zf, fn) as f:
cr = csv.reader(f, delimiter='|')
for el in cr:
# UNL soubory maj jeden extra sloupec
# TODO: zapnout tohle, az opravi schema sbirky
# assert len(el) == len(cols) + 1, (el, cols)
dt = {}
for k, v in zip(cols, el):
if v.strip() == '':
dt[k] = None
elif types[k] in datetypes:
lv = len(v)
if lv == 10:
if '-' in v:
dt[k] = datetime.strptime(v, '%Y-%m-%d')
elif '.' in v:
dt[k] = datetime.strptime(v, '%d.%m.%Y')
else:
raise ValueError(v)
elif lv == 13:
dt[k] = datetime.strptime(v, '%Y-%m-%d %H')
elif lv == 16:
dt[k] = datetime.strptime(v, '%Y-%m-%d %H:%M')
elif lv == 19:
# 2013-11-27 14:06:11
dt[k] = datetime.strptime(v, '%Y-%m-%d %H:%M:%S')
elif lv == 25:
# 1999-01-12 14:14:41.35000
dt[k] = datetime.strptime(v, '%Y-%m-%d %H:%M:%S.%f')
else:
raise ValueError(v)
else:
dt[k] = v
yield dt
# je to kratky, tak neimplementuju `partial`
def main(outdir: str, partial: bool = False):
cdir = os.path.dirname(os.path.abspath(__file__))
with open(os.path.join(cdir, 'mapping.json')) as f:
mapping = json.load(f)
for mp in mapping:
tbl = f'{mp["tema"]}_{mp["tabulka"]}'
tfn = os.path.join(outdir, f'{tbl}.csv')
print(tbl)
cols = [j['sloupec'] for j in mp['sloupce']]
with open(tfn, 'w') as fw:
cw = csv.DictWriter(fw, fieldnames=cols)
cw.writeheader()
for ffn in mp['soubory']:
# TODO: nemuzem ted udelat partial, protoze failujou ForeignKeys
# if partial and ffn not in mp['soubory'][-2:]:
# continue
print('\t', ffn)
zf, fn = ffn.split('/')
for el in read_compressed_csv(zf, fn, mp['sloupce']):
cw.writerow(el)
if __name__ == "__main__":
main(".")