-
Notifications
You must be signed in to change notification settings - Fork 1
/
data_to_tpl.py
40 lines (32 loc) · 1.31 KB
/
data_to_tpl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import os
import csv
""" ALP course CSV --> TPL converter
github.com/asahala """
POS_FILTER = {'V', 'AJ', 'N', 'AV'}
DATA_PATHS = ('data/rinap01', 'data/rinap05')
def read_csv(filename):
with open(filename, 'r', encoding='utf-8') as f:
for e, line in enumerate(csv.reader(f, delimiter=',')):
if e == 0:
keys = {a: b for a, b in zip(line, range(0, len(line)))}
continue
pos = line[keys['pos']]
sense = line[keys['sense']].replace(' ', '_')
cf = line[keys['cf']]
if len(pos) == 2 and pos.endswith('N') or pos in POS_FILTER:
yield f'{cf}[{sense}]{pos}'
else:
yield '_'
""" Batch process all CSV files in DATA_PATHS and write to dataset.txt """
lemmalist = set()
with open('dataset_nostops.txt', 'w', encoding='utf-8') as o:
for path in DATA_PATHS:
for file in (x for x in os.listdir(path) if x.endswith('.csv')):
print(f'processing {file}...')
for word in read_csv(os.path.join(path, file)):
o.write(word + ' ')
lemmalist.add(word)
o.write('\n')
with open('lemmalist.txt', 'w', encoding='utf-8') as o:
for lemma in sorted(lemmalist):
o.write(lemma + '\n')