-
Notifications
You must be signed in to change notification settings - Fork 11
/
convert_zaliznyak_to_exception_dictionary.py
166 lines (127 loc) · 6.12 KB
/
convert_zaliznyak_to_exception_dictionary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# OS: GNU/Linux, Author: Klim V. O.
'''
Unpacking the "Grammatical dictionary" by A. A. Zaliznyak from .zip archive, converting it to the format of exception dictionary
and combining it with the current exception dictionary.
Zaliznyak's dictionary is taken from http://odict.ru/.
'''
import os
import sys
import time
import zipfile
import curses
import argparse
SAME_LETTERS_EN_RU = {
'A': 'А',
'B': 'В',
'C': 'С',
'E': 'Е',
'H': 'Н',
'K': 'К',
'M': 'М',
'O': 'О',
'P': 'Р',
'T': 'Т',
'X': 'Х',
'a': 'а',
'c': 'с',
'e': 'е',
'o': 'о',
'p': 'р',
'x': 'х'
}
def count_number_of_vowels(word: str) -> int:
''' Counting the number of vowels in a word. '''
number_of_vowels = 0
for symbol in word:
if symbol in 'аеиоуэюяыёАЕИОУЭЮЯЫЁ':
number_of_vowels += 1
return number_of_vowels
def main():
curses.setupterm()
f_name_zip_odict = 'odict.zip'
f_name_odict = 'zaliznyak.txt'
f_name_current_exception_dict = 'stressrnn/dicts/source_exception_dictionary.txt'
f_name_new_exception_dict = 'stressrnn/dicts/exception_dictionary.txt'
parser = argparse.ArgumentParser(description="Converting Zaliznyak's dictionary from http://odict.ru/ to the format of " + \
"exception dictionary and combining it with the current exception dictionary.")
parser.add_argument('-iz', '--f_name_zip_odict', type=str, default=None,
help="Name of .zip archive with the Zaliznyak's dictionary")
parser.add_argument('-i', '--f_name_odict', type=str, default=None,
help="Name of .txt file with the Zaliznyak's dictionary")
parser.add_argument('-ic', '--f_name_current_exception_dict', type=str, default=None,
help="Name of .txt file with the current exception dictionary (it will be combined with the Zaliznyak's dictionary)")
parser.add_argument('-o', '--f_name_new_exception_dict', type=str, default=None,
help="Name of .txt file to save the combined dictionary")
args = parser.parse_args()
if args.f_name_zip_odict and args.f_name_odict:
print("[W] 'f_name_zip_odict' and 'f_name_odict' are set simultaneously — the value from 'f_name_odict' will be used.")
f_name_zip_odict = None
f_name_odict = args.f_name_odict
elif args.f_name_zip_odict and not args.f_name_odict:
f_name_zip_odict = args.f_name_zip_odict
elif not args.f_name_zip_odict and args.f_name_odict:
f_name_zip_odict = None
f_name_odict = args.f_name_odict
if args.f_name_current_exception_dict:
f_name_current_exception_dict = args.f_name_current_exception_dict
if args.f_name_new_exception_dict:
f_name_new_exception_dict = args.f_name_new_exception_dict
# Unpacking archive with the dictionary to the same folder, where the archive is located
start_time = time.time()
if f_name_zip_odict:
print("[i] Unpacking '{}'...".format(f_name_zip_odict))
with zipfile.ZipFile(f_name_zip_odict, 'r') as zip_odict:
zip_odict.extractall(os.path.dirname(f_name_zip_odict))
f_name_odict = zip_odict.namelist()[0]
print("[i] Loading Zaliznyak's dictionary from '{}'...".format(f_name_odict))
zaliznyak_dict = []
with open(f_name_odict, 'r') as f_odict:
zaliznyak_dict = f_odict.readlines()
zaliznyak_dict[0] = zaliznyak_dict[0].replace('\ufeff', '')
print('[i] Loaded {} values'.format(len(zaliznyak_dict)))
print("[i] Converting Zaliznyak's dictionary to the format of exception dictionary...")
for i, word in enumerate(zaliznyak_dict):
word = word.replace('\n', '').lower().split(' ')
if not word[0] or count_number_of_vowels(word[0]) == 0:
zaliznyak_dict[i] = ''
continue
word, stress_index = [subword for subword in word if subword][:2]
if stress_index.find(',') != -1 or stress_index.find('.') != -1:
zaliznyak_dict[i] = ''
continue
if word[0] == '-':
word = word[1:]
j = 0
while j < len(word):
if SAME_LETTERS_EN_RU.get(word[j]):
word = word.replace(word[j], SAME_LETTERS_EN_RU[word[j]])
j += 1
zaliznyak_dict[i] = word[:int(stress_index)] + '+' + word[int(stress_index):] + '\n'
zaliznyak_dict = [word for word in zaliznyak_dict if word]
print('[i] After converting, there are {} values left'.format(len(zaliznyak_dict)))
print("[i] Loading current exception dictionary from '{}'...".format(f_name_current_exception_dict))
current_exception_dict = []
with open(f_name_current_exception_dict, 'r') as f_exception_dict:
current_exception_dict = f_exception_dict.readlines()
current_exception_dict[-1] += '\n'
print('[i] Loaded {} values'.format(len(current_exception_dict)))
print('[i] Combining dictionaries... 0 of {}'.format(len(current_exception_dict)))
zaliznyak_dict_without_stresses = [word.replace('+', '') for word in zaliznyak_dict]
for i, word in enumerate(current_exception_dict):
if i % 1000 == 0 or i == len(current_exception_dict) - 1:
os.write(sys.stdout.fileno(), curses.tigetstr('cuu1'))
print('[i] Combining dictionaries... {} of {}'.format(i, len(current_exception_dict)))
if word not in zaliznyak_dict and word.replace('+', '') in zaliznyak_dict_without_stresses:
zaliznyak_dict[zaliznyak_dict_without_stresses.index(word.replace('+', ''))] = word
zaliznyak_dict = current_exception_dict + zaliznyak_dict
zaliznyak_dict = sorted(list(set(zaliznyak_dict)))
print("[i] Saving {} values in '{}'...".format(len(zaliznyak_dict), f_name_new_exception_dict))
with open(f_name_new_exception_dict, 'w') as f_new_exception_dict:
f_new_exception_dict.writelines(zaliznyak_dict)
if f_name_zip_odict:
os.remove(f_name_odict)
print('[i] Done in {:.2f} second(-s)'.format(time.time()-start_time))
if __name__ == '__main__':
main()