-
Notifications
You must be signed in to change notification settings - Fork 11
/
exception_dictionary_wrapper.py
111 lines (83 loc) · 4.92 KB
/
exception_dictionary_wrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# OS: GNU/Linux, Author: Klim V. O.
'''
Exception dictionary for correcting stress placement by a neural network.
Contains the 'ExceptionDictWrapper' class. Learn more in https://github.com/Desklop/StressRNN.
Dependences: pymorphy2[fast]<=0.9.2
'''
import pymorphy2
try:
from .constants import DEF_STRESS_SYMBOL, ADD_STRESS_SYMBOL, F_NAME_EXCEPTION_DICT
except ImportError:
from constants import DEF_STRESS_SYMBOL, ADD_STRESS_SYMBOL, F_NAME_EXCEPTION_DICT
class ExceptionDictWrapper:
''' Exception dictionary for correcting stress placement. Contains methods:
- is_in_dict(): checking if a word is in the dictionary
- put_stress(): placing stress in a word in accordance with the dictionary
The exception dictionary must contain a list of words with stresses placed in them using the "'" or '+' symbol after the vowel
(1 line = 1 word).
The dictionary looks like this:
дре+гер
ивано+в
...
The main dictionary comes with the package and is located in 'stressrnn/exception_dictionary.txt'. You can also add your own
dictionary, the values from which will complement the main dictionary (and overwrite the same words, but with different stresses).
1. f_name_add_exception_dict - name of additional .txt dictionary with exceptions '''
def __init__(self, f_name_add_exception_dict: str = None) -> None:
self.morph_analyzer = pymorphy2.MorphAnalyzer()
self.exception_dict = {}
self.__load_exception_dict(F_NAME_EXCEPTION_DICT)
if f_name_add_exception_dict:
self.__load_exception_dict(f_name_add_exception_dict, overwrite=True)
def __load_exception_dict(self, f_name_exception_dict: str, overwrite: bool = False) -> None:
''' Loading a dictionary from a .txt file and creating pairs of the form 'word': [stress_position]. The file can contain
several identical words with different stresses, they will all be added to the dictionary, and the stress positions will be
specified in the order of reading the words. '''
with open(f_name_exception_dict, 'r', encoding="utf-8") as f_exception_dict:
for word in f_exception_dict:
word = word.strip('\n')
if DEF_STRESS_SYMBOL in word:
unstressed_word = word.replace(DEF_STRESS_SYMBOL, '')
if not overwrite and unstressed_word in self.exception_dict:
self.exception_dict[unstressed_word].append(word.find(DEF_STRESS_SYMBOL))
else:
self.exception_dict[unstressed_word] = [word.find(DEF_STRESS_SYMBOL)]
elif ADD_STRESS_SYMBOL in word:
unstressed_word = word.replace(ADD_STRESS_SYMBOL, '')
if not overwrite and unstressed_word in self.exception_dict:
self.exception_dict[unstressed_word].append(word.find(ADD_STRESS_SYMBOL))
else:
self.exception_dict[unstressed_word] = [word.find(ADD_STRESS_SYMBOL)]
def is_in_dict(self, word: str, lemmatize_word: bool = False) -> bool:
''' Checking if the word is in the dictionary.
1. word - string with the word of interest
2. lemmatize_word - True: lemmatize (normalize) word before searching in dictionary
3. returns True/False '''
if word.lower() in self.exception_dict:
return True
elif word.lower().replace('ё', 'е') in self.exception_dict:
return True
elif lemmatize_word and self.morph_analyzer.parse(word)[0].normal_form in self.exception_dict:
return True
else:
return False
def put_stress(self, word: str, stress_symbol: str, lemmatize_word: bool = False) -> str:
''' Put stress in a word in accordance with the dictionary. Stress is indicated by stress_symbol after the stressed vowel.
1. word - string with the word of interest
2. stress_symbol - stress symbol
3. lemmatize_word - True: lemmatize (normalize) word before searching in dictionary
4. returns word with placed stress '''
prepared_word = word.lower()
if prepared_word in self.exception_dict:
stress_index = self.exception_dict[prepared_word][0]
return word[:stress_index] + stress_symbol + word[stress_index:]
prepared_word = word.lower().replace('ё', 'е')
if prepared_word in self.exception_dict:
stress_index = self.exception_dict[prepared_word][0]
return word[:stress_index] + stress_symbol + word[stress_index:]
prepared_word = self.morph_analyzer.parse(word)[0].normal_form
if lemmatize_word and prepared_word in self.exception_dict:
stress_index = self.exception_dict[prepared_word][0]
return word[:stress_index] + stress_symbol + word[stress_index:]
return word