Skip to content

Commit 8ebb419

Browse files
author
Jeevesh Juneja
authored
Update cleaners.py
1 parent fb1c17d commit 8ebb419

File tree

1 file changed

+21
-16
lines changed

1 file changed

+21
-16
lines changed

std_indic/cleaners.py

+21-16
Original file line numberDiff line numberDiff line change
@@ -36,14 +36,15 @@ def __init__(self, lang):
3636
super().__init__()
3737
self.lang = lang
3838
self.contractions = []
39-
with open('contractions/'+lang+'.txt', 'r') as f:
40-
text = f.readline().rstrip()
41-
text = re.sub(r'\s+', r'\s', text)
42-
contractions.append(text.split(' ', 1))
43-
39+
with open('./std_indic/contractions/'+lang+'.txt', 'r') as f:
40+
for line in f.readlines():
41+
text = line.rstrip()
42+
text = re.sub(r'\s+', r' ', text)
43+
self.contractions.append(text.split(' ', 1))
44+
4445
def resolve_contractions(self, text):
4546
for contraction in self.contractions:
46-
text = re.sub(re.compile(self.contraction[0]), re.compile(self.contraction[1]), text)
47+
text = text.replace(contraction[0], contraction[1])
4748
return text
4849

4950

@@ -52,52 +53,56 @@ def __init__(self, lang):
5253
super().__init__()
5354
self.lang = lang
5455
self.normalizations = []
55-
with open('unicode_normalization/'+lang+'.txt', 'r') as f :
56+
with open('./std_indic/unicode_normalization/'+lang+'.txt', 'r') as f :
5657
content = f.read()
5758
replacements = re.findall(r'\[.+\]', content)
5859
for elem in replacements :
5960
elem = elem.strip('[]')
6061
elems = elem.split(' , ')
61-
self.normalizations.append( [str_to_unicode(elem[0]), str_to_unicode(elem[1])] )
62+
self.normalizations.append( [str_to_unicode(elems[0]), str_to_unicode(elems[1])] )
6263

6364
def normalize(self, text) :
6465
for norm in self.normalizations:
65-
text = re.sub( re.compile(norm[0]), re.compile(norm[1]), text)
66+
text = text.replace( norm[0], norm[1])
6667
return text
6768

6869
class resolve_group_chars(object):
6970
def __init__(self, lang):
7071
super().__init__()
7172
self.lang = lang
7273
self.replaces = []
73-
with open('unicode_normalization/groups'+lang+'.txt', 'r') as f :
74+
with open('./std_indic/unicode_normalization/groups/'+lang+'.txt', 'r') as f :
7475
for line in f.readlines():
75-
line = re.sub(r'\s+',r'\s',line.rstrip())
76-
line = line.split('#')[0]
76+
line = re.sub(r'\s+',r' ',line.rstrip())
77+
line = line.split('#')[0].rstrip()
78+
#print(line)
7779
original, replacement = line.split(' ')
7880
match_str = self.make_pattern(original)
7981
repl_str = self.make_pattern(replacement, True)
80-
self.replaces.append((match_str, repl_str))
82+
print(match_str, repl_str)
83+
self.replaces.append((match_str, repl_str))
8184

8285
def make_pattern(self, original, is_replacement=False):
8386
original_parts = original.split(',')
8487
match_str = ''
88+
#print(original_parts)
8589
for part in original_parts:
8690
if is_replacement:
8791
if not part.startswith('0x'):
88-
match_str += '\\'+part
92+
match_str += '\\g<'+part+'>'
8993
else:
9094
match_str += str_to_unicode(part)
9195
else :
9296
if '-' not in part:
9397
match_str += '('+str_to_unicode(part)+')'
9498
else:
9599
match_str += '([' + str_to_unicode(part.split('-')[0]) + '-' + str_to_unicode(part.split('-')[1]) + '])'
96-
return match_str
100+
return match_str
97101

98102
def normalize(self, text) :
99103
for norm in self.replaces:
100-
text = re.sub( re.compile(norm[0]), re.compile(norm[1]), text)
104+
print(norm[0], norm[1])
105+
text = re.sub( norm[0], norm[1], text)
101106
return text
102107

103108

0 commit comments

Comments
 (0)