@@ -36,14 +36,15 @@ def __init__(self, lang):
36
36
super ().__init__ ()
37
37
self .lang = lang
38
38
self .contractions = []
39
- with open ('contractions/' + lang + '.txt' , 'r' ) as f :
40
- text = f .readline ().rstrip ()
41
- text = re .sub (r'\s+' , r'\s' , text )
42
- contractions .append (text .split (' ' , 1 ))
43
-
39
+ with open ('./std_indic/contractions/' + lang + '.txt' , 'r' ) as f :
40
+ for line in f .readlines ():
41
+ text = line .rstrip ()
42
+ text = re .sub (r'\s+' , r' ' , text )
43
+ self .contractions .append (text .split (' ' , 1 ))
44
+
44
45
def resolve_contractions (self , text ):
45
46
for contraction in self .contractions :
46
- text = re . sub ( re . compile ( self . contraction [0 ]), re . compile ( self . contraction [1 ]), text )
47
+ text = text . replace ( contraction [0 ], contraction [1 ])
47
48
return text
48
49
49
50
@@ -52,52 +53,56 @@ def __init__(self, lang):
52
53
super ().__init__ ()
53
54
self .lang = lang
54
55
self .normalizations = []
55
- with open ('unicode_normalization/' + lang + '.txt' , 'r' ) as f :
56
+ with open ('./std_indic/ unicode_normalization/' + lang + '.txt' , 'r' ) as f :
56
57
content = f .read ()
57
58
replacements = re .findall (r'\[.+\]' , content )
58
59
for elem in replacements :
59
60
elem = elem .strip ('[]' )
60
61
elems = elem .split (' , ' )
61
- self .normalizations .append ( [str_to_unicode (elem [0 ]), str_to_unicode (elem [1 ])] )
62
+ self .normalizations .append ( [str_to_unicode (elems [0 ]), str_to_unicode (elems [1 ])] )
62
63
63
64
def normalize (self , text ) :
64
65
for norm in self .normalizations :
65
- text = re . sub ( re . compile ( norm [0 ]), re . compile ( norm [1 ]), text )
66
+ text = text . replace ( norm [0 ], norm [1 ])
66
67
return text
67
68
68
69
class resolve_group_chars (object ):
69
70
def __init__ (self , lang ):
70
71
super ().__init__ ()
71
72
self .lang = lang
72
73
self .replaces = []
73
- with open ('unicode_normalization/groups' + lang + '.txt' , 'r' ) as f :
74
+ with open ('./std_indic/ unicode_normalization/groups/ ' + lang + '.txt' , 'r' ) as f :
74
75
for line in f .readlines ():
75
- line = re .sub (r'\s+' ,r'\s' ,line .rstrip ())
76
- line = line .split ('#' )[0 ]
76
+ line = re .sub (r'\s+' ,r' ' ,line .rstrip ())
77
+ line = line .split ('#' )[0 ].rstrip ()
78
+ #print(line)
77
79
original , replacement = line .split (' ' )
78
80
match_str = self .make_pattern (original )
79
81
repl_str = self .make_pattern (replacement , True )
80
- self .replaces .append ((match_str , repl_str ))
82
+ print (match_str , repl_str )
83
+ self .replaces .append ((match_str , repl_str ))
81
84
82
85
def make_pattern (self , original , is_replacement = False ):
83
86
original_parts = original .split (',' )
84
87
match_str = ''
88
+ #print(original_parts)
85
89
for part in original_parts :
86
90
if is_replacement :
87
91
if not part .startswith ('0x' ):
88
- match_str += '\\ ' + part
92
+ match_str += '\\ g< ' + part + '>'
89
93
else :
90
94
match_str += str_to_unicode (part )
91
95
else :
92
96
if '-' not in part :
93
97
match_str += '(' + str_to_unicode (part )+ ')'
94
98
else :
95
99
match_str += '([' + str_to_unicode (part .split ('-' )[0 ]) + '-' + str_to_unicode (part .split ('-' )[1 ]) + '])'
96
- return match_str
100
+ return match_str
97
101
98
102
def normalize (self , text ) :
99
103
for norm in self .replaces :
100
- text = re .sub ( re .compile (norm [0 ]), re .compile (norm [1 ]), text )
104
+ print (norm [0 ], norm [1 ])
105
+ text = re .sub ( norm [0 ], norm [1 ], text )
101
106
return text
102
107
103
108
0 commit comments