-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtranslate.py
141 lines (107 loc) · 3.91 KB
/
translate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# -*- coding: utf-8 -*-
import sys, getopt, os, math, collections, copy, codecs, re, nltk, string
from datetime import datetime
from bisect import bisect_left
from nltk.tag import pos_tag
from IBMModel1 import M1
SPECIAL_CHARS = {
'\xc3\x81' : 'A',
'\xc3\x89' : 'E',
'\xc3\x8d' : 'I',
'\xc3\x91' : 'N',
'\xc3\x93' : 'O',
'\xc3\x9a' : 'U',
'\xc3\x9c' : 'U',
'\xc3\xa1' : 'A',
'\xc3\xa9' : 'E',
'\xc3\xad' : 'I',
'\xc3\xb1' : 'N',
'\xc3\xb3' : 'O',
'\xc3\xba' : 'U',
'\xc3\xbc' : 'U',
'\xc2\xbf' : '', # upside down question mark
'\xc2\xa1' : '', # upside down exclamation mark
'\n' : ''
}
USE_EXTENSIONS = True
def translate_sentences(sp_sentences, m1):
translns_file = open('%s_translations' % FILENAME, 'w')
print '\n== Translating to English...'
for i, sp_sentence in enumerate(sp_sentences):
translate_sentence(sp_sentence, m1, translns_file)
if (i+1)%200 == 0:
print ' %d of %d sentences translated' % (i+1, len(sp_sentences))
print '\n== ... Done translating!\n'
translns_file.close()
##
# Given a sentence string, returns a list of tuples containing 1: each word
# and 2: its corresponding part-of-speech.
def get_POS(s):
v = os.popen('echo "' + s + '" | tree-tagger/cmd/tree-tagger-spanish').read()
tagged = []
for line in v.split('\n'):
if line != '':
parts = line.split()
tagged.append( (parts[0], parts[1]) )
return tagged
def translate_sentence(sp_sentence, m1, translns_file):
en_transln = ''
for sp_word in sp_sentence:
en_word = m1.max_prob_alignment(sp_word)
if sp_word in string.punctuation:
en_transln += '%s ' % (sp_word)
elif en_word is not None:
en_transln += '%s ' % (en_word)
if USE_EXTENSIONS:
en_transln = re.sub(
r'(.*) (.*?) of the (.*?) (.*)',
r'\g<1> \g<3> \g<2> \g<4>',
en_transln
)
translns_file.write(en_transln + '\n')
def get_lines_of_file(filepath, SPLIT=False):
f = codecs.open(filepath, encoding='utf-8')
lines = []
for line in f:
line = line.encode('utf-8').lower()
for ch in SPECIAL_CHARS:
line = line.replace(ch, SPECIAL_CHARS[ch])
if SPLIT: lines.append(line.split())
else: lines.append(line)
return lines
def is_noun(POS):
return POS == 'NN' or POS == 'NC' or POS == 'NP'
def is_adj(POS):
return POS=='ADJ' or POS=='JJ'
def flip_nouns_and_adjs(sp_sentences):
sp_sentences_tagged = [get_POS(s) for s in sp_sentences]
flipped_sentences = []
for s in sp_sentences_tagged:
for i in range(0, len(s) - 1):
curr_pos, next_pos = s[i][1], s[i+1][1]
if is_noun(curr_pos) and is_adj(next_pos):
curr_word, next_word = s[i], s[i+1]
s[i], s[i+1] = next_word, curr_word
flipped_sentences.append([tup[0] for tup in s])
return flipped_sentences
if __name__ == "__main__":
startTime = datetime.now()
if len(sys.argv) < 2:
print '\nRequires the path to and name of file (without .en/.es extension) to translate:'
print 'Usage: $ python translate.py ./PATH/TO/FILE/ FILENAME'
print 'Aborting...'
else:
filepath_to_train = './es-en/train/' + raw_input('\n== Filename to train on? ')
PATH, FILENAME = sys.argv[1], sys.argv[2]
# Get sp_sentences to translate out of file (no tokenizing)
sp_sentences = get_lines_of_file('%s%s.es' % (PATH, FILENAME), not USE_EXTENSIONS)
if USE_EXTENSIONS:
sp_sentences = flip_nouns_and_adjs(sp_sentences)
# Get goal_sentences to compare translations to out of file (no tokenizing)
goal_translns = get_lines_of_file('%s%s.en' % (PATH, FILENAME), True)
# Initialize IBM Model 1 class.
n_iterations = int(raw_input('\n== # of iterations? '))
m1 = M1(filepath_to_train, n_iterations)
translate_sentences(sp_sentences, m1)
os.system('python bleu_score.py %s%s.en %s_translations' % (PATH, FILENAME, FILENAME))
print '\n[ Time elapsed: ] %s\n' % (str(datetime.now() - startTime))