-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathpre_process.py
54 lines (48 loc) · 1.63 KB
/
pre_process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import string
import re
import pickle as pkl
import numpy as np
from unicodedata import normalize
# Load the file to preprocess
def load_file(filename):
file = open(filename, mode='rt', encoding='utf-8')
text = file.read()
file.close()
return text
# Split the text into sentences
def to_pair(text):
sentences = text.strip().split('\n')
pairs = [s.strip().split('\t') for s in sentences]
return pairs
# Clean a list of lines
def clean_pairs(lines):
cleaned = list()
# Regex for char filtering
re_print = re.compile('[^%s]' % re.escape(string.printable))
table = str.maketrans('', '', string.punctuation)
for pair in lines:
clean_pair = list()
for line in pair:
line = normalize('NFD', line).encode('ascii', 'ignore')
line = line.decode('UTF-8')
line = line.split()
line = [word.lower() for word in line]
line = [word.translate(table) for word in line]
line = [re_print.sub('', word) for word in line]
# Remove numeric chrs
line = [w for w in line if w.isalpha()]
clean_pair.append(' '.join(line))
cleaned.append(clean_pair)
return np.array(cleaned)
# Save the cleaned data to the given filename
def save_data(sentences, filename):
pkl.dump(sentences, open(filename, 'wb'))
print('Saved: %s' % filename)
filename = './data/deu.txt'
file = load_file(filename)
pairs = to_pair(file)
clean_pairs = clean_pairs(pairs)
save_data(clean_pairs, 'english-german.pkl')
# Checking the cleaned data
for i in range(100):
print('[%s] => [%s]' % (clean_pairs[i, 0], clean_pairs[i, 1]))