-
Notifications
You must be signed in to change notification settings - Fork 0
/
corpora_sanitise.py
54 lines (48 loc) · 2.49 KB
/
corpora_sanitise.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from string import punctuation
def custom_sanitiser(sentence_list):
# Removing new line symbol and emojis.
new = [s.encode("ascii", "ignore").decode("ascii") # Removing most emojis.
.replace("&", "&") # Decoding ampersands.
.replace("< 3", "heartemoji") # Decoding heart emojis.
.replace(" '", "") # Decoding apostrophes.
.replace(""", "") # Decoding quotes.
.replace(">", "") # Decoding >.
.replace("<", "") # Decoding <.
.split() for s in sentence_list]
for s in range(len(new)):
new[s] = [w.translate(str.maketrans('', '', punctuation))
for i, w in (enumerate(new[s]))
if not (i == 0 and w in ["democratic", "republican"])]
new = [" ".join(filter(None, s)) for s in new]
list(filter(lambda s: s != "", new))
return new
with open("corpora/resplit/unsanitised/dem_train.txt", encoding="utf-8") as f:
dem_train = custom_sanitiser(f.read().split("\n"))
with open("corpora/resplit/unsanitised/dem_val.txt", encoding="utf-8") as f:
dem_val = custom_sanitiser(f.read().split("\n"))
with open("corpora/resplit/unsanitised/dem_test.txt", encoding="utf-8") as f:
dem_test = custom_sanitiser(f.read().split("\n"))
with open("corpora/resplit/unsanitised/rep_train.txt", encoding="utf-8") as f:
rep_train = custom_sanitiser(f.read().split("\n"))
with open("corpora/resplit/unsanitised/rep_val.txt", encoding="utf-8") as f:
rep_val = custom_sanitiser(f.read().split("\n"))
with open("corpora/resplit/unsanitised/rep_test.txt", encoding="utf-8") as f:
rep_test = custom_sanitiser(f.read().split("\n"))
with open("corpora/resplit/sanitised/dem_train.txt", encoding="utf-8") as f:
for s in dem_train:
f.write("{}\n".format(s.strip()))
with open("corpora/resplit/sanitised/dem_val.txt", encoding="utf-8") as f:
for s in dem_val:
f.write("{}\n".format(s.strip()))
with open("corpora/resplit/sanitised/dem_test.txt", encoding="utf-8") as f:
for s in dem_test:
f.write("{}\n".format(s.strip()))
with open("corpora/resplit/sanitised/rep_train.txt", encoding="utf-8") as f:
for s in rep_train:
f.write("{}\n".format(s.strip()))
with open("corpora/resplit/sanitised/rep_val.txt", encoding="utf-8") as f:
for s in rep_val:
f.write("{}\n".format(s.strip()))
with open("corpora/resplit/sanitised/rep_test.txt", encoding="utf-8") as f:
for s in rep_test:
f.write("{}\n".format(s.strip()))