-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanonymize_ency.py
64 lines (48 loc) · 2.28 KB
/
anonymize_ency.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import argparse
import csv
import random
import spacy
from spacy.tokens import Doc
from redaction import *
import datetime
begin_time = datetime.datetime.now()
Doc.set_extension("cy_text", default=set()) # TO ADD CY TEXT TO EN DOC
nlp_en = spacy.load("en_core_web_lg", disable=["parser"])
nlp_cy = spacy.load("/home/gruffudd/spacy23/model_tagio_lemateiddio_spacy")
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("input_file", help='Input file. Must be a tab-delimited .csv '
'file featuring a column named "source" '
'featuring English text and a column named '
'"target" featuring the corresponding Welsh '
'text.')
parser.add_argument("output_file", help='The required name of the anonymized output '
'two-column .csv file that will be produced')
args = parser.parse_args()
anonymized_bitexts = []
with open(args.input_file, mode="r") as csv_file:
csv_reader = csv.DictReader(csv_file)
for i, row in enumerate(csv_reader):
bitext = {}
bitext["en"] = (row["source"].strip())
bitext["cy"] = (row["target"].strip())
en_doc = nlp_en(bitext["en"])
en_sensitive_ents = get_sensitive_ents(en_doc)
redacted_en = redact_en(en_doc, bitext["en"], en_sensitive_ents)
cy_doc = nlp_cy(bitext["cy"])
redacted_cy = redact_cy(cy_doc, bitext["cy"], en_sensitive_ents) # using EN ents for the time being)
if redacted_en \
and redacted_cy \
and len(redacted_en) > 1 \
and len(redacted_cy) > 1:
anonymized_bitext = redacted_en + "\t" + redacted_cy + "\n"
if "X" not in anonymized_bitext:
anonymized_bitexts.append(anonymized_bitext)
# if i == 100:
# break
random.shuffle(anonymized_bitexts)
with open(args.output_file, mode="w") as outfile:
for anonymized_bitext in anonymized_bitexts:
outfile.write((anonymized_bitext))
print ('File "' + args.output_file + '" written!')
print ("Time taken:", datetime.datetime.now() - begin_time)