-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocessxml.py
executable file
·115 lines (95 loc) · 4.15 KB
/
processxml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python
#%%
import pandas as pd
import regex as re
import argparse, os, csv
import xml.etree.ElementTree as ET
#%%
def validate_file(f):
if not os.path.exists(f):
raise argparse.ArgumentTypeError("{0} does not exist".format(f))
return f
#%%
tags=list('.?!,;:-—…')
parentheses=r'\([^)(]+[^)( ] *\)'
parenthesestokeep=r'\([^)(]+[^)(.!?—\-, ] *\)'
speakertag=r'((?<=[^\w\d \",])|^) *(?![?\.,!:\-\—\[\]\(\)])(?:[A-Z\d][^\s.?!\[\]\(\)]*\s?)*:(?=[^\w]*[A-Z])'#lookahead keeps semicolon in false cases.
parenthesestoremove=r'\(([^\w]*[^\(\)]+[\w ]+)\):?'
parenthesesaroundsentence=r'\(([^\w]*[^\(\)]+\W*)\):?'
squarebracketsaroundsentence=r'\[([^\[\]]+)\]' #generic since it seems like the square brackets just denote unclear speech.
def displayinstances(col,exp):
for i in range(len(col)):
# temp={x.group() for x in re.finditer( , tedtalks[i])}
temp={x.group() for x in re.finditer(exp, col[i])}
if len(temp)!=0:print(i,temp)
print('--fin--')
''' Identifies term to remove if the words from the previous
punctuation (except ") through : until the next word all
begins with a caps. Drawback:This doesnt properly capture
places where the following term is caps due to it being a
proper noun, where the prefix will be removed regardless
but will not break the syntax.
'''
def removespeakertags(text):
return re.sub(speakertag,' ',text)
def removeparentheses(text):
return re.sub(parenthesestoremove, ' ',text)
def removeparenthesesaroundsentence(text):
return re.sub(parenthesesaroundsentence,r'\g<1>',text)
def removesquarebrackets(text):
return re.sub(squarebracketsaroundsentence, r'\g<1>',text)
def removemusic(text):
text = re.sub(r'♫( *[^♫ ])+ *♫', ' ',text)
return re.sub(r'♪( *[^♫ ])+ *♪', ' ',text)
def reducewhitespaces(text):
text=re.sub(r'(?<=[.?!,;:\—\-]) *(?=[.?!,;:\—\-])','',text)
return re.sub(r'\s+', ' ',text)
def removeemptyquotes(text):
text= re.sub(r"'[^\w\d]*'",' ',text)
text= re.sub(r"\([^\w\d]*\)",' ',text)
text= re.sub(r"\[[^\w\d]*\]",' ',text)
return re.sub(r'"[^\w\d]*"',' ',text)
def ellipsistounicode(text):
text = re.sub(r'\.{3,}(?= )','…',text) #ellipsis without trailing punctuation
return re.sub(r'\.{3,}([^\w\s])','…\g<1>',text) #ellipsis with trailing punctuation
def removenonsentencepunct(text):
return re.sub(r'[^\w\d\s,.!?;:$#%&^+•=€²£¥…@\-\–\—\/](?!\w)|(?<!\w)[^\w\d\s,.!?;:$#%&^+•=€²£¥…@\-\–\—\/]',' ',text)
def combinerepeatedpunct(text):
newtext=[text,re.sub(r'([^\w\d]+) *\1+','\g<1> ',text)]
i=1
while (newtext[0]!=newtext[1]):
i+=1
newtext[i%2]=re.sub(r'([^\w\d]+) *\1+','\g<1> ',newtext[(1+i)%2])
return newtext[i%2]
def endashtohyphen(text):
return re.sub('–','-',text)
def pronouncesymbol(text):
return re.sub('(?<=\d)\.(?=\d)',' point ',text)
def preprocess(tedtalks):
tedtalks=removespeakertags(tedtalks)
tedtalks=removeparentheses(tedtalks)
tedtalks=removeparenthesesaroundsentence(tedtalks)
tedtalks=removesquarebrackets(tedtalks)
tedtalks=removemusic(tedtalks)
tedtalks=removeemptyquotes(tedtalks)
tedtalks=ellipsistounicode(tedtalks)
tedtalks=removenonsentencepunct(tedtalks)
tedtalks=endashtohyphen(tedtalks)
tedtalks=combinerepeatedpunct(tedtalks)
tedtalks=pronouncesymbol(tedtalks)
tedtalks=reducewhitespaces(tedtalks)
return tedtalks
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Enter xml file location.')
parser.add_argument("-i", "--input", dest="filename", required=True, type=validate_file,
help="input file", metavar="FILE")
parser.add_argument("-o", "--output", dest="output", required=True,
help="output file", metavar="FILE")
args = parser.parse_args()
tree=ET.parse(args.filename)
rows=[]
for child in tree.getroot():
rows.append(''.join([x.strip() for x in list(child.itertext())]))
with open (args.output, 'a') as f:
writer=csv.writer(f)
writer.writerow((args.filename,preprocess(' '.join([i for i in rows if i.strip()[-1] in tags]))))