-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparser.py
141 lines (122 loc) · 4.47 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import sys
import re
from os import listdir
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup
directory = None
documents = None
stop_words = None
try:
directory = sys.argv[1]
documents = listdir(directory)
except IndexError:
print('Directory not provided. Provide it like "python tokenizer.py <Directory>". Program exiting.')
exit()
except FileNotFoundError:
print('Directory not Found. program exiting.')
exit()
directory = directory + '/'
try:
with open('stoplist.txt') as f:
print('Reading Stop List.')
stop_words = f.read().split('\n')
print('Stop List read.')
except FileNotFoundError:
print('stoplist.txt not found. program exiting.')
exit()
# reading html data ###################################################################################################
print('\nReading Data.')
data = []
not_read = []
not_html = []
for document in documents:
try:
with open(directory + document) as fileHandle:
html = fileHandle.read().lower()
except UnicodeDecodeError:
try:
with open(directory + document, encoding='iso8859-1') as fileHandle:
html = fileHandle.read().lower()
except UnicodeDecodeError:
not_read.append(document)
continue
position = html.find('\n<')
if position == -1:
not_html.append(document)
html = '\n'.join(html.split('\n')[20:])
else:
html = html[position + 1:]
data.append(html)
print('Data Read.')
if len(not_read) != 0:
print('Files failed to read:', not_read, '\nRemoving them from Corpus.')
documents = [document for document in documents if document not in not_read]
if len(not_html) != 0:
print('Files not html:', not_html)
not_html = None
# parsing and pre-processing html #####################################################################################
print('\nParsing and Pre-processing Data.')
not_read.clear()
terms = set()
stemmer = PorterStemmer()
stop_words = set(stop_words + [stemmer.stem(stop_word) for stop_word in stop_words]) | {'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'}
stop_words.add('')
for i, html in enumerate(data):
# get text from html
try:
parsed_html = BeautifulSoup(html, features='html5lib')
except UserWarning:
not_read.append(i)
continue
for script in parsed_html(['script', 'style']):
script.decompose() # rip it out
try:
text = ' '.join(parsed_html.strings).replace('\n', ' ')
except AttributeError:
not_read.append(i)
continue
# tokenize, stem, remove stop-words
filtered_tokens = [token for token in
[stemmer.stem(token) for token in re.split('\W+', text)]
if token not in stop_words]
# adding newly found words to unique terms set
terms |= set(filtered_tokens)
data[i] = filtered_tokens
print('Data parsed and Preprocessed.')
if len(not_read) != 0:
print('Files failed to Parse & Pre-process:', [documents[x] for x in not_read], '\nRemoving them from Corpus.')
for i in not_read:
del documents[i]
del data[i]
not_read = None
# converting terms set to list
terms = sorted(terms)
# create termids.txt file #############################################################################################
print('\nCreating Files.')
to_write = '\n'.join([str(i + 1) + '\t' + term for i, term in enumerate(terms)])
with open('termids.txt', 'w', encoding='utf8') as termID:
termID.write(to_write)
print('termids.txt created.')
# create docids.txt file
to_write = '\n'.join([str(i + 1) + '\t' + document for i, document in enumerate(documents)])
with open('docids.txt', 'w', encoding='cp1252') as docID:
docID.write(to_write)
print('docids.txt created.')
# create doc_index.txt
terms = {v: k + 1 for k, v in enumerate(terms)}
to_write = []
for i, d in enumerate(data):
term_positions = {}
doc_terms = []
for j, token in enumerate(d):
try:
term_positions[token].append(str(j + 1))
except KeyError:
term_positions[token] = [str(j + 1)]
doc_terms.append(token)
for u in doc_terms:
to_write.append(str(i + 1) + '\t' + str(terms[u]) + '\t' + '\t'.join(term_positions[u]))
to_write = '\n'.join(to_write)
with open('doc_index.txt', 'w', encoding='cp1252') as doc_index:
doc_index.write(to_write)
print('doc_index.txt created.')