-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinvert.py
162 lines (115 loc) · 4.48 KB
/
invert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
from PorterStemmer import PorterStemmer
def updateDict():
#This method reads the cacm file and saves all relavent lines, filtered by the fields, into a variable called docs so it may be processed
includedFields = {'.I', '.T', '.W', '.B', '.A'}
excludedFields = {'.N', '.X', '.K', '.C'}
Fields = includedFields.union(excludedFields)
currField = ""
doc = ""
for i in docs.splitlines():
if i.split(' ')[0] in Fields:
currField = i.split(' ')[0]
#print(f"currField {currField}") #set current field
if currField == '.I': #If new document is detected, then send off saved doc to addTerms() method and clear docs variable
docId = int(i.split(' ')[1]) - 1
print(f"\n{docId} \n")
#print(doc)
print(doc)
wDocs.write(f"{docId}")
wDocs.write(f"\n{doc}\n")
addTerms(docId, doc)
doc=""
continue
if currField in excludedFields:
continue
else:
doc+= i + "\n"
def addTerms(docId, doc):
#this method adds terms to dictionary data structure
punctuation = '''!()-[]{};:'"\, <>./?@#$%^&*_~'''
doc = doc.lower() #make everything in the doc lowercase
#print("BEFORE")
#print(doc)
for char in doc: #this loop removes all punctuation
if char in punctuation:
doc = doc.replace(char, " ")
words = doc.split() #doc after punctuation is removed, made into a list of strings
#STEM FINDER FUNCTION
if STEMMER_ENABLED:
for w in words:
doc = doc.replace(w, p.stem(w, 0, len(w)-1))
#print("AFTER")
#print(doc)
words = doc.split()
#print(words)
for t in words:
if SW_ENABLED and t in stopwords: #if the term is a stopword, the continue
continue
elif t in dict: #if the term exists in the dictionary, then update
#print("DUPLICATE")
#print(f"duplicate term is {t}")
l = dict[t]
if l.find(str(docId))==-1:
string = f", ({docId},{words.count(t)})"
new = dict[t] + string
dict[t] = new
#print(f"dict[{t}] = {dict[t]}")
#print("\n")
else: #if the term does not exist in the dictonary, then add it
#print("NEW")
string = f"({docId},{words.count(t)})"
dict[t] = string
#print(f"dict[{t}] = {dict[t]}")
#print('\n')
def writeFiles(): #this function will create and write to dictionary.txt
dictionary = open("dictionary.txt", 'w')
postings = open("postings.txt", 'w')
invertedindex = open("inverted_index.txt", 'w')
for i in sorted(dict):
#print(f"{i}: {dict[i]}")
dictionary.write(f"{i} [{dict[i].count('(')}] \n")
postings.write(f"{dict[i]} \n")
invertedindex.write(f"{i} [{dict[i].count('(')}] >> {dict[i]} \n")
dictionary.close()
if __name__=="__main__":
d = open("cacm/cacm.all", 'r')
sw = open("cacm/common_words", 'r')
p = PorterStemmer()
wDocs = open("documents.txt", 'w')
while True:
x = input("Do you want to enable stemming? (y/n): ")
x = x.lower()
if x == "y":
STEMMER_ENABLED = True
print("Input accepted.")
break
elif x == "n":
STEMMER_ENABLED = False
print("Input accepted")
break
else:
print("Invalid entry")
while True:
x = input("Do you want to enable stop word removal? (y/n): ")
x = x.lower()
if x == "y":
SW_ENABLED = True
print("Input accepted")
break
elif x == "n":
SW_ENABLED = False
print("Input accepted")
break
else:
print("Invalid entry")
#load into memory
docs = d.read()
#fields to record include .I (doc ID), .T(title), .W(abstract), B(publication date), .A (author list)
stopwords = sw.read()
stopwords = set(stopwords.split()) #create the set of stopwords
dict = {} #this will hold the term and document frequency
updateDict()
writeFiles()
d.close() #close master file
sw.close()#close stopWords file
print("\nFINISHED BUILDING DICTIONARY AND POSTINGS \n")