generated from LgoLgo/.github
-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsearch.py
188 lines (149 loc) · 5.72 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
import re
import pylru
class SearchEngineBase(object):
def __init__(self):
pass
def add_corpus(self, file_path):
with open(file_path, 'r') as fin:
text = fin.read()
self.process_corpus(file_path, text)
def process_corpus(self, id, text):
raise Exception('process_corpus not implemented.')
def search(self, query):
raise Exception('search not implemented.')
class SimpleEngine(SearchEngineBase):
def __init__(self):
super(SimpleEngine, self).__init__()
self.__id_to_texts = {}
def process_corpus(self, id, text):
self.__id_to_texts[id] = text
def search(self, query):
results = []
for id, text in self.__id_to_texts.items():
if query in text:
results.append(id)
return results
class BOWEngine(SearchEngineBase):
def __init__(self):
super(BOWEngine, self).__init__()
self.__id_to_word = {}
def process_corpus(self, id, text):
self.__id_to_word[id] = self.parse_text_to_word(text)
def search(self, query):
query_words = self.parse_text_to_word(query)
results = []
for id, words in self.__id_to_word.items():
if self.query_match(query_words, words):
results.append(id)
return results
@staticmethod
def parse_text_to_word(text):
# Use regex to remove punctuation and newlines
text = re.sub(r'[^\w ]', ' ', text)
# To lower
text = text.lower()
# Generate a list of all words
word_list = text.split(' ')
# Remove blank words
word_list = filter(None, word_list)
# Return a set of words
return set(word_list)
@staticmethod
def query_match(query_words, words):
for query_word in query_words:
if query_word not in words:
return False
return True
# 减少查询的量
class BOWInvertedIndexEngine(SearchEngineBase):
def __init__(self):
super(BOWInvertedIndexEngine, self).__init__()
self.inverted_index = {}
def process_corpus(self, id, text):
words = self.parse_text_to_word(text)
for word in words:
if word not in self.inverted_index:
self.inverted_index[word] = []
self.inverted_index[word].append(id)
def search(self, query):
query_words = list(self.parse_text_to_word(query))
query_words_index = list()
for _ in query_words:
query_words_index.append(0)
# If a word is indexed in reverse order, return immediately.
for query_word in query_words:
if query_word not in self.inverted_index:
return []
result = []
while True:
# First get the index of all inverted indexes in the current state.
current_ids = []
for idx, query_word in enumerate(query_words):
current_index = query_words_index[idx]
current_inverted_list = self.inverted_index[query_word]
# If the current index exceeds the length of the inverted list,
# it means that the query is not in the inverted list.
if current_index >= len(current_inverted_list):
return result
current_ids.append(current_inverted_list[current_index])
# If all the elements in the current_ids are the same, it means that the query is in the inverted list.
if all(x == current_ids[0] for x in current_ids):
result.append(current_ids[0])
query_words_index = [x + 1 for x in query_words_index]
continue
# If not all elements are the same, increase the index of the smallest element by 1.
min_val = min(current_ids)
min_val_pos = current_ids.index(min_val)
query_words_index[min_val_pos] += 1
@staticmethod
def parse_text_to_word(text):
# Use regex to remove punctuation and newlines
text = re.sub(r'[^\w ]', ' ', text)
# To lower
text = text.lower()
# Generate a list of all words
word_list = text.split(' ')
# Remove blank words
word_list = filter(None, word_list)
# Return a set of words
return set(word_list)
class LRUCache(object):
def __init__(self, size=2):
self.cache = pylru.lrucache(size)
def has(self, key):
return key in self.cache
def get(self, key):
return self.cache[key]
def set(self, key, value):
self.cache[key] = value
class BOWInvertedIndexEngineWithCache(BOWInvertedIndexEngine, LRUCache):
def __init__(self):
super(BOWInvertedIndexEngineWithCache, self).__init__()
LRUCache.__init__(self)
def search(self, query):
if self.has(query):
print('cache hit!')
return self.get(query)
result = super(BOWInvertedIndexEngineWithCache, self).search(query)
self.set(query, result)
return result
def main(search_engine):
for file_path in ["./text/1.txt", "./text/2.txt", "./text/3.txt", "./text/4.txt"]:
search_engine.add_corpus(file_path)
while True:
query = input("Please input query:")
if query == "q":
break
results = search_engine.search(query)
print("found {} result(s):".format(len(results)))
for result in results:
print(result)
# if __name__ == "__main__":
# search_engine = SimpleEngine()
# main(search_engine)
# search_engine = BOWEngine()
# main(search_engine)
# search_engine = BOWInvertedIndexEngine()
# main(search_engine)
# search_engine = BOWInvertedIndexEngineWithCache()
# main(search_engine)