-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmoogle.py
271 lines (240 loc) · 9.24 KB
/
moogle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
###################################################################
# FILE: moogle.py
# WRITER: Daniel Sinai
# DESCRIPTION: This program implements the search engine "Moogle"
###################################################################
import requests
import bs4
import urllib.parse
import sys
import pickle
def create_relatives_list(path):
"""
This function gets the relative urls from a file
:param path: Path for the file with the relative urls
:return: Dictionary with all the relative urls, numbered by indexes
"""
with open(path) as _:
relatives_list = _.read().splitlines()
return relatives_list
def create_full_url(base_url, relative_url):
"""
This function creates an absolute url from base and relative url
:param base_url: Base url
:param relative_url: Relative url
:return: Combined absolute url
"""
return urllib.parse.urljoin(base_url, relative_url)
def crawl(base_url, relative_path):
"""
This function creates a dictionary with the number of links between
webpages
:param base_url: Base url of a webpage
:param relative_path: Path to the index file that contains the
relative urls
:return: Dictionary with number of links between the webpages from the
index file
"""
relatives_list = create_relatives_list(relative_path)
traffic_dict = dict()
for relative in relatives_list:
inner_dict = dict()
response = requests.get(create_full_url(base_url, relative))
html = response.text
soup = bs4.BeautifulSoup(html, "html.parser")
for p in soup.find_all("p"):
for link in p.find_all("a"):
target = link.get("href")
if target in relatives_list:
inner_dict[target] = inner_dict.get(target, 0) + 1
traffic_dict[relative] = inner_dict
return traffic_dict
def sum_dict(diction):
"""
This function summarizes a dictionary values
:param diction: Dictionary to be summarized
:return: Sum of the dictionary values
"""
sum1 = 0
for key in diction:
sum1 += diction[key]
return sum1
def create_new_zero_dict(traffic_dict):
"""
This function creates a dictionary with values of 0 and keys from another
dictionary
:param traffic_dict: The dictionary from which we take the keys
:return: New dictionary with values of 0 and keys from traffic_dict
"""
zero_dict = dict()
for key in traffic_dict:
zero_dict[key] = 0
return zero_dict
def page_rank(iterations_num, traffic_dict):
"""
This function ranks a webpages based on their links to and from other
webpages
:param iterations_num: Number of iterations for the ranking
:param traffic_dict: Dictionary that contains the links between all of
the webpages
:return: Dictionary with the rank of the pages
"""
r = dict()
for key in traffic_dict:
r[key] = 1
for i in range(iterations_num):
new_r = create_new_zero_dict(traffic_dict)
for key in traffic_dict:
for inner_key in traffic_dict[key]:
all_link_sum = sum_dict(traffic_dict[key])
new_r[inner_key] += r[key] * ((traffic_dict[key][inner_key])
/ all_link_sum)
r = new_r
return r
def words_dict(base_url, index_path):
"""
This function calculates how many times each word in the webpages is shown
in all of the webpages
:param base_url: Base url to work with
:param index_path: Path to the relative urls file
:return: Dictionary with the number of performance of each word in the
Webpages
"""
word_dict = dict()
relatives_list = create_relatives_list(index_path)
for relative in relatives_list:
response = requests.get(create_full_url(base_url, relative))
html = response.text
soup = bs4.BeautifulSoup(html, "html.parser")
for p in soup.find_all("p"):
content = p.text
words_list = content.split()
for word in words_list:
if word not in word_dict:
word_dict[word] = dict()
if relative in word_dict[word]:
word_dict[word][relative] += 1
else:
word_dict[word][relative] = 1
return word_dict
def sort_dict(dictionary):
"""
This function sorts a dictionary
:param dictionary: Dictionary to be sorted
:return: Sorted dictionary
"""
sorted_dictionary = {}
sorted_keys = sorted(dictionary, key=dictionary.get, reverse=True)
for key in sorted_keys:
sorted_dictionary[key] = dictionary[key]
return sorted_dictionary
def sort_list_by_values(sorted_dict):
"""
This function sorts a list according to a dictionary values
:param sorted_dict: Sorted dictionary
:return: Sorted list according to the dictionary values
"""
sorted_list = []
for key in sorted_dict:
sorted_list.append(key)
return sorted_list
def filter_max_dict(query_list, sorted_rank_dict, word_dict, max_results):
"""
This function filters the ranking dictionary to max_results pages
:param query_list: List of query word/s
:param sorted_rank_dict: Sorted ranking dictionary
:param word_dict: Words dictionary
:param max_results: Num of pages to be on the final dictionary
:return: Filtered dictionary with max_results pages
"""
sorted_relatives_list = sort_list_by_values(sorted_rank_dict)
max_results_dict = dict()
for relative in sorted_relatives_list:
if relative in sorted_rank_dict:
flag = True
for q in query_list:
if q not in word_dict:
continue
else:
if relative not in word_dict[q]:
flag = False
break
if flag and len(max_results_dict) < max_results:
max_results_dict[relative] = sorted_rank_dict[relative]
return max_results_dict
def search(query, ranking_dict, word_dict, max_results):
"""
This function runs a query search on the webpages and return the results
in a dictionary
:param query: A query
:param ranking_dict: The ranking dictionary
:param word_dict: The words dictionary
:param max_results: Num of webpages to be displayed
:return: Dictionary with the search results
"""
query_list = query.split()
sorted_rank_dict = sort_dict(ranking_dict)
max_results_dict = filter_max_dict(query_list, sorted_rank_dict, word_dict,
max_results)
final_ranking_dict = dict()
for key in max_results_dict:
num_of_appearances = []
if len(query_list) == 1:
if query in word_dict:
final_ranking_dict[key] = max_results_dict[key] * \
word_dict[query][key]
else:
for q in query_list:
if q not in word_dict:
continue
else:
num_of_appearances.append(word_dict[q][key])
if len(num_of_appearances) > 0:
word_rank = min(num_of_appearances)
final_ranking_dict[key] = max_results_dict[key] * word_rank
final_ranking_dict = sort_dict(final_ranking_dict)
return final_ranking_dict
def run_search(query, ranking_dict, word_dict, max_results):
"""
This function runs a single search in Moogle engine
:param query: A query
:param ranking_dict: The ranking dictionary
:param word_dict: The words dictionary
:param max_results:
:return:
"""
final_ranking_dict = search(query, ranking_dict, word_dict, max_results)
for key, value in final_ranking_dict.items():
print(key, value)
if __name__ == "__main__":
command_type = sys.argv[1]
if command_type == "crawl":
base_link, index_file, out_file = sys.argv[2], sys.argv[3], sys.argv[4]
final_traffic_dict = crawl(base_link, index_file)
with open(out_file, "wb") as f:
pickle.dump(final_traffic_dict, f)
elif command_type == "page_rank":
iterations, dict_file, out_file = int(sys.argv[2]),\
sys.argv[3], sys.argv[4]
with open(dict_file, "rb") as f:
d = pickle.load(f)
final_pagerank_dict = page_rank(iterations, d)
with open(out_file, "wb") as f:
pickle.dump(final_pagerank_dict, f)
elif command_type == "words_dict":
base_link, index_file, out_file = sys.argv[2], sys.argv[3], sys.argv[4]
final_word_dict = words_dict(base_link, index_file)
with open(out_file, "wb") as f:
pickle.dump(final_word_dict, f)
elif command_type == "search":
user_query, ranking_dict_path, words_dict_path, user_max_results = \
sys.argv[2], sys.argv[3], sys.argv[4], int(sys.argv[5])
with open(ranking_dict_path, "rb") as f:
ranking_dict_file = pickle.load(f)
with open(words_dict_path, "rb") as f:
words_dict_file = pickle.load(f)
run_search(user_query, ranking_dict_file, words_dict_file,
user_max_results)
else:
print("Illegal command!")
sys.exit()