-
Notifications
You must be signed in to change notification settings - Fork 1
/
web_graph.py
82 lines (55 loc) · 1.77 KB
/
web_graph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
"""
@author: Sriram Veturi
@title: SmartSearch - An Intelligent Search Engine.
@date: 05/06/2019
"""
import os
import json
from page_rank import get_page_ranks
GRAPH_DIR = "./web_graph"
DATA_DIR = "./documents"
PAGE_RANKS_DIR = "./web_page_ranks"
def build_web_graph():
"""
Funciton to build the web graph.
:return web_graph
"""
# First, build a dictionary with URLs and their outgoing links.
web_graph = dict()
for json_file_name in os.listdir(DATA_DIR):
with open(os.path.join(DATA_DIR, json_file_name), "r") as jf:
# Load JSON as dictionary.
url_data_dict = dict(json.load(jf))
# Get URLs and outgoing links.
url = url_data_dict["URL"]
outgoing_links = url_data_dict["OUTGOING_LINKS"]
# Add node to web_graph
web_graph[url] = outgoing_links
return web_graph
def get_web_page_ranks():
"""
Function to get the page ranks in the entire crawled web.
:return web_page_rank: Mapping of pages with their page rank scores.
"""
# Create directory to store the ranks.
# If it already exists, return True.
if os.path.isdir(PAGE_RANKS_DIR) is True:
print("Directory to store the ranks already exists. Moving on.")
else:
try:
os.mkdir(PAGE_RANKS_DIR)
print("Directory created to store the ranks.")
except Exception as e:
print(e)
# First, build the web graph.
web_graph = build_web_graph()
print("Web Graph Built.")
# Now that we have the web graph, get page ranks of all the nodes in the web graph.
web_page_ranks = get_page_ranks(web_graph)
print("Ranks calculated.")
# Create a json file which would store the web graph information of the url.
document_name = "web_ranks.json"
with open(os.path.join(PAGE_RANKS_DIR, document_name), 'w') as ranks_file:
json.dump(web_page_ranks, ranks_file)
return None
get_web_page_ranks()