-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathcrawler.py
212 lines (178 loc) · 8.56 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
"""
Web Crawler Security Tool v2.0.0
A simple Python web crawler that can follow links and
obtain all the structure of a website, including files.
"""
# pylint: disable=line-too-long
import os
import re
import pickle
import logging
from collections import deque
from urllib.parse import urlparse
import requests
from requests.exceptions import ConnectionError
from lib.fetch_website import fetch_website
from lib.parse_website import find_all_links
from lib.utils import store_set_to_file
from lib.utils import load_set_from_file
from lib.utils import load_queue_from_file
from lib.utils import add_url_to_set
from lib.utils import add_url_to_queue
from lib.utils import create_parser
def setup_logging(verbose, debug, url):
"""
Set up the logging configuration. Verbose and debug flags adjust the logging level.
URL is used to create a unique log file for each URL.
"""
log_directory = "logs"
level = logging.DEBUG if debug else (logging.INFO if verbose or not debug and not verbose else logging.WARNING)
# Replace non-alphanumeric characters with underscore
sanitized_url = re.sub(r'[^\w\-_\. ]', '_', url)
# Create the logs directory if it doesn't exist
os.makedirs(log_directory, exist_ok=True)
log_filename = f"{log_directory}/{sanitized_url}.log"
# Create a logger
logger = logging.getLogger()
logger.setLevel(level)
# File handler for logging to a file
file_handler = logging.FileHandler(log_filename, mode='a')
file_formatter = logging.Formatter('%(asctime)s.%(msecs)05d - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
file_handler.setFormatter(file_formatter)
logger.addHandler(file_handler)
# Console handler for logging to stdout
console_handler = logging.StreamHandler()
console_formatter = logging.Formatter('%(asctime)s.%(msecs)05d - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
console_handler.setFormatter(console_formatter)
logger.addHandler(console_handler)
def main():
"""
Main function for the crawler program. Parses command line arguments and starts the crawling process.
"""
parser = create_parser()
args = parser.parse_args()
urls_parsed = set()
urls_queued = deque()
urls_failed = set()
urls_extern = set()
urls_errors = set()
urls_files = set()
urls_seen = set()
total_content_size = 0
# Parse the URL to get the base url and scheme
# which will be used to store data and reconstruct
# relative URLs found in the HTML content.
base_url = urlparse(args.url).netloc
base_scheme = urlparse(args.url).scheme
setup_logging(args.verbose, args.debug, base_url)
# Check if the session needs to be resumed or else start from scratch
if args.resume:
urls_parsed = load_set_from_file(f"logs/{base_url}_urls_parsed.log", urls_seen)
urls_failed = load_set_from_file(f"logs/{base_url}_urls_failed.log", urls_seen)
urls_extern = load_set_from_file(f"logs/{base_url}_urls_extern.log", urls_seen)
urls_errors = load_set_from_file(f"logs/{base_url}_urls_errors.log", urls_seen)
urls_files = load_set_from_file(f"logs/{base_url}_urls_files.log", urls_seen)
urls_queued = load_queue_from_file(f"logs/{base_url}_urls_queued.log", urls_seen)
logging.info('Resuming web crawling session: Crawled: %i, Queued: %i, Failed: %i, Files: %i, External: %i, Errors: %i',
len(urls_parsed),
len(urls_queued),
len(urls_failed),
len(urls_files),
len(urls_extern),
len(urls_errors)
)
else:
# Process the root URL
add_url_to_queue(args.url, urls_queued, urls_seen)
logging.info('Web crawling starting on base URL %s (%s)', args.url, base_url)
# Limit the URLs processed according to the input limit
while len(urls_parsed) <= args.crawl_limit and len(urls_queued) > 0:
# Create one session to run all the HTTP requests through it
with requests.Session() as session:
try:
# Breadth-first search
current_url = urls_queued.popleft()
add_url_to_set(current_url, urls_seen)
# Default size if there's no content
content_size_kb = 0
try:
# Crawl URL
response = fetch_website(session, current_url, args.username, args.password)
except ConnectionError:
add_url_to_queue(current_url, urls_queued, urls_seen)
logging.error('Error fetching the website. Connectivity issues. Stopping. Resume with --resume')
break
if not response or not response.ok:
# If response is not ok, mark URL as failed
add_url_to_set(current_url, urls_failed)
continue
# Depending on the response status, store the URL in the correct set.
# We are here if response is ok
add_url_to_set(current_url, urls_parsed)
try:
total_content_size += len(response.content)
content_size_kb = len(response.content) / 1024
except:
# If we cannot calculate the size, do nothing.
pass
logging.info('CRAWLED - %s - %s - %.2f Kb', current_url, response.status_code, content_size_kb)
if response.headers.get('Location', None) is not None:
redirection_url = response.headers.get('Location', None)
if redirection_url not in urls_seen:
add_url_to_queue(redirection_url, urls_queued, urls_seen)
continue
# Only parse the HTML responses, ignore the rest.
content_type = response.headers.get('Content-Type', '').lower()
if 'text/html' not in content_type:
add_url_to_set(current_url, urls_files)
logging.debug('FILES - %s', current_url)
continue
# Parse the response content to find all outlinks from the HTML reponse
try:
found_urls = find_all_links(response.content, base_scheme, base_url)
logging.debug('Found %i new URLs', len(found_urls))
except Exception as err:
logging.error('Exception found in find_all_links(): %s', err)
continue
for new_url in found_urls:
# Only process those URLs that have not been parsed
if new_url not in urls_seen:
found_base_url = urlparse(new_url).netloc
if base_url in found_base_url:
add_url_to_queue(new_url, urls_queued, urls_seen)
logging.debug('FETCHED - %s', new_url)
continue
# Other links are external
add_url_to_set(new_url, urls_extern)
logging.debug('EXTERNAL - %s', new_url)
except KeyboardInterrupt:
add_url_to_queue(current_url, urls_queued, urls_seen)
break
except Exception as err:
logging.error('Error processing URL: %s (%s)', current_url, err)
urls_errors.add(current_url)
continue
# Log summary of the results
logging.info('SUMMARY - Crawled: %i, Queued: %i, Failed: %i, Files: %i, External: %i, Errors: %i, Total downloaded: %.2f Kb',
len(urls_parsed),
len(urls_queued),
len(urls_failed),
len(urls_files),
len(urls_extern),
len(urls_errors),
total_content_size/1024
)
# Store sets to disk
store_set_to_file(urls_queued, 'logs', f'{base_url}_urls_queued')
store_set_to_file(urls_parsed, 'logs', f'{base_url}_urls_parsed')
store_set_to_file(urls_failed, 'logs', f'{base_url}_urls_failed')
store_set_to_file(urls_errors, 'logs', f'{base_url}_urls_errors')
store_set_to_file(urls_extern, 'logs', f'{base_url}_urls_extern')
store_set_to_file(urls_files, 'logs', f'{base_url}_urls_files')
if __name__ == "__main__":
try:
main()
except FileNotFoundError:
logging.error('No session to resume from. Exiting.')
except pickle.UnpicklingError:
logging.error('Session cannot be restored. Pickle file seems corrupt.')