-
Notifications
You must be signed in to change notification settings - Fork 142
/
Copy pathmagnet_crawler.py
160 lines (133 loc) · 4.5 KB
/
magnet_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import requests, re, json, sys, os
reload(sys)
sys.setdefaultencoding('utf8')
cookie = ''
max_depth = 40
viewed_urls = []
found_magnets = []
ignore_url_param = True
ignore_html_label = True
session = requests.Session()
session.headers.update({'Cookie': cookie})
resource_list = []
if os.path.exists('resource_list.json'):
with open('resource_list.json', 'r') as json_file:
resource_list = json.loads(json_file.read())
for resource in resource_list:
found_magnets.extend(resource['magnets'])
def scan_page(url, depth=0):
if url in viewed_urls:
return
if (depth > max_depth):
return
print('Entering: ' + url)
sys.stdout.flush()
try:
result = session.get(url, timeout=60)
if not (result.status_code >= 400 and result.status_code<500):
result.raise_for_status()
viewed_urls.append(url)
except Exception:
scan_page(url, depth)
return
result_text = result.content
magnet_list = get_magnet_links(result_text)
sub_urls = get_sub_urls(result_text, url)
page_title = get_page_title(result_text)
new_resource = {'title':page_title, 'magnets': magnet_list}
if new_resource in resource_list:
for sub_url in sub_urls:
scan_page(sub_url, depth+1)
return
if (len(magnet_list) > 0):
append_title_to_file(page_title, 'magnet_output')
for magnet in magnet_list:
print('Found magnet: ' + magnet)
sys.stdout.flush()
append_magnet_to_file(magnet, 'magnet_output')
resource_list.append(new_resource)
remove_duplicated_resources()
save_json_to_file('resource_list.json')
for sub_url in sub_urls:
scan_page(sub_url, depth+1)
def get_sub_urls(result_text, url):
urls = set(re.findall(r'<a.*?href=[\'"](.*?)[\'"].*?>', result_text))
sub_urls = []
for sub_url in urls:
sub_url = sub_url.strip()
if sub_url == '':
continue
if 'javascript:' in sub_url or 'mailto:' in sub_url:
continue
if sub_url[0:4] == 'http':
try:
if (get_url_prefix(sub_url)[1] != get_url_prefix(url)[1]):
continue
except Exception:
continue
elif sub_url[0:1] == '/':
sub_url = get_url_prefix(url)[0] + '://' + get_url_prefix(url)[1] + sub_url
else:
sub_url = url + '/' + sub_url
sub_url = re.sub(r'#.*$', '', sub_url)
sub_url = re.sub(r'//$', '/', sub_url)
if ignore_url_param:
sub_url = re.sub(r'\?.*$', '', sub_url)
if not sub_url in viewed_urls:
sub_urls.append(sub_url)
return sub_urls
def get_url_prefix(url):
domain_match = re.search(r'(.*?)://(.*?)/', url)
if (domain_match):
return (domain_match.group(1) ,domain_match.group(2))
else:
domain_match = re.search(r'(.*?)://(.*)$', url)
return (domain_match.group(1) ,domain_match.group(2))
def get_magnet_links(result_text):
if (ignore_html_label):
result_text = re.sub(r'<[\s\S]*?>', '', result_text)
result_text = re.sub(r'([^0-9a-zA-Z])([0-9a-zA-Z]{5,30})[^0-9a-zA-Z]{5,30}([0-9a-zA-Z]{5,30})([^0-9a-zA-Z])', r'\1\2\3\4', result_text)
hashes = list(set(re.findall(r'[^0-9a-fA-F]([0-9a-fA-F]{40})[^0-9a-fA-F]', result_text)))
hashes.extend(list(set(re.findall(r'[^0-9a-zA-Z]([0-9a-zA-Z]{32})[^0-9a-zA-Z]', result_text))))
magnets = list(set([('magnet:?xt=urn:btih:' + hash_value).lower() for hash_value in hashes if not ('magnet:?xt=urn:btih:' + hash_value).lower() in found_magnets]))
found_magnets.extend(magnets)
return magnets
def get_page_title(result_text):
match = re.search(r'<title>(.+?)</title>', result_text)
if match:
return match.group(1).strip()
else:
return ''
def append_magnet_to_file(magnet, filename):
with open(filename, 'a+') as output_file:
output_file.write(magnet + '\n')
def append_title_to_file(title, filename):
with open(filename, 'a+') as output_file:
output_file.write(title + '\n')
def remove_duplicated_resources():
global resource_list
new_resource_list = []
for resource in resource_list:
add_flag = True
for added_resource in new_resource_list:
if added_resource['title'] == resource['title']:
add_flag = False
added_resource['magnets'].extend(resource['magnets'])
added_resource['magnets'] = list(set(added_resource['magnets']))
break
if add_flag:
new_resource_list.append(resource)
resource_list = new_resource_list
def save_json_to_file(filename):
with open(filename, 'w+') as output_file:
output_file.write(json.dumps(resource_list, indent=4, sort_keys=True, ensure_ascii=False))
def main():
print('Enter a website url to start.')
root_url = raw_input()
if not '://' in root_url:
root_url = 'http://' + root_url
#with open('magnet_output', 'w+') as output_file:
# output_file.write('')
scan_page(root_url)
if __name__ == '__main__':
main()