-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
82 lines (61 loc) · 2.02 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import re
class Scraper(object):
async def download(self, session, item):
"""
Overridable method to download the given item.
"""
raise NotImplementedError()
async def save(self, crawl_manager, item, response):
"""
Method to save the item.
N.B. This is handled by the scraper (and is a scraper
processing callback) because some crawls may not want
to actually save a response (e.g. ID discovery).
"""
await crawl_manager.save(item, response)
class FakeResponse(object):
"""
Fake aiohttp response object for NullScraper/tests
"""
async def text(self):
return ""
@property
def status(self):
return 200
async def release(self):
pass
class NullScraper(Scraper):
async def download(self, session, item):
return FakeResponse()
@property
def processing_callbacks(self):
return []
class SimpleScraper(Scraper):
def __init__(self, link_regex: str):
self._regex = re.compile(link_regex)
async def download(self, session, item):
return await session.get(item)
async def add_new_links(self, crawl_manager, item, response):
text = await response.text()
await crawl_manager.add_new_items(self._regex.findall(text))
@property
def processing_callbacks(self):
return [
self.add_new_links,
self.save,
]
class IDScraper(Scraper):
def __init__(self, download_url_fmt: str, id_regex: str):
self.download_url_fmt = download_url_fmt
self._regex = re.compile(id_regex)
async def download(self, session, item):
return await session.get(self.download_url_fmt.format(item))
async def add_new_ids(self, crawl_manager, item, response):
text = await response.text()
await crawl_manager.add_new_items(self._regex.findall(text))
@property
def processing_callbacks(self):
return [
self.add_new_ids,
self.save,
]