This repository has been archived by the owner on Aug 12, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathseed.py
97 lines (68 loc) · 2.66 KB
/
seed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from collections import defaultdict
from json import dump
from logging import basicConfig, INFO, info
from re import search
from bs4 import BeautifulSoup
from requests import get
def scrape(path: str) -> BeautifulSoup | None:
path = 'https://quotes.toscrape.com' + path
info(f'Parsing {path}')
response = get(path)
return BeautifulSoup(response.text, 'lxml') \
if response.status_code == 200 else None
def main() -> None:
basicConfig(level=INFO, format='%(message)s')
ATTRIBUTES = {
field: {key: field for key in ('class', 'itemprop')}
for field in ('author', 'text')
}
results = defaultdict(list)
path = '/'
while (page := scrape(path)):
nodes = page.find_all(
'div',
{
'class': 'quote',
'itemtype': 'http://schema.org/CreativeWork'
},
)
for quote in nodes:
phrase = quote.find('span', ATTRIBUTES['text'])
wrapper = phrase.find_next_sibling('span')
author_name = wrapper.find('small', ATTRIBUTES['author']).text
author_link = wrapper.select('a[href^="/author/"]')[0]
author_info = f'by {author_name}{author_link.text}'
if wrapper.text.replace('\n', '') != author_info:
continue
results['authors'].append(author_link['href'])
tags = quote.find('div', class_='tags') \
.select('a.tag[href^="/tag/"]')
results['quotes'].append({
'tags': [tag.text for tag in tags],
'author': author_name,
'quote': phrase.text,
})
if (not (
(nodes := page.select('nav > ul.pager > li.next > a')) and
search(r'^/page/\d+/$', path := nodes[0]['href'])
)):
break
paths = set(results['authors'])
results['authors'] = []
for path in paths:
if not (page := scrape(path)):
continue
wrapper = page.find('div', class_='author-details')
author = {}
for name in ('title', 'born-date', 'born-location', 'description'):
field = 'fullname' if name == 'title' else name.replace('-', '_')
author[field] = wrapper.select(f'.author-{name}')[0].text.strip()
if name == 'born-location':
author[field] = author[field][3:]
results['authors'].append(author)
for type, result in results.items():
with open(f'{type}.json', 'w', encoding='utf-8') as file:
dump(result, file, indent=2, ensure_ascii=False)
info(f'Found {len(result)} {type}.')
if __name__ == '__main__':
main()