-
Notifications
You must be signed in to change notification settings - Fork 0
/
xkcdplay.py
161 lines (128 loc) · 4.92 KB
/
xkcdplay.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#! python3
#
# xkcdplay.py - downloads all or limited number of images (comics) from https://xkcd.com/
#
# it's just an example of getting data: parsing, downloading files from web-site
# based on original idea described in book Al Sweigart "Automate the boring stuff with python"
#
# unlike the original, the script downloads only locally missing files
# saved images naming pattern: numPage_fileBaseName.ext
# starts scrapping from basic url .com or from certain page .com/777 you want
import logging
logging.basicConfig(filename = 'log.txt', level = logging.DEBUG, format = '%(asctime)s - %(levelname)s - %(message)s')
import os
import requests
import bs4
import re
from pathlib import Path
# FUNCTIONS
# Returns dictionary with content of local dir with img files
def getRegistry(dirStorage):
# dictionary: key is numPage, value is nameIMGFile
dctRegistry = {}
dirStorage = Path(dirStorage)
lstFiles = dirStorage.glob('*[0-9]_*.???')
for pathFile in lstFiles:
dctRegistry[pathFile.name.split('_')[0]] = pathFile.name
return dctRegistry
# Returns page number (extracted from url string)
def extractNumPage(pathUrl):
pat = r'^https?:\/\/.*\/(\d{1,4})\/?$'
if matched := re.search(pat, pathUrl, re.IGNORECASE):
return matched.group(1)
# Returns previous page number (extracted from page a href navigation)
def findPrevNumPage(soupObj):
# Select href like href='/pageNum/'
prevLink = soupObj.select('a[rel="prev"]')[0]
# Get number only
strNum = (prevLink.get('href')).replace('/', '')
return strNum
# Returns page number (extracted from permanent link)
def findNumPage(soupObj):
pat = r'^https?:\/\/.*\/(\d{1,4})\/?$'
hrefRegex = re.compile(pat, re.IGNORECASE)
# Return first in list if element matched
for element in soupObj.find_all(href = hrefRegex):
return hrefRegex.search(element.get('href')).group(1)
# Returns url string like https://basicUrl/pageNum/
def findImgUrl(soupObj):
elemImg = soupObj.select('#comic img')
if elemImg != []:
return f"https:{elemImg[0].get('src')}"
# START
# Starting url
basicUrl = 'https://xkcd.com'
# Folder for savin downloaded images
dirStorage = 'xkcd'
os.makedirs(dirStorage, exist_ok = True)
# Inventory data in local storage
dctLocalStorage = getRegistry(dirStorage)
# Downloaded files limit
numLimit = 5000
i = 1
# start from basic url or from certain page
pageUrl = basicUrl # + '/777'
numPage = None
# Iterate over pages on web-site
while not pageUrl.endswith('#') and i <= numLimit:
if numPage and int(numPage) <= 0:
break
# pageUrl format is like basicUrl/numPage/
if numPage := extractNumPage(pageUrl):
# Check if current page number already in local storage
if numPage in dctLocalStorage:
# Fast way to get previous url
pageUrl = f'{basicUrl}/{str(int(numPage) - 1)}'
# Skip current page
continue
# Starting request current page
logging.debug('Request page data %s' % pageUrl)
res = requests.get(pageUrl)
try:
res.raise_for_status()
except Exception as exc:
# Can not request a page
logging.debug('num: {numPage} url: {pageUrl} error occured %s:' % (exc))
# Fast way to get previous url
pageUrl = f'{basicUrl}/{str(int(numPage) - 1)}'
# Skip current page
continue
soup = bs4.BeautifulSoup(res.text, 'html.parser')
# If current page number not extracted from url
if not numPage:
# Try find number of page in page structure
numPage = findNumPage(soup)
# Check if file number present in local storage
if numPage in dctLocalStorage:
# Get previous url from page navigation
pageUrl = f'{basicUrl}/{findPrevNumPage(soup)}'
# Skip current page
continue
# Find image file url
if not (imgUrl := findImgUrl(soup)):
# Nothing to download on current page
pageUrl = f'{basicUrl}/{str(int(numPage) - 1)}'
# Skip current page
continue
# Download the image
logging.debug('downloading image %s...' % (imgUrl))
res = requests.get(imgUrl)
try:
res.raise_for_status()
except Exception as exc:
logging.debug('num: {numPage} img url: {imgUrl} error occured %s:' % (exc))
# Make file name with rule: numPage_fileBaseName.ext
pathFile = os.path.join(dirStorage, f'{str(numPage)}_{os.path.basename(imgUrl)}')
imageFile = open(pathFile,'wb')
# Save an image to ./dirStorage
for chunk in res.iter_content(100000):
imageFile.write(chunk)
imageFile.close()
logging.debug(f'done: {i} num: {numPage} img: {imgUrl} page: {pageUrl} file: {pathFile}')
print(f'done:\t{i}\nnum:\t{numPage}\nimg:\t{imgUrl}\npage:\t{pageUrl}\nfile:\t{pathFile}')
# Get previous page url
pageUrl = f'{basicUrl}/{findPrevNumPage(soup)}'
i += 1
# Show result
logging.debug(f'all ({i - 1}) done!')
print(f'\nall ({i - 1}) done!')