-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathscraper.py
76 lines (64 loc) · 1.85 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/python3
import re
import sys
import ssl
import time
import json
import ntpath
import importlib
from random import randrange
from math import floor
from urllib.request import urlopen
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from PIL import Image, ImageStat
from multiprocessing.dummy import Pool as ThreadPool
from luminosity import get_brightness
def get_filename(path):
head, tail = ntpath.split(path)
return tail or ntpath.basename(head)
i = 0
def process_image(image_tag):
name = get_filename(image_tag['src'])
global i, image_nr
i = i + 1
complete = floor(100.0 * i / image_nr)
print(f'[STEP] Getting data for image {name:s}\n[PROGRESS] {complete:d}')
# Force absolute URLs.
image_tag['src'] = urljoin(siteurl, image_tag['src'])
# Get image size.
image = urlopen(image_tag['src'], context=sslctx)
image_res = Image.open(image)
return {
'name': name,
'src': image_tag['src'],
'alt': image_tag['alt'],
'width': image_res.size[0],
'height': image_res.size[1],
'format': image_res.format,
'size': image.headers['content-length'],
'brightness': get_brightness(image_res)
}
if ( len(sys.argv) == 1 ):
print('[ERROR] No URL provided.')
raise SystemExit(0)
# Scrape images from URL.
siteurl = sys.argv[1]
sslctx = ssl._create_unverified_context()
try:
html = urlopen(siteurl, timeout=10, context=sslctx)
except IOError as err:
print('[ERROR] Could not connect to URL:')
print(err)
raise SystemExit(0)
bs = BeautifulSoup(html, 'html.parser')
image_tags = bs.find_all('img', {
'src': re.compile('.jpe?g|png|gif')
})
image_nr = len(image_tags)
# Start processing.
print(f'[START] Fetching data for {image_nr:d} images...')
pool = ThreadPool(4)
images = pool.map(process_image, image_tags)
# Output JSON.
print(f'[JSON] {json.dumps(images):s}')