forked from jappeace/distrowatch1graph1svg
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfetchdists.py
134 lines (114 loc) · 4.36 KB
/
fetchdists.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# This program tries to parse distrowatch and create a svg graph simliar to: <https://en.wikipedia.org/wiki/Linux_distribution#/media/File:Linux_Distribution_Timeline_with_Android.svg>
# Copyright (C) 2016 Jappe Klooster
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program.If not, see <http://www.gnu.org/licenses/>.
"""
This file does the data collection from distrowatch (or any other site with
a similar html structure)
"""
import strings
from requests import Session
from bs4 import BeautifulSoup
from re import match
from shutil import copyfileobj
def jsondumps(item):
import json
return json.dumps(item, indent=4)
def fetch_details(arguments):
"""
Fetch the details of a distrobution, can be executed as a seperate
process, IO bound and blocking.
"""
(baseurl, distrobution) = arguments
# since this is subprocess space we want to reconstruct these based on
# the passed primitives
distrobution = BeautifulSoup(distrobution)
session = Session(
headers={
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Gecko/20100101 Firefox/72.0',
}
)
print("downloading and parsing %s" % distrobution.a.text)
aname = distrobution.a.get("href")
hname = distrobution.a.text
aname = hname.split(' ')[0].lower() if aname == '' else aname
link = "%s/%s" % (baseurl, aname)
distrosoup = BeautifulSoup(session.get(link).text)
structure = {
strings.name: aname,
"Human Name": hname,
"Link": link
}
anchor = distrosoup.find('ul')
for attribute in anchor.find_all('li'):
if attribute.b is None:
# no name, probably not a distro
continue
name = attribute.b.extract().text[:-1]
structure[name] = attribute.text[1:].replace("\\n", "")
def sanatizeDate(element):
"""find all dates and do some data sanitation if neccisarry"""
date = element.text
if "-" not in date:
date += "-XX-XX" # note this already exist in distrowatch input
return date.replace("XX", "01")
structure[strings.dates] = list(map(
sanatizeDate,
distrosoup.find_all("td", class_="Date")
))
url = "%s/%s" % (baseurl, anchor.parent.find_all('img')[-1]['src'])
print("using image: %s" % url)
image = session.get(url, stream=True)
image_name = "%s.png" % aname
with open(image_name, 'wb') as ifile:
image.raw.decode_content = True
copyfileobj(image.raw, ifile)
structure[strings.image] = image_name
return jsondumps(structure)
def fetch_dist_list_from(baseurl, search_options):
# for debugging...
def tohtml(lines, outFile="output.html"):
with open("out/%s" % outFile, "w", encoding='utf8') as f:
f.writelines(lines)
session = Session()
website = session.get('%s/search.php?%s' % (baseurl, search_options)).text
searchSoup = BeautifulSoup(website)
def tagfilter(tag):
return tag.name == "b" and match("[0-9]+\.", tag.text)
# TODO Why are we creating THIS as a json string here.
# Why not jsut return a python array
result = "["
# some missing root elements
godfathers = [
["android", "2008-10-23"]
]
for godfather in godfathers:
result += jsondumps({
strings.name: godfather[0],
strings.based: strings.independend,
strings.dates: [godfather[1]],
strings.status: strings.active,
strings.image: ""
}) + ","
from multiprocessing import Pool
pool = Pool(8) # sub interpreters to use
foundDistributions = searchSoup.find_all(tagfilter)
result += ",".join(pool.map(
fetch_details,
zip([baseurl
for x
in foundDistributions],
[str(x)
for x
in foundDistributions])
))
return result + "]"