-
Notifications
You must be signed in to change notification settings - Fork 1
/
scraper.js
154 lines (121 loc) · 4.05 KB
/
scraper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
const fs = require('fs-extra');
const path = require('path');
const axios = require('axios');
const cheerio = require('cheerio');
const BASE = 'https://pankow.lebensmittel-kontrollergebnisse.de';
const resultsDir = path.join(__dirname, 'results', 'pankow');
async function main() {
let url = '/Search?filter=';
const results = [];
while (url) {
const $ = await load(url);
const urls = links($, 'div.card a.btn.btn-light.round-button');
for (const url of urls) {
const [, id] = /\/Ergebnisse\/Detail\/([\w\d]+)/.exec(url);
const $ = await load(url);
// base metadata
const name = text($, 'div.card-body div.col-md-8 h3.startpage-h3');
const kind = text($, 'div.card-body div.col-md-8 div div div.text-small');
const address = text($, 'div.card-body div.col-md-8 > div');
console.log('>', name);
const images = links($, 'div.image-grid a');
const dir = path.join(resultsDir, id);
await fs.mkdirp(dir);
// report images
const imageFiles = [];
for (const image of images) {
const [, id] = /\?imageid=([\w\d]+)/.exec(image);
const url = image.replace('/Image/', '/ImageData/');
const { data } = await get(url, {
responseType: 'arraybuffer',
headers: {
Accept: 'image/avif,image/webp,image/apng,image/*,*/*;q=0.8',
},
});
console.log('>> Downloaded image', id);
const filename = `${id}.jpg`;
const file = path.join(dir, filename);
await fs.writeFile(file, data);
imageFiles.push(filename);
}
// current report results
const selector = (n) =>
`.bewertung-panel > div > div:nth-child(1) > div:nth-child(${n[0][0]})`;
const date = formatDate($, selector`1`);
const points = reportText($, selector`2`);
const conclusion = reportText($, selector`3`);
const smileyClasses = $('.bewertung-panel i.far').attr('class');
const [, smiley] = /color-smiley-(\w+)/.exec(smileyClasses);
const results = { date, points, conclusion, smiley };
// follow up reports
const followUps = [];
const followUpEls = $('div.mt-2.p-3:not(.bewertung-panel)').toArray();
for (const followUp of followUpEls) {
const date = formatDate($, '.text-datum', followUp);
if (!date) continue;
const result = text($, 'div', followUp);
followUps.push({ date, result });
}
// details
const details = [];
const rows = $('table.w-100 tr').toArray();
for (const row of rows) {
const propertyRaw = text($, 'th', row);
if (!/\d+\./.test(propertyRaw)) continue;
const property = propertyRaw.replace(/\d+\. /, '');
const possiblePoints = text($, 'td:nth-of-type(1)', row);
const achievedPoints = text($, 'td:nth-of-type(2)', row);
details.push({ property, achievedPoints, possiblePoints });
}
const report = {
id,
name,
kind,
address,
imageFiles,
results,
followUps,
details,
};
const file = path.join(dir, `${id}.json`);
await fs.writeJSON(file, report);
}
url = $('a.page-link[aria-label=Next]').attr('href');
}
}
function get(url, ...attrs) {
return axios.get(BASE + url, ...attrs);
}
async function load(url) {
const { data } = await get(url);
return cheerio.load(data);
}
function links($, selector) {
return $(selector)
.toArray()
.map((el) => $(el).attr('href'));
}
function text($, ...selector) {
return trim(
$(...selector)
.first()
.text()
);
}
function trim(text) {
return text
.replace(/\n {2,}(.*)\n/gm, ' $1')
.replace(/\s{2,}/gm, '\n')
.trim();
}
function reportText($, ...selector) {
const el = $(...selector);
$('span', el).remove();
return trim(el.text());
}
function formatDate($, ...selector) {
const d = text($, ...selector);
const date = new Date(d.replace(/(\d{2})\.(\d{2})\.(\d{4})/, '$3-$2-$1'));
return isFinite(date) ? date : false;
}
main().then(() => console.log('Done.'));