forked from duckduckgo/tracker-radar-collector
-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawlerConductor.js
88 lines (76 loc) · 3.51 KB
/
crawlerConductor.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
const os = require('os');
const cores = os.cpus().length;
const chalk = require('chalk').default;
const async = require('async');
const crawl = require('./crawler');
const URL = require('url').URL;
const {createTimer} = require('./helpers/timer');
const createDeferred = require('./helpers/deferred');
// eslint-disable-next-line no-unused-vars
const BaseCollector = require('./collectors/BaseCollector');
const notABot = require('./helpers/notABot');
const MAX_NUMBER_OF_CRAWLERS = 38;// by trial and error there seems to be network bandwidth issues with more than 38 browsers.
const MAX_NUMBER_OF_RETRIES = 2;
/**
* @param {string} urlString
* @param {BaseCollector[]} dataCollectors
* @param {function} log
* @param {boolean} filterOutFirstParty
* @param {function(URL, import('./crawler').CollectResult): void} dataCallback
* @param {boolean} emulateMobile
* @param {string} proxyHost
* @param {boolean} antiBotDetection
*/
async function crawlAndSaveData(urlString, dataCollectors, log, filterOutFirstParty, dataCallback, emulateMobile, proxyHost, antiBotDetection) {
const url = new URL(urlString);
/**
* @type {function(...any):void}
*/
const prefixedLog = (...msg) => log(chalk.gray(`${url.hostname}:`), ...msg);
const data = await crawl(url, {
log: prefixedLog,
// @ts-ignore
collectors: dataCollectors.map(collector => new collector.constructor()),
filterOutFirstParty,
emulateMobile,
proxyHost,
runInEveryFrame: antiBotDetection ? notABot : undefined
});
dataCallback(url, data);
}
/**
* @param {{urls: string[], dataCallback: function(URL, import('./crawler').CollectResult): void, dataCollectors?: BaseCollector[], failureCallback?: function(string, Error): void, numberOfCrawlers?: number, logFunction?: function, filterOutFirstParty: boolean, emulateMobile: boolean, proxyHost: string, antiBotDetection?: boolean}} options
*/
module.exports = options => {
const deferred = createDeferred();
const log = options.logFunction || (() => {});
const failureCallback = options.failureCallback || (() => {});
let numberOfCrawlers = options.numberOfCrawlers || Math.floor(cores * 0.8);
numberOfCrawlers = Math.min(MAX_NUMBER_OF_CRAWLERS, numberOfCrawlers, options.urls.length);
// Increase number of listeners so we have at least one listener for each async process
if (numberOfCrawlers > process.getMaxListeners()) {
process.setMaxListeners(numberOfCrawlers + 1);
}
log(chalk.cyan(`Number of crawlers: ${numberOfCrawlers}\n`));
async.eachOfLimit(options.urls, numberOfCrawlers, (urlString, idx, callback) => {
log(chalk.cyan(`Processing entry #${Number(idx) + 1} (${urlString}).`));
const timer = createTimer();
const task = crawlAndSaveData.bind(null, urlString, options.dataCollectors, log, options.filterOutFirstParty, options.dataCallback, options.emulateMobile, options.proxyHost, (options.antiBotDetection !== false));
async.retry(MAX_NUMBER_OF_RETRIES, task, err => {
if (err) {
log(chalk.red(`Max number of retries (${MAX_NUMBER_OF_RETRIES}) exceeded for "${urlString}".`));
failureCallback(urlString, err);
} else {
log(chalk.cyan(`Processing "${urlString}" took ${timer.getElapsedTime()}s.`));
}
callback();
});
}, err => {
if (err) {
deferred.reject(err);
} else {
deferred.resolve();
}
});
return deferred.promise;
};