-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.cjs
74 lines (65 loc) · 2.11 KB
/
index.cjs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
require("dotenv").config();
const AWS = require("aws-sdk");
const s3 = new AWS.S3({
accessKeyId: process.env.UNISTART_AWS_ACCESS_KEY_ID,
secretAccessKey: process.env.UNISTART_AWS_SECRET_ACCESS_KEY,
});
const chromium = require("@sparticuz/chromium");
// chromium-min doesn't bundle the executable with the package
// const chromium = require("@sparticuz/chromium-min");
const { Cluster } = require("puppeteer-cluster");
const ScrapingMap = require("./utils/map.cjs");
const { prepareJobData, writeJSONToOutputFile } = require("./utils/utils.cjs");
const Constants = require("./utils/constants.cjs");
exports.handler = async (event, context) => {
let cluster;
if (Constants.RunningLocally) {
cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_CONTEXT,
maxConcurrency: 4,
timeout: 900000,
});
} else {
// use old headless mode
chromium.headless = true;
cluster = await Cluster.launch({
puppeteerOptions: {
args: chromium.args,
defaultViewport: chromium.defaultViewport,
// for using chromium-min and downloading executable from github
// executablePath: await chromium.executablePath(
// "https://github.com/Sparticuz/chromium/releases/download/v116.0.0/chromium-v116.0.0-pack.tar"
// ),
executablePath: await chromium.executablePath(),
headless: chromium.headless,
ignoreHTTPSErrors: true,
},
concurrency: Cluster.CONCURRENCY_CONTEXT,
maxConcurrency: 4,
timeout: 900000,
});
}
const results = [];
await cluster.task(async ({ page, data: url }) => {
const scrapedData = await ScrapingMap[url](page, url);
results.push(...scrapedData);
});
for (const url in ScrapingMap) {
cluster.queue(url);
}
await cluster.idle();
await cluster.close();
const formattedJobItems = prepareJobData(results);
if (Constants.RunningLocally) {
writeJSONToOutputFile("jobs.json", formattedJobItems);
} else {
// log the first few job objects
console.log(formattedJobItems.slice(0, 5));
const params = {
Bucket: "scraped-job-objects",
Key: "jobs.json",
Body: JSON.stringify(formattedJobItems),
};
await s3.putObject(params).promise();
}
};