-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathamazon.js
93 lines (68 loc) · 2.65 KB
/
amazon.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
const puppeteer = require('puppeteer');
const { amazon: CREDS } = require('./credentials.json');
const AMAZON_NOTEBOOKS_URL = 'https://read.amazon.com/notebook';
const USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/61.0';
// Selector used for extracting highlights
const BOOK_SELECTOR = '.kp-notebook-library-each-book';
const TITLE_SELECTOR = 'h3.a-spacing-top-small';
const AUTHOR_SELECTOR = 'p.a-spacing-none:nth-child(3)';
const HIGHLIGHT_SELECTOR = '.kp-notebook-highlight';
// Selector used for logging in to amaozon
const USERNAME_SELECTOR = '#ap_email';
const PASSWORD_SELECTOR = '#ap_password';
const BUTTON_SELECTOR = '#signInSubmit';
const FIVE_SECONDS = 5 * 1000;
const randomDelay = async (page, seconds) => page.waitFor(Math.random() * seconds * 1000);
async function inputText(page, selector, text) {
await randomDelay(page, 2);
await page.click(selector);
await page.keyboard.type(text);
}
async function loginToAmazon(page) {
await page.goto(AMAZON_NOTEBOOKS_URL, {
waitUntil: 'networkidle2',
});
await inputText(page, USERNAME_SELECTOR, CREDS.username);
await inputText(page, PASSWORD_SELECTOR, CREDS.password);
await randomDelay(page, 2);
await page.click(BUTTON_SELECTOR);
await page.waitForNavigation();
}
async function extractHighlightsOfBook(page) {
return page.evaluate((HIGHLIGHT, TITLE, AUTHOR) => ({
title: document.querySelector(TITLE).innerText,
author: document.querySelector(AUTHOR).innerText,
highlights: Array.from(document.querySelectorAll(HIGHLIGHT))
.map(highlight => highlight.innerText),
}), HIGHLIGHT_SELECTOR, TITLE_SELECTOR, AUTHOR_SELECTOR);
}
async function extractHighlights(page) {
const books = await page.$$(BOOK_SELECTOR);
const result = [];
for (let index = 0; index < books.length; index += 1) {
await books[index].click();
// TODO: Fix this to instead wait till something appears
await page.waitFor(FIVE_SECONDS);
try {
const highlights = await extractHighlightsOfBook(page);
result.push(highlights);
} catch (error) {
console.log('Something went wrong with book number:', index + 1);
}
}
return result;
}
module.exports = async function scrapeAmazonNotebooks() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setUserAgent(USER_AGENT);
await page.setExtraHTTPHeaders({
'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8',
});
await loginToAmazon(page);
await randomDelay(page, 5);
const highlightsGroupedByBooks = await extractHighlights(page);
await page.close();
await browser.close();
return highlightsGroupedByBooks;
};