-
Notifications
You must be signed in to change notification settings - Fork 2
/
scraper.js
140 lines (113 loc) · 4.41 KB
/
scraper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
const puppeteer = require('puppeteer');
const cheerio = require('cheerio');
const fs = require('fs/promises');
const path = require('path');
const NOUNS_BASE_URL =
'http://kateglo.com?op=1&phrase=&lex=n&type=r&src=&mod=dictionary&srch=Cari';
const ADJECTIVES_BASE_URL =
'http://kateglo.com?&op=1&phrase=&lex=adj&type=r&src=&mod=dictionary&srch=Cari';
// Shorter word length is not that "strong".
// Perhaps it's a good idea to only include words that are at least a certain length.
const MINIMUM_WORD_LENGTH = 5;
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
// Scrape nouns.
// `Set` is used because it guarantees uniqueness (whereas a normal Array doesn't).
const nouns = new Set();
await page.goto(NOUNS_BASE_URL);
const lastNounPageNumber = await getNumberOfLastPage(page);
const nounPageNumbers = getFetchedPageNumbers(lastNounPageNumber);
for (const pageNumber of nounPageNumbers) {
console.info(`[info] scraping nouns in page ${pageNumber}...`);
await page.goto(`${NOUNS_BASE_URL}&p=${pageNumber}`);
const pageNouns = await getWordsFromPage(page);
for (const word of pageNouns) {
if (word.length >= MINIMUM_WORD_LENGTH) {
nouns.add(word);
}
}
}
// Scrape adjectives.
// `Set` is used because it guarantees uniqueness (whereas a normal Array doesn't).
const adjectives = new Set();
await page.goto(ADJECTIVES_BASE_URL);
const lastAdjectivePageNumber = await getNumberOfLastPage(page);
const adjectivePageNumbers = getFetchedPageNumbers(lastAdjectivePageNumber);
for (const pageNumber of adjectivePageNumbers) {
console.info(`[info] scraping adjectives in page ${pageNumber}...`);
await page.goto(`${ADJECTIVES_BASE_URL}&p=${pageNumber}`);
const pageAdjectives = await getWordsFromPage(page);
for (const word of pageAdjectives) {
if (word.length >= MINIMUM_WORD_LENGTH) {
adjectives.add(word);
}
}
}
console.info(
`Total scraped words: ${nouns.size} nouns, ${adjectives.size} adjectives.`
);
// Save to local JSON file.
await fs.writeFile(
path.join(__dirname, '../src/words.json'),
JSON.stringify(
{
nouns: Array.from(nouns).sort(),
adjectives: Array.from(adjectives).sort()
},
null,
2
)
);
await browser.close();
})();
// Helper functions.
async function getWordsFromPage(page) {
const data = await page.evaluate(() => document.querySelector('*').outerHTML);
const selector = cheerio.load(data);
const words = [];
selector('dl > dt > a').each((_idx, element) => {
const word = cheerio.load(element.children[0]).text();
// Clear parentheses (if any).
// TODO(imballinst): check if we want to remove symbols such as hyphen as well.
words.push(word.toLowerCase().replace(/[()]+/g, ''));
});
return words;
}
async function getNumberOfLastPage(page) {
const data = await page.evaluate(() => document.querySelector('*').outerHTML);
const selector = cheerio.load(data);
const paginationSelector = cheerio.load(
selector('ul.pagination').first().html()
);
const pagingElement = paginationSelector('li > a').last();
// The last page URL is something like this: ./?&op=1&phrase=&lex=n&type=r&src=&mod=dictionary&srch=Cari&p=403.
// We want to take the `p` query parameter only.
const lastPage = pagingElement.attr('href').split('&p=')[1];
return Number(lastPage.replace(',', ''));
}
// In kateglo site, each page contains ~50 words.
// Reference: https://github.com/ans-4175/password-ga/issues/9#issuecomment-964749676.
const NUMBER_OF_FETCHED_PAGES = 50;
/**
* Get the page numbers that will to be fetched.
* @param {number} lastPageNumber last page number
* @returns {number[]} array of page numbers
*/
function getFetchedPageNumbers(lastPageNumber) {
// Create an array with size `lastPageNumber`, with the
// first array element being 1 and the last array element being `lastPageNumber`.
const pages = Array.from(new Array(lastPageNumber)).map(
(_el, idx) => idx + 1
);
const fetchedPageNumbers = [];
while (fetchedPageNumbers.length < NUMBER_OF_FETCHED_PAGES) {
// Get the array index.
// We will use it to get the page number and to splice the array.
const pageArrayIdx = Math.floor(Math.random() * pages.length);
const pageNumber = pages[pageArrayIdx];
fetchedPageNumbers.push(pageNumber);
pages.splice(pageArrayIdx, 1);
}
return fetchedPageNumbers;
}