-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.js
115 lines (97 loc) · 3.18 KB
/
scrape.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
const axios = require('axios');
const cheerio = require('cheerio');
const fs = require('fs');
const path = require('path');
const orderedGenerator = generateOrderedId("ql4ixp", "qlgg3x");
const DIRECTORY = "data/";
const GENERATION_METHOD = () => orderedGenerator.next().value;
const AMOUNT_TO_SCRAPE = Infinity;
run();
async function run(){
for(i=0; i < AMOUNT_TO_SCRAPE; i++){
//await sleep(5000); // easy way to lighten up load on internet connection
let id = GENERATION_METHOD();
try {
await downloadImageById(id);
console.log(id);
} catch (e) {
console.log(e);
}
}
}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
function* generateOrderedId(startId, endId){
start = convertFromIdFormat(startId);
end = convertFromIdFormat(endId);
for(let i = start; i < end; i++){
id = convertToIdFormat(i);
yield id;
}
console.log("Finished");
process.exit();
}
function convertToIdFormat(i){
return i.toString(36);
}
function convertFromIdFormat(s){
return parseInt(s, 36);
}
function generateRandomId() {
// The images are uploaded in order, they are currently at "ql####" in 1/8/2020
// so only generate ids below ql#### (p#### is fine)
// Also only generate ids above 0#####
// we can generate different ranges to get different time periods but we are just choosing randomly here
let parts = [];
parts.push(generateRandomString("123456789abcdefghijklmnop", 1));
parts.push(generateRandomString("0123456789abcdefghijklmnopqrstuvwxyz", 5));
return parts.join("");
}
// generateRandomString("0123456789abcdefghijklmnopqrstuvwxyz", 6)
// Example output: asd42e
function generateRandomString(characters, length) {
let result = '';
let charactersLength = characters.length;
for (var i = 0; i < length; i++) {
result += characters.charAt(Math.floor(Math.random() * charactersLength));
}
return result;
}
async function downloadImageById(id) {
try {
let filePath = path.resolve(DIRECTORY, `${id}.png`);
let page = await loadInitialPage(id);
let imageUrl = await getImageUrl(page);
await downloadImage(imageUrl, filePath);
} catch (e) {
console.log(`Error: ${e.message}`);
}
}
async function loadInitialPage(id) {
let url = `https://prnt.sc/${id}`;
let res = await axios.get(url);
return cheerio.load(res.data);
}
const ERROR_IMAGE_URL = "//st.prntscr.com/2019/11/26/0154/img/0_173a7b_211be8ff.png";
async function getImageUrl($) {
let url = $("#screenshot-image").attr("src");
if(url == ERROR_IMAGE_URL){
throw new Error("Invalid id");
}else{
return url;
}
}
async function downloadImage(url, path) {
const writer = fs.createWriteStream(path);
const response = await axios({
url,
method: 'GET',
responseType: 'stream'
});
response.data.pipe(writer);
return new Promise((resolve, reject) => {
writer.on('finish', resolve);
writer.on('error', reject);
});
}