This repository has been archived by the owner on Aug 22, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrapper.js
208 lines (187 loc) · 6.23 KB
/
scrapper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
'use strict';
require('dotenv').config();
const https = require('https');
const fs = require('fs');
//File where the data is backed up
const dataFile = process.env.DATA_FILE_PATH;
const regexList = {
getDayGroups: /message_group\" id=\".*?\"\>[\s\S]*?(?=message_group|pagination)/g,
getDayDate: /id=\"(.*?)\"/,
getEventTypes: /message (?:auto-message)*(.*?)\"[\s\S]*?datetime=\"(.*?)\"[\s\S]*?title\"\>(.*?)\<\/span/g,
isWeekTheMostRecent: /next disabled/,
getNextWeekDate: /\"\/messages\/(.*?)\".*?Next Week/
};
const firstStatusDate =
'{"date": "2010-02-01", "eventsCount": {"good": 1, "minor": 0, "major": 0}, "events": []}';
const checkDataFileExists = () => {
if (!fs.existsSync(dataFile))
fs.writeFileSync(dataFile, `[${firstStatusDate}]`);
};
//Load the data contained in the save file
const getSavedData = async () => {
try {
return await JSON.parse(fs.readFileSync(dataFile, 'utf8'));
} catch (e) {
console.log(e);
}
return false;
};
//Fetch the HTML content of a page, if no date specified, it takes the most recent status report
const fetchStatusPage = date =>
new Promise((resolve, reject) => {
const url = date ? `https://status.github.com/messages/${date}` : "https://status.github.com/messages";
https.get(url, res => {
if (res.statusCode !== 200) {
res.resume();
reject(res);
}
res.setEncoding('utf8');
let rawData = '';
res.on('data', chunk => rawData += chunk);
res.on('end', () => resolve(rawData));
}).on('error', e => reject(e));
});
//Parse a status page returning an object with the date and the count of each type of events
//(Good, minor, major)
const parseStatusPage = html => {
let result = [];
let temp, dayList = [];
//Separate each days of the page
while ((temp = regexList.getDayGroups.exec(html)))
dayList.push(temp[0])
if (dayList.length === 0)
return;
dayList.forEach(day => {
let dayDate, eventsType = [];
//Get the date of the current day
try {
dayDate = day.match(regexList.getDayDate)[1];
} catch (e) {
console.error(e);
dayDate = null;
}
let dayResult = {
date: dayDate,
eventsCount: {
good: 0,
minor: 0,
major: 0
},
events: []
};
//get the events type and status message
while ((temp = regexList.getEventTypes.exec(day)))
eventsType.push({
type: temp[1].trim(),
timestamp: temp[2].trim(),
msg: temp[3].trim()
});
//Increment the count of the corresponding event type and add the event
eventsType.forEach(aEventType => {
switch (aEventType.type) {
case "good":
dayResult.eventsCount.good++;
break;
case "minor":
dayResult.eventsCount.minor++;
break;
case "major":
dayResult.eventsCount.major++;
break;
default:
break;
}
dayResult.events.push(aEventType);
});
result.push(dayResult);
});
return result;
};
//Pass nothing to get most recent status data, pass the date to get the week's status data
const getStatusOnline = async date => {
try {
const html = await fetchStatusPage(date);
return await parseStatusPage(html);
} catch (e) {
console.error(e);
}
return false;
};
//Get or update the status data from github
const updateStatusData = () =>
new Promise(async resolve => {
await checkDataFileExists(dataFile);
//We get the last status date saved in the file
let savedData = await getSavedData();
if (!savedData) return;
savedData.sortByKey("date");
let date = savedData[savedData.length - 1].date;
console.log(`Updating the GitHub status events data ...\n`);
//We loop from the last saved date to the most recent online, stop if last status or error
let continueFetching = true;
let count = 0;
while (continueFetching) {
try {
console.log(`Fetching GitHub status for the week of this date : ${date}.`);
const html = await fetchStatusPage(date);
const weekStatus = await parseStatusPage(html);
saveData(weekStatus);
if (!regexList.isWeekTheMostRecent.test(html))
date = html.match(regexList.getNextWeekDate)[1];
else {
console.log(`The date : ${date} is the most recent GitHub status's update week.`);
continueFetching = false;
}
} catch (e) {
//If the page redirects (http 302 code), we are at the last week of GitHub status
if (e.hasOwnProperty('statusCode') && e.statusCode === 302) {
//Save the last week status
await saveData(await parseStatusPage(await fetchStatusPage()));
console.log(`The date : ${date} is the most recent GitHub status's update week.`);
} else {
console.error(e);
count--;
};
continueFetching = false;
}
count++;
}
console.log(`\nSuccessfully updated GitHub status events data. ${count} week(s) of events were fetched.`);
resolve();
});
//Sort any array of objects by one of its keys
Array.prototype.sortByKey = function(key) {
this.sort((a, b) => {
let x = a[key],
y = b[key];
if (typeof x == "string")
x = ("" + x).toLowerCase();
if (typeof y == "string")
y = ("" + y).toLowerCase();
return ((x < y) ? -1 : ((x > y) ? 1 : 0));
});
};
//Pass an array of objects containing the date and the count of each type of events in parameters
//It searches if the date of the event in the save file exists, if not, it adds it to the save file
const saveData = async daysEventsArray => {
await checkDataFileExists(dataFile);
let savedData = await getSavedData();
if (!savedData) return false;
await daysEventsArray.forEach(aDayEvents => {
//Check if the date is already saved, if not, save it
if (!savedData.find(aSavedDay => aSavedDay.date === aDayEvents.date))
savedData.push(aDayEvents);
});
//Sort the array by date and save it
savedData.sortByKey("date");
await fs.writeFileSync(dataFile, JSON.stringify(savedData));
return true;
};
//Exported objects to be used out of this script
module.exports = {
dataFile: dataFile,
getStatusOnline: getStatusOnline,
updateStatusData: updateStatusData,
getSavedData: getSavedData,
saveData: saveData
};