-
Notifications
You must be signed in to change notification settings - Fork 26
/
Copy pathstatement-parser-lib.js
353 lines (320 loc) · 13 KB
/
statement-parser-lib.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
// Requires single file built version of PDF.js -- please run
// `node make singlefile` before running the example.
//
// TO-DO: doesn't get the year right when the statement covers two years e.g. the statement is dated January 2014 but contains transactions from December 2013
(function() {
var isBrowser = typeof window !== 'undefined';
// if not in a browser
// few hacks to let PDF.js be loaded not as a module in global space
if (!isBrowser) {
var fs = require('fs');
global.window = global;
global.navigator = {
userAgent: 'node'
};
global.PDFJS = {};
require('./domstubs.js');
PDFJS.workerSrc = true;
require('./pdf.combined.js');
var debugMode = process.argv[3] === '--debug';
if (!debugMode) {
console.info = function() {
// do nothing
};
}
}
// NB: we'd have to move these inside function scope if we wanted this file to provide a method that could be run multiple times in parallel
var totalPaymentsFromStatement;
var totalPaymentsFromTransactions;
var totalReceiptsFromStatement;
var totalReceiptsFromTransactions;
var transactions;
var currentTransactionDate;
var statementYear;
/*
* Parse a PDF data stream for statement data
* Safe to use in a browser
*/
function parsePDFStatement(data, fileName, callback) {
// reset global variable each time
totalPaymentsFromStatement = 0;
totalPaymentsFromTransactions = 0;
totalReceiptsFromStatement = 0;
totalReceiptsFromTransactions = 0;
transactions = [];
currentTransactionDate = '';
// get the statement year from the filename
var statementFileNameDelimiter = 'Statement_'; // e.g. Statement_20140807.pdf
statementYear = fileName.substr(
fileName.lastIndexOf(statementFileNameDelimiter) +
statementFileNameDelimiter.length,
4);
// Will be using promises to load document, pages and misc data instead of
// callback.
PDFJS.getDocument(data).then(function(doc) {
// load preview of the PDF in the canvas if present
if (isBrowser) {
doc.getPage(1).then(function(page) {
var scale = 1.5;
var viewport = page.getViewport(scale);
// Prepare canvas using PDF page dimensions
var canvas = document.getElementById('the-canvas');
var context = canvas.getContext('2d');
canvas.height = viewport.height;
canvas.width = viewport.width;
// Render PDF page into canvas context
var renderContext = {
canvasContext: context,
viewport: viewport
};
page.render(renderContext);
});
}
var numPages = doc.numPages;
console.info('# Document Loaded');
console.info('Number of Pages: ' + numPages);
console.info();
var lastPromise; // will be used to chain promises
lastPromise = doc.getMetadata().then(function(data) {
console.info('# Metadata Is Loaded');
console.info('## Info');
console.info(JSON.stringify(data.info, null, 2));
console.info();
if (data.metadata) {
console.info('## Metadata');
console.info(JSON.stringify(data.metadata.metadata, null, 2));
console.info();
}
});
var loadPage = function(pageNum) {
return doc.getPage(pageNum).then(function(page) {
console.info('# Page ' + pageNum);
var viewport = page.getViewport(1.0 /* scale */);
console.info('Size: ' + viewport.width + 'x' + viewport.height);
console.info();
return page.getTextContent().then(function(content) {
// Content contains lots of information about the text layout and
// styles, but we need only strings at the moment
var strings = content.items.map(function(item, i) {
//console.info(JSON.stringify(item));
// add an appropriate whitespace character here if the next item is on the same line and more than 5px to the right, or is on the next line
// item.transform[4] is the x coordinate
// item.transform[5] is the y coordinate
var nextItem = content.items[i + 1];
var padding = '';
var isFarAway;
var isOnSameLine;
if (nextItem) {
isOnSameLine = nextItem.transform[5] === item.transform[5]; // transform[5] is y coordinate
isFarAway = nextItem.transform[4] -
(item.transform[4] + item.width) > 5; // transform[4] is x coordinate
//console.info('distance to next item', nextItem.transform[4] - item.transform[4],item.str,nextItem.str);
if (!isOnSameLine) {
padding = '\\n';
}
if (isFarAway) {
padding = '\t';
}
}
return item.str + padding;
});
console.log('## Text Content');
var text = strings.join('');
console.log(text);
processStatement(text, pageNum);
console.info('# Transactions analysed');
}).then(function() {
console.info();
});
});
};
// Loading of the first page will wait on metadata and subsequent loadings
// will wait on the previous pages.
for (var i = 1; i <= numPages; i++) {
lastPromise = lastPromise.then(loadPage.bind(null, i));
}
return lastPromise;
}).then(function() {
console.info('# End of Document');
var transactionsList = transactions.map(function(transaction) {
return transaction.date + '\t' + transaction.description + '\t' +
transaction.amount;
}).join('\n');
console.info(transactionsList);
console.log('Totals from statement: payments ' +
totalPaymentsFromStatement + ', receipts ' + totalReceiptsFromStatement);
console.log('Totals from transactions: payments ' +
totalPaymentsFromTransactions.toFixed(2) + ', receipts ' +
totalReceiptsFromTransactions.toFixed(2));
var errorsInPayments = (totalPaymentsFromTransactions -
totalPaymentsFromStatement).toFixed(2);
var errorsInReceipts = (totalReceiptsFromTransactions -
totalReceiptsFromStatement).toFixed(2);
console.warn('Errors: payments ' + errorsInPayments +
', receipts ' + errorsInReceipts);
callback(null, transactions);
}, function(err) {
console.error('Error: ' + err);
callback(err);
});
} // end of processPDFStatment()
var payments = [
'AFTS payment re',
'Cash machine withdrawal',
'Card Payment',
'CHAPS transfer to',
'Cheque issued',
'Commission charges',
'Credit Payment',
'Debit card payment to',
'Direct debit to',
'Internet Banking transfer to',
'On-line Banking bill payment to',
'Standing order to'
];
var receipts = [
'Direct credit from',
'Debit card refund from',
'Internet Banking transfer from',
'Deposit', // NB: not sure this is a generic reference
'Refund from',
'Business Banking Loyalty Reward'
];
var transactionsStart = [
'Transactions in date order\\nDate\tDescription\tPayments\tReceipts\tBalance', //Old statement format
'Your Business Current Account',
'Continued\\n'
];
/*
transaction ending patterns are:
* Interim balance carried forward3,856.88 - used on first page when a day's transaction spill over to the second page
* 3,056.16 - used on first page when used on first page when a day's transactions finished neatly at the end of the page
* Continued 6 FebInterim balance brought forward4,515.18Interim balance carried forward4,421.10 - used on intermediate pages
* 4,016.1017 FebBalance carried forward - used on last page
*/
var paymentsMarkers = payments.join('|');
var receiptsMarkers = receipts.join('|');
var optionalDateMarker = '(?:(\\d{1,2} [a-zA-z]{3})\t)?'; // some transactions are preceded by dates such as '7 Feb' or '21 Jul'
var amountMarker = '(\t£?[\\d,]+\\.\\d\\d)';
var trailingBalanceMarker = '(?:[\\d,]+\\.\\d\\d)?'; // some transactions are followed by balances that can interfere a subsequent date e.g. 'Direct credit from G Kirschner Ref:-KirschnerBooking306.004,109.18' followed by '7 FebDebit card payment...'
var transactionSeparator = new RegExp(optionalDateMarker + '((' +
paymentsMarkers + '|' + receiptsMarkers + ').+?)' + amountMarker +
trailingBalanceMarker, 'gi');
var totalsMarkers = [
// old format
new RegExp('Total payments - incl\\.\\\\ncommission & interest' +
amountMarker + '.+?Total receipts' + amountMarker),
// new format
new RegExp('Money out\\s*?' + amountMarker +
'.+?Money in\\s*?' + amountMarker)
];
// console.info('transaction separator',transactionSeparator);
function processStatement(text, pageNum) {
// get old format totals if this is page 1
var totals;
if (pageNum === 1) {
totals = text.match(totalsMarkers[0]);
if (totals) {
// convert strings such as '10,271.17' to 10271.17
totalPaymentsFromStatement = -parseFloat(totals[1].replace(',', '')).toFixed(2);
totalReceiptsFromStatement = parseFloat(totals[2].replace(',', '')).toFixed(2);
}
}
// get new format totals if this is page 2
if (pageNum === 2 && !totals) {
totals = text.match(totalsMarkers[1]);
if (totals) {
// convert strings such as '£10,271.17' to 10271.17
totalPaymentsFromStatement = -parseFloat(totals[1].replace(',','')
.replace('£', '')).toFixed(2);
totalReceiptsFromStatement = parseFloat(totals[2].replace(',','')
.replace('£', '')).toFixed(2);
}
}
// extract statement lines
var startIndex;
var loopStartIndex;
// var endIndex = text.search(transactionsEnd);
var matches;
var transactionsStartLength;
for (var i = transactionsStart.length - 1; i >= 0; i--) {
loopStartIndex = text.indexOf(transactionsStart[i]);
if (loopStartIndex > -1) {
startIndex = loopStartIndex;
transactionsStartLength = transactionsStart[i].length;
}
}
if (typeof startIndex === 'undefined') {
startIndex = -1;
}
// if(endIndex === -1 || startIndex === -1) {
if (startIndex === -1) {
console.warn('could not find start of transactions in text, skipping page ' + pageNum);
console.info(text);
return;
}
// newText = text.substring(startIndex + transactionsStart.length, endIndex);
text = text.substring(startIndex + transactionsStartLength);
// extract transactions
// console.info('**** check for argument we want ****');
matches = transactionSeparator.exec(text);
do {
if (!matches) {
throw new Error('no initial matches for ' + text);
}
//console.info(matches);
var transactionDate = matches[1];
var paymentOrReceiptIndicator = matches[3];
var transactionAmount = parseFloat(matches[4].replace(',','')).toFixed(2);
var transactionDirection = -1; // start by assuming outgoing transaction
// adjust sign of amount according to whether it is an incoming or outgoing transaction
if (receipts.indexOf(paymentOrReceiptIndicator) !== -1) {
transactionDirection = 1; // it is an incoming transaction
}
transactionAmount = transactionAmount * transactionDirection;
// if transaction has a date, add the statement year and update the currently set transaction date
// if the transaction has no date, use currently set transaction date
if (transactionDate) {
//console.info('transaction date is set to',transactionDate);
transactionDate += ' ' + statementYear;
currentTransactionDate = transactionDate;
} else {
transactionDate = currentTransactionDate;
}
// update the transaction description to replace any escaped '\n' characters that were put there as padding
var transactionDescription = matches[2].replace(/\\n/g,' ');
transactions.push({
date: transactionDate,
description: transactionDescription,
amount: transactionAmount
});
if (transactionDirection > 0) { // i.e. is an incoming transaction
totalReceiptsFromTransactions += transactionAmount;
} else {
totalPaymentsFromTransactions += transactionAmount;
}
matches = transactionSeparator.exec(text);
} while (matches);
// console.info('**** end of matches ****');
}
/*
* Parse a PDF statement given its file path
* Only works under node
*/
function parsePDFPath(pdfPath, callback) {
if (!pdfPath) {
throw new Error('parsePDFStatement requires a pdfPath argument');
}
console.log('# Starting ' + pdfPath);
// Loading file from file system into typed array
var data = new Uint8Array(fs.readFileSync(pdfPath));
parsePDFStatement(data, pdfPath, callback);
}
// export the useful node method if we're in node
// otherwise set up a window method
if (!isBrowser) {
module.exports = parsePDFPath;
} else {
window.parsePDFStatement = parsePDFStatement;
}
}());