diff --git a/.gitignore b/.gitignore index 1ff85f9..5794dd9 100644 --- a/.gitignore +++ b/.gitignore @@ -68,3 +68,5 @@ typings/ .sonarlint/ .idea/ + +swh.cache/ diff --git a/package.json b/package.json index 97d58f4..2fbb540 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { - "name": "template-typescript-package", + "name": "swh-swmath-save-now-batch", "version": "0.0.0-development", - "description": "A boilerplate repo for publishing typescript packages to npm", + "description": "A simple script to batch query the software heritage save now api with swMATH data", "main": "dist/index.js", "types": "dist/index.d.ts", "scripts": { @@ -41,8 +41,10 @@ "devDependencies": { "@commitlint/cli": "^8.2.0", "@commitlint/config-conventional": "^8.2.0", + "@types/cacache": "^12.0.1", "@types/jest": "^24.0.23", "@types/node": "^12.12.12", + "@types/node-fetch": "^2.5.4", "@typescript-eslint/eslint-plugin": "^2.9.0", "@typescript-eslint/parser": "^2.9.0", "eslint": "^6.7.1", @@ -61,8 +63,10 @@ "dist/**/*" ], "dependencies": { - "@types/node-fetch": "^2.5.4", + "cacache": "^13.0.1", "fast-csv": "^3.4.0", - "node-fetch": "^2.6.0" + "loglevel": "^1.6.6", + "node-fetch": "^2.6.0", + "p-queue": "^6.2.1" } } diff --git a/src/__tests__/greet.test.ts b/src/__tests__/greet.test.ts deleted file mode 100644 index 5fdb6bc..0000000 --- a/src/__tests__/greet.test.ts +++ /dev/null @@ -1,7 +0,0 @@ -import { greet, swh } from '..' - -test('My Greeter', () => { - expect(greet('Carl')).toBe('Hello Carl') -}) - -test('Get swh list', async () => swh()) diff --git a/src/__tests__/index.test.ts b/src/__tests__/index.test.ts new file mode 100644 index 0000000..43cede5 --- /dev/null +++ b/src/__tests__/index.test.ts @@ -0,0 +1,11 @@ +import { swh, deleteCacheEntry, processLink } from '..' +import * as log from 'loglevel' + +log.setDefaultLevel('info') +jest.setTimeout(300000) // Allow 5 min to process all links + +test('Process pseudo-link', async () => processLink(0, 'https://github.com/fairmath/SwhSaveNowBatch.git')) +test('Process pseudo-link again', async () => processLink(0, 'https://github.com/fairmath/SwhSaveNowBatch.git')) +test('Delete pseudo-link', async () => deleteCacheEntry(0)) + +test('Get swh list again', async () => swh()) diff --git a/src/index.ts b/src/index.ts index d73a90a..2833857 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,25 +1,49 @@ import { Response } from 'node-fetch' import csv = require('fast-csv') -import CsvParserStream from 'fast-csv/build/src/parser/CsvParserStream' - +import cacach = require('cacache') import fetch from 'node-fetch' +import log = require('loglevel') +import PQueue from 'p-queue' + +const cachePath = './swh.cache' + +export function processLink(id: number, url: string): Promise { + log.trace('processing', { id: id, url: url }) + const cacheKey = `${id}` + return cacach + .get(cachePath, cacheKey) + .then((x: any) => { + return x.data + }) + .catch(() => + fetch(`https://archive.softwareheritage.org/api/1/git/url/${url}/`).then((res: Response) => + res.text().then((text) => cacach.put(cachePath, cacheKey, text)), + ), + ) +} -function parseCSV(res: Response): Promise { +function parseCSV(res: Response): Promise { return new Promise((resolve) => { - const stream = res.body.pipe(csv.parse()) + const stream = res.body.pipe(csv.parse({ delimiter: ';', ignoreEmpty: true })) + const queue = new PQueue({ concurrency: 10 }) + let count = 0 + queue.on('active', () => { + log.debug(`Working on item #${++count}. Size: ${queue.size} Pending: ${queue.pending}`) + }) stream .on('error', /* istanbul ignore next */ (error: Error) => console.error(error)) - .on('data', (row: string) => console.log(`ROW=${JSON.stringify(row)}`)) + .on('data', (row: [number, string]) => queue.add(() => processLink(row[0], row[1]))) .on('end', (rowCount: number) => { - console.log(`Parsed ${rowCount} rows`) - resolve(stream) + stream.end() + log.info(`Received ${rowCount} rows from swMATH`) + resolve(queue) }) }) } -export const greet = (name: string) => `Hello ${name}` - export const swh = () => fetch('http://swmath.org/SWH/') .then(parseCSV) - .then((stream: CsvParserStream) => stream.end()) + .then((q) => q.onIdle().then(() => log.info('Processing finished'))) + +export const deleteCacheEntry = (id: number) => cacach.rm.entry(cachePath, `${id}`)