diff --git a/src/Downloader.ts b/src/Downloader.ts index a36794180..e856f36b1 100644 --- a/src/Downloader.ts +++ b/src/Downloader.ts @@ -36,10 +36,7 @@ imageminOptions.set('default', new Map()) imageminOptions.set('webp', new Map()) imageminOptions.get('default').set('image/png', { - plugins: [ - (imageminPngquant as any)({ speed: 3, strip: true, dithering: 0 }), - imageminAdvPng({ optimizationLevel: 4, iterations: 5 }), - ], + plugins: [(imageminPngquant as any)({ speed: 3, strip: true, dithering: 0 }), imageminAdvPng({ optimizationLevel: 4, iterations: 5 })], }) imageminOptions.get('default').set('image/jpeg', { plugins: [imageminJpegoptim({ max: 60, stripAll: true })], @@ -102,16 +99,7 @@ class Downloader { public jsonRequestOptions: AxiosRequestConfig public streamRequestOptions: AxiosRequestConfig - constructor({ - mw, - uaString, - speed, - reqTimeout, - optimisationCacheUrl, - s3, - webp, - backoffOptions, - }: DownloaderOpts) { + constructor({ mw, uaString, speed, reqTimeout, optimisationCacheUrl, s3, webp, backoffOptions }: DownloaderOpts) { this.mw = mw this.uaString = uaString this.speed = speed @@ -132,9 +120,7 @@ class Downloader { this.backoffOptions = { strategy: new backoff.ExponentialStrategy(), failAfter: 7, - retryIf: (err: any) => - err.code === 'ECONNABORTED' || - ![400, 403, 404].includes(err.response?.status), + retryIf: (err: any) => err.code === 'ECONNABORTED' || ![400, 403, 404].includes(err.response?.status), backoffHandler: (number: number, delay: number) => { logger.info(`[backoff] #${number} after ${delay} ms`) }, @@ -197,9 +183,7 @@ class Downloader { public serializeUrl(url: string): string { const { path } = urlParser.parse(url) const cacheablePart = url.replace(path, '') - const cacheEntry = Object.entries(this.urlPartCache).find( - ([value]) => value === cacheablePart, - ) + const cacheEntry = Object.entries(this.urlPartCache).find(([value]) => value === cacheablePart) let cacheKey if (!cacheEntry) { const cacheId = String(Object.keys(this.urlPartCache).length + 1) @@ -228,19 +212,12 @@ class Downloader { ? this.mw.veApiUrl.href : undefined - this.baseUrlForMainPage = this.mwCapabilities.desktopRestApiAvailable - ? this.mw.desktopRestApiUrl.href - : this.mwCapabilities.veApiAvailable - ? this.mw.veApiUrl.href - : undefined + this.baseUrlForMainPage = this.mwCapabilities.desktopRestApiAvailable ? this.mw.desktopRestApiUrl.href : this.mwCapabilities.veApiAvailable ? this.mw.veApiUrl.href : undefined logger.log('Base Url: ', this.baseUrl) logger.log('Base Url for Main Page: ', this.baseUrlForMainPage) - if (!this.baseUrl || !this.baseUrlForMainPage) - throw new Error( - 'Unable to find appropriate API end-point to retrieve article HTML', - ) + if (!this.baseUrl || !this.baseUrlForMainPage) throw new Error('Unable to find appropriate API end-point to retrieve article HTML') } public async checkApiAvailabilty(url: string): Promise { @@ -249,48 +226,28 @@ class Downloader { headers: { cookie: this.loginCookie }, }) // Check for hostname is for domain name in cases of redirects. - return ( - resp.status === 200 && - !resp.headers['mediawiki-api-error'] && - path.dirname(url) === path.dirname(resp.request.res.responseUrl) - ) + return resp.status === 200 && !resp.headers['mediawiki-api-error'] && path.dirname(url) === path.dirname(resp.request.res.responseUrl) } catch (err) { return false } } - public async checkCapabilities( - testArticleId = 'MediaWiki:Sidebar', - ): Promise { + public async checkCapabilities(testArticleId = 'MediaWiki:Sidebar'): Promise { // By default check all API's responses and set the capabilities // accordingly. We need to set a default page (always there because // installed per default) to request the REST API, otherwise it would // fail the check. - this.mwCapabilities.mobileRestApiAvailable = await this.checkApiAvailabilty( - this.mw.getMobileRestApiArticleUrl(testArticleId), - ) - this.mwCapabilities.desktopRestApiAvailable = - await this.checkApiAvailabilty( - this.mw.getDesktopRestApiArticleUrl(testArticleId), - ) - this.mwCapabilities.veApiAvailable = await this.checkApiAvailabilty( - this.mw.getVeApiArticleUrl(testArticleId), - ) - this.mwCapabilities.apiAvailable = await this.checkApiAvailabilty( - this.mw.apiUrl.href, - ) + this.mwCapabilities.mobileRestApiAvailable = await this.checkApiAvailabilty(this.mw.getMobileRestApiArticleUrl(testArticleId)) + this.mwCapabilities.desktopRestApiAvailable = await this.checkApiAvailabilty(this.mw.getDesktopRestApiArticleUrl(testArticleId)) + this.mwCapabilities.veApiAvailable = await this.checkApiAvailabilty(this.mw.getVeApiArticleUrl(testArticleId)) + this.mwCapabilities.apiAvailable = await this.checkApiAvailabilty(this.mw.apiUrl.href) // Coordinate fetching const reqOpts = objToQueryString({ ...this.getArticleQueryOpts(), }) - const resp = await this.getJSON( - `${this.mw.apiUrl.href}${reqOpts}`, - ) - const isCoordinateWarning = - resp.warnings && - resp.warnings.query && - (resp.warnings.query['*'] || '').includes('coordinates') + const resp = await this.getJSON(`${this.mw.apiUrl.href}${reqOpts}`) + const isCoordinateWarning = resp.warnings && resp.warnings.query && (resp.warnings.query['*'] || '').includes('coordinates') if (isCoordinateWarning) { logger.info('Coordinates not available on this wiki') this.mwCapabilities.coordinatesAvailable = false @@ -305,10 +262,7 @@ class Downloader { return this.getJSON(this.mw.getApiQueryUrl(query)) } - public async getArticleDetailsIds( - articleIds: string[], - shouldGetThumbnail = false, - ): Promise { + public async getArticleDetailsIds(articleIds: string[], shouldGetThumbnail = false): Promise { let continuation: ContinueOpts let finalProcessedResp: QueryMwRet @@ -335,20 +289,12 @@ class Downloader { continuation = resp.continue const relevantDetails = this.stripNonContinuedProps(processedResponse) - finalProcessedResp = - finalProcessedResp === undefined - ? relevantDetails - : deepmerge(finalProcessedResp, relevantDetails) + finalProcessedResp = finalProcessedResp === undefined ? relevantDetails : deepmerge(finalProcessedResp, relevantDetails) } else { if (this.mw.getCategories) { - processedResponse = await this.setArticleSubCategories( - processedResponse, - ) + processedResponse = await this.setArticleSubCategories(processedResponse) } - finalProcessedResp = - finalProcessedResp === undefined - ? processedResponse - : deepmerge(finalProcessedResp, processedResponse) + finalProcessedResp = finalProcessedResp === undefined ? processedResponse : deepmerge(finalProcessedResp, processedResponse) break } } @@ -356,10 +302,7 @@ class Downloader { return finalProcessedResp } - public async getArticleDetailsNS( - ns: number, - gapcontinue = '', - ): Promise<{ gapContinue: string; articleDetails: QueryMwRet }> { + public async getArticleDetailsNS(ns: number, gapcontinue = ''): Promise<{ gapContinue: string; articleDetails: QueryMwRet }> { let queryContinuation: QueryContinueOpts let finalProcessedResp: QueryMwRet let gCont: string = null @@ -382,14 +325,10 @@ class Downloader { } if (queryContinuation) { - queryOpts.cocontinue = - queryContinuation?.coordinates?.cocontinue ?? queryOpts.cocontinue - queryOpts.clcontinue = - queryContinuation?.categories?.clcontinue ?? queryOpts.clcontinue - queryOpts.picontinue = - queryContinuation?.pageimages?.picontinue ?? queryOpts.picontinue - queryOpts.rdcontinue = - queryContinuation?.redirects?.rdcontinue ?? queryOpts.rdcontinue + queryOpts.cocontinue = queryContinuation?.coordinates?.cocontinue ?? queryOpts.cocontinue + queryOpts.clcontinue = queryContinuation?.categories?.clcontinue ?? queryOpts.clcontinue + queryOpts.picontinue = queryContinuation?.pageimages?.picontinue ?? queryOpts.picontinue + queryOpts.rdcontinue = queryContinuation?.redirects?.rdcontinue ?? queryOpts.rdcontinue } const queryString = objToQueryString(queryOpts) @@ -402,31 +341,20 @@ class Downloader { gCont = resp['query-continue']?.allpages?.gapcontinue ?? gCont - const queryComplete = - Object.keys(resp['query-continue'] || {}).filter( - (key) => key !== 'allpages', - ).length === 0 + const queryComplete = Object.keys(resp['query-continue'] || {}).filter((key) => key !== 'allpages').length === 0 if (!queryComplete) { queryContinuation = resp['query-continue'] const relevantDetails = this.stripNonContinuedProps(processedResponse) - finalProcessedResp = - finalProcessedResp === undefined - ? relevantDetails - : deepmerge(finalProcessedResp, relevantDetails) + finalProcessedResp = finalProcessedResp === undefined ? relevantDetails : deepmerge(finalProcessedResp, relevantDetails) } else { if (this.mw.getCategories) { - processedResponse = await this.setArticleSubCategories( - processedResponse, - ) + processedResponse = await this.setArticleSubCategories(processedResponse) } - finalProcessedResp = - finalProcessedResp === undefined - ? processedResponse - : deepmerge(finalProcessedResp, processedResponse) + finalProcessedResp = finalProcessedResp === undefined ? processedResponse : deepmerge(finalProcessedResp, processedResponse) break } } @@ -437,10 +365,7 @@ class Downloader { } } - public async getArticle( - articleId: string, - dump: Dump, - ): Promise { + public async getArticle(articleId: string, dump: Dump): Promise { const isMainPage = dump.isMainPage(articleId) const articleApiUrl = this.getArticleUrl(articleId, isMainPage) @@ -470,9 +395,7 @@ class Downloader { }) } - public async downloadContent( - _url: string, - ): Promise<{ content: Buffer | string; responseHeaders: any }> { + public async downloadContent(_url: string): Promise<{ content: Buffer | string; responseHeaders: any }> { if (!_url) { throw new Error(`Parameter [${_url}] is not a valid url`) } @@ -503,30 +426,16 @@ class Downloader { } private getArticleUrl(articleId: string, isMainPage: boolean): string { - return `${ - isMainPage ? this.baseUrlForMainPage : this.baseUrl - }${encodeURIComponent(articleId)}` + return `${isMainPage ? this.baseUrlForMainPage : this.baseUrl}${encodeURIComponent(articleId)}` } - private stripNonContinuedProps( - articleDetails: QueryMwRet, - cont: QueryContinueOpts | ContinueOpts = {}, - ): QueryMwRet { + private stripNonContinuedProps(articleDetails: QueryMwRet, cont: QueryContinueOpts | ContinueOpts = {}): QueryMwRet { const propsMap: KVS = { pageimages: ['thumbnail', 'pageimage'], coordinates: ['coordinates'], categories: ['categories'], } - const keysToKeep: string[] = [ - 'subCategories', - 'revisions', - 'redirects', - ].concat( - Object.keys(cont).reduce( - (acc, key) => acc.concat(propsMap[key] || []), - [], - ), - ) + const keysToKeep: string[] = ['subCategories', 'revisions', 'redirects'].concat(Object.keys(cont).reduce((acc, key) => acc.concat(propsMap[key] || []), [])) const items = Object.entries(articleDetails).map(([aId, detail]) => { const newDetail = keysToKeep.reduce((acc, key) => { const val = (detail as any)[key] @@ -547,34 +456,19 @@ class Downloader { } private static handleMWWarningsAndErrors(resp: MwApiResponse): void { - if (resp.warnings) - logger.warn( - `Got warning from MW Query ${JSON.stringify( - resp.warnings, - null, - '\t', - )}`, - ) - if (resp.error?.code === DB_ERROR) - throw new Error( - `Got error from MW Query ${JSON.stringify(resp.error, null, '\t')}`, - ) - if (resp.error) - logger.log( - `Got error from MW Query ${JSON.stringify(resp.warnings, null, '\t')}`, - ) + if (resp.warnings) logger.warn(`Got warning from MW Query ${JSON.stringify(resp.warnings, null, '\t')}`) + if (resp.error?.code === DB_ERROR) throw new Error(`Got error from MW Query ${JSON.stringify(resp.error, null, '\t')}`) + if (resp.error) logger.log(`Got error from MW Query ${JSON.stringify(resp.warnings, null, '\t')}`) } private getArticleQueryOpts(includePageimages = false, redirects = false) { - const validNamespaceIds = this.mw.namespacesToMirror.map( - (ns) => this.mw.namespaces[ns].num, - ) + const validNamespaceIds = this.mw.namespacesToMirror.map((ns) => this.mw.namespaces[ns].num) return { action: 'query', format: 'json', - prop: `redirects|revisions${includePageimages ? '|pageimages' : ''}${ - this.mwCapabilities.coordinatesAvailable ? '|coordinates' : '' - }${this.mw.getCategories ? '|categories' : ''}`, + prop: `redirects|revisions${includePageimages ? '|pageimages' : ''}${this.mwCapabilities.coordinatesAvailable ? '|coordinates' : ''}${ + this.mw.getCategories ? '|categories' : '' + }`, rdlimit: 'max', rdnamespace: validNamespaceIds.join('|'), redirects: redirects ? true : undefined, @@ -587,8 +481,7 @@ class Downloader { const isCategoryArticle = articleDetail.ns === 14 if (isCategoryArticle) { const categoryMembers = await this.getSubCategories(articleId) - ;(articleDetails[articleId] as any).subCategories = - categoryMembers.slice() + ;(articleDetails[articleId] as any).subCategories = categoryMembers.slice() } } return articleDetails @@ -611,10 +504,7 @@ class Downloader { return null } - private getJSONCb = ( - url: string, - handler: (...args: any[]) => any, - ): void => { + private getJSONCb = (url: string, handler: (...args: any[]) => any): void => { logger.info(`Getting JSON from [${url}]`) axios .get(url, this.jsonRequestOptions) @@ -623,13 +513,8 @@ class Downloader { try { if (err.response && err.response.status === 429) { logger.log('Received a [status=429], slowing down') - const newMaxActiveRequests: number = Math.max( - this.maxActiveRequests - 1, - 1, - ) - logger.log( - `Setting maxActiveRequests from [${this.maxActiveRequests}] to [${newMaxActiveRequests}]`, - ) + const newMaxActiveRequests: number = Math.max(this.maxActiveRequests - 1, 1) + logger.log(`Setting maxActiveRequests from [${this.maxActiveRequests}] to [${newMaxActiveRequests}]`) this.maxActiveRequests = newMaxActiveRequests return this.getJSONCb(url, handler) } else if (err.response && err.response.status === 404) { @@ -644,22 +529,13 @@ class Downloader { private async getCompressedBody(resp: any): Promise { if (isBitmapImageMimeType(resp.headers['content-type'])) { - if ( - isWebpCandidateImageMimeType(this.webp, resp.headers['content-type']) && - !this.cssDependenceUrls.hasOwnProperty(resp.config.url) - ) { + if (isWebpCandidateImageMimeType(this.webp, resp.headers['content-type']) && !this.cssDependenceUrls.hasOwnProperty(resp.config.url)) { resp.data = await (imagemin as any) - .buffer( - resp.data, - imageminOptions.get('webp').get(resp.headers['content-type']), - ) + .buffer(resp.data, imageminOptions.get('webp').get(resp.headers['content-type'])) .catch(async (err) => { if (/Unsupported color conversion request/.test(err.stderr)) { return await (imagemin as any) - .buffer( - await sharp(resp.data).toColorspace('srgb').toBuffer(), - imageminOptions.get('webp').get(resp.headers['content-type']), - ) + .buffer(await sharp(resp.data).toColorspace('srgb').toBuffer(), imageminOptions.get('webp').get(resp.headers['content-type'])) .catch(() => { return resp.data }) @@ -668,16 +544,9 @@ class Downloader { return data }) } else { - return await (imagemin as any) - .buffer( - resp.data, - imageminOptions - .get('default') - .get(resp.headers['content-type']), - ) - .catch(() => { - return resp.data - }) + return await (imagemin as any).buffer(resp.data, imageminOptions.get('default').get(resp.headers['content-type'])).catch(() => { + return resp.data + }) } }) .then((data) => { @@ -686,14 +555,9 @@ class Downloader { }) resp.headers.path_postfix = '.webp' } else { - resp.data = await (imagemin as any) - .buffer( - resp.data, - imageminOptions.get('default').get(resp.headers['content-type']), - ) - .catch(() => { - return resp.data - }) + resp.data = await (imagemin as any).buffer(resp.data, imageminOptions.get('default').get(resp.headers['content-type'])).catch(() => { + return resp.data + }) } return true } @@ -728,8 +592,7 @@ class Downloader { .downloadBlob(stripHttpFromUrl(url), this.webp ? 'webp' : '1') .then(async (s3Resp) => { if (s3Resp?.Metadata?.etag) { - this.arrayBufferRequestOptions.headers['If-None-Match'] = - this.removeEtagWeakPrefix(s3Resp.Metadata.etag) + this.arrayBufferRequestOptions.headers['If-None-Match'] = this.removeEtagWeakPrefix(s3Resp.Metadata.etag) } const mwResp = await axios(url, this.arrayBufferRequestOptions) @@ -742,18 +605,10 @@ class Downloader { const headers = (({ Body, ...o }) => o)(s3Resp) if ( mwResp.headers['content-type'] - ? isWebpCandidateImageMimeType( - this.webp, - mwResp.headers['content-type'], - ) || + ? isWebpCandidateImageMimeType(this.webp, mwResp.headers['content-type']) || // Hack because of https://phabricator.wikimedia.org/T298011 - (this.webp && - mwResp.headers['content-type'] === - 'application/octet-stream' && - isWebpCandidateImageUrl(mwResp.config.url)) - : this.webp && - isWebpCandidateImageUrl(mwResp.config.url) && - !this.cssDependenceUrls.hasOwnProperty(mwResp.config.url) + (this.webp && mwResp.headers['content-type'] === 'application/octet-stream' && isWebpCandidateImageUrl(mwResp.config.url)) + : this.webp && isWebpCandidateImageUrl(mwResp.config.url) && !this.cssDependenceUrls.hasOwnProperty(mwResp.config.url) ) { headers.path_postfix = '.webp' headers['content-type'] = 'image/webp' @@ -771,12 +626,7 @@ class Downloader { // Check for the etag and upload const etag = this.removeEtagWeakPrefix(mwResp.headers.etag) if (etag) { - this.s3.uploadBlob( - stripHttpFromUrl(url), - mwResp.data, - etag, - this.webp ? 'webp' : '1', - ) + this.s3.uploadBlob(stripHttpFromUrl(url), mwResp.data, etag, this.webp ? 'webp' : '1') } handler(null, { @@ -795,26 +645,16 @@ class Downloader { private errHandler(err: any, url: string, handler: any): void { if (err.response && err.response.status === 429) { logger.log('Received a [status=429], slowing down') - const newMaxActiveRequests: number = Math.max( - this.maxActiveRequests - 1, - 1, - ) - logger.log( - `Setting maxActiveRequests from [${this.maxActiveRequests}] to [${newMaxActiveRequests}]`, - ) + const newMaxActiveRequests: number = Math.max(this.maxActiveRequests - 1, 1) + logger.log(`Setting maxActiveRequests from [${this.maxActiveRequests}] to [${newMaxActiveRequests}]`) this.maxActiveRequests = newMaxActiveRequests } logger.log(`Not able to download content for ${url} due to ${err}`) handler(err) } - private async getSubCategories( - articleId: string, - continueStr = '', - ): Promise> { - const { query, continue: cont } = await this.getJSON( - this.mw.subCategoriesApiUrl(articleId, continueStr), - ) + private async getSubCategories(articleId: string, continueStr = ''): Promise> { + const { query, continue: cont } = await this.getJSON(this.mw.subCategoriesApiUrl(articleId, continueStr)) const items = query.categorymembers.filter((a: any) => a && a.title) if (cont && cont.cmcontinue) { const nextItems = await this.getSubCategories(articleId, cont.cmcontinue) @@ -824,11 +664,7 @@ class Downloader { } } - private backoffCall( - handler: (...args: any[]) => void, - url: string, - callback: (...args: any[]) => void | Promise, - ): void { + private backoffCall(handler: (...args: any[]) => void, url: string, callback: (...args: any[]) => void | Promise): void { const call = backoff.call(handler, url, callback) call.setStrategy(this.backoffOptions.strategy) call.retryIf(this.backoffOptions.retryIf) diff --git a/test/unit/downloader.test.ts b/test/unit/downloader.test.ts index 2bbfd6a04..05c58c672 100644 --- a/test/unit/downloader.test.ts +++ b/test/unit/downloader.test.ts @@ -1,274 +1,280 @@ -import {startRedis, stopRedis} from './bootstrap.js'; -import Downloader from '../../src/Downloader.js'; -import MediaWiki from '../../src/MediaWiki.js'; -import Axios from 'axios'; -import { mkdirPromise, mwRetToArticleDetail, stripHttpFromUrl, isImageUrl } from '../../src/util/index.js'; -import S3 from '../../src/S3.js'; -import rimraf from 'rimraf'; -import { Dump } from '../../src/Dump'; -import { articleDetailXId } from '../../src/stores.js'; -import { config } from '../../src/config.js'; -import 'dotenv/config.js'; -import * as FileType from 'file-type'; -import {jest} from '@jest/globals'; -import urlParser from 'url'; -import {setTimeout} from 'timers/promises'; - -jest.setTimeout(100000); +import { startRedis, stopRedis } from './bootstrap.js' +import Downloader from '../../src/Downloader.js' +import MediaWiki from '../../src/MediaWiki.js' +import Axios from 'axios' +import { mwRetToArticleDetail, stripHttpFromUrl, isImageUrl } from '../../src/util/index.js' +import S3 from '../../src/S3.js' +import { Dump } from '../../src/Dump' +import { articleDetailXId } from '../../src/stores.js' +import { config } from '../../src/config.js' +import 'dotenv/config.js' +import * as FileType from 'file-type' +import { jest } from '@jest/globals' +import urlParser from 'url' +import { setTimeout } from 'timers/promises' + +jest.setTimeout(100000) describe('Downloader class', () => { - let mw: MediaWiki; - let downloader: Downloader; + let mw: MediaWiki + let downloader: Downloader - beforeAll(startRedis); - afterAll(stopRedis); + beforeAll(startRedis) + afterAll(stopRedis) beforeAll(async () => { mw = new MediaWiki({ base: 'https://en.wikipedia.org', getCategories: true, - } as any); - - downloader = new Downloader({ mw, uaString: `${config.userAgent} (contact@kiwix.org)`, speed: 1, reqTimeout: 1000 * 60, webp: true, optimisationCacheUrl: '' }); - - await mw.getMwMetaData(downloader); - await downloader.checkCapabilities(); - await downloader.setBaseUrls(); - }); - - test('downloader.query returns valid JSON', async() => { - const queryRet = await downloader.query(`?action=query&meta=siteinfo&siprop=statistics&format=json`); - expect(queryRet).toBeDefined(); - }); - - test('downloader.getJSON returns valid JSON', async() => { - const JSONRes = await downloader.getJSON(`https://en.wikipedia.org/w/api.php?action=query&meta=siteinfo&format=json`); - expect(JSONRes).toBeDefined(); - }); - - test('downloader.canGetUrl returns valid answer (positive)', async() => { - const urlExists = await downloader.canGetUrl(`https://en.wikipedia.org/w/api.php?action=query&meta=siteinfo&format=json`); - expect(urlExists).toBeDefined(); - }); - - test('downloader.canGetUrl returns valid answer (negative)', async() => { - const urlNotExists = await downloader.canGetUrl(`https://en.wikipedia.org/w/thisisa404`); - expect(urlNotExists).toBeDefined(); - }); - - test('getJSON response status for non-existant url is 404', async() => { - await expect(downloader.getJSON(`https://en.wikipedia.org/w/thisisa404`)).rejects.toThrowError(new Error('Request failed with status code 404')); - }); - - test('downloader.downloadContent returns', async() => { - const contentRes = await downloader.downloadContent(`https://upload.wikimedia.org/wikipedia/commons/thumb/c/cd/London_Montage_L.jpg/275px-London_Montage_L.jpg`); - expect(contentRes.responseHeaders).toBeDefined(); - }); - - test('Webp compression working for cmyk color-space images', async() => { - const {content} = await downloader.downloadContent(`https://upload.wikimedia.org/wikipedia/commons/thumb/5/5a/LOGO_HAEMMERLIN.jpg/550px-LOGO_HAEMMERLIN.jpg`); + } as any) + + downloader = new Downloader({ mw, uaString: `${config.userAgent} (contact@kiwix.org)`, speed: 1, reqTimeout: 1000 * 60, webp: true, optimisationCacheUrl: '' }) + + await mw.getMwMetaData(downloader) + await downloader.checkCapabilities() + await downloader.setBaseUrls() + }) + + test('downloader.query returns valid JSON', async () => { + const queryRet = await downloader.query('?action=query&meta=siteinfo&siprop=statistics&format=json') + expect(queryRet).toBeDefined() + }) + + test('downloader.getJSON returns valid JSON', async () => { + const JSONRes = await downloader.getJSON('https://en.wikipedia.org/w/api.php?action=query&meta=siteinfo&format=json') + expect(JSONRes).toBeDefined() + }) + + test('downloader.canGetUrl returns valid answer (positive)', async () => { + const urlExists = await downloader.canGetUrl('https://en.wikipedia.org/w/api.php?action=query&meta=siteinfo&format=json') + expect(urlExists).toBeDefined() + }) + + test('downloader.canGetUrl returns valid answer (negative)', async () => { + const urlNotExists = await downloader.canGetUrl('https://en.wikipedia.org/w/thisisa404') + expect(urlNotExists).toBeDefined() + }) + + test('getJSON response status for non-existant url is 404', async () => { + await expect(downloader.getJSON('https://en.wikipedia.org/w/thisisa404')).rejects.toThrowError(new Error('Request failed with status code 404')) + }) + + test('downloader.downloadContent returns', async () => { + const contentRes = await downloader.downloadContent('https://upload.wikimedia.org/wikipedia/commons/thumb/c/cd/London_Montage_L.jpg/275px-London_Montage_L.jpg') + expect(contentRes.responseHeaders).toBeDefined() + }) + + test('Webp compression working for cmyk color-space images', async () => { + const { content } = await downloader.downloadContent('https://upload.wikimedia.org/wikipedia/commons/thumb/5/5a/LOGO_HAEMMERLIN.jpg/550px-LOGO_HAEMMERLIN.jpg') const fileType = await FileType.fileTypeFromBuffer(Buffer.from(content)) - expect(fileType?.mime).toEqual('image/webp'); - }); - - test('downloader.downloadContent throws on non-existant url', async() => { - await expect(downloader.downloadContent(`https://upload.wikimedia.org/wikipedia/commons/thumb/c/cd/thisdoesnotexist.jpg`)) - .rejects - .toThrowError(new Error('Request failed with status code 404')); - }); - - test('getArticleDetailsIds Scraped \'London\', \'United_Kingdom\', \'Paris\', \'Zürich\', \'THISARTICLEDOESNTEXIST\' successfully', async() => { - const _articleDetailsRet = await downloader.getArticleDetailsIds(['London', 'United_Kingdom', 'Paris', 'Zürich', 'THISARTICLEDOESNTEXIST', 'Category:Container_categories']); - const articleDetailsRet = mwRetToArticleDetail(_articleDetailsRet); - articleDetailXId.setMany(articleDetailsRet); - const { London, Paris, Zürich, United_Kingdom, THISARTICLEDOESNTEXIST } = articleDetailsRet; - expect(London).toBeDefined(); - expect(United_Kingdom).toBeDefined(); - expect(Paris).toBeDefined(); - expect(Zürich).toBeDefined(); - - expect(THISARTICLEDOESNTEXIST.missing).toBe(''); - }); - - test('getArticleDetailsNS query returns \'gapContinue\' or \'multiple articles\', ', async() => { - const { gapContinue, articleDetails } = await downloader.getArticleDetailsNS(0); - expect(gapContinue).toBeDefined(); - expect(Object.keys(articleDetails).length).toBeGreaterThan(10); - - const secondNsRet = await downloader.getArticleDetailsNS(0, gapContinue); - expect(secondNsRet.gapContinue).toBeDefined(); - }); - - test('downloadContent throws when empty string is passed', async() => { - await expect(downloader.downloadContent('')).rejects.toThrowError(); - }); - - test('downloadContent successfully downloaded an image', async() => { - const { data: LondonDetail } = await Axios.get(`https://en.wikipedia.org/api/rest_v1/page/mobile-sections/London`); - const [imgToGet] = Object.values(LondonDetail.lead.image.urls); - - const LondonImage = await downloader.downloadContent(imgToGet as string); - expect(LondonImage.responseHeaders['content-type']).toMatch(/image\//i); - }); + expect(fileType?.mime).toEqual('image/webp') + }) + + test('downloader.downloadContent throws on non-existant url', async () => { + await expect(downloader.downloadContent('https://upload.wikimedia.org/wikipedia/commons/thumb/c/cd/thisdoesnotexist.jpg')).rejects.toThrowError( + new Error('Request failed with status code 404'), + ) + }) + + test("getArticleDetailsIds Scraped 'London', 'United_Kingdom', 'Paris', 'Zürich', 'THISARTICLEDOESNTEXIST' successfully", async () => { + const _articleDetailsRet = await downloader.getArticleDetailsIds(['London', 'United_Kingdom', 'Paris', 'Zürich', 'THISARTICLEDOESNTEXIST', 'Category:Container_categories']) + const articleDetailsRet = mwRetToArticleDetail(_articleDetailsRet) + articleDetailXId.setMany(articleDetailsRet) + const { London, Paris, Zürich, United_Kingdom, THISARTICLEDOESNTEXIST } = articleDetailsRet + expect(London).toBeDefined() + expect(United_Kingdom).toBeDefined() + expect(Paris).toBeDefined() + expect(Zürich).toBeDefined() + + expect(THISARTICLEDOESNTEXIST.missing).toBe('') + }) + + test("getArticleDetailsNS query returns 'gapContinue' or 'multiple articles', ", async () => { + const { gapContinue, articleDetails } = await downloader.getArticleDetailsNS(0) + expect(gapContinue).toBeDefined() + expect(Object.keys(articleDetails).length).toBeGreaterThan(10) + + const secondNsRet = await downloader.getArticleDetailsNS(0, gapContinue) + expect(secondNsRet.gapContinue).toBeDefined() + }) + + test('downloadContent throws when empty string is passed', async () => { + await expect(downloader.downloadContent('')).rejects.toThrowError() + }) + + test('downloadContent successfully downloaded an image', async () => { + const { data: LondonDetail } = await Axios.get('https://en.wikipedia.org/api/rest_v1/page/mobile-sections/London') + const [imgToGet] = Object.values(LondonDetail.lead.image.urls) + + const LondonImage = await downloader.downloadContent(imgToGet as string) + expect(LondonImage.responseHeaders['content-type']).toMatch(/image\//i) + }) describe('getArticle method', () => { - let dump: Dump; + let dump: Dump beforeAll(async () => { - const mwMetadata = await mw.getMwMetaData(downloader); - dump = new Dump('', {} as any, mwMetadata); - }); + const mwMetadata = await mw.getMwMetaData(downloader) + dump = new Dump('', {} as any, mwMetadata) + }) - test('getArticle of "London" returns one article', async() => { - const LondonArticle = await downloader.getArticle('London', dump); + test('getArticle of "London" returns one article', async () => { + const LondonArticle = await downloader.getArticle('London', dump) expect(LondonArticle).toHaveLength(1) - }); + }) - test('Categories with many subCategories are paginated', async() => { - const PaginatedArticle = await downloader.getArticle('Category:Container_categories', dump); + test('Categories with many subCategories are paginated', async () => { + const PaginatedArticle = await downloader.getArticle('Category:Container_categories', dump) expect(PaginatedArticle.length).toBeGreaterThan(100) - }); + }) - test('getArticle response status for non-existent article id is 404', async() => { - await expect(downloader.getArticle('NeverExistingArticle', dump)) - .rejects - .toThrowError(new Error('Request failed with status code 404')); - }); - }); + test('getArticle response status for non-existent article id is 404', async () => { + await expect(downloader.getArticle('NeverExistingArticle', dump)).rejects.toThrowError(new Error('Request failed with status code 404')) + }) + }) describe('isImageUrl method', () => { - test('Checked Image type: png', async() => { - const isPngFile = isImageUrl('https://bm.wikipedia.org/static/images/project-logos/bmwiki-2x.svg.png'); - expect(isPngFile).toBeTruthy(); - }); - - test('Checked Image type: jpg', async() => { - const isJpgFile = isImageUrl('https://bm.wikipedia.org/static/images/project-logos/bmwiki-2x.JPG'); - expect(isJpgFile).toBeTruthy(); - }); - - test('Checked Image type: svg', async() => { - const isSvgFile = isImageUrl('https://bm.wikipedia.org/static/images/project-logos/bmwiki-2x.svg'); - expect(isSvgFile).toBeTruthy(); - }); - - test('Checked Image type: jpeg', async() => { - const isJpegFile = isImageUrl('https://bm.wikipedia.org/static/images/project-logos/bmwiki-2x.JPEG'); - expect(isJpegFile).toBeTruthy(); - }); - - test('Checked Image type: gif', async() => { - const isgifFile = isImageUrl('https://bm.wikipedia.org/static/images/project-logos/bmwiki-2x.gif'); - expect(isgifFile).toBeTruthy(); - }); - - test('Checked Image URL with arguments', async() => { - const isgifFileWithArgs = isImageUrl('https://bm.wikipedia.org/static/images/project-logos/bmwiki-2x.gif?foo=bar'); - expect(isgifFileWithArgs).toBeTruthy(); - }); - - test('Url is not image type', async() => { - const isnotImage = isImageUrl('https://en.wikipedia.org/w/api.php?action=query&meta=siteinfo&format=json'); - expect(isnotImage).not.toBeTruthy(); - }); - - test('Url is empty string', async() => { - const isEmptyString = isImageUrl(''); - expect(isEmptyString).not.toBeTruthy(); - }); - - test('Image Url has no extension', async() => { - const imageHasNoExtension = isImageUrl('https://bm.wikipedia.org/static/images/project-logos/bmwiki-2x'); - expect(imageHasNoExtension).not.toBeTruthy(); - }); - - test('Image Url extension is undefined', async() => { - const extensionIsUndefined = isImageUrl('https://bm.wikipedia.org/static/images/project-logos/undefined'); - expect(extensionIsUndefined).not.toBeTruthy(); - }); - }); - - const describeIf = process.env.S3_URL ? describe : describe.skip; + test('Checked Image type: png', async () => { + const isPngFile = isImageUrl('https://bm.wikipedia.org/static/images/project-logos/bmwiki-2x.svg.png') + expect(isPngFile).toBeTruthy() + }) + + test('Checked Image type: jpg', async () => { + const isJpgFile = isImageUrl('https://bm.wikipedia.org/static/images/project-logos/bmwiki-2x.JPG') + expect(isJpgFile).toBeTruthy() + }) + + test('Checked Image type: svg', async () => { + const isSvgFile = isImageUrl('https://bm.wikipedia.org/static/images/project-logos/bmwiki-2x.svg') + expect(isSvgFile).toBeTruthy() + }) + + test('Checked Image type: jpeg', async () => { + const isJpegFile = isImageUrl('https://bm.wikipedia.org/static/images/project-logos/bmwiki-2x.JPEG') + expect(isJpegFile).toBeTruthy() + }) + + test('Checked Image type: gif', async () => { + const isgifFile = isImageUrl('https://bm.wikipedia.org/static/images/project-logos/bmwiki-2x.gif') + expect(isgifFile).toBeTruthy() + }) + + test('Checked Image URL with arguments', async () => { + const isgifFileWithArgs = isImageUrl('https://bm.wikipedia.org/static/images/project-logos/bmwiki-2x.gif?foo=bar') + expect(isgifFileWithArgs).toBeTruthy() + }) + + test('Url is not image type', async () => { + const isnotImage = isImageUrl('https://en.wikipedia.org/w/api.php?action=query&meta=siteinfo&format=json') + expect(isnotImage).not.toBeTruthy() + }) + + test('Url is empty string', async () => { + const isEmptyString = isImageUrl('') + expect(isEmptyString).not.toBeTruthy() + }) + + test('Image Url has no extension', async () => { + const imageHasNoExtension = isImageUrl('https://bm.wikipedia.org/static/images/project-logos/bmwiki-2x') + expect(imageHasNoExtension).not.toBeTruthy() + }) + + test('Image Url extension is undefined', async () => { + const extensionIsUndefined = isImageUrl('https://bm.wikipedia.org/static/images/project-logos/undefined') + expect(extensionIsUndefined).not.toBeTruthy() + }) + }) + + const describeIf = process.env.S3_URL ? describe : describe.skip describeIf('Downloader class with optimisation', () => { - let downloader: Downloader; - let s3: S3; - const s3UrlObj = urlParser.parse(`${process.env.S3_URL}`, true); + let downloader: Downloader + let s3: S3 + const s3UrlObj = urlParser.parse(`${process.env.S3_URL}`, true) beforeAll(async () => { const mw = new MediaWiki({ - base: 'https://en.wikipedia.org', - getCategories: true, - } as any); + base: 'https://en.wikipedia.org', + getCategories: true, + } as any) s3 = new S3(`${s3UrlObj.protocol}//${s3UrlObj.host}/`, { - bucketName: s3UrlObj.query.bucketName, - keyId: s3UrlObj.query.keyId, - secretAccessKey: s3UrlObj.query.secretAccessKey, - }); - downloader = new Downloader({ mw, uaString: `${config.userAgent} (contact@kiwix.org)`, speed: 1, reqTimeout: 1000 * 60, webp: false, optimisationCacheUrl: 'random-string' , s3}); - - await s3.initialise(); - }); - - test('Etag Not Present', async() => { - const etagNotPresent = await downloader.downloadContent(`https://en.wikipedia.org/w/extensions/WikimediaBadges/resources/images/badge-silver-star.png?70a8c`); - expect(etagNotPresent.responseHeaders.etag).toBeUndefined(); - }); - - test('Delete image from S3', async() => { - const testImage = 'https://bm.wikipedia.org/static/images/project-logos/bmwiki-2x.png'; + bucketName: s3UrlObj.query.bucketName, + keyId: s3UrlObj.query.keyId, + secretAccessKey: s3UrlObj.query.secretAccessKey, + }) + downloader = new Downloader({ + mw, + uaString: `${config.userAgent} (contact@kiwix.org)`, + speed: 1, + reqTimeout: 1000 * 60, + webp: false, + optimisationCacheUrl: 'random-string', + s3, + }) + + await s3.initialise() + }) + + test('Etag Not Present', async () => { + const etagNotPresent = await downloader.downloadContent('https://en.wikipedia.org/w/extensions/WikimediaBadges/resources/images/badge-silver-star.png?70a8c') + expect(etagNotPresent.responseHeaders.etag).toBeUndefined() + }) + + test('Delete image from S3', async () => { + const testImage = 'https://bm.wikipedia.org/static/images/project-logos/bmwiki-2x.png' // Strip http(s) from url - const httpOrHttpsRemoved = stripHttpFromUrl(testImage); - expect(httpOrHttpsRemoved).toBeDefined(); + const httpOrHttpsRemoved = stripHttpFromUrl(testImage) + expect(httpOrHttpsRemoved).toBeDefined() // Delete the image already present in S3 - await s3.deleteBlob({ Bucket: s3UrlObj.query.bucketName, Key: httpOrHttpsRemoved }); + await s3.deleteBlob({ Bucket: s3UrlObj.query.bucketName, Key: httpOrHttpsRemoved }) // Check if image exists after deleting from S3 - const imageNotExists = await s3.downloadBlob(httpOrHttpsRemoved); - expect(imageNotExists).toBeNull(); - }); + const imageNotExists = await s3.downloadBlob(httpOrHttpsRemoved) + expect(imageNotExists).toBeNull() + }) - test('Delete image from S3', async() => { + test('Delete image from S3', async () => { const randomImageUrl = async () => { - const url = await getRandomImageUrl(); - return isImageUrl(url) ? url : randomImageUrl(); // recursion to get URL with image in needed format + const url = await getRandomImageUrl() + return isImageUrl(url) ? url : randomImageUrl() // recursion to get URL with image in needed format } // Check Etag Flow - const randomImage = await randomImageUrl(); - const imagePath = stripHttpFromUrl(randomImage); - await s3.deleteBlob({ Bucket: s3UrlObj.query.bucketName, Key: imagePath }); + const randomImage = await randomImageUrl() + const imagePath = stripHttpFromUrl(randomImage) + await s3.deleteBlob({ Bucket: s3UrlObj.query.bucketName, Key: imagePath }) // Upload the image in S3 - await downloader.downloadContent(randomImage); + await downloader.downloadContent(randomImage) // downloadContent() is async so there is no way figure outs when the download completes, thats why setTimeout() is used - await setTimeout(5000); + await setTimeout(5000) // Get the online data of Image from Mediawiki - const resp = await Axios(randomImage); + const resp = await Axios(randomImage) // Download the uploaded image from S3 and check the Etags - const imageContent = await s3.downloadBlob(imagePath); - expect(downloader.removeEtagWeakPrefix(`${resp.headers.etag}`)).toEqual(imageContent.Metadata.etag); + const imageContent = await s3.downloadBlob(imagePath) + expect(downloader.removeEtagWeakPrefix(`${resp.headers.etag}`)).toEqual(imageContent.Metadata.etag) // Upload Image with wrong Etag - await s3.uploadBlob(imagePath, resp.data, 'random-string', '1'); + await s3.uploadBlob(imagePath, resp.data, 'random-string', '1') // Download again to check the Etag has been refreshed properly - const updatedImage = await s3.downloadBlob(imagePath); - expect(updatedImage.Metadata.etag).toEqual(downloader.removeEtagWeakPrefix(`${resp.headers.etag}`)); + const updatedImage = await s3.downloadBlob(imagePath) + expect(updatedImage.Metadata.etag).toEqual(downloader.removeEtagWeakPrefix(`${resp.headers.etag}`)) // Remove Image after test - await s3.deleteBlob({ Bucket: s3UrlObj.query.bucketName, Key: imagePath }); - }); - }); + await s3.deleteBlob({ Bucket: s3UrlObj.query.bucketName, Key: imagePath }) + }) + }) async function getRandomImageUrl(): Promise { - const resp = await Axios('https://commons.wikimedia.org/w/api.php?action=query&generator=random&grnnamespace=6&prop=imageinfo&iiprop=url&formatversion=2&iiurlwidth=100&format=json'); - return resp.data.query.pages[0].imageinfo[0].url; + const resp = await Axios( + 'https://commons.wikimedia.org/w/api.php?action=query&generator=random&grnnamespace=6&prop=imageinfo&iiprop=url&formatversion=2&iiurlwidth=100&format=json', + ) + return resp.data.query.pages[0].imageinfo[0].url } - -}); \ No newline at end of file +}) diff --git a/test/unit/s3.test.ts b/test/unit/s3.test.ts index a37e16c3e..135d694bd 100644 --- a/test/unit/s3.test.ts +++ b/test/unit/s3.test.ts @@ -1,46 +1,45 @@ -import S3 from '../../src/S3.js'; -import 'dotenv/config.js'; -import {jest} from '@jest/globals'; -import urlParser from 'url'; +import S3 from '../../src/S3.js' +import 'dotenv/config.js' +import { jest } from '@jest/globals' +import urlParser from 'url' -jest.setTimeout(60000); +jest.setTimeout(60000) -const describeIf = process.env.S3_URL ? describe : describe.skip; +const describeIf = process.env.S3_URL ? describe : describe.skip describeIf('S3', () => { - test('S3 checks', async () => { - const s3UrlObj = urlParser.parse(`${process.env.S3_URL}`, true); + const s3UrlObj = urlParser.parse(`${process.env.S3_URL}`, true) const s3 = new S3(`${s3UrlObj.protocol}//${s3UrlObj.host}/`, { bucketName: s3UrlObj.query.bucketName, keyId: s3UrlObj.query.keyId, secretAccessKey: s3UrlObj.query.secretAccessKey, - }); + }) - const credentialExists = await s3.initialise(); + const credentialExists = await s3.initialise() // Credentials on S3 exists expect(credentialExists).toBeTruthy() - const bucketExists = await s3.bucketExists(s3UrlObj.query.bucketName as string); + const bucketExists = await s3.bucketExists(s3UrlObj.query.bucketName as string) // Given bucket exists in S3 expect(bucketExists).toBeDefined() // Given bucket does not exists in S3 - await expect(s3.bucketExists('random-string')).rejects.toThrowError(); + await expect(s3.bucketExists('random-string')).rejects.toThrowError() - const s3TestKey = `bm.wikipedia.org/static/images/project-logos/${Math.random().toString(36).slice(2, 7)}.png`; + const s3TestKey = `bm.wikipedia.org/static/images/project-logos/${Math.random().toString(36).slice(2, 7)}.png` // Image uploaded to S3 - await s3.uploadBlob(s3TestKey, '42', '42', '1'); + await s3.uploadBlob(s3TestKey, '42', '42', '1') - const imageExist = await s3.downloadBlob(s3TestKey); + const imageExist = await s3.downloadBlob(s3TestKey) // Image exists in S3 expect(imageExist).toBeDefined() // Remove Image after test - await s3.deleteBlob({ Bucket: s3UrlObj.query.bucketName, Key: s3TestKey }); + await s3.deleteBlob({ Bucket: s3UrlObj.query.bucketName, Key: s3TestKey }) - const imageNotExist = await s3.downloadBlob('bm.wikipedia.org/static/images/project-logos/polsjsshsgd.png'); + const imageNotExist = await s3.downloadBlob('bm.wikipedia.org/static/images/project-logos/polsjsshsgd.png') // Image doesnt exist in S3 expect(imageNotExist).toBeNull() - }); -}); + }) +})