Skip to content

Commit

Permalink
To download media files used additional redis storage.
Browse files Browse the repository at this point in the history
Clearing redis media storage after each dump
  • Loading branch information
pavel-karatsiuba authored and kelson42 committed Apr 10, 2023
1 parent 6b9260a commit e41062e
Show file tree
Hide file tree
Showing 6 changed files with 155 additions and 8 deletions.
22 changes: 20 additions & 2 deletions src/RedisStore.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ class RedisStore implements RS {
private storesReady: boolean

private _filesToDownloadXPath: RKVS<FileDetail>
private _mediaToDownloadXPath: RKVS<FileDetail>
private _filesToRetryXPath: RKVS<FileDetail>
private _articleDetailXId: RKVS<ArticleDetail>
private _redirectsXId: RKVS<ArticleRedirect>
Expand Down Expand Up @@ -58,8 +59,15 @@ class RedisStore implements RS {
}
}

public async flushMediaToDownloadXPath() {
if (this._client.isReady && this.storesReady) {
logger.log('Flushing Redis DB for storing media')
await this._mediaToDownloadXPath.flush()
}
}

public async checkForExistingStores() {
const patterns = ['*-media', '*-media-retry', '*-detail', '*-redirect']
const patterns = ['*-media', '*-files', '*-media-retry', '*-detail', '*-redirect']
let keys: string[] = []
for (const pattern of patterns) {
keys = keys.concat(await this._client.keys(pattern))
Expand All @@ -77,7 +85,13 @@ class RedisStore implements RS {
}

private async populateStores() {
this._filesToDownloadXPath = new RedisKvs(this._client, `${Date.now()}-media`, {
this._mediaToDownloadXPath = new RedisKvs(this._client, `${Date.now()}-media`, {
u: 'url',
n: 'namespace',
m: 'mult',
w: 'width',
})
this._filesToDownloadXPath = new RedisKvs(this._client, `${Date.now()}-files`, {
u: 'url',
n: 'namespace',
m: 'mult',
Expand Down Expand Up @@ -119,6 +133,10 @@ class RedisStore implements RS {
return this._filesToDownloadXPath
}

public get mediaToDownloadXPath(): RKVS<FileDetail> {
return this._mediaToDownloadXPath
}

public get filesToRetryXPath(): RKVS<FileDetail> {
return this._filesToRetryXPath
}
Expand Down
6 changes: 4 additions & 2 deletions src/mwoffliner.lib.ts
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ async function execute(argv: any) {

const redisStore = new RedisStore(argv.redis || config.defaults.redisPath)
await redisStore.connect()
const { articleDetailXId, filesToDownloadXPath, filesToRetryXPath, redirectsXId } = redisStore
const { articleDetailXId, filesToDownloadXPath, mediaToDownloadXPath, filesToRetryXPath, redirectsXId } = redisStore

// Output directory
const outputDirectory = path.isAbsolute(_outputDirectory || '') ? _outputDirectory : path.join(process.cwd(), _outputDirectory || 'out')
Expand Down Expand Up @@ -344,6 +344,7 @@ async function execute(argv: any) {
} else {
try {
await doDump(dump)
await mediaToDownloadXPath.flush()
} catch (err) {
debugger
throw err
Expand Down Expand Up @@ -438,6 +439,7 @@ async function execute(argv: any) {
)

await downloadFiles(filesToDownloadXPath, filesToRetryXPath, zimCreator, dump, downloader)
await downloadFiles(mediaToDownloadXPath, filesToRetryXPath, zimCreator, dump, downloader)

logger.log('Writing Article Redirects')
await writeArticleRedirects(downloader, dump, zimCreator)
Expand Down Expand Up @@ -614,7 +616,7 @@ async function execute(argv: any) {
articleDetail.internalThumbnailUrl = getRelativeFilePath('Main_Page', getMediaBase(suitableResUrl, true), 'I')

await Promise.all([
filesToDownloadXPath.set(path, { url: downloader.serializeUrl(suitableResUrl), mult, width } as FileDetail),
mediaToDownloadXPath.set(path, { url: downloader.serializeUrl(suitableResUrl), mult, width } as FileDetail),
articleDetailXId.set(articleId, articleDetail),
])
articlesWithImages++
Expand Down
1 change: 1 addition & 0 deletions src/types.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ interface RKVS<T> {
// RedisStore Interface
interface RS {
readonly client: any // RedisClientType
readonly mediaToDownloadXPath: RKVS<FileDetail>
readonly filesToDownloadXPath: RKVS<FileDetail>
readonly filesToRetryXPath: RKVS<FileDetail>
readonly articleDetailXId: RKVS<ArticleDetail>
Expand Down
6 changes: 4 additions & 2 deletions src/util/saveArticles.ts
Original file line number Diff line number Diff line change
Expand Up @@ -194,20 +194,21 @@ async function saveArticle(
const { finalHTML, mediaDependencies, subtitles } = await processArticleHtml(articleHtml, redisStore, mw, dump, articleId, articleDetail, _moduleDependencies, downloader.webp)

const filesToDownload: KVS<FileDetail> = {}
const mediaToDownload: KVS<FileDetail> = {}

subtitles.forEach((s) => {
filesToDownload[s.path] = { url: s.url, namespace: '-' }
})

if (mediaDependencies.length) {
const existingVals = await redisStore.filesToDownloadXPath.getMany(mediaDependencies.map((dep) => dep.path))
const existingVals = await redisStore.mediaToDownloadXPath.getMany(mediaDependencies.map((dep) => dep.path))

for (const dep of mediaDependencies) {
const { mult, width } = getSizeFromUrl(dep.url)
const existingVal = existingVals[dep.path]
const currentDepIsHigherRes = !existingVal || existingVal.width < (width || 10e6) || existingVal.mult < (mult || 1)
if (currentDepIsHigherRes) {
filesToDownload[dep.path] = {
mediaToDownload[dep.path] = {
url: downloader.serializeUrl(dep.url),
mult,
width,
Expand All @@ -216,6 +217,7 @@ async function saveArticle(
}
}

await redisStore.mediaToDownloadXPath.setMany(mediaToDownload)
await redisStore.filesToDownloadXPath.setMany(filesToDownload)

const zimArticle = new ZimArticle({
Expand Down
124 changes: 124 additions & 0 deletions test/e2e/multimediaContent.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import * as mwoffliner from '../../src/mwoffliner.lib.js'
import { execa } from 'execa'
import rimraf from 'rimraf'
import { zimcheckAvailable, zimcheck, zimdumpAvailable, zimdump } from '../util.js'
import 'dotenv/config'
import { jest } from '@jest/globals'

jest.setTimeout(60000)

describe('Multimedia', () => {
const now = new Date()
const testId = `mwo-test-${+now}`

const parameters = {
mwUrl: 'https://en.m.wikipedia.org',
adminEmail: 'test@kiwix.org',
articleList: 'User:Kelson/MWoffliner_CI_reference',
outputDirectory: testId,
redis: process.env.REDIS,
customZimDescription: 'Example of the description',
}

test('check multimedia content from wikipedia test page', async () => {
await execa('redis-cli flushall', { shell: true })

const [dump] = await mwoffliner.execute(parameters)

expect(dump.status.articles.success).toEqual(1)
expect(dump.status.articles.fail).toEqual(0)

if (await zimcheckAvailable()) {
await expect(zimcheck(dump.outFile)).resolves.not.toThrowError()
} else {
console.log('Zimcheck not installed, skipping test')
}

if (await zimdumpAvailable()) {
const mediaFiles = await zimdump(`list --ns I ${dump.outFile}`)

expect(mediaFiles.split('\n').sort()).toEqual(
[
'I/Kiwix_-_WikiArabia_Cairo_2017.pdf',
'I/Kiwix_Hackathon_2017_Florence_WikiFundi.webm.120p.vp9.webm',
'I/Kiwix_Hackathon_2017_Florence_WikiFundi.webm.jpg',
'I/Kiwix_icon.svg.png',
'I/Local_Forecast_-_Elevator_(ISRC_USUAN1300012).mp3.ogg',
'I/page1-120px-Kiwix_-_WikiArabia_Cairo_2017.pdf.jpg',
'I/page1-1500px-Kiwix_-_WikiArabia_Cairo_2017.pdf.jpg',
].sort(),
)
} else {
console.log('Zimcheck not installed, skipping test')
}

rimraf.sync(`./${testId}`)
const redisScan = await execa('redis-cli --scan', { shell: true })
// Redis has been cleared
expect(redisScan.stdout).toEqual('')
})

test('check multimedia content from wikipedia test page with different formates', async () => {
await execa('redis-cli flushall', { shell: true })
const dumps = await mwoffliner.execute({ ...parameters, format: ['nopic', 'novid', 'nopdf', 'nodet'] })

expect(dumps).toHaveLength(4)
for (const dump of dumps) {
expect(dump.status.articles.success).toEqual(1)
expect(dump.status.articles.fail).toEqual(0)

if (await zimcheckAvailable()) {
await expect(zimcheck(dump.outFile)).resolves.not.toThrowError()
} else {
console.log('Zimcheck not installed, skipping test')
}

if (await zimdumpAvailable()) {
const mediaFiles = await zimdump(`list --ns I ${dump.outFile}`)
if (dump.nopic) {
expect(mediaFiles.split('\n').sort()).toEqual(
[
'I/Kiwix_-_WikiArabia_Cairo_2017.pdf',
// 'I/Kiwix_Hackathon_2017_Florence_WikiFundi.webm.120p.vp9.webm', // these files were omitted by nopic parameter
// 'I/Kiwix_Hackathon_2017_Florence_WikiFundi.webm.jpg',
// 'I/Kiwix_icon.svg.png',
// 'I/Local_Forecast_-_Elevator_(ISRC_USUAN1300012).mp3.ogg',
// 'I/page1-120px-Kiwix_-_WikiArabia_Cairo_2017.pdf.jpg',
// 'I/page1-1500px-Kiwix_-_WikiArabia_Cairo_2017.pdf.jpg',
].sort(),
)
} else if (dump.novid) {
expect(mediaFiles.split('\n').sort()).toEqual(
[
'I/Kiwix_-_WikiArabia_Cairo_2017.pdf',
// 'I/Kiwix_Hackathon_2017_Florence_WikiFundi.webm.120p.vp9.webm', // these files were omitted by novid parameter
// 'I/Kiwix_Hackathon_2017_Florence_WikiFundi.webm.jpg',
'I/Kiwix_icon.svg.png',
// 'I/Local_Forecast_-_Elevator_(ISRC_USUAN1300012).mp3.ogg',
'I/page1-120px-Kiwix_-_WikiArabia_Cairo_2017.pdf.jpg',
'I/page1-1500px-Kiwix_-_WikiArabia_Cairo_2017.pdf.jpg',
].sort(),
)
} else if (dump.nopdf) {
expect(mediaFiles.split('\n').sort()).toEqual(
[
// 'I/Kiwix_-_WikiArabia_Cairo_2017.pdf', // this file was omitted by nopdf parameter
'I/Kiwix_Hackathon_2017_Florence_WikiFundi.webm.120p.vp9.webm',
'I/Kiwix_Hackathon_2017_Florence_WikiFundi.webm.jpg',
'I/Kiwix_icon.svg.png',
'I/Local_Forecast_-_Elevator_(ISRC_USUAN1300012).mp3.ogg',
'I/page1-120px-Kiwix_-_WikiArabia_Cairo_2017.pdf.jpg',
'I/page1-1500px-Kiwix_-_WikiArabia_Cairo_2017.pdf.jpg',
].sort(),
)
}
} else {
console.log('Zimcheck not installed, skipping test')
}
}
rimraf.sync(`./${testId}`)
const redisScan = await execa('redis-cli --scan', { shell: true })
// Redis has been cleared
expect(redisScan.stdout).toEqual('')
})
})
4 changes: 2 additions & 2 deletions test/unit/bootstrap.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ export const redisStore = new RedisStore(process.env.REDIS || config.defaults.re

export const startRedis = async () => {
await redisStore.connect()
const { articleDetailXId, redirectsXId, filesToDownloadXPath, filesToRetryXPath } = redisStore
await Promise.all([articleDetailXId.flush(), redirectsXId.flush(), filesToDownloadXPath.flush(), filesToRetryXPath.flush()])
const { articleDetailXId, redirectsXId, filesToDownloadXPath, mediaToDownloadXPath, filesToRetryXPath } = redisStore
await Promise.all([articleDetailXId.flush(), redirectsXId.flush(), mediaToDownloadXPath.flush(), filesToDownloadXPath.flush(), filesToRetryXPath.flush()])
}

export const stopRedis = async () => {
Expand Down

0 comments on commit e41062e

Please sign in to comment.