Skip to content

Commit

Permalink
alternative retry strategy for downloading files
Browse files Browse the repository at this point in the history
  • Loading branch information
uriesk committed Jan 23, 2023
1 parent 64febcc commit 78901b5
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 30 deletions.
31 changes: 21 additions & 10 deletions src/Downloader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -393,25 +393,36 @@ class Downloader {
})
}

public async downloadContent(_url: string): Promise<{ content: Buffer | string; responseHeaders: any }> {
public async downloadContent(_url: string, retry = true): Promise<{ content: Buffer | string; responseHeaders: any }> {
if (!_url) {
throw new Error(`Parameter [${_url}] is not a valid url`)
}
const url = this.deserializeUrl(_url)

await this.claimRequest()
return new Promise((resolve, reject) => {
this.backoffCall(this.getContentCb, url, async (err: any, val: any) => {
this.releaseRequest()
if (err) {
const httpStatus = err.response && err.response.status
logger.warn(`Failed to get [${url}] [status=${httpStatus}]`)
reject(err)

try {
return await new Promise((resolve, reject) => {
const cb = (err: any, val: any) => {
if (err) {
reject(err)
} else {
resolve(val)
}
}
if (retry) {
this.backoffCall(this.getContentCb, url, cb)
} else {
resolve(val)
this.getContentCb(url, cb)
}
})
})
} catch (err) {
const httpStatus = err.response && err.response.status
logger.warn(`Failed to get [${url}] [status=${httpStatus}]`)
throw err
} finally {
this.releaseRequest()
}
}

public async canGetUrl(url: string): Promise<boolean> {
Expand Down
6 changes: 1 addition & 5 deletions src/mwoffliner.lib.ts
Original file line number Diff line number Diff line change
Expand Up @@ -478,11 +478,7 @@ async function execute(argv: any) {
}),
)

await downloadFiles(filesToDownloadXPath, zimCreator, dump, downloader)

logger.log('Flushing Redis file store')
await filesToDownloadXPath.flush()
await filesToRetryXPath.flush()
await downloadFiles(filesToDownloadXPath, filesToRetryXPath, zimCreator, dump, downloader)

logger.log('Writing Article Redirects')
await writeArticleRedirects(downloader, dump, zimCreator)
Expand Down
1 change: 1 addition & 0 deletions src/util/const.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ export const DEFAULT_WIKI_PATH = 'wiki/'
export const ALL_READY_FUNCTION = /function allReady\( modules \) {/
export const DO_PROPAGATION = /mw\.requestIdleCallback\( doPropagation, \{ timeout: 1 \} \);/
export const WEBP_HANDLER_URL = 'https://gist.githubusercontent.com/rgaudin/60bb9cc6f187add506584258028b8ee1/raw/9d575b8e25d67eed2a9c9a91d3e053a0062d2fc7/web-handler.js'
export const MAX_FILE_DOWNLOAD_RETRIES = 5
35 changes: 20 additions & 15 deletions src/util/saveArticles.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ import { Dump } from '../Dump.js'
import { contains, genCanonicalLink, genHeaderCSSLink, genHeaderScript, getFullUrl, getMediaBase, jsPath } from './index.js'
import { config } from '../config.js'
import { footerTemplate, htmlTemplateCode } from '../Templates.js'
import { articleDetailXId, filesToDownloadXPath, filesToRetryXPath } from '../stores.js'
import { articleDetailXId, filesToDownloadXPath } from '../stores.js'
import { getRelativeFilePath, getSizeFromUrl, encodeArticleIdForZimHtmlUrl, interpolateTranslationString, isWebpCandidateImageMimeType, getMimeType } from './misc.js'
import { RedisKvs } from './RedisKvs.js'
import { rewriteUrlsOfDoc } from './rewriteUrls.js'
import { CONCURRENCY_LIMIT, DELETED_ARTICLE_ERROR } from './const.js'
import { CONCURRENCY_LIMIT, DELETED_ARTICLE_ERROR, MAX_FILE_DOWNLOAD_RETRIES } from './const.js'

const genericJsModules = config.output.mw.js
const genericCssModules = config.output.mw.css
Expand All @@ -28,11 +28,13 @@ type FileStore = RedisKvs<{
width?: number
}>

export async function downloadFiles(fileStore: FileStore, zimCreator: ZimCreator, dump: Dump, downloader: Downloader, retryLater = true) {
export async function downloadFiles(fileStore: FileStore, retryStore: FileStore, zimCreator: ZimCreator, dump: Dump, downloader: Downloader, retryCounter = 0) {
await retryStore.flush()
const filesForAttempt = await fileStore.len()
const filesTotal = filesForAttempt + dump.status.files.success + dump.status.files.fail
const doRetry = retryCounter < MAX_FILE_DOWNLOAD_RETRIES

logger.log(`${retryLater ? '' : 'RE-'}Downloading a total of [${retryLater ? filesTotal : filesForAttempt}] files...`)
logger.log(`${retryCounter ? 'RE-' : ''}Downloading a total of [${retryCounter ? filesForAttempt : filesTotal}] files...`)
let prevPercentProgress: string

await fileStore.iterateItems(downloader.speed, async (fileDownloadPairs, workerId) => {
Expand Down Expand Up @@ -64,9 +66,8 @@ export async function downloadFiles(fileStore: FileStore, zimCreator: ZimCreator
isFailed = true
} finally {
if (isFailed) {
// todo don't queue 404 for retry
if (retryLater) {
await filesToRetryXPath.set(resp.path, { url: resp.url, namespace: resp.namespace, mult: resp.mult, width: resp.width })
if (doRetry && resp.status !== 404) {
await retryStore.set(resp.path, { url: resp.url, namespace: resp.namespace, mult: resp.mult, width: resp.width })
} else {
logger.warn(`Error downloading file [${downloader.deserializeUrl(resp.url)}], skipping`)
dump.status.files.fail += 1
Expand All @@ -83,16 +84,19 @@ export async function downloadFiles(fileStore: FileStore, zimCreator: ZimCreator
}
})

if (retryLater) {
const isThereAnythingToRetry = (await filesToRetryXPath.len()) > 0
if (isThereAnythingToRetry) {
await downloadFiles(filesToRetryXPath, zimCreator, dump, downloader, false)
logger.log(`Done with ${retryCounter ? 'RE-' : ''}Downloading [${retryCounter ? filesForAttempt : filesTotal}] files`)

if (doRetry) {
const amountToRetry = await retryStore.len()
if (amountToRetry > 0) {
const ts = (retryCounter + 1) * 10
logger.log(`Waiting ${ts} seconds before retrying ${amountToRetry} files`)
await new Promise((res) => setTimeout(res, ts * 1000))
await downloadFiles(retryStore, fileStore, zimCreator, dump, downloader, retryCounter + 1)
} else {
logger.log('No files to retry')
}
}

logger.log(`Done with ${retryLater ? '' : 'RE-'}Downloading a total of [${retryLater ? filesTotal : filesForAttempt}] files`)
}

async function downloadBulk(listOfArguments: any[], downloader: Downloader): Promise<any> {
Expand All @@ -116,13 +120,14 @@ async function downloadBulk(listOfArguments: any[], downloader: Downloader): Pro
resp.width = arg.val.width

return downloader
.downloadContent(arg.val.url)
.downloadContent(arg.val.url, false)
.then((r) => {
resp.result = r
resp.path += resp.result.responseHeaders.path_postfix || ''
return resp
})
.catch(() => {
.catch((err) => {
resp.status = err.response?.status
return resp
})
},
Expand Down

0 comments on commit 78901b5

Please sign in to comment.