Skip to content

Commit

Permalink
if both articleList and articleListIgnore are given, articleListIgnore
Browse files Browse the repository at this point in the history
  gets substraced from articleList
limit length of articleList part of filename and replace whitespaces and
comma with underlines
add e2e test for articleList and articleListIgnore with comma seperated
lists on wikipedia-en
  • Loading branch information
uriesk committed Jan 4, 2023
1 parent cf6b0b4 commit 8d678f5
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 12 deletions.
9 changes: 8 additions & 1 deletion src/Dump.ts
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,14 @@ export class Dump {
}
if (!withoutSelection && !this.opts.filenamePrefix) {
if (this.opts.articleList) {
radical += `_${pathParser.basename(this.opts.articleList).toLowerCase().replace(/\.\w{3}$/, '')}`;
let filenamePostfix = pathParser.basename(this.opts.articleList)
.toLowerCase()
.replace(/\.\w{3}$/, '')
.replace(/[\,\s]/g, '_');
if (filenamePostfix.length > 50) {
filenamePostfix = filenamePostfix.slice(0, 50);
}
radical += `_${filenamePostfix}`;
} else {
radical += '_all';
}
Expand Down
27 changes: 16 additions & 11 deletions src/mwoffliner.lib.ts
Original file line number Diff line number Diff line change
Expand Up @@ -295,24 +295,27 @@ async function execute(argv: any) {
/* GET CONTENT ********************* */
/* ********************************* */

let articleListLines: string[];
if (articleList) {
let articleListToIgnoreLines: string[];
if (articleListToIgnore) {
try {
articleListLines = await readFileOrUrlByLine(articleList);
logger.info(`ArticleList has [${articleListLines.length}] items`);
articleListToIgnoreLines = await readFileOrUrlByLine(articleListToIgnore);
logger.info(`ArticleListToIgnore has [${articleListToIgnoreLines.length}] items`);
} catch (err) {
logger.error(`Failed to read articleList from [${articleList}]`, err);
logger.error(`Failed to read articleListToIgnore from [${articleListToIgnore}]`, err);
throw err;
}
}

let articleListToIgnoreLines: string[];
if (articleListToIgnore) {
let articleListLines: string[];
if (articleList) {
try {
articleListToIgnoreLines = await readFileOrUrlByLine(articleListToIgnore);
logger.info(`ArticleListToIgnore has [${articleListToIgnoreLines.length}] items`);
articleListLines = await readFileOrUrlByLine(articleList);
if (articleListToIgnore) {
articleListLines = articleListLines.filter((title: string) => !articleListToIgnore.includes(title));
}
logger.info(`ArticleList has [${articleListLines.length}] items`);
} catch (err) {
logger.error(`Failed to read articleListToIgnore from [${articleListToIgnore}]`, err);
logger.error(`Failed to read articleList from [${articleList}]`, err);
throw err;
}
}
Expand Down Expand Up @@ -554,7 +557,9 @@ async function execute(argv: any) {
}

if (!fs.existsSync(resourcePath)) {
return resourcePath.split(',').filter((part) => part !== '');
return resourcePath.split(',')
.filter((part) => part !== '')
.map((part) => part.trim());
}

let fileLines: string[];
Expand Down
54 changes: 54 additions & 0 deletions test/e2e/articleLists.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import test from 'blue-tape';
import { execute } from '../../src/mwoffliner.lib';
import execa from 'execa';
import rimraf from 'rimraf';
import { zimcheckAvailable, zimcheck } from 'test/util';
import 'dotenv/config';

const now = new Date();
const testId = `mwo-test-${+now}`;

const articleList = 'Kiwix,Wikipedia,Internet,Real-time computer graphics';
const articleListToIgnore = 'Wikipedia, Internet';
const listMinusIgnore = 2;
const parameters = {
mwUrl: `https://en.wikipedia.org`,
adminEmail: `test@kiwix.org`,
articleList,
articleListToIgnore,
outputDirectory: testId,
redis: process.env.REDIS,
format: ['nopic'],
};

test('articleList and articleListIgnore check', async (t) => {
await execa.command(`redis-cli flushall`);

const outFiles = await execute(parameters);

t.equal(outFiles.length, 1, `Created 1 output`);

for (const dump of outFiles) {
if (dump.nopic) {
t.ok(dump.status.articles.success === listMinusIgnore, 'Output has right amount of articles');
t.ok(dump.status.articles.fail === 0, 'Output has no failed article');
}
}

t.ok(true, 'Scraped selected articles from wikipedia en');

if (await zimcheckAvailable()) {
try {
await zimcheck(outFiles[0].outFile);
t.ok(true, `Zimcheck passes`);
} catch (err) {
t.ok(false, `Zimcheck passes`);
}
} else {
console.log(`Zimcheck not installed, skipping test`);
}

rimraf.sync(`./${testId}`);
const redisScan = await execa.command(`redis-cli --scan`);
t.equal(redisScan.stdout, '', 'Redis has been cleared');
})

0 comments on commit 8d678f5

Please sign in to comment.