Skip to content

Commit

Permalink
Add articleListToIgnore argument and support comma seperated lists fo…
Browse files Browse the repository at this point in the history
…r it and articleList
  • Loading branch information
uriesk authored and kelson42 committed Jan 5, 2023
1 parent 8fd75b7 commit 5c2eb92
Show file tree
Hide file tree
Showing 7 changed files with 130 additions and 30 deletions.
9 changes: 8 additions & 1 deletion src/Dump.ts
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,14 @@ export class Dump {
}
if (!withoutSelection && !this.opts.filenamePrefix) {
if (this.opts.articleList) {
radical += `_${pathParser.basename(this.opts.articleList).toLowerCase().replace(/\.\w{3}$/, '')}`;
let filenamePostfix = pathParser.basename(this.opts.articleList)
.toLowerCase()
.replace(/\.\w{3}$/, '')
.replace(/[\,\s]/g, '_');
if (filenamePostfix.length > 50) {
filenamePostfix = filenamePostfix.slice(0, 50);
}
radical += `_${filenamePostfix}`;
} else {
radical += '_all';
}
Expand Down
76 changes: 52 additions & 24 deletions src/mwoffliner.lib.ts
Original file line number Diff line number Diff line change
Expand Up @@ -108,19 +108,24 @@ async function execute(argv: any) {
publisher: _publisher,
outputDirectory: _outputDirectory,
addNamespaces: _addNamespaces,
articleList: _articleList,
customZimFavicon: _customZimFavicon,
optimisationCacheUrl,
noLocalParserFallback,
forceLocalParser,
customFlavour
} = argv;

let {
articleList,
articleListToIgnore,
} = argv;

(process as any).verbose = !!verbose;

logger.log(`Starting mwoffliner v${packageJSON.version}...`);

let articleList = _articleList ? String(_articleList) : _articleList;
if (articleList) articleList = String(articleList);
if (articleListToIgnore) articleListToIgnore = String(articleListToIgnore);
const publisher = _publisher || config.defaults.publisher;
let customZimFavicon = _customZimFavicon;

Expand Down Expand Up @@ -290,39 +295,35 @@ async function execute(argv: any) {
/* GET CONTENT ********************* */
/* ********************************* */

if (articleList && articleList.includes('http')) {
let articleListToIgnoreLines: string[];
if (articleListToIgnore) {
try {
const fileName = articleList.split('/').slice(-1)[0];
const tmpArticleListPath = path.join(tmpDirectory, fileName);
logger.log(`Downloading article list from [${articleList}] to [${tmpArticleListPath}]`);
const { data: articleListContentStream } = await axios.get(articleList, downloader.streamRequestOptions);
const articleListWriteStream = fs.createWriteStream(tmpArticleListPath);
await new Promise((resolve, reject) => {
articleListContentStream
.pipe(articleListWriteStream)
.on('error', (err: any) => reject(err))
.on('close', resolve);
});
articleList = tmpArticleListPath;
articleListToIgnoreLines = await readFileOrUrlByLine(articleListToIgnore);
logger.info(`ArticleListToIgnore has [${articleListToIgnoreLines.length}] items`);
} catch (err) {
throw new Error(`Failed to download article list from [${articleList}]`);
logger.error(`Failed to read articleListToIgnore from [${articleListToIgnore}]`, err);
throw err;
}
}

let articleListLines: string[];
try {
articleListLines = articleList ? fs.readFileSync(articleList).toString().split('\n').
map(a => a.replace(/\r/gm, '')).filter((a) => a) : [];
logger.info(`ArticleList has [${articleListLines.length}] items`);
} catch (err) {
logger.error(`Failed to read articleList from [${articleList}]`, err);
throw err;
if (articleList) {
try {
articleListLines = await readFileOrUrlByLine(articleList);
if (articleListToIgnore) {
articleListLines = articleListLines.filter((title: string) => !articleListToIgnoreLines.includes(title));
}
logger.info(`ArticleList has [${articleListLines.length}] items`);
} catch (err) {
logger.error(`Failed to read articleList from [${articleList}]`, err);
throw err;
}
}

await mw.getNamespaces(addNamespaces, downloader);

logger.info(`Getting article ids`);
await getArticleIds(downloader, mw, mainPage, articleList ? articleListLines : null);
await getArticleIds(downloader, mw, mainPage, articleList ? articleListLines : null, articleListToIgnore ? articleListToIgnoreLines : null);
if (mw.getCategories) {
await getCategoriesForArticles(articleDetailXId, downloader, redis);

Expand Down Expand Up @@ -541,6 +542,33 @@ async function execute(argv: any) {
return await saveFavicon(zimCreator, faviconPath);
}

async function readFileOrUrlByLine(resourcePath: string): Promise<string[]> {
if (resourcePath.includes('http')) {
const fileName = resourcePath.split('/').slice(-1)[0];
const { data: contentStream } = await axios.get(resourcePath, downloader.streamRequestOptions);
resourcePath = path.join(tmpDirectory, fileName);
const writeStream = fs.createWriteStream(resourcePath);
await new Promise((resolve, reject) => {
contentStream
.pipe(writeStream)
.on('error', (err: any) => reject(err))
.on('close', resolve);
});
}

if (!fs.existsSync(resourcePath)) {
return resourcePath.split(',')
.filter((part) => part !== '')
.map((part) => part.trim());
}

let fileLines: string[];
fileLines = resourcePath ? fs.readFileSync(resourcePath).toString().split('\n').
map(a => a.replace(/\r/gm, '')).filter((a) => a) : [];

return fileLines;
}

function getMainPage(dump: Dump, zimCreator: ZimCreator, downloader: Downloader) {
async function createMainPage() {
logger.log('Creating main page...');
Expand Down
3 changes: 2 additions & 1 deletion src/parameterList.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ export const requiredParams = ['mwUrl', 'adminEmail'];
export const parameterDescriptions = {
mwUrl: 'Mediawiki base URL.',
adminEmail: 'Email of the mwoffliner user which will be put in the HTTP user-agent string',
articleList: 'File with one title (in UTF8) per line. This can be a local path or an HTTP(S) url',
articleList: 'List of articles to include. Can be a comma seperated list of titles or a local path or http(s) URL to a file with one title (in UTF8) per line',
articleListToIgnore: 'List of articles to ignore. Can be a comma seperated list of titles or a local path or http(s) URL to a file with one title (in UTF8) per line',
customZimFavicon: 'Use this option to give a path to a PNG favicon, it will be used in place of the Mediawiki logo. This can be a local path or an HTTP(S) url',
customZimTitle: 'Allow to configure a custom ZIM file title.',
customZimDescription: 'Allow to configure a custom ZIM file description.',
Expand Down
12 changes: 11 additions & 1 deletion src/util/mw-api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -69,13 +69,23 @@ export async function getArticlesByIds(articleIds: string[], downloader: Downloa
);
}

export async function getArticlesByNS(ns: number, downloader: Downloader, continueLimit?: number): Promise<void> {
export async function getArticlesByNS(ns: number, downloader: Downloader, articleIdsToIgnore?: string[], continueLimit?: number): Promise<void> {
let totalArticles = 0;
let chunk: { articleDetails: QueryMwRet, gapContinue: string };

do {
chunk = await downloader.getArticleDetailsNS(ns, chunk && chunk.gapContinue);

if (articleIdsToIgnore) {
Object.keys(chunk.articleDetails).forEach((articleId) => {
const articleTitle = chunk.articleDetails[articleId].title;
if (articleIdsToIgnore.includes(articleTitle)) {
delete chunk.articleDetails[articleId];
logger.info(`Excluded article ${articleTitle}`);
}
})
}

await articleDetailXId.setMany(mwRetToArticleDetail(chunk.articleDetails));

for (const [articleId, articleDetail] of Object.entries(chunk.articleDetails)) {
Expand Down
4 changes: 2 additions & 2 deletions src/util/redirects.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import Downloader from '../Downloader';
import { redirectsXId, articleDetailXId } from '../stores';
import { getArticlesByIds, getArticlesByNS } from './mw-api';

export async function getArticleIds(downloader: Downloader, mw: MediaWiki, mainPage?: string, articleIds?: string[]) {
export async function getArticleIds(downloader: Downloader, mw: MediaWiki, mainPage?: string, articleIds?: string[], articleIdsToIgnore?: string[]) {
if (mainPage) {
await getArticlesByIds([mainPage], downloader);
}
Expand All @@ -15,7 +15,7 @@ export async function getArticleIds(downloader: Downloader, mw: MediaWiki, mainP
await pmap(
mw.namespacesToMirror,
(namespace: string) => {
return getArticlesByNS(mw.namespaces[namespace].num, downloader);
return getArticlesByNS(mw.namespaces[namespace].num, downloader, articleIdsToIgnore);
},
{concurrency: downloader.speed}
);
Expand Down
54 changes: 54 additions & 0 deletions test/e2e/articleLists.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import test from 'blue-tape';
import { execute } from '../../src/mwoffliner.lib';
import execa from 'execa';
import rimraf from 'rimraf';
import { zimcheckAvailable, zimcheck } from 'test/util';
import 'dotenv/config';

const now = new Date();
const testId = `mwo-test-${+now}`;

const articleList = 'Kiwix,Wikipedia,Internet,Real-time computer graphics';
const articleListToIgnore = 'Wikipedia, Internet';
const listMinusIgnore = 2;
const parameters = {
mwUrl: `https://en.wikipedia.org`,
adminEmail: `test@kiwix.org`,
articleList,
articleListToIgnore,
outputDirectory: testId,
redis: process.env.REDIS,
format: ['nopic'],
};

test('articleList and articleListIgnore check', async (t) => {
await execa.command(`redis-cli flushall`);

const outFiles = await execute(parameters);

t.equal(outFiles.length, 1, `Created 1 output`);

for (const dump of outFiles) {
if (dump.nopic) {
t.ok(dump.status.articles.success === listMinusIgnore, 'Output has right amount of articles');
t.ok(dump.status.articles.fail === 0, 'Output has no failed article');
}
}

t.ok(true, 'Scraped selected articles from wikipedia en');

if (await zimcheckAvailable()) {
try {
await zimcheck(outFiles[0].outFile);
t.ok(true, `Zimcheck passes`);
} catch (err) {
t.ok(false, `Zimcheck passes`);
}
} else {
console.log(`Zimcheck not installed, skipping test`);
}

rimraf.sync(`./${testId}`);
const redisScan = await execa.command(`redis-cli --scan`);
t.equal(redisScan.stdout, '', 'Redis has been cleared');
})
2 changes: 1 addition & 1 deletion test/unit/mwApi.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ test('MWApi NS', async (t) => {

await mw.getNamespaces([], downloader);

await getArticlesByNS(0, downloader, 5); // Get 5 continues/pages of NSes
await getArticlesByNS(0, downloader, null, 5); // Get 5 continues/pages of NSes
const interestingAIds = ['"...And_Ladies_of_the_Club"', '"M"_Circle'];
const articles = await articleDetailXId.getMany(interestingAIds);
const Ladies = articles['"...And_Ladies_of_the_Club"'];
Expand Down

0 comments on commit 5c2eb92

Please sign in to comment.