From 2acffd65b16ece93949cafaacbfd7706809c12a0 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Tue, 13 Apr 2021 18:02:45 +0200 Subject: [PATCH] Integrate Mozilla's Readibility.js - see https://github.com/mozilla/readability - if enabled (command-line flag --readerView): - remove boilerplate from text and HTML - (if available) extract article metadat (author, etc.) - add readable 'article' object to page records in pages.jsonl --- crawler.js | 65 +++++++++++++++++++++++++++++++++++++++++----------- package.json | 3 ++- 2 files changed, 54 insertions(+), 14 deletions(-) diff --git a/crawler.js b/crawler.js index 9d8de9688..368b4d0ab 100644 --- a/crawler.js +++ b/crawler.js @@ -14,6 +14,10 @@ const warcio = require("warcio"); const Redis = require("ioredis"); const TextExtract = require("./textextract"); + +const readabilityJs = fs.readFileSync("/app/node_modules/@mozilla/readability/Readability-readerable.js", "utf-8") + + fs.readFileSync("/app/node_modules/@mozilla/readability/Readability.js", "utf-8"); + const behaviors = fs.readFileSync("/app/node_modules/browsertrix-behaviors/dist/behaviors.js", "utf-8"); const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"]; @@ -281,6 +285,12 @@ class Crawler { default: false, }, + "readerView": { + describe: "If set, apply Mozilla's reader view and add the 'article' object to the pages.jsonl file, see https://github.com/mozilla/readability", + type: "boolean", + default: false, + }, + "cwd": { describe: "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd()", type: "string", @@ -571,14 +581,33 @@ class Crawler { const title = await page.title(); - let text = ""; + let text = null; + let article = null; + if (this.params.text) { const client = await page.target().createCDPSession(); const result = await client.send("DOM.getDocument", {"depth": -1, "pierce": true}); text = await new TextExtract(result).parseTextFromDom(); } - - await this.writePage(data.url, title, this.params.text, text); + + if (this.params.readerView) { + article = {}; + try { + // Note: DOM tree is cloned to avoid side effects + // because it is modified by @mozilla/readability + await page.exposeFunction("readabilityLog", (msg) => console.log(msg)); + article = await page.evaluate(`${readabilityJs};\n(async () => { + if (isProbablyReaderable(document)) { + return await new Readability(document.cloneNode(true)).parse(); + } else { + readabilityLog("Not readerable: " + document.URL); + }})();`); + } catch(e) { + console.log("Error applying reader view:", e); + } + } + + await this.writePage(data.url, title, text, article); if (this.behaviorOpts) { await Promise.allSettled(page.frames().map(frame => frame.evaluate("self.__bx_behaviors.run();"))); @@ -792,14 +821,20 @@ class Crawler { if (createNew) { const header = {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"}; + header["hasText"] = this.params.text; + header["hasReaderView"] = this.params.readerView; + let msg = "creating pages "; if (this.params.text) { - console.log("creating pages with full text"); - header["hasText"] = true; - } - else{ - console.log("creating pages without full text"); - header["hasText"] = false; + msg += "with full text"; + if (this.params.readerView) { + msg += " and reader view"; + } + } else if (this.params.readerView) { + msg += "with reader view"; + } else { + msg += "without full text or reader view"; } + console.log(msg); const header_formatted = JSON.stringify(header).concat("\n"); await this.pagesFH.writeFile(header_formatted); } @@ -809,14 +844,18 @@ class Crawler { } } - async writePage(url, title, text, text_content){ + async writePage(url, title, text, article){ const id = uuidv4(); const row = {"id": id, "url": url, "title": title}; - if (text == true){ - row["text"] = text_content; + if (text) { + row["text"] = text; } - + + if (article) { + row["article"] = article; + } + const processedRow = JSON.stringify(row).concat("\n"); try { this.pagesFH.writeFile(processedRow); diff --git a/package.json b/package.json index 124e1b899..86c0ed432 100644 --- a/package.json +++ b/package.json @@ -18,7 +18,8 @@ "sitemapper": "^3.1.2", "uuid": "8.3.2", "ws": "^7.4.4", - "yargs": "^16.0.3" + "yargs": "^16.0.3", + "@mozilla/readability": "^0.4.1" }, "devDependencies": { "eslint": "^7.20.0",