Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integrate Mozilla's Readibility.js #42

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 52 additions & 13 deletions crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ const warcio = require("warcio");
const Redis = require("ioredis");

const TextExtract = require("./textextract");

const readabilityJs = fs.readFileSync("/app/node_modules/@mozilla/readability/Readability-readerable.js", "utf-8")
+ fs.readFileSync("/app/node_modules/@mozilla/readability/Readability.js", "utf-8");

const behaviors = fs.readFileSync("/app/node_modules/browsertrix-behaviors/dist/behaviors.js", "utf-8");

const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
Expand Down Expand Up @@ -281,6 +285,12 @@ class Crawler {
default: false,
},

"readerView": {
describe: "If set, apply Mozilla's reader view and add the 'article' object to the pages.jsonl file, see https://github.com/mozilla/readability",
type: "boolean",
default: false,
},

"cwd": {
describe: "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd()",
type: "string",
Expand Down Expand Up @@ -571,14 +581,33 @@ class Crawler {


const title = await page.title();
let text = "";
let text = null;
let article = null;

if (this.params.text) {
const client = await page.target().createCDPSession();
const result = await client.send("DOM.getDocument", {"depth": -1, "pierce": true});
text = await new TextExtract(result).parseTextFromDom();
}

await this.writePage(data.url, title, this.params.text, text);

if (this.params.readerView) {
article = {};
try {
// Note: DOM tree is cloned to avoid side effects
// because it is modified by @mozilla/readability
await page.exposeFunction("readabilityLog", (msg) => console.log(msg));
article = await page.evaluate(`${readabilityJs};\n(async () => {
if (isProbablyReaderable(document)) {
return await new Readability(document.cloneNode(true)).parse();
} else {
readabilityLog("Not readerable: " + document.URL);
}})();`);
} catch(e) {
console.log("Error applying reader view:", e);
}
}

await this.writePage(data.url, title, text, article);

if (this.behaviorOpts) {
await Promise.allSettled(page.frames().map(frame => frame.evaluate("self.__bx_behaviors.run();")));
Expand Down Expand Up @@ -792,14 +821,20 @@ class Crawler {

if (createNew) {
const header = {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"};
header["hasText"] = this.params.text;
header["textSource"] = (this.params.readerView ? "readability" : "browser-dom");
let msg = "creating pages ";
if (this.params.text) {
console.log("creating pages with full text");
header["hasText"] = true;
}
else{
console.log("creating pages without full text");
header["hasText"] = false;
msg += "with full text";
if (this.params.readerView) {
msg += " and reader view";
}
} else if (this.params.readerView) {
msg += "with reader view";
} else {
msg += "without full text or reader view";
}
console.log(msg);
const header_formatted = JSON.stringify(header).concat("\n");
await this.pagesFH.writeFile(header_formatted);
}
Expand All @@ -809,14 +844,18 @@ class Crawler {
}
}

async writePage(url, title, text, text_content){
async writePage(url, title, text, article){
const id = uuidv4();
const row = {"id": id, "url": url, "title": title};

if (text == true){
row["text"] = text_content;
if (text) {
row["text"] = text;
}


if (article) {
row["article"] = article;
}

const processedRow = JSON.stringify(row).concat("\n");
try {
this.pagesFH.writeFile(processedRow);
Expand Down
3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
"sitemapper": "^3.1.2",
"uuid": "8.3.2",
"ws": "^7.4.4",
"yargs": "^16.0.3"
"yargs": "^16.0.3",
"@mozilla/readability": "^0.4.1"
},
"devDependencies": {
"eslint": "^7.20.0",
Expand Down
23 changes: 23 additions & 0 deletions tests/mozilla_readability_test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
const util = require("util");
const exec = util.promisify(require("child_process").exec);
const fs = require("fs");

test("verify that Mozilla's Readibility.js extracts a boilerplate-free text", async () => {
jest.setTimeout(30000);

try {
await exec("docker-compose run crawler crawl --collection readibilitytest --url https://www.iana.org/about --timeout 10000 --text --readerView --limit 1");
}
catch (error) {
console.log(error);
}

const page = JSON.parse(fs.readFileSync("crawls/collections/readibilitytest/pages/pages.jsonl",
"utf8").split("\n")[1]);
console.log("title:", page.article.title, "\nexcerpt:", page.article.excerpt);

// test whether excerpt is present
expect(page.article.excerpt.length > 0).toBe(true);
// test whether boilerplate-free text is shorter than DOM-constructed text
expect(page.article.textContent.length < page.text.length).toBe(true);
});