From 2acffd65b16ece93949cafaacbfd7706809c12a0 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Tue, 13 Apr 2021 18:02:45 +0200 Subject: [PATCH 1/3] Integrate Mozilla's Readibility.js - see https://github.com/mozilla/readability - if enabled (command-line flag --readerView): - remove boilerplate from text and HTML - (if available) extract article metadat (author, etc.) - add readable 'article' object to page records in pages.jsonl --- crawler.js | 65 +++++++++++++++++++++++++++++++++++++++++----------- package.json | 3 ++- 2 files changed, 54 insertions(+), 14 deletions(-) diff --git a/crawler.js b/crawler.js index 9d8de9688..368b4d0ab 100644 --- a/crawler.js +++ b/crawler.js @@ -14,6 +14,10 @@ const warcio = require("warcio"); const Redis = require("ioredis"); const TextExtract = require("./textextract"); + +const readabilityJs = fs.readFileSync("/app/node_modules/@mozilla/readability/Readability-readerable.js", "utf-8") + + fs.readFileSync("/app/node_modules/@mozilla/readability/Readability.js", "utf-8"); + const behaviors = fs.readFileSync("/app/node_modules/browsertrix-behaviors/dist/behaviors.js", "utf-8"); const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"]; @@ -281,6 +285,12 @@ class Crawler { default: false, }, + "readerView": { + describe: "If set, apply Mozilla's reader view and add the 'article' object to the pages.jsonl file, see https://github.com/mozilla/readability", + type: "boolean", + default: false, + }, + "cwd": { describe: "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd()", type: "string", @@ -571,14 +581,33 @@ class Crawler { const title = await page.title(); - let text = ""; + let text = null; + let article = null; + if (this.params.text) { const client = await page.target().createCDPSession(); const result = await client.send("DOM.getDocument", {"depth": -1, "pierce": true}); text = await new TextExtract(result).parseTextFromDom(); } - - await this.writePage(data.url, title, this.params.text, text); + + if (this.params.readerView) { + article = {}; + try { + // Note: DOM tree is cloned to avoid side effects + // because it is modified by @mozilla/readability + await page.exposeFunction("readabilityLog", (msg) => console.log(msg)); + article = await page.evaluate(`${readabilityJs};\n(async () => { + if (isProbablyReaderable(document)) { + return await new Readability(document.cloneNode(true)).parse(); + } else { + readabilityLog("Not readerable: " + document.URL); + }})();`); + } catch(e) { + console.log("Error applying reader view:", e); + } + } + + await this.writePage(data.url, title, text, article); if (this.behaviorOpts) { await Promise.allSettled(page.frames().map(frame => frame.evaluate("self.__bx_behaviors.run();"))); @@ -792,14 +821,20 @@ class Crawler { if (createNew) { const header = {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"}; + header["hasText"] = this.params.text; + header["hasReaderView"] = this.params.readerView; + let msg = "creating pages "; if (this.params.text) { - console.log("creating pages with full text"); - header["hasText"] = true; - } - else{ - console.log("creating pages without full text"); - header["hasText"] = false; + msg += "with full text"; + if (this.params.readerView) { + msg += " and reader view"; + } + } else if (this.params.readerView) { + msg += "with reader view"; + } else { + msg += "without full text or reader view"; } + console.log(msg); const header_formatted = JSON.stringify(header).concat("\n"); await this.pagesFH.writeFile(header_formatted); } @@ -809,14 +844,18 @@ class Crawler { } } - async writePage(url, title, text, text_content){ + async writePage(url, title, text, article){ const id = uuidv4(); const row = {"id": id, "url": url, "title": title}; - if (text == true){ - row["text"] = text_content; + if (text) { + row["text"] = text; } - + + if (article) { + row["article"] = article; + } + const processedRow = JSON.stringify(row).concat("\n"); try { this.pagesFH.writeFile(processedRow); diff --git a/package.json b/package.json index 124e1b899..86c0ed432 100644 --- a/package.json +++ b/package.json @@ -18,7 +18,8 @@ "sitemapper": "^3.1.2", "uuid": "8.3.2", "ws": "^7.4.4", - "yargs": "^16.0.3" + "yargs": "^16.0.3", + "@mozilla/readability": "^0.4.1" }, "devDependencies": { "eslint": "^7.20.0", From b0050c293d5682146e2e62f3f8a51c3bf29d4154 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Tue, 22 Jun 2021 16:12:33 +0200 Subject: [PATCH 2/3] Integrate Mozilla's Readibility.js - indicate in header "textSource" from where the text extract could be taken (via readability or via DOM dump) --- crawler.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawler.js b/crawler.js index 368b4d0ab..252418c7a 100644 --- a/crawler.js +++ b/crawler.js @@ -822,7 +822,7 @@ class Crawler { if (createNew) { const header = {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"}; header["hasText"] = this.params.text; - header["hasReaderView"] = this.params.readerView; + header["textSource"] = (this.params.readerView ? "readability" : "browser-dom"); let msg = "creating pages "; if (this.params.text) { msg += "with full text"; From 7ab6709107c1fb2d1e8c827ef7d66bfea4f792c6 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Tue, 22 Jun 2021 17:55:13 +0200 Subject: [PATCH 3/3] Integrate Mozilla's Readibility.js - add unit test reading https://www.iana.org/about --- tests/mozilla_readability_test.js | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 tests/mozilla_readability_test.js diff --git a/tests/mozilla_readability_test.js b/tests/mozilla_readability_test.js new file mode 100644 index 000000000..06f3dca66 --- /dev/null +++ b/tests/mozilla_readability_test.js @@ -0,0 +1,23 @@ +const util = require("util"); +const exec = util.promisify(require("child_process").exec); +const fs = require("fs"); + +test("verify that Mozilla's Readibility.js extracts a boilerplate-free text", async () => { + jest.setTimeout(30000); + + try { + await exec("docker-compose run crawler crawl --collection readibilitytest --url https://www.iana.org/about --timeout 10000 --text --readerView --limit 1"); + } + catch (error) { + console.log(error); + } + + const page = JSON.parse(fs.readFileSync("crawls/collections/readibilitytest/pages/pages.jsonl", + "utf8").split("\n")[1]); + console.log("title:", page.article.title, "\nexcerpt:", page.article.excerpt); + + // test whether excerpt is present + expect(page.article.excerpt.length > 0).toBe(true); + // test whether boilerplate-free text is shorter than DOM-constructed text + expect(page.article.textContent.length < page.text.length).toBe(true); +});