diff --git a/crawler.js b/crawler.js index 9d8de9688..252418c7a 100644 --- a/crawler.js +++ b/crawler.js @@ -14,6 +14,10 @@ const warcio = require("warcio"); const Redis = require("ioredis"); const TextExtract = require("./textextract"); + +const readabilityJs = fs.readFileSync("/app/node_modules/@mozilla/readability/Readability-readerable.js", "utf-8") + + fs.readFileSync("/app/node_modules/@mozilla/readability/Readability.js", "utf-8"); + const behaviors = fs.readFileSync("/app/node_modules/browsertrix-behaviors/dist/behaviors.js", "utf-8"); const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"]; @@ -281,6 +285,12 @@ class Crawler { default: false, }, + "readerView": { + describe: "If set, apply Mozilla's reader view and add the 'article' object to the pages.jsonl file, see https://github.com/mozilla/readability", + type: "boolean", + default: false, + }, + "cwd": { describe: "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd()", type: "string", @@ -571,14 +581,33 @@ class Crawler { const title = await page.title(); - let text = ""; + let text = null; + let article = null; + if (this.params.text) { const client = await page.target().createCDPSession(); const result = await client.send("DOM.getDocument", {"depth": -1, "pierce": true}); text = await new TextExtract(result).parseTextFromDom(); } - - await this.writePage(data.url, title, this.params.text, text); + + if (this.params.readerView) { + article = {}; + try { + // Note: DOM tree is cloned to avoid side effects + // because it is modified by @mozilla/readability + await page.exposeFunction("readabilityLog", (msg) => console.log(msg)); + article = await page.evaluate(`${readabilityJs};\n(async () => { + if (isProbablyReaderable(document)) { + return await new Readability(document.cloneNode(true)).parse(); + } else { + readabilityLog("Not readerable: " + document.URL); + }})();`); + } catch(e) { + console.log("Error applying reader view:", e); + } + } + + await this.writePage(data.url, title, text, article); if (this.behaviorOpts) { await Promise.allSettled(page.frames().map(frame => frame.evaluate("self.__bx_behaviors.run();"))); @@ -792,14 +821,20 @@ class Crawler { if (createNew) { const header = {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"}; + header["hasText"] = this.params.text; + header["textSource"] = (this.params.readerView ? "readability" : "browser-dom"); + let msg = "creating pages "; if (this.params.text) { - console.log("creating pages with full text"); - header["hasText"] = true; - } - else{ - console.log("creating pages without full text"); - header["hasText"] = false; + msg += "with full text"; + if (this.params.readerView) { + msg += " and reader view"; + } + } else if (this.params.readerView) { + msg += "with reader view"; + } else { + msg += "without full text or reader view"; } + console.log(msg); const header_formatted = JSON.stringify(header).concat("\n"); await this.pagesFH.writeFile(header_formatted); } @@ -809,14 +844,18 @@ class Crawler { } } - async writePage(url, title, text, text_content){ + async writePage(url, title, text, article){ const id = uuidv4(); const row = {"id": id, "url": url, "title": title}; - if (text == true){ - row["text"] = text_content; + if (text) { + row["text"] = text; } - + + if (article) { + row["article"] = article; + } + const processedRow = JSON.stringify(row).concat("\n"); try { this.pagesFH.writeFile(processedRow); diff --git a/package.json b/package.json index 124e1b899..86c0ed432 100644 --- a/package.json +++ b/package.json @@ -18,7 +18,8 @@ "sitemapper": "^3.1.2", "uuid": "8.3.2", "ws": "^7.4.4", - "yargs": "^16.0.3" + "yargs": "^16.0.3", + "@mozilla/readability": "^0.4.1" }, "devDependencies": { "eslint": "^7.20.0", diff --git a/tests/mozilla_readability_test.js b/tests/mozilla_readability_test.js new file mode 100644 index 000000000..06f3dca66 --- /dev/null +++ b/tests/mozilla_readability_test.js @@ -0,0 +1,23 @@ +const util = require("util"); +const exec = util.promisify(require("child_process").exec); +const fs = require("fs"); + +test("verify that Mozilla's Readibility.js extracts a boilerplate-free text", async () => { + jest.setTimeout(30000); + + try { + await exec("docker-compose run crawler crawl --collection readibilitytest --url https://www.iana.org/about --timeout 10000 --text --readerView --limit 1"); + } + catch (error) { + console.log(error); + } + + const page = JSON.parse(fs.readFileSync("crawls/collections/readibilitytest/pages/pages.jsonl", + "utf8").split("\n")[1]); + console.log("title:", page.article.title, "\nexcerpt:", page.article.excerpt); + + // test whether excerpt is present + expect(page.article.excerpt.length > 0).toBe(true); + // test whether boilerplate-free text is shorter than DOM-constructed text + expect(page.article.textContent.length < page.text.length).toBe(true); +});