From 2acffd65b16ece93949cafaacbfd7706809c12a0 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian.nagel@uni-konstanz.de>
Date: Tue, 13 Apr 2021 18:02:45 +0200
Subject: [PATCH 1/3] Integrate Mozilla's Readibility.js - see
 https://github.com/mozilla/readability - if enabled (command-line flag
 --readerView): - remove boilerplate from text and HTML - (if available)
 extract article metadat (author, etc.) - add readable 'article' object to
 page records in pages.jsonl

---
 crawler.js   | 65 +++++++++++++++++++++++++++++++++++++++++-----------
 package.json |  3 ++-
 2 files changed, 54 insertions(+), 14 deletions(-)

diff --git a/crawler.js b/crawler.js
index 9d8de9688..368b4d0ab 100644
--- a/crawler.js
+++ b/crawler.js
@@ -14,6 +14,10 @@ const warcio = require("warcio");
 const Redis = require("ioredis");
 
 const TextExtract = require("./textextract");
+
+const readabilityJs = fs.readFileSync("/app/node_modules/@mozilla/readability/Readability-readerable.js", "utf-8")
+  + fs.readFileSync("/app/node_modules/@mozilla/readability/Readability.js", "utf-8");
+
 const behaviors = fs.readFileSync("/app/node_modules/browsertrix-behaviors/dist/behaviors.js", "utf-8");
 
 const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
@@ -281,6 +285,12 @@ class Crawler {
         default: false,
       },
       
+      "readerView": {
+        describe: "If set, apply Mozilla's reader view and add the 'article' object to the pages.jsonl file, see https://github.com/mozilla/readability",
+        type: "boolean",
+        default: false,
+      },
+
       "cwd": {
         describe: "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd()",
         type: "string",
@@ -571,14 +581,33 @@ class Crawler {
       
       
       const title = await page.title();
-      let text = "";
+      let text = null;
+      let article = null;
+
       if (this.params.text) {
         const client = await page.target().createCDPSession();
         const result = await client.send("DOM.getDocument", {"depth": -1, "pierce": true});
         text = await new TextExtract(result).parseTextFromDom();
       }
-    
-      await this.writePage(data.url, title, this.params.text, text);
+
+      if (this.params.readerView) {
+        article = {};
+        try {
+          // Note: DOM tree is cloned to avoid side effects
+          // because it is modified by @mozilla/readability
+          await page.exposeFunction("readabilityLog", (msg) => console.log(msg));
+          article = await page.evaluate(`${readabilityJs};\n(async () => {
+            if (isProbablyReaderable(document)) {
+              return await new Readability(document.cloneNode(true)).parse();
+            } else {
+              readabilityLog("Not readerable: " + document.URL);
+            }})();`);
+        } catch(e) {
+          console.log("Error applying reader view:", e);
+        }
+      }
+
+      await this.writePage(data.url, title, text, article);
 
       if (this.behaviorOpts) {
         await Promise.allSettled(page.frames().map(frame => frame.evaluate("self.__bx_behaviors.run();")));
@@ -792,14 +821,20 @@ class Crawler {
 
       if (createNew) {
         const header = {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"};
+        header["hasText"] = this.params.text;
+        header["hasReaderView"] = this.params.readerView;
+        let msg = "creating pages ";
         if (this.params.text) {
-          console.log("creating pages with full text");
-          header["hasText"] = true;
-        }
-        else{
-          console.log("creating pages without full text");
-          header["hasText"] = false;
+          msg += "with full text";
+          if (this.params.readerView) {
+            msg += " and reader view";
+          }
+        } else if (this.params.readerView) {
+          msg += "with reader view";
+        } else {
+          msg += "without full text or reader view";
         }
+        console.log(msg);
         const header_formatted = JSON.stringify(header).concat("\n");
         await this.pagesFH.writeFile(header_formatted);
       }
@@ -809,14 +844,18 @@ class Crawler {
     }
   }
 
-  async writePage(url, title, text, text_content){
+  async writePage(url, title, text, article){
     const id = uuidv4();
     const row = {"id": id, "url": url, "title": title};
 
-    if (text == true){
-      row["text"] = text_content;
+    if (text) {
+      row["text"] = text;
     }
-    
+
+    if (article) {
+      row["article"] = article;
+    }
+
     const processedRow = JSON.stringify(row).concat("\n");
     try {
       this.pagesFH.writeFile(processedRow);
diff --git a/package.json b/package.json
index 124e1b899..86c0ed432 100644
--- a/package.json
+++ b/package.json
@@ -18,7 +18,8 @@
     "sitemapper": "^3.1.2",
     "uuid": "8.3.2",
     "ws": "^7.4.4",
-    "yargs": "^16.0.3"
+    "yargs": "^16.0.3",
+    "@mozilla/readability": "^0.4.1"
   },
   "devDependencies": {
     "eslint": "^7.20.0",

From b0050c293d5682146e2e62f3f8a51c3bf29d4154 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian.nagel@uni-konstanz.de>
Date: Tue, 22 Jun 2021 16:12:33 +0200
Subject: [PATCH 2/3] Integrate Mozilla's Readibility.js - indicate in header
 "textSource" from where the text extract could be   taken (via readability or
 via DOM dump)

---
 crawler.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crawler.js b/crawler.js
index 368b4d0ab..252418c7a 100644
--- a/crawler.js
+++ b/crawler.js
@@ -822,7 +822,7 @@ class Crawler {
       if (createNew) {
         const header = {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"};
         header["hasText"] = this.params.text;
-        header["hasReaderView"] = this.params.readerView;
+        header["textSource"] = (this.params.readerView ? "readability" : "browser-dom");
         let msg = "creating pages ";
         if (this.params.text) {
           msg += "with full text";

From 7ab6709107c1fb2d1e8c827ef7d66bfea4f792c6 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <sebastian.nagel@uni-konstanz.de>
Date: Tue, 22 Jun 2021 17:55:13 +0200
Subject: [PATCH 3/3] Integrate Mozilla's Readibility.js - add unit test
 reading https://www.iana.org/about

---
 tests/mozilla_readability_test.js | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 tests/mozilla_readability_test.js

diff --git a/tests/mozilla_readability_test.js b/tests/mozilla_readability_test.js
new file mode 100644
index 000000000..06f3dca66
--- /dev/null
+++ b/tests/mozilla_readability_test.js
@@ -0,0 +1,23 @@
+const util = require("util");
+const exec = util.promisify(require("child_process").exec);
+const fs = require("fs");
+
+test("verify that Mozilla's Readibility.js extracts a boilerplate-free text", async () => {
+  jest.setTimeout(30000);
+
+  try {
+    await exec("docker-compose run crawler crawl --collection readibilitytest --url https://www.iana.org/about --timeout 10000 --text --readerView --limit 1");
+  }
+  catch (error) {
+    console.log(error);
+  }
+
+  const page = JSON.parse(fs.readFileSync("crawls/collections/readibilitytest/pages/pages.jsonl",
+    "utf8").split("\n")[1]);
+  console.log("title:", page.article.title, "\nexcerpt:", page.article.excerpt);
+
+  // test whether excerpt is present
+  expect(page.article.excerpt.length > 0).toBe(true);
+  // test whether boilerplate-free text is shorter than DOM-constructed text
+  expect(page.article.textContent.length < page.text.length).toBe(true);
+});