Skip to content

Commit 2acffd6

Browse files
Integrate Mozilla's Readibility.js
- see https://github.com/mozilla/readability - if enabled (command-line flag --readerView): - remove boilerplate from text and HTML - (if available) extract article metadat (author, etc.) - add readable 'article' object to page records in pages.jsonl
1 parent ae4ce97 commit 2acffd6

File tree

2 files changed

+54
-14
lines changed

2 files changed

+54
-14
lines changed

crawler.js

+52-13
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@ const warcio = require("warcio");
1414
const Redis = require("ioredis");
1515

1616
const TextExtract = require("./textextract");
17+
18+
const readabilityJs = fs.readFileSync("/app/node_modules/@mozilla/readability/Readability-readerable.js", "utf-8")
19+
+ fs.readFileSync("/app/node_modules/@mozilla/readability/Readability.js", "utf-8");
20+
1721
const behaviors = fs.readFileSync("/app/node_modules/browsertrix-behaviors/dist/behaviors.js", "utf-8");
1822

1923
const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
@@ -281,6 +285,12 @@ class Crawler {
281285
default: false,
282286
},
283287

288+
"readerView": {
289+
describe: "If set, apply Mozilla's reader view and add the 'article' object to the pages.jsonl file, see https://github.com/mozilla/readability",
290+
type: "boolean",
291+
default: false,
292+
},
293+
284294
"cwd": {
285295
describe: "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd()",
286296
type: "string",
@@ -571,14 +581,33 @@ class Crawler {
571581

572582

573583
const title = await page.title();
574-
let text = "";
584+
let text = null;
585+
let article = null;
586+
575587
if (this.params.text) {
576588
const client = await page.target().createCDPSession();
577589
const result = await client.send("DOM.getDocument", {"depth": -1, "pierce": true});
578590
text = await new TextExtract(result).parseTextFromDom();
579591
}
580-
581-
await this.writePage(data.url, title, this.params.text, text);
592+
593+
if (this.params.readerView) {
594+
article = {};
595+
try {
596+
// Note: DOM tree is cloned to avoid side effects
597+
// because it is modified by @mozilla/readability
598+
await page.exposeFunction("readabilityLog", (msg) => console.log(msg));
599+
article = await page.evaluate(`${readabilityJs};\n(async () => {
600+
if (isProbablyReaderable(document)) {
601+
return await new Readability(document.cloneNode(true)).parse();
602+
} else {
603+
readabilityLog("Not readerable: " + document.URL);
604+
}})();`);
605+
} catch(e) {
606+
console.log("Error applying reader view:", e);
607+
}
608+
}
609+
610+
await this.writePage(data.url, title, text, article);
582611

583612
if (this.behaviorOpts) {
584613
await Promise.allSettled(page.frames().map(frame => frame.evaluate("self.__bx_behaviors.run();")));
@@ -792,14 +821,20 @@ class Crawler {
792821

793822
if (createNew) {
794823
const header = {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"};
824+
header["hasText"] = this.params.text;
825+
header["hasReaderView"] = this.params.readerView;
826+
let msg = "creating pages ";
795827
if (this.params.text) {
796-
console.log("creating pages with full text");
797-
header["hasText"] = true;
798-
}
799-
else{
800-
console.log("creating pages without full text");
801-
header["hasText"] = false;
828+
msg += "with full text";
829+
if (this.params.readerView) {
830+
msg += " and reader view";
831+
}
832+
} else if (this.params.readerView) {
833+
msg += "with reader view";
834+
} else {
835+
msg += "without full text or reader view";
802836
}
837+
console.log(msg);
803838
const header_formatted = JSON.stringify(header).concat("\n");
804839
await this.pagesFH.writeFile(header_formatted);
805840
}
@@ -809,14 +844,18 @@ class Crawler {
809844
}
810845
}
811846

812-
async writePage(url, title, text, text_content){
847+
async writePage(url, title, text, article){
813848
const id = uuidv4();
814849
const row = {"id": id, "url": url, "title": title};
815850

816-
if (text == true){
817-
row["text"] = text_content;
851+
if (text) {
852+
row["text"] = text;
818853
}
819-
854+
855+
if (article) {
856+
row["article"] = article;
857+
}
858+
820859
const processedRow = JSON.stringify(row).concat("\n");
821860
try {
822861
this.pagesFH.writeFile(processedRow);

package.json

+2-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@
1818
"sitemapper": "^3.1.2",
1919
"uuid": "8.3.2",
2020
"ws": "^7.4.4",
21-
"yargs": "^16.0.3"
21+
"yargs": "^16.0.3",
22+
"@mozilla/readability": "^0.4.1"
2223
},
2324
"devDependencies": {
2425
"eslint": "^7.20.0",

0 commit comments

Comments
 (0)