@@ -14,6 +14,10 @@ const warcio = require("warcio");
14
14
const Redis = require ( "ioredis" ) ;
15
15
16
16
const TextExtract = require ( "./textextract" ) ;
17
+
18
+ const readabilityJs = fs . readFileSync ( "/app/node_modules/@mozilla/readability/Readability-readerable.js" , "utf-8" )
19
+ + fs . readFileSync ( "/app/node_modules/@mozilla/readability/Readability.js" , "utf-8" ) ;
20
+
17
21
const behaviors = fs . readFileSync ( "/app/node_modules/browsertrix-behaviors/dist/behaviors.js" , "utf-8" ) ;
18
22
19
23
const HTML_TYPES = [ "text/html" , "application/xhtml" , "application/xhtml+xml" ] ;
@@ -281,6 +285,12 @@ class Crawler {
281
285
default : false ,
282
286
} ,
283
287
288
+ "readerView" : {
289
+ describe : "If set, apply Mozilla's reader view and add the 'article' object to the pages.jsonl file, see https://github.com/mozilla/readability" ,
290
+ type : "boolean" ,
291
+ default : false ,
292
+ } ,
293
+
284
294
"cwd" : {
285
295
describe : "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd()" ,
286
296
type : "string" ,
@@ -571,14 +581,33 @@ class Crawler {
571
581
572
582
573
583
const title = await page . title ( ) ;
574
- let text = "" ;
584
+ let text = null ;
585
+ let article = null ;
586
+
575
587
if ( this . params . text ) {
576
588
const client = await page . target ( ) . createCDPSession ( ) ;
577
589
const result = await client . send ( "DOM.getDocument" , { "depth" : - 1 , "pierce" : true } ) ;
578
590
text = await new TextExtract ( result ) . parseTextFromDom ( ) ;
579
591
}
580
-
581
- await this . writePage ( data . url , title , this . params . text , text ) ;
592
+
593
+ if ( this . params . readerView ) {
594
+ article = { } ;
595
+ try {
596
+ // Note: DOM tree is cloned to avoid side effects
597
+ // because it is modified by @mozilla /readability
598
+ await page . exposeFunction ( "readabilityLog" , ( msg ) => console . log ( msg ) ) ;
599
+ article = await page . evaluate ( `${ readabilityJs } ;\n(async () => {
600
+ if (isProbablyReaderable(document)) {
601
+ return await new Readability(document.cloneNode(true)).parse();
602
+ } else {
603
+ readabilityLog("Not readerable: " + document.URL);
604
+ }})();` ) ;
605
+ } catch ( e ) {
606
+ console . log ( "Error applying reader view:" , e ) ;
607
+ }
608
+ }
609
+
610
+ await this . writePage ( data . url , title , text , article ) ;
582
611
583
612
if ( this . behaviorOpts ) {
584
613
await Promise . allSettled ( page . frames ( ) . map ( frame => frame . evaluate ( "self.__bx_behaviors.run();" ) ) ) ;
@@ -792,14 +821,20 @@ class Crawler {
792
821
793
822
if ( createNew ) {
794
823
const header = { "format" : "json-pages-1.0" , "id" : "pages" , "title" : "All Pages" } ;
824
+ header [ "hasText" ] = this . params . text ;
825
+ header [ "hasReaderView" ] = this . params . readerView ;
826
+ let msg = "creating pages " ;
795
827
if ( this . params . text ) {
796
- console . log ( "creating pages with full text" ) ;
797
- header [ "hasText" ] = true ;
798
- }
799
- else {
800
- console . log ( "creating pages without full text" ) ;
801
- header [ "hasText" ] = false ;
828
+ msg += "with full text" ;
829
+ if ( this . params . readerView ) {
830
+ msg += " and reader view" ;
831
+ }
832
+ } else if ( this . params . readerView ) {
833
+ msg += "with reader view" ;
834
+ } else {
835
+ msg += "without full text or reader view" ;
802
836
}
837
+ console . log ( msg ) ;
803
838
const header_formatted = JSON . stringify ( header ) . concat ( "\n" ) ;
804
839
await this . pagesFH . writeFile ( header_formatted ) ;
805
840
}
@@ -809,14 +844,18 @@ class Crawler {
809
844
}
810
845
}
811
846
812
- async writePage ( url , title , text , text_content ) {
847
+ async writePage ( url , title , text , article ) {
813
848
const id = uuidv4 ( ) ;
814
849
const row = { "id" : id , "url" : url , "title" : title } ;
815
850
816
- if ( text == true ) {
817
- row [ "text" ] = text_content ;
851
+ if ( text ) {
852
+ row [ "text" ] = text ;
818
853
}
819
-
854
+
855
+ if ( article ) {
856
+ row [ "article" ] = article ;
857
+ }
858
+
820
859
const processedRow = JSON . stringify ( row ) . concat ( "\n" ) ;
821
860
try {
822
861
this . pagesFH . writeFile ( processedRow ) ;
0 commit comments