adbar · hugoobauer · Jan 22, 2024 · Jan 24, 2024 · Jan 25, 2024
diff --git a/tests/comparison.py b/tests/comparison.py
@@ -29,7 +29,7 @@
 # from libextract.api import extract as lib_extract
 from newspaper import fulltext
 from newsplease import NewsPlease
-from readabilipy import simple_json_from_html_string
+# from readabilipy import simple_json_from_html_string
 from readability import Document
 from resiliparse.extract.html2text import extract_plain_text
 from resiliparse.parse.encoding import bytes_to_str, detect_encoding
@@ -290,7 +290,7 @@ def run_boilerpipe(htmlstring):
     try:
         content = boilerpipe_extractor.get_content(htmlstring)
         # sanitize(boilerpipe_extractor.get_content(htmlstring))
-    except Exception:
+    except Exception as e:
         #print('Boilerpipe exception:', err)
         content = ''
     return content
@@ -330,21 +330,24 @@ def run_newsplease(htmlstring):
 #    return sanitize(returnstring)
 
 
-def run_readabilipy(htmlstring):
-    '''try with the readability.py module'''
-    try:
-        article = simple_json_from_html_string(htmlstring, use_readability=True)
-        returnlist = [textelem['text'] for textelem in article['plain_text']]
-        return '\n'.join(returnlist) # sanitize(content)
-    except Exception as err:
-        #print('Readabilipy exception:', err)
-        return ''
+# def run_readabilipy(htmlstring):
+#     '''try with the readability.py module'''
+#     try:
+#         article = simple_json_from_html_string(htmlstring, use_readability=True)
+#         returnlist = [textelem['text'] for textelem in article['plain_text']]
+#         return '\n'.join(returnlist) # sanitize(content)
+#     except Exception as err:
+#         #print('Readabilipy exception:', err)
+#         return ''
 
 
 def run_resiliparse(htmlstring):
     '''try with the resiliparse package'''
-    decoded = bytes_to_str(htmlstring, detect_encoding(htmlstring))
-    tree = HTMLTree.parse(decoded)
+    if isinstance(htmlstring, bytes):
+        decoded = bytes_to_str(htmlstring, detect_encoding(htmlstring))
+        tree = HTMLTree.parse(decoded)
+    else:
+        tree = HTMLTree.parse(htmlstring)
     return extract_plain_text(tree, main_content=True)
 
 
@@ -427,7 +430,7 @@ def calculate_scores(mydict):
 html_text_result.update(template_dict)
 boilerpipe_result.update(template_dict)
 newsplease_result.update(template_dict)
-readabilipy_result.update(template_dict)
+# readabilipy_result.update(template_dict)
 resiliparse_result.update(template_dict)
 bs4_result.update(template_dict)
 # jparser_result.update(template_dict)
@@ -600,14 +603,14 @@ def calculate_scores(mydict):
     #jparser_result['true negatives'] += tn
     #jparser_result['false negatives'] += fn
     # readabilipy
-    start = time.time()
-    result = run_readabilipy(htmlstring)
-    readabilipy_result['time'] += time.time() - start
-    tp, fn, fp, tn = evaluate_result(result, EVAL_PAGES[item])
-    readabilipy_result['true positives'] += tp
-    readabilipy_result['false positives'] += fp
-    readabilipy_result['true negatives'] += tn
-    readabilipy_result['false negatives'] += fn
+    # start = time.time()
+    # result = run_readabilipy(htmlstring)
+    # readabilipy_result['time'] += time.time() - start
+    # tp, fn, fp, tn = evaluate_result(result, EVAL_PAGES[item])
+    # readabilipy_result['true positives'] += tp
+    # readabilipy_result['false positives'] += fp
+    # readabilipy_result['true negatives'] += tn
+    # readabilipy_result['false negatives'] += fn
     # resiliparse
     start = time.time()
     result = run_resiliparse(htmlstring)
@@ -700,10 +703,10 @@ def calculate_scores(mydict):
 print("precision: %.3f recall: %.3f accuracy: %.3f f-score: %.3f" % (calculate_scores(readability_result)))
 print(f"time diff.: {readability_result['time'] / baseline_result['time']:.2f}")
 
-print('readabilipy')
-print(readabilipy_result)
-print("precision: %.3f recall: %.3f accuracy: %.3f f-score: %.3f" % (calculate_scores(readabilipy_result)))
-print(f"time diff.: {readabilipy_result['time'] / baseline_result['time']:.2f}")
+# # print('readabilipy')
+# # print(readabilipy_result)
+# # print("precision: %.3f recall: %.3f accuracy: %.3f f-score: %.3f" % (calculate_scores(readabilipy_result)))
+# print(f"time diff.: {readabilipy_result['time'] / baseline_result['time']:.2f}")
 
 print('resiliparse')
 print(resiliparse_result)

diff --git a/trafilatura/core.py b/trafilatura/core.py
@@ -14,7 +14,7 @@
 from copy import deepcopy
 
 from lxml.etree import Element, SubElement, strip_elements, strip_tags
-from lxml.html import tostring
+from lxml.html import tostring, Element as HtmlElement
 
 # own
 from .external import (SANITIZED_XPATH, justext_rescue, sanitize_tree,
@@ -545,7 +545,18 @@ def extract_content(tree, options):
     for expr in BODY_XPATH:
         # select tree if the expression has been found
         try:
-            subtree = tree.xpath(expr)[0]
+            subtrees = tree.xpath(expr)
+            if len(subtrees) > 1 and options.recall is True:
+                new_subtree = HtmlElement(subtrees[0].tag)
+                for _subtree in subtrees:
+                    for child in _subtree:
+                        if len(''.join(child.itertext()).strip()) > options.config.getint('DEFAULT', 'MIN_EXTRACTED_SIZE'):
+                            new_subtree.append(child)
+                subtree = new_subtree
+            elif len(subtrees) == 1:
+                subtree = subtrees[0]
+            else:
+                continue
         except IndexError:
             continue
         # prune the subtree

diff --git a/trafilatura/xpaths.py b/trafilatura/xpaths.py
@@ -25,7 +25,7 @@
     contains(@id, "body-text") or contains(@class, "body-text") or
     contains(@class, "article__container") or contains(@id, "art-content") or contains(@class, "art-content")][1]''',
     # (…)[1] = first occurrence
-    '(.//article)[1]',
+    '(.//article)',
     """(.//*[(self::article or self::div or self::main or self::section)][
     contains(@class, 'post-bodycopy') or
     contains(@class, 'storycontent') or contains(@class, 'story-content') or