Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge multiple nodes returned by XPath #487

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 29 additions & 26 deletions tests/comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
# from libextract.api import extract as lib_extract
from newspaper import fulltext
from newsplease import NewsPlease
from readabilipy import simple_json_from_html_string
# from readabilipy import simple_json_from_html_string
from readability import Document
from resiliparse.extract.html2text import extract_plain_text
from resiliparse.parse.encoding import bytes_to_str, detect_encoding
Expand Down Expand Up @@ -290,7 +290,7 @@ def run_boilerpipe(htmlstring):
try:
content = boilerpipe_extractor.get_content(htmlstring)
# sanitize(boilerpipe_extractor.get_content(htmlstring))
except Exception:
except Exception as e:
#print('Boilerpipe exception:', err)
content = ''
return content
Expand Down Expand Up @@ -330,21 +330,24 @@ def run_newsplease(htmlstring):
# return sanitize(returnstring)


def run_readabilipy(htmlstring):
'''try with the readability.py module'''
try:
article = simple_json_from_html_string(htmlstring, use_readability=True)
returnlist = [textelem['text'] for textelem in article['plain_text']]
return '\n'.join(returnlist) # sanitize(content)
except Exception as err:
#print('Readabilipy exception:', err)
return ''
# def run_readabilipy(htmlstring):
# '''try with the readability.py module'''
# try:
# article = simple_json_from_html_string(htmlstring, use_readability=True)
# returnlist = [textelem['text'] for textelem in article['plain_text']]
# return '\n'.join(returnlist) # sanitize(content)
# except Exception as err:
# #print('Readabilipy exception:', err)
# return ''


def run_resiliparse(htmlstring):
'''try with the resiliparse package'''
decoded = bytes_to_str(htmlstring, detect_encoding(htmlstring))
tree = HTMLTree.parse(decoded)
if isinstance(htmlstring, bytes):
decoded = bytes_to_str(htmlstring, detect_encoding(htmlstring))
tree = HTMLTree.parse(decoded)
else:
tree = HTMLTree.parse(htmlstring)
return extract_plain_text(tree, main_content=True)


Expand Down Expand Up @@ -427,7 +430,7 @@ def calculate_scores(mydict):
html_text_result.update(template_dict)
boilerpipe_result.update(template_dict)
newsplease_result.update(template_dict)
readabilipy_result.update(template_dict)
# readabilipy_result.update(template_dict)
resiliparse_result.update(template_dict)
bs4_result.update(template_dict)
# jparser_result.update(template_dict)
Expand Down Expand Up @@ -600,14 +603,14 @@ def calculate_scores(mydict):
#jparser_result['true negatives'] += tn
#jparser_result['false negatives'] += fn
# readabilipy
start = time.time()
result = run_readabilipy(htmlstring)
readabilipy_result['time'] += time.time() - start
tp, fn, fp, tn = evaluate_result(result, EVAL_PAGES[item])
readabilipy_result['true positives'] += tp
readabilipy_result['false positives'] += fp
readabilipy_result['true negatives'] += tn
readabilipy_result['false negatives'] += fn
# start = time.time()
# result = run_readabilipy(htmlstring)
# readabilipy_result['time'] += time.time() - start
# tp, fn, fp, tn = evaluate_result(result, EVAL_PAGES[item])
# readabilipy_result['true positives'] += tp
# readabilipy_result['false positives'] += fp
# readabilipy_result['true negatives'] += tn
# readabilipy_result['false negatives'] += fn
# resiliparse
start = time.time()
result = run_resiliparse(htmlstring)
Expand Down Expand Up @@ -700,10 +703,10 @@ def calculate_scores(mydict):
print("precision: %.3f recall: %.3f accuracy: %.3f f-score: %.3f" % (calculate_scores(readability_result)))
print(f"time diff.: {readability_result['time'] / baseline_result['time']:.2f}")

print('readabilipy')
print(readabilipy_result)
print("precision: %.3f recall: %.3f accuracy: %.3f f-score: %.3f" % (calculate_scores(readabilipy_result)))
print(f"time diff.: {readabilipy_result['time'] / baseline_result['time']:.2f}")
# # print('readabilipy')
# # print(readabilipy_result)
# # print("precision: %.3f recall: %.3f accuracy: %.3f f-score: %.3f" % (calculate_scores(readabilipy_result)))
# print(f"time diff.: {readabilipy_result['time'] / baseline_result['time']:.2f}")

print('resiliparse')
print(resiliparse_result)
Expand Down
15 changes: 13 additions & 2 deletions trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from copy import deepcopy

from lxml.etree import Element, SubElement, strip_elements, strip_tags
from lxml.html import tostring
from lxml.html import tostring, Element as HtmlElement

# own
from .external import (SANITIZED_XPATH, justext_rescue, sanitize_tree,
Expand Down Expand Up @@ -545,7 +545,18 @@ def extract_content(tree, options):
for expr in BODY_XPATH:
# select tree if the expression has been found
try:
subtree = tree.xpath(expr)[0]
subtrees = tree.xpath(expr)
if len(subtrees) > 1 and options.recall is True:
new_subtree = HtmlElement(subtrees[0].tag)
for _subtree in subtrees:
for child in _subtree:
if len(''.join(child.itertext()).strip()) > options.config.getint('DEFAULT', 'MIN_EXTRACTED_SIZE'):
new_subtree.append(child)
subtree = new_subtree
elif len(subtrees) == 1:
subtree = subtrees[0]
else:
continue
except IndexError:
continue
# prune the subtree
Expand Down
2 changes: 1 addition & 1 deletion trafilatura/xpaths.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
contains(@id, "body-text") or contains(@class, "body-text") or
contains(@class, "article__container") or contains(@id, "art-content") or contains(@class, "art-content")][1]''',
# (…)[1] = first occurrence
'(.//article)[1]',
'(.//article)',
"""(.//*[(self::article or self::div or self::main or self::section)][
contains(@class, 'post-bodycopy') or
contains(@class, 'storycontent') or contains(@class, 'story-content') or
Expand Down