Skip to content

Commit

Permalink
Improvements for Chinese web pages (#186)
Browse files Browse the repository at this point in the history
Co-authored-by: Adrien Barbaresi <[email protected]>
  • Loading branch information
immortal-autumn and adbar authored Mar 17, 2022
1 parent e975f1b commit 658ee6e
Show file tree
Hide file tree
Showing 6 changed files with 71 additions and 45 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@
dist/
build/
*.egg-info/
.idea/

# tests
.cache/
.eggs/
.pytest_cache/
.tox/
.coverage

Expand All @@ -30,3 +32,4 @@ Pipfile*

# older stuff
old/

11 changes: 5 additions & 6 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from setuptools import setup



def get_version(package):
"Return package version as listed in `__version__` in `init.py`"
# version = Path(package, '__init__.py').read_text() # Python >= 3.5
Expand All @@ -21,8 +20,8 @@ def get_long_description():
"Return the README"
with open('README.rst', 'r', encoding='utf-8') as filehandle:
long_description = filehandle.read()
#long_description += "\n\n"
#with open("CHANGELOG.md", encoding="utf8") as f:
# long_description += "\n\n"
# with open("CHANGELOG.md", encoding="utf8") as f:
# long_description += f.read()
return long_description

Expand All @@ -31,7 +30,7 @@ def get_long_description():
extras = {
'all': [
'cchardet >= 2.1.7',
'htmldate[speed] >= 1.1.1',
'htmldate[speed] >= 1.2.0',
'py3langid >= 0.2.0',
'pycurl >= 7.44.1',
'urllib3[brotli]',
Expand Down Expand Up @@ -95,13 +94,13 @@ def get_long_description():
'certifi',
'charset_normalizer >= 2.0.12',
'courlan >= 0.6.0',
'htmldate >= 1.1.1',
'htmldate >= 1.2.0',
'justext >= 3.0.0',
'lxml >= 4.6.4',
'urllib3 >= 1.26, < 2',
],
extras_require=extras,
entry_points = {
entry_points={
'console_scripts': [
'trafilatura=trafilatura.cli:main',
'trafilatura_gui=trafilatura.gui:main',
Expand Down
75 changes: 45 additions & 30 deletions trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
# own
from .external import justext_rescue, sanitize_tree, SANITIZED_XPATH, try_readability
from .filters import (check_html_lang, content_fingerprint, duplicate_test,
language_filter, text_chars_test)
language_filter, text_chars_test)
from .htmlprocessing import (convert_tags, handle_textnode,
link_density_test, link_density_test_tables,
process_node, prune_unwanted_nodes, tree_cleaning)
Expand All @@ -33,7 +33,6 @@
ADDITIONAL_DISCARD_XPATH, PRECISION_DISCARD_XPATH,
DISCARD_IMAGE_ELEMENTS, REMOVE_COMMENTS_XPATH)


LOGGER = logging.getLogger(__name__)

FORMATTING_PROTECTED = {'cell', 'head', 'hi', 'item', 'p', 'quote', 'td'}
Expand Down Expand Up @@ -143,7 +142,8 @@ def handle_lists(element, dedupbool, config):
if processed_subchild is not None:
newchildelem.append(processed_subchild)
else:
processed_subchild = handle_textnode(subelem, comments_fix=False, deduplicate=dedupbool, config=config)
processed_subchild = handle_textnode(subelem, comments_fix=False, deduplicate=dedupbool,
config=config)
# add child element to processed_element
if processed_subchild is not None:
subchildelem = SubElement(newchildelem, processed_subchild.tag)
Expand All @@ -165,7 +165,7 @@ def handle_quotes(element, dedupbool, config):
'''Process quotes elements'''
processed_element = Element(element.tag)
for child in element.iter('*'):
processed_child = process_node(child, dedupbool, config) # handle_textnode(child, comments_fix=True)
processed_child = process_node(child, dedupbool, config) # handle_textnode(child, comments_fix=True)
if processed_child is not None:
newsub = SubElement(processed_element, child.tag)
newsub.text, newsub.tail = processed_child.text, processed_child.tail
Expand Down Expand Up @@ -218,11 +218,13 @@ def handle_paragraphs(element, potential_tags, dedupbool, config):
continue
# spacing = child.tag in SPACING_PROTECTED # todo: outputformat.startswith('xml')?
# todo: act on spacing here?
processed_child = handle_textnode(child, comments_fix=False, deduplicate=dedupbool, preserve_spaces=True, config=config)
processed_child = handle_textnode(child, comments_fix=False, deduplicate=dedupbool, preserve_spaces=True,
config=config)
if processed_child is not None:
# todo: needing attention!
if processed_child.tag == 'p':
LOGGER.debug('extra p within p: %s %s %s', processed_child.tag, processed_child.text, processed_child.tail)
LOGGER.debug('extra p within p: %s %s %s', processed_child.tag, processed_child.text,
processed_child.tail)
if processed_element.text:
processed_element.text += ' ' + processed_child.text
else:
Expand Down Expand Up @@ -268,8 +270,8 @@ def handle_paragraphs(element, potential_tags, dedupbool, config):
if len(processed_element) > 0:
# clean trailing lb-elements
if (
processed_element[-1].tag == 'lb'
and processed_element[-1].tail is None
processed_element[-1].tag == 'lb'
and processed_element[-1].tail is None
):
processed_element[-1].getparent().remove(processed_element[-1])
return processed_element
Expand All @@ -279,7 +281,6 @@ def handle_paragraphs(element, potential_tags, dedupbool, config):
return None



def define_cell_type(element):
'''Determine cell element type and mint new element'''
# define tag
Expand Down Expand Up @@ -319,7 +320,8 @@ def handle_table(table_elem, potential_tags, dedupbool, config):
if child.tag in TABLE_ELEMS:
# subcell_elem = define_cell_type(subelement)
child.tag = 'cell'
processed_subchild = handle_textnode(child, preserve_spaces=True, comments_fix=True, deduplicate=dedupbool, config=config)
processed_subchild = handle_textnode(child, preserve_spaces=True, comments_fix=True,
deduplicate=dedupbool, config=config)
# todo: lists in table cells
else:
# subcell_elem = Element(child.tag)
Expand Down Expand Up @@ -373,7 +375,8 @@ def handle_image(element):
return processed_element


def recover_wild_text(tree, result_body, favor_precision=False, favor_recall=False, potential_tags=TAG_CATALOG, deduplicate=True, config=None):
def recover_wild_text(tree, result_body, favor_precision=False, favor_recall=False, potential_tags=TAG_CATALOG,
deduplicate=True, config=None):
'''Look for all previously unconsidered wild elements, including outside of the determined
frame and throughout the document to recover potentially missing text parts'''
LOGGER.debug('Recovering wild text elements')
Expand All @@ -397,10 +400,10 @@ def recover_wild_text(tree, result_body, favor_precision=False, favor_recall=Fal
else:
strip_tags(search_tree, 'span')
result_body.extend(e for e in
[handle_textelem(
element, potential_tags, deduplicate, config)
for element in search_tree.iter(search_list)]
if e is not None)
[handle_textelem(
element, potential_tags, deduplicate, config)
for element in search_tree.iter(search_list)]
if e is not None)
return result_body


Expand All @@ -423,7 +426,7 @@ def handle_textelem(element, potential_tags, dedupbool, config):
new_element = Element('p')
new_element.text = element.tail
elif element.tag in FORMATTING:
new_element = handle_formatting(element, dedupbool, config) # process_node(element, dedupbool, config)
new_element = handle_formatting(element, dedupbool, config) # process_node(element, dedupbool, config)
elif element.tag == 'table' and 'table' in potential_tags:
new_element = handle_table(element, potential_tags, dedupbool, config)
elif element.tag == 'graphic' and 'graphic' in potential_tags:
Expand Down Expand Up @@ -461,7 +464,8 @@ def delete_by_link_density(subtree, tagname, backtracking=False):
return subtree


def extract_content(tree, favor_precision=False, favor_recall=False, include_tables=False, include_images=False, include_links=False, deduplicate=False, config=None):
def extract_content(tree, favor_precision=False, favor_recall=False, include_tables=False, include_images=False,
include_links=False, deduplicate=False, config=None):
'''Find the main content of a page using a set of XPath expressions,
then extract relevant elements, strip them of unwanted subparts and
convert them'''
Expand Down Expand Up @@ -495,7 +499,7 @@ def extract_content(tree, favor_precision=False, favor_recall=False, include_tab
subtree = delete_by_link_density(subtree, 'div', backtracking=True)
subtree = delete_by_link_density(subtree, 'list', backtracking=False)
subtree = delete_by_link_density(subtree, 'p', backtracking=False)
#subtree = delete_by_link_density(subtree, 'head', backtracking=False)
# subtree = delete_by_link_density(subtree, 'head', backtracking=False)
# also filter fw/head, table and quote elements?
if favor_precision is True:
subtree = delete_by_link_density(subtree, 'head', backtracking=False)
Expand Down Expand Up @@ -526,8 +530,8 @@ def extract_content(tree, favor_precision=False, favor_recall=False, include_tab
##strip_tags(subtree, 'lb') # BoingBoing-Bug
# extract content # list(filter(None.__ne__, processed_elems)) ?
result_body.extend(e for e in
[handle_textelem(e, potential_tags, deduplicate, config) for e in subtree.xpath('.//*')]
if e is not None)
[handle_textelem(e, potential_tags, deduplicate, config) for e in subtree.xpath('.//*')]
if e is not None)
# remove trailing titles
while len(result_body) > 0 and (result_body[-1].tag in NOT_AT_THE_END):
result_body[-1].getparent().remove(result_body[-1])
Expand All @@ -539,7 +543,9 @@ def extract_content(tree, favor_precision=False, favor_recall=False, include_tab
# try parsing wild <p> elements if nothing found or text too short
# todo: test precision and recall settings here
if len(result_body) == 0 or len(temp_text) < config.getint('DEFAULT', 'MIN_EXTRACTED_SIZE'):
result_body = recover_wild_text(backup_tree, result_body, favor_precision=favor_precision, favor_recall=favor_recall, potential_tags=potential_tags, deduplicate=deduplicate, config=config)
result_body = recover_wild_text(backup_tree, result_body, favor_precision=favor_precision,
favor_recall=favor_recall, potential_tags=potential_tags,
deduplicate=deduplicate, config=config)
temp_text = trim(' '.join(result_body.itertext()))
# filter output
strip_elements(result_body, 'done')
Expand Down Expand Up @@ -583,7 +589,8 @@ def extract_comments(tree, dedupbool, config):
# processed_elem = process_comments_node(elem, potential_tags)
# if processed_elem is not None:
# comments_body.append(processed_elem)
processed_elems = (process_comments_node(elem, potential_tags, dedupbool, config) for elem in subtree.xpath('.//*'))
processed_elems = (process_comments_node(elem, potential_tags, dedupbool, config) for elem in
subtree.xpath('.//*'))
comments_body.extend(elem for elem in processed_elems if elem is not None)
# control
if len(comments_body) > 0: # if it has children
Expand All @@ -596,11 +603,12 @@ def extract_comments(tree, dedupbool, config):
return comments_body, temp_comments, len(temp_comments), tree


def compare_extraction(tree, backup_tree, url, body, text, len_text, target_language, favor_precision, favor_recall, include_formatting, include_links, include_images, include_tables, config):
def compare_extraction(tree, backup_tree, url, body, text, len_text, target_language, favor_precision, favor_recall,
include_formatting, include_links, include_images, include_tables, config):
'''Decide whether to choose own or external extraction
based on a series of heuristics'''
# bypass for recall
if favor_recall is True and len_text > config.getint('DEFAULT', 'MIN_EXTRACTED_SIZE')*10:
if favor_recall is True and len_text > config.getint('DEFAULT', 'MIN_EXTRACTED_SIZE') * 10:
return body, text, len_text
algo_flag, jt_result = False, False
# prior cleaning
Expand All @@ -625,7 +633,8 @@ def compare_extraction(tree, backup_tree, url, body, text, len_text, target_lang
else:
if not body.xpath('//p//text()') and len_algo > config.getint('DEFAULT', 'MIN_EXTRACTED_SIZE') * 2:
algo_flag = True
elif len(body.xpath('//table')) > len(body.xpath('//p')) and len_algo > config.getint('DEFAULT', 'MIN_EXTRACTED_SIZE') * 2:
elif len(body.xpath('//table')) > len(body.xpath('//p')) and len_algo > config.getint('DEFAULT',
'MIN_EXTRACTED_SIZE') * 2:
algo_flag = True
else:
LOGGER.debug('extraction values: %s %s for %s', len_text, len_algo, url)
Expand Down Expand Up @@ -822,7 +831,7 @@ def bare_extraction(filecontent, url=None, no_fallback=False,
if only_with_metadata is True and any(
x is None for x in
[document.date, document.title, document.url]
):
):
LOGGER.error('no metadata for URL %s', url)
raise ValueError
else:
Expand All @@ -841,18 +850,23 @@ def bare_extraction(filecontent, url=None, no_fallback=False,

# comments first, then remove
if include_comments is True:
commentsbody, temp_comments, len_comments, cleaned_tree = extract_comments(cleaned_tree, deduplicate, config)
commentsbody, temp_comments, len_comments, cleaned_tree = extract_comments(cleaned_tree, deduplicate,
config)
else:
commentsbody, temp_comments, len_comments = None, '', 0
if favor_precision is True:
cleaned_tree = prune_unwanted_nodes(cleaned_tree, REMOVE_COMMENTS_XPATH)

# extract content
postbody, temp_text, len_text = extract_content(cleaned_tree, favor_precision, favor_recall, include_tables, include_images, include_links, deduplicate, config)
postbody, temp_text, len_text = extract_content(cleaned_tree, favor_precision, favor_recall, include_tables,
include_images, include_links, deduplicate, config)

# compare if necessary
if no_fallback is False:
postbody, temp_text, len_text = compare_extraction(cleaned_tree_backup, tree_backup_1, url, postbody, temp_text, len_text, target_language, favor_precision, favor_recall, include_formatting, include_links, include_images, include_tables, config)
postbody, temp_text, len_text = compare_extraction(cleaned_tree_backup, tree_backup_1, url, postbody,
temp_text, len_text, target_language, favor_precision,
favor_recall, include_formatting, include_links,
include_images, include_tables, config)
# add baseline as additional fallback
# rescue: try to use original/dirty tree # and favor_precision is False=?
if len_text < config.getint('DEFAULT', 'MIN_EXTRACTED_SIZE'):
Expand All @@ -872,7 +886,8 @@ def bare_extraction(filecontent, url=None, no_fallback=False,
# size checks
if len_comments < config.getint('DEFAULT', 'MIN_EXTRACTED_COMM_SIZE'):
LOGGER.info('not enough comments %s', url)
if len_text < config.getint('DEFAULT', 'MIN_OUTPUT_SIZE') and len_comments < config.getint('DEFAULT', 'MIN_OUTPUT_COMM_SIZE'):
if len_text < config.getint('DEFAULT', 'MIN_OUTPUT_SIZE') and len_comments < config.getint('DEFAULT',
'MIN_OUTPUT_COMM_SIZE'):
LOGGER.info('text and comments not long enough: %s %s', len_text, len_comments)
raise ValueError

Expand Down
6 changes: 5 additions & 1 deletion trafilatura/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,11 @@

RE_HTML_LANG = re.compile(r'([a-z]{2})', re.I)

RE_FILTER = re.compile(r'\W*(Drucken|E-?Mail|Facebook|Flipboard|Google|Instagram|Linkedin|Mail|PDF|Pinterest|Pocket|Print|Reddit|Twitter|Whatsapp|Xing|Mehr zum Thema:?|More on this.{,8}$)$', flags=re.IGNORECASE)
# Mostly filters for social media
RE_FILTER = re.compile(r'\W*(Drucken|E-?Mail|Facebook|Flipboard|Google|Instagram|'
'Linkedin|Mail|PDF|Pinterest|Pocket|Print|QQ|Reddit|Twitter|'
'WeChat|WeiBo|Whatsapp|Xing|Mehr zum Thema:?|More on this.{,8}$)$',
flags=re.IGNORECASE)
# COMMENTS_BLACKLIST = ('( Abmelden / Ändern )') # Fill in your details below|Trage deine Daten unten|Kommentar verfassen|Bitte logge dich|Hinterlasse einen Kommentar| to %s| mit %s)


Expand Down
2 changes: 1 addition & 1 deletion trafilatura/metaxpaths.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
author_xpaths = [
'//*[(self::a or self::address or self::link or self::p or self::span or self::strong)][@rel="author" or @id="author" or @class="author" or @itemprop="author name" or rel="me"]|//author', # specific
'//*[(self::a or self::div or self::span or self::p or self::strong)][contains(@class, "author-name") or contains(@class, "AuthorName") or contains(@class, "authorName") or contains(@class, "author name")]', # almost specific
'//*[(self::a or self::div or self::span or self::p or self::h4 or self::h3)][contains(@class, "author") or contains(@id, "author") or contains(@itemprop, "author") or @class="byline"]', # almost generic
'//*[(self::a or self::div or self::span or self::p or self::h4 or self::h3)][contains(@class, "author") or contains(@id, "author") or contains(@itemprop, "author") or @class="byline" or contains(@id, "zuozhe") or contains(@class, "zuozhe") or contains(@id, "bianji") or contains(@class, "bianji") or contains(@id, "xiaobian") or contains(@class, "xiaobian")]', # almost generic
'//*[(self::a or self::div or self::span or self::p)][contains(@class, "authors") or contains(@class, "byline") or contains(@class, "ByLine") or contains(@class, "submitted-by") or contains(@class, "posted-by")]', # generic
'//*[contains(@class, "author") or contains(@class, "Author") or contains(@id, "Author") or contains(@class, "screenname") or contains(@data-component, "Byline") or contains(@itemprop, "author") or contains(@class, "writer") or contains(@class, "byline")]', # any element
'//*[(self::a or self::span)][@class="username" or @class="BBL"]', # not common
Expand Down
Loading

0 comments on commit 658ee6e

Please sign in to comment.