Improvements for Chinese web pages (#186)

Co-authored-by: Adrien Barbaresi <[email protected]>
adbar · Mar 17, 2022 · 658ee6e · 658ee6e
1 parent e975f1b
commit 658ee6e
Show file tree

Hide file tree

Showing 6 changed files with 71 additions and 45 deletions.
diff --git a/.gitignore b/.gitignore
@@ -8,10 +8,12 @@
 dist/
 build/
 *.egg-info/
+.idea/
 
 # tests
 .cache/
 .eggs/
+.pytest_cache/
 .tox/
 .coverage
 
@@ -30,3 +32,4 @@ Pipfile*
 
 # older stuff
 old/
+
diff --git a/setup.py b/setup.py
@@ -8,7 +8,6 @@
 from setuptools import setup
 
 
-
 def get_version(package):
     "Return package version as listed in `__version__` in `init.py`"
     # version = Path(package, '__init__.py').read_text() # Python >= 3.5
@@ -21,8 +20,8 @@ def get_long_description():
     "Return the README"
     with open('README.rst', 'r', encoding='utf-8') as filehandle:
         long_description = filehandle.read()
-    #long_description += "\n\n"
-    #with open("CHANGELOG.md", encoding="utf8") as f:
+    # long_description += "\n\n"
+    # with open("CHANGELOG.md", encoding="utf8") as f:
     #    long_description += f.read()
     return long_description
 
@@ -31,7 +30,7 @@ def get_long_description():
 extras = {
     'all': [
         'cchardet >= 2.1.7',
-        'htmldate[speed] >= 1.1.1',
+        'htmldate[speed] >= 1.2.0',
         'py3langid >= 0.2.0',
         'pycurl >= 7.44.1',
         'urllib3[brotli]',
@@ -95,13 +94,13 @@ def get_long_description():
         'certifi',
         'charset_normalizer >= 2.0.12',
         'courlan >= 0.6.0',
-        'htmldate >= 1.1.1',
+        'htmldate >= 1.2.0',
         'justext >= 3.0.0',
         'lxml >= 4.6.4',
         'urllib3 >= 1.26, < 2',
     ],
     extras_require=extras,
-    entry_points = {
+    entry_points={
         'console_scripts': [
             'trafilatura=trafilatura.cli:main',
             'trafilatura_gui=trafilatura.gui:main',

diff --git a/trafilatura/core.py b/trafilatura/core.py
@@ -20,7 +20,7 @@
 # own
 from .external import justext_rescue, sanitize_tree, SANITIZED_XPATH, try_readability
 from .filters import (check_html_lang, content_fingerprint, duplicate_test,
-                     language_filter, text_chars_test)
+                      language_filter, text_chars_test)
 from .htmlprocessing import (convert_tags, handle_textnode,
                              link_density_test, link_density_test_tables,
                              process_node, prune_unwanted_nodes, tree_cleaning)
@@ -33,7 +33,6 @@
                      ADDITIONAL_DISCARD_XPATH, PRECISION_DISCARD_XPATH,
                      DISCARD_IMAGE_ELEMENTS, REMOVE_COMMENTS_XPATH)
 
-
 LOGGER = logging.getLogger(__name__)
 
 FORMATTING_PROTECTED = {'cell', 'head', 'hi', 'item', 'p', 'quote', 'td'}
@@ -143,7 +142,8 @@ def handle_lists(element, dedupbool, config):
                     if processed_subchild is not None:
                         newchildelem.append(processed_subchild)
                 else:
-                    processed_subchild = handle_textnode(subelem, comments_fix=False, deduplicate=dedupbool, config=config)
+                    processed_subchild = handle_textnode(subelem, comments_fix=False, deduplicate=dedupbool,
+                                                         config=config)
                     # add child element to processed_element
                     if processed_subchild is not None:
                         subchildelem = SubElement(newchildelem, processed_subchild.tag)
@@ -165,7 +165,7 @@ def handle_quotes(element, dedupbool, config):
     '''Process quotes elements'''
     processed_element = Element(element.tag)
     for child in element.iter('*'):
-        processed_child = process_node(child, dedupbool, config) # handle_textnode(child, comments_fix=True)
+        processed_child = process_node(child, dedupbool, config)  # handle_textnode(child, comments_fix=True)
         if processed_child is not None:
             newsub = SubElement(processed_element, child.tag)
             newsub.text, newsub.tail = processed_child.text, processed_child.tail
@@ -218,11 +218,13 @@ def handle_paragraphs(element, potential_tags, dedupbool, config):
             continue
         # spacing = child.tag in SPACING_PROTECTED  # todo: outputformat.startswith('xml')?
         # todo: act on spacing here?
-        processed_child = handle_textnode(child, comments_fix=False, deduplicate=dedupbool, preserve_spaces=True, config=config)
+        processed_child = handle_textnode(child, comments_fix=False, deduplicate=dedupbool, preserve_spaces=True,
+                                          config=config)
         if processed_child is not None:
             # todo: needing attention!
             if processed_child.tag == 'p':
-                LOGGER.debug('extra p within p: %s %s %s', processed_child.tag, processed_child.text, processed_child.tail)
+                LOGGER.debug('extra p within p: %s %s %s', processed_child.tag, processed_child.text,
+                             processed_child.tail)
                 if processed_element.text:
                     processed_element.text += ' ' + processed_child.text
                 else:
@@ -268,8 +270,8 @@ def handle_paragraphs(element, potential_tags, dedupbool, config):
     if len(processed_element) > 0:
         # clean trailing lb-elements
         if (
-            processed_element[-1].tag == 'lb'
-            and processed_element[-1].tail is None
+                processed_element[-1].tag == 'lb'
+                and processed_element[-1].tail is None
         ):
             processed_element[-1].getparent().remove(processed_element[-1])
         return processed_element
@@ -279,7 +281,6 @@ def handle_paragraphs(element, potential_tags, dedupbool, config):
     return None
 
 
-
 def define_cell_type(element):
     '''Determine cell element type and mint new element'''
     # define tag
@@ -319,7 +320,8 @@ def handle_table(table_elem, potential_tags, dedupbool, config):
                         if child.tag in TABLE_ELEMS:
                             # subcell_elem = define_cell_type(subelement)
                             child.tag = 'cell'
-                        processed_subchild = handle_textnode(child, preserve_spaces=True, comments_fix=True, deduplicate=dedupbool, config=config)
+                        processed_subchild = handle_textnode(child, preserve_spaces=True, comments_fix=True,
+                                                             deduplicate=dedupbool, config=config)
                     # todo: lists in table cells
                     else:
                         # subcell_elem = Element(child.tag)
@@ -373,7 +375,8 @@ def handle_image(element):
     return processed_element
 
 
-def recover_wild_text(tree, result_body, favor_precision=False, favor_recall=False, potential_tags=TAG_CATALOG, deduplicate=True, config=None):
+def recover_wild_text(tree, result_body, favor_precision=False, favor_recall=False, potential_tags=TAG_CATALOG,
+                      deduplicate=True, config=None):
     '''Look for all previously unconsidered wild elements, including outside of the determined
        frame and throughout the document to recover potentially missing text parts'''
     LOGGER.debug('Recovering wild text elements')
@@ -397,10 +400,10 @@ def recover_wild_text(tree, result_body, favor_precision=False, favor_recall=Fal
     else:
         strip_tags(search_tree, 'span')
     result_body.extend(e for e in
-                        [handle_textelem(
-                            element, potential_tags, deduplicate, config)
-                            for element in search_tree.iter(search_list)]
-                        if e is not None)
+                       [handle_textelem(
+                           element, potential_tags, deduplicate, config)
+                           for element in search_tree.iter(search_list)]
+                       if e is not None)
     return result_body
 
 
@@ -423,7 +426,7 @@ def handle_textelem(element, potential_tags, dedupbool, config):
                 new_element = Element('p')
                 new_element.text = element.tail
     elif element.tag in FORMATTING:
-        new_element = handle_formatting(element, dedupbool, config) # process_node(element, dedupbool, config)
+        new_element = handle_formatting(element, dedupbool, config)  # process_node(element, dedupbool, config)
     elif element.tag == 'table' and 'table' in potential_tags:
         new_element = handle_table(element, potential_tags, dedupbool, config)
     elif element.tag == 'graphic' and 'graphic' in potential_tags:
@@ -461,7 +464,8 @@ def delete_by_link_density(subtree, tagname, backtracking=False):
     return subtree
 
 
-def extract_content(tree, favor_precision=False, favor_recall=False, include_tables=False, include_images=False, include_links=False, deduplicate=False, config=None):
+def extract_content(tree, favor_precision=False, favor_recall=False, include_tables=False, include_images=False,
+                    include_links=False, deduplicate=False, config=None):
     '''Find the main content of a page using a set of XPath expressions,
        then extract relevant elements, strip them of unwanted subparts and
        convert them'''
@@ -495,7 +499,7 @@ def extract_content(tree, favor_precision=False, favor_recall=False, include_tab
         subtree = delete_by_link_density(subtree, 'div', backtracking=True)
         subtree = delete_by_link_density(subtree, 'list', backtracking=False)
         subtree = delete_by_link_density(subtree, 'p', backtracking=False)
-        #subtree = delete_by_link_density(subtree, 'head', backtracking=False)
+        # subtree = delete_by_link_density(subtree, 'head', backtracking=False)
         # also filter fw/head, table and quote elements?
         if favor_precision is True:
             subtree = delete_by_link_density(subtree, 'head', backtracking=False)
@@ -526,8 +530,8 @@ def extract_content(tree, favor_precision=False, favor_recall=False, include_tab
         ##strip_tags(subtree, 'lb') # BoingBoing-Bug
         # extract content # list(filter(None.__ne__, processed_elems)) ?
         result_body.extend(e for e in
-                            [handle_textelem(e, potential_tags, deduplicate, config) for e in subtree.xpath('.//*')]
-                            if e is not None)
+                           [handle_textelem(e, potential_tags, deduplicate, config) for e in subtree.xpath('.//*')]
+                           if e is not None)
         # remove trailing titles
         while len(result_body) > 0 and (result_body[-1].tag in NOT_AT_THE_END):
             result_body[-1].getparent().remove(result_body[-1])
@@ -539,7 +543,9 @@ def extract_content(tree, favor_precision=False, favor_recall=False, include_tab
     # try parsing wild <p> elements if nothing found or text too short
     # todo: test precision and recall settings here
     if len(result_body) == 0 or len(temp_text) < config.getint('DEFAULT', 'MIN_EXTRACTED_SIZE'):
-        result_body = recover_wild_text(backup_tree, result_body, favor_precision=favor_precision, favor_recall=favor_recall, potential_tags=potential_tags, deduplicate=deduplicate, config=config)
+        result_body = recover_wild_text(backup_tree, result_body, favor_precision=favor_precision,
+                                        favor_recall=favor_recall, potential_tags=potential_tags,
+                                        deduplicate=deduplicate, config=config)
         temp_text = trim(' '.join(result_body.itertext()))
     # filter output
     strip_elements(result_body, 'done')
@@ -583,7 +589,8 @@ def extract_comments(tree, dedupbool, config):
         #    processed_elem = process_comments_node(elem, potential_tags)
         #    if processed_elem is not None:
         #        comments_body.append(processed_elem)
-        processed_elems = (process_comments_node(elem, potential_tags, dedupbool, config) for elem in subtree.xpath('.//*'))
+        processed_elems = (process_comments_node(elem, potential_tags, dedupbool, config) for elem in
+                           subtree.xpath('.//*'))
         comments_body.extend(elem for elem in processed_elems if elem is not None)
         # control
         if len(comments_body) > 0:  # if it has children
@@ -596,11 +603,12 @@ def extract_comments(tree, dedupbool, config):
     return comments_body, temp_comments, len(temp_comments), tree
 
 
-def compare_extraction(tree, backup_tree, url, body, text, len_text, target_language, favor_precision, favor_recall, include_formatting, include_links, include_images, include_tables, config):
+def compare_extraction(tree, backup_tree, url, body, text, len_text, target_language, favor_precision, favor_recall,
+                       include_formatting, include_links, include_images, include_tables, config):
     '''Decide whether to choose own or external extraction
        based on a series of heuristics'''
     # bypass for recall
-    if favor_recall is True and len_text > config.getint('DEFAULT', 'MIN_EXTRACTED_SIZE')*10:
+    if favor_recall is True and len_text > config.getint('DEFAULT', 'MIN_EXTRACTED_SIZE') * 10:
         return body, text, len_text
     algo_flag, jt_result = False, False
     # prior cleaning
@@ -625,7 +633,8 @@ def compare_extraction(tree, backup_tree, url, body, text, len_text, target_lang
     else:
         if not body.xpath('//p//text()') and len_algo > config.getint('DEFAULT', 'MIN_EXTRACTED_SIZE') * 2:
             algo_flag = True
-        elif len(body.xpath('//table')) > len(body.xpath('//p')) and len_algo > config.getint('DEFAULT', 'MIN_EXTRACTED_SIZE') * 2:
+        elif len(body.xpath('//table')) > len(body.xpath('//p')) and len_algo > config.getint('DEFAULT',
+                                                                                              'MIN_EXTRACTED_SIZE') * 2:
             algo_flag = True
         else:
             LOGGER.debug('extraction values: %s %s for %s', len_text, len_algo, url)
@@ -822,7 +831,7 @@ def bare_extraction(filecontent, url=None, no_fallback=False,
             if only_with_metadata is True and any(
                     x is None for x in
                     [document.date, document.title, document.url]
-                ):
+            ):
                 LOGGER.error('no metadata for URL %s', url)
                 raise ValueError
         else:
@@ -841,18 +850,23 @@ def bare_extraction(filecontent, url=None, no_fallback=False,
 
         # comments first, then remove
         if include_comments is True:
-            commentsbody, temp_comments, len_comments, cleaned_tree = extract_comments(cleaned_tree, deduplicate, config)
+            commentsbody, temp_comments, len_comments, cleaned_tree = extract_comments(cleaned_tree, deduplicate,
+                                                                                       config)
         else:
             commentsbody, temp_comments, len_comments = None, '', 0
         if favor_precision is True:
             cleaned_tree = prune_unwanted_nodes(cleaned_tree, REMOVE_COMMENTS_XPATH)
 
         # extract content
-        postbody, temp_text, len_text = extract_content(cleaned_tree, favor_precision, favor_recall, include_tables, include_images, include_links, deduplicate, config)
+        postbody, temp_text, len_text = extract_content(cleaned_tree, favor_precision, favor_recall, include_tables,
+                                                        include_images, include_links, deduplicate, config)
 
         # compare if necessary
         if no_fallback is False:
-            postbody, temp_text, len_text = compare_extraction(cleaned_tree_backup, tree_backup_1, url, postbody, temp_text, len_text, target_language, favor_precision, favor_recall, include_formatting, include_links, include_images, include_tables, config)
+            postbody, temp_text, len_text = compare_extraction(cleaned_tree_backup, tree_backup_1, url, postbody,
+                                                               temp_text, len_text, target_language, favor_precision,
+                                                               favor_recall, include_formatting, include_links,
+                                                               include_images, include_tables, config)
         # add baseline as additional fallback
         # rescue: try to use original/dirty tree # and favor_precision is False=?
         if len_text < config.getint('DEFAULT', 'MIN_EXTRACTED_SIZE'):
@@ -872,7 +886,8 @@ def bare_extraction(filecontent, url=None, no_fallback=False,
         # size checks
         if len_comments < config.getint('DEFAULT', 'MIN_EXTRACTED_COMM_SIZE'):
             LOGGER.info('not enough comments %s', url)
-        if len_text < config.getint('DEFAULT', 'MIN_OUTPUT_SIZE') and len_comments < config.getint('DEFAULT', 'MIN_OUTPUT_COMM_SIZE'):
+        if len_text < config.getint('DEFAULT', 'MIN_OUTPUT_SIZE') and len_comments < config.getint('DEFAULT',
+                                                                                                   'MIN_OUTPUT_COMM_SIZE'):
             LOGGER.info('text and comments not long enough: %s %s', len_text, len_comments)
             raise ValueError
 

diff --git a/trafilatura/filters.py b/trafilatura/filters.py
@@ -27,7 +27,11 @@
 
 RE_HTML_LANG = re.compile(r'([a-z]{2})', re.I)
 
-RE_FILTER = re.compile(r'\W*(Drucken|E-?Mail|Facebook|Flipboard|Google|Instagram|Linkedin|Mail|PDF|Pinterest|Pocket|Print|Reddit|Twitter|Whatsapp|Xing|Mehr zum Thema:?|More on this.{,8}$)$', flags=re.IGNORECASE)
+# Mostly filters for social media
+RE_FILTER = re.compile(r'\W*(Drucken|E-?Mail|Facebook|Flipboard|Google|Instagram|'
+                        'Linkedin|Mail|PDF|Pinterest|Pocket|Print|QQ|Reddit|Twitter|'
+                        'WeChat|WeiBo|Whatsapp|Xing|Mehr zum Thema:?|More on this.{,8}$)$',
+                       flags=re.IGNORECASE)
 # COMMENTS_BLACKLIST = ('( Abmelden / Ändern )') # Fill in your details below|Trage deine Daten unten|Kommentar verfassen|Bitte logge dich|Hinterlasse einen Kommentar| to %s| mit %s)
 
 

diff --git a/trafilatura/metaxpaths.py b/trafilatura/metaxpaths.py
@@ -10,7 +10,7 @@
 author_xpaths = [
     '//*[(self::a or self::address or self::link or self::p or self::span or self::strong)][@rel="author" or @id="author" or @class="author" or @itemprop="author name" or rel="me"]|//author', # specific
     '//*[(self::a or self::div or self::span or self::p or self::strong)][contains(@class, "author-name") or contains(@class, "AuthorName") or contains(@class, "authorName") or contains(@class, "author name")]', # almost specific
-    '//*[(self::a or self::div or self::span or self::p or self::h4 or self::h3)][contains(@class, "author") or contains(@id, "author") or contains(@itemprop, "author") or @class="byline"]', # almost generic
+    '//*[(self::a or self::div or self::span or self::p or self::h4 or self::h3)][contains(@class, "author") or contains(@id, "author") or contains(@itemprop, "author") or @class="byline" or contains(@id, "zuozhe") or contains(@class, "zuozhe") or contains(@id, "bianji") or contains(@class, "bianji") or contains(@id, "xiaobian") or contains(@class, "xiaobian")]', # almost generic
     '//*[(self::a or self::div or self::span or self::p)][contains(@class, "authors") or contains(@class, "byline") or contains(@class, "ByLine") or contains(@class, "submitted-by") or contains(@class, "posted-by")]', # generic
     '//*[contains(@class, "author") or contains(@class, "Author") or contains(@id, "Author") or contains(@class, "screenname") or contains(@data-component, "Byline") or contains(@itemprop, "author") or contains(@class, "writer") or contains(@class, "byline")]', # any element
     '//*[(self::a or self::span)][@class="username" or @class="BBL"]', # not common