From 0ad8c3d4a12756d1b603ff1a753e18e1eb276816 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Fri, 22 Nov 2024 18:39:58 +0100 Subject: [PATCH] type hinting: add remaining types and integrate into CI (#748) * type hinting: add remaining types and integrate into CI * change workflow order * fix errors and update setup * fix build * try Python 3.14 * fix CI workflow * add py.typed * fix remaining strict warnings --- .github/workflows/tests.yml | 33 +++++++++++------------ MANIFEST.in | 1 + pyproject.toml | 3 +++ trafilatura/downloads.py | 11 ++++---- trafilatura/external.py | 2 +- trafilatura/htmlprocessing.py | 15 ++++++----- trafilatura/main_extractor.py | 46 ++++++++++++++++----------------- trafilatura/metadata.py | 4 ++- trafilatura/py.typed | 0 trafilatura/readability_lxml.py | 3 ++- trafilatura/settings.py | 8 +----- trafilatura/utils.py | 6 ++--- 12 files changed, 65 insertions(+), 67 deletions(-) create mode 100644 trafilatura/py.typed diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index ca82b852..e990ae57 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -18,7 +18,7 @@ jobs: matrix: os: [ubuntu-latest] # https://github.com/actions/python-versions/blob/main/versions-manifest.json - python-version: ["3.9", "3.11"] # "3.13", "3.14-dev" + python-version: ["3.9", "3.11", "3.13"] # "3.14-dev" env: - MINIMAL: "true" PROXY_TEST: "false" @@ -57,7 +57,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Upgrade pip - run: python -m pip install --upgrade pip setuptools wheel + run: python -m pip install --upgrade pip - name: Get pip cache dir id: pip-cache @@ -75,35 +75,32 @@ jobs: # package setup - uses: actions/checkout@v4 - # only where prebuilt wheels do not exist - # - name: Install LXML dependencies - # if: ${{ matrix.python-version == '3.13-dev' }} - # run: | - # sudo apt-get update - # sudo apt-get install libxml2-dev libxslt-dev - - name: Install dependencies run: python -m pip install -e ".[dev]" + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + # pycurl installation fix - name: Install packages required by pycurl - if: ${{ matrix.env.MINIMAL == 'false'}} + if: ${{ matrix.env.MINIMAL == 'false' }} run: | sudo apt-get update sudo apt-get install libcurl4-gnutls-dev libgnutls28-dev # alternatively: sudo apt-get install libcurl4-openssl-dev libssl-dev - name: Install full dependencies - if: ${{ matrix.env.MINIMAL == 'false'}} + if: ${{ matrix.env.MINIMAL == 'false' }} run: python -m pip install -e ".[all]" - # tests - - name: Lint with flake8 + - name: Type checking + if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.12' }} run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + mypy -p trafilatura - name: Test with pytest run: | @@ -113,7 +110,7 @@ jobs: # coverage - name: Upload coverage to Codecov - if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.11' }} + if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.12' }} uses: codecov/codecov-action@v4 env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} diff --git a/MANIFEST.in b/MANIFEST.in index c4c8c161..91ba57d5 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,6 +1,7 @@ include CITATION.cff CONTRIBUTING.md HISTORY.md README.rst LICENSE graft trafilatura/data/ include trafilatura/settings.cfg +include trafilatura/py.typed include tests/__init__.py include tests/*test*.py diff --git a/pyproject.toml b/pyproject.toml index a08a5943..0d352adc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -94,8 +94,11 @@ trafilatura = "trafilatura.cli:main" [project.optional-dependencies] dev = [ "flake8", + "mypy", "pytest", "pytest-cov", + "types-lxml", + "types-urllib3", ] all = [ "brotli", diff --git a/trafilatura/downloads.py b/trafilatura/downloads.py index 2bae7fe7..9475b0d7 100644 --- a/trafilatura/downloads.py +++ b/trafilatura/downloads.py @@ -15,7 +15,6 @@ from time import sleep from typing import ( Any, - ByteString, Callable, Dict, Generator, @@ -73,7 +72,7 @@ def create_pool(**args: Any) -> Union[urllib3.PoolManager, Any]: return manager_class(**manager_args, **args) # type: ignore[arg-type] -DEFAULT_HEADERS = urllib3.util.make_headers(accept_encoding=True) +DEFAULT_HEADERS = urllib3.util.make_headers(accept_encoding=True) # type: ignore[no-untyped-call] USER_AGENT = ( "trafilatura/" + version("trafilatura") + " (+https://github.com/adbar/trafilatura)" ) @@ -106,7 +105,7 @@ class Response: "Store information gathered in a HTTP response object." __slots__ = ["data", "headers", "html", "status", "url"] - def __init__(self, data: ByteString, status: int, url: str) -> None: + def __init__(self, data: bytes, status: int, url: str) -> None: self.data = data self.headers: Optional[Dict[str, str]] = None self.html: Optional[str] = None @@ -332,14 +331,14 @@ def _pycurl_is_live_page(url: str) -> bool: curl.setopt(pycurl.SSL_VERIFYPEER, 0) curl.setopt(pycurl.SSL_VERIFYHOST, 0) # Set option to avoid getting the response body - curl.setopt(curl.NOBODY, True) # type: ignore[attr-defined] + curl.setopt(curl.NOBODY, True) if PROXY_URL: curl.setopt(pycurl.PRE_PROXY, PROXY_URL) # Perform the request try: curl.perform() # Get the response code - page_exists = curl.getinfo(curl.RESPONSE_CODE) < 400 # type: ignore[attr-defined] + page_exists = curl.getinfo(curl.RESPONSE_CODE) < 400 except pycurl.error as err: LOGGER.debug("pycurl HEAD error: %s %s", url, err) page_exists = False @@ -503,7 +502,7 @@ def _send_pycurl_request( # ip_info = curl.getinfo(curl.PRIMARY_IP) resp = Response( - bufferbytes, curl.getinfo(curl.RESPONSE_CODE), curl.getinfo(curl.EFFECTIVE_URL) # type: ignore[attr-defined] + bufferbytes, curl.getinfo(curl.RESPONSE_CODE), curl.getinfo(curl.EFFECTIVE_URL) ) curl.close() diff --git a/trafilatura/external.py b/trafilatura/external.py index 72c45741..49801869 100644 --- a/trafilatura/external.py +++ b/trafilatura/external.py @@ -103,7 +103,7 @@ def compare_extraction(tree: HtmlElement, backup_tree: HtmlElement, body: _Eleme # post-processing: remove unwanted sections if use_readability and not jt_result: - body, text, len_text = sanitize_tree(body, options) + body, text, len_text = sanitize_tree(body, options) # type: ignore[arg-type] return body, text, len_text diff --git a/trafilatura/htmlprocessing.py b/trafilatura/htmlprocessing.py index 34b228c7..af855ee2 100644 --- a/trafilatura/htmlprocessing.py +++ b/trafilatura/htmlprocessing.py @@ -291,7 +291,7 @@ def convert_lists(elem: _Element) -> None: for subelem in elem.iter("dd", "dt", "li"): # keep track of dd/dt items if subelem.tag in ("dd", "dt"): - subelem.set("rend", f"{subelem.tag}-{i}") + subelem.set("rend", f"{str(subelem.tag)}-{i}") # increment counter after
in description list if subelem.tag == "dd": i += 1 @@ -397,7 +397,7 @@ def convert_tags( convert_link(elem, base_url) if options.formatting: - for elem in tree.iter(REND_TAG_MAPPING.keys()): # type: ignore[call-overload] + for elem in tree.iter(REND_TAG_MAPPING.keys()): elem.attrib.clear() elem.set("rend", REND_TAG_MAPPING[elem.tag]) elem.tag = "hi" @@ -405,7 +405,7 @@ def convert_tags( strip_tags(tree, *REND_TAG_MAPPING.keys()) # iterate over all concerned elements - for elem in tree.iter(CONVERSIONS.keys()): # type: ignore[call-overload] + for elem in tree.iter(CONVERSIONS.keys()): CONVERSIONS[elem.tag](elem) # images if options.images: @@ -430,12 +430,13 @@ def convert_tags( def convert_to_html(tree: _Element) -> _Element: "Convert XML to simplified HTML." - for elem in tree.iter(HTML_CONVERSIONS.keys()): # type: ignore[call-overload] + for elem in tree.iter(HTML_CONVERSIONS.keys()): + conversion = HTML_CONVERSIONS[str(elem.tag)] # apply function or straight conversion - if callable(HTML_CONVERSIONS[elem.tag]): - elem.tag = HTML_CONVERSIONS[elem.tag](elem) # type: ignore[operator] + if callable(conversion): + elem.tag = conversion(elem) else: - elem.tag = HTML_CONVERSIONS[elem.tag] + elem.tag = conversion # type: ignore[assignment] # handle attributes if elem.tag == "a": elem.set("href", elem.attrib.pop("target", "")) diff --git a/trafilatura/main_extractor.py b/trafilatura/main_extractor.py index d82c63e1..eb50338e 100644 --- a/trafilatura/main_extractor.py +++ b/trafilatura/main_extractor.py @@ -7,7 +7,7 @@ import re # import regex as re from copy import deepcopy -from typing import Any, Optional, Tuple, Union +from typing import Any, Optional, Tuple, Set, Union from lxml.etree import _Element, Element, SubElement, strip_elements, strip_tags, tostring from lxml.html import HtmlElement @@ -16,7 +16,7 @@ from .htmlprocessing import (delete_by_link_density, handle_textnode, link_density_test_tables, process_node, prune_unwanted_nodes) -from .settings import TAG_CATALOG +from .settings import TAG_CATALOG, Extractor from .utils import FORMATTING_PROTECTED, is_image_file, text_chars_test, trim from .xml import delete_element from .xpaths import (BODY_XPATH, COMMENTS_DISCARD_XPATH, COMMENTS_XPATH, @@ -35,12 +35,12 @@ NOT_AT_THE_END = {'head', 'ref'} -def _log_event(msg: str, tag: str, text: Optional[Union[bytes, str]]) -> None: +def _log_event(msg: str, tag: Any, text: Optional[Union[bytes, str]]) -> None: "Format extraction event for debugging purposes." LOGGER.debug("%s: %s %s", msg, tag, trim(text or "") or "None") -def handle_titles(element: _Element, options: Any) -> Optional[_Element]: +def handle_titles(element: _Element, options: Extractor) -> Optional[_Element]: '''Process head elements (titles)''' if len(element) == 0: # maybe needs attention? @@ -66,7 +66,7 @@ def handle_titles(element: _Element, options: Any) -> Optional[_Element]: return None -def handle_formatting(element: _Element, options: Any) -> Optional[_Element]: +def handle_formatting(element: _Element, options: Extractor) -> Optional[_Element]: '''Process formatting elements (b, i, etc. converted to hi) found outside of paragraphs''' formatting = process_node(element, options) @@ -124,7 +124,7 @@ def add_sub_element(new_child_elem: _Element, subelem: _Element, processed_subch sub_child_elem.set(attr, subelem.attrib[attr]) -def process_nested_elements(child: _Element, new_child_elem: _Element, options: Any) -> None: +def process_nested_elements(child: _Element, new_child_elem: _Element, options: Extractor) -> None: "Iterate through an element child and rewire its descendants." new_child_elem.text = child.text for subelem in child.iterdescendants("*"): @@ -158,7 +158,7 @@ def define_newelem(processed_elem: _Element, orig_elem: _Element) -> None: childelem.text, childelem.tail = processed_elem.text, processed_elem.tail -def handle_lists(element: _Element, options: Any) -> Optional[_Element]: +def handle_lists(element: _Element, options: Extractor) -> Optional[_Element]: "Process lists elements including their descendants." processed_element = Element(element.tag) @@ -224,7 +224,7 @@ def handle_code_blocks(element: _Element) -> _Element: return processed_element -def handle_quotes(element: _Element, options: Any) -> Optional[_Element]: +def handle_quotes(element: _Element, options: Extractor) -> Optional[_Element]: "Process quotes elements." if is_code_block_element(element): return handle_code_blocks(element) @@ -242,7 +242,7 @@ def handle_quotes(element: _Element, options: Any) -> Optional[_Element]: return None -def handle_other_elements(element: _Element, potential_tags: Any, options: Any) -> Optional[_Element]: +def handle_other_elements(element: _Element, potential_tags: Set[str], options: Extractor) -> Optional[_Element]: "Handle diverse or unknown elements in the scope of relevant tags." # handle w3schools code if element.tag == "div" and "w3-code" in element.get("class", ""): @@ -269,7 +269,7 @@ def handle_other_elements(element: _Element, potential_tags: Any, options: Any) return None -def handle_paragraphs(element: _Element, potential_tags: Any, options: Any) -> Optional[_Element]: +def handle_paragraphs(element: _Element, potential_tags: Set[str], options: Extractor) -> Optional[_Element]: "Process paragraphs along with their children, trim and clean the content." element.attrib.clear() # todo: test if necessary # strip_tags(element, 'p') # change in precision due to spaces? @@ -355,7 +355,7 @@ def define_cell_type(is_header: bool) -> _Element: return cell_element -def handle_table(table_elem: _Element, potential_tags: Any, options: Any) -> Optional[_Element]: +def handle_table(table_elem: _Element, potential_tags: Set[str], options: Extractor) -> Optional[_Element]: "Process single table element." newtable = Element("table") @@ -365,7 +365,7 @@ def handle_table(table_elem: _Element, potential_tags: Any, options: Any) -> Opt # calculate maximum number of columns per row, includin colspan max_cols = 0 for tr in table_elem.iter('tr'): - max_cols = max(max_cols, sum(int(td.get("colspan", 1)) for td in tr.iter(TABLE_ELEMS))) # type: ignore + max_cols = max(max_cols, sum(int(td.get("colspan", 1)) for td in tr.iter(TABLE_ELEMS))) # explore sub-elements seen_header_row = False @@ -471,7 +471,7 @@ def handle_image(element: _Element) -> Optional[_Element]: return processed_element -def handle_textelem(element: _Element, potential_tags: Any, options: Any) -> Optional[_Element]: +def handle_textelem(element: _Element, potential_tags: Set[str], options: Extractor) -> Optional[_Element]: '''Process text element and determine how to deal with its content''' new_element = None # bypass: nested elements @@ -501,7 +501,7 @@ def handle_textelem(element: _Element, potential_tags: Any, options: Any) -> Opt return new_element -def recover_wild_text(tree: HtmlElement, result_body: _Element, options: Any, potential_tags: Any = TAG_CATALOG) -> _Element: +def recover_wild_text(tree: HtmlElement, result_body: _Element, options: Extractor, potential_tags: Any = TAG_CATALOG) -> _Element: '''Look for all previously unconsidered wild elements, including outside of the determined frame and throughout the document to recover potentially missing text parts''' LOGGER.debug('Recovering wild text elements') @@ -518,11 +518,11 @@ def recover_wild_text(tree: HtmlElement, result_body: _Element, options: Any, po strip_tags(search_tree, 'span') subelems = search_tree.xpath(search_expr) result_body.extend(filter(lambda x: x is not None, (handle_textelem(e, potential_tags, options) - for e in subelems))) + for e in subelems))) # type: ignore[arg-type] return result_body -def prune_unwanted_sections(tree: HtmlElement, potential_tags: Any, options: Any) -> HtmlElement: +def prune_unwanted_sections(tree: HtmlElement, potential_tags: Set[str], options: Extractor) -> HtmlElement: 'Rule-based deletion of targeted document sections' favor_precision = options.focus == "precision" # prune the rest @@ -556,7 +556,7 @@ def prune_unwanted_sections(tree: HtmlElement, potential_tags: Any, options: Any return tree -def _extract(tree: HtmlElement, options: Any) -> Tuple[_Element, str, Any]: +def _extract(tree: HtmlElement, options: Extractor) -> Tuple[_Element, str, Set[str]]: # init potential_tags = set(TAG_CATALOG) if options.tables is True: @@ -583,7 +583,7 @@ def _extract(tree: HtmlElement, options: Any) -> Tuple[_Element, str, Any]: factor = 1 else: factor = 3 - if not ptest or len(''.join(ptest)) < options.min_extracted_size * factor: + if not ptest or len(''.join(ptest)) < options.min_extracted_size * factor: # type: ignore[attr-defined] potential_tags.add('div') # polish list of potential tags if 'ref' not in potential_tags: @@ -609,7 +609,7 @@ def _extract(tree: HtmlElement, options: Any) -> Tuple[_Element, str, Any]: return result_body, temp_text, potential_tags -def extract_content(cleaned_tree: HtmlElement, options: Any) -> Tuple[_Element, str, int]: +def extract_content(cleaned_tree: HtmlElement, options: Extractor) -> Tuple[_Element, str, int]: '''Find the main content of a page using a set of XPath expressions, then extract relevant elements, strip them of unwanted subparts and convert them''' @@ -622,7 +622,7 @@ def extract_content(cleaned_tree: HtmlElement, options: Any) -> Tuple[_Element, # try parsing wild

elements if nothing found or text too short # todo: test precision and recall settings here - if len(result_body) == 0 or len(temp_text) < options.min_extracted_size: + if len(result_body) == 0 or len(temp_text) < options.min_extracted_size: # type: ignore[attr-defined] result_body = recover_wild_text(backup_tree, result_body, options, potential_tags) temp_text = ' '.join(result_body.itertext()).strip() # filter output @@ -632,7 +632,7 @@ def extract_content(cleaned_tree: HtmlElement, options: Any) -> Tuple[_Element, return result_body, temp_text, len(temp_text) -def process_comments_node(elem: _Element, potential_tags: Any, options: Any) -> Optional[_Element]: +def process_comments_node(elem: _Element, potential_tags: Set[str], options: Extractor) -> Optional[_Element]: '''Process comment node and determine how to deal with its content''' if elem.tag in potential_tags: # print(elem.tag, elem.text_content()) @@ -646,7 +646,7 @@ def process_comments_node(elem: _Element, potential_tags: Any, options: Any) -> return None -def extract_comments(tree: HtmlElement, options: Any) -> Tuple[_Element, str, int, HtmlElement]: +def extract_comments(tree: HtmlElement, options: Extractor) -> Tuple[_Element, str, int, HtmlElement]: "Try to extract comments out of potential sections in the HTML." comments_body = Element("body") # define iteration strategy @@ -668,7 +668,7 @@ def extract_comments(tree: HtmlElement, options: Any) -> Tuple[_Element, str, in # comments_body.append(processed_elem) # processed_elems = (process_comments_node(elem, potential_tags, options) for elem in # subtree.xpath('.//*')) - comments_body.extend(filter(lambda x: x is not None, (process_comments_node(e, potential_tags, options) for e in subtree.xpath(".//*")))) + comments_body.extend(filter(lambda x: x is not None, (process_comments_node(e, potential_tags, options) for e in subtree.xpath(".//*")))) # type: ignore[arg-type] # control if len(comments_body) > 0: # if it has children LOGGER.debug(expr) diff --git a/trafilatura/metadata.py b/trafilatura/metadata.py index 09dbd2dd..f6fe6d8c 100644 --- a/trafilatura/metadata.py +++ b/trafilatura/metadata.py @@ -38,6 +38,7 @@ TITLE_XPATHS, ) +__all__ = ["Document"] LOGGER = logging.getLogger(__name__) logging.getLogger("htmldate").setLevel(logging.WARNING) @@ -309,7 +310,8 @@ def examine_meta(tree: HtmlElement) -> Document: # backups metadata.sitename = metadata.sitename or backup_sitename # copy - metadata.set_attributes(tags=tags) + metadata.tags = tags + # metadata.set_attributes(tags=tags) return metadata diff --git a/trafilatura/py.typed b/trafilatura/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/trafilatura/readability_lxml.py b/trafilatura/readability_lxml.py index 5ccfb9f4..96742bd0 100644 --- a/trafilatura/readability_lxml.py +++ b/trafilatura/readability_lxml.py @@ -269,7 +269,8 @@ def class_weight(self, elem: HtmlElement) -> float: def score_node(self, elem: HtmlElement) -> Candidate: score = self.class_weight(elem) - name = elem.tag.lower() + tag = str(elem.tag) + name = tag.lower() if name in DIV_SCORES: score += 5 elif name in BLOCK_SCORES: diff --git a/trafilatura/settings.py b/trafilatura/settings.py index 2341f7e9..778543ae 100644 --- a/trafilatura/settings.py +++ b/trafilatura/settings.py @@ -279,19 +279,13 @@ def __init__( self.filedate: Optional[str] = filedate @classmethod - def from_dict(cls: Any, data: Dict[str, Any]) -> Any: + def from_dict(cls, data: Dict[str, Any]) -> 'Document': "Set a series of attributes using a dictionary." doc = cls() for key, value in data.items(): setattr(doc, key, value) return doc - def set_attributes(self, **kwargs: Optional[Dict[str, Any]]) -> None: - "Helper function to (re-)set a series of attributes." - for key, value in kwargs.items(): - if value: - setattr(self, key, value) - def clean_and_trim(self) -> None: "Limit text length and trim the attributes." for slot in self.__slots__: diff --git a/trafilatura/utils.py b/trafilatura/utils.py index fce29955..aae37d7f 100644 --- a/trafilatura/utils.py +++ b/trafilatura/utils.py @@ -32,7 +32,7 @@ HAS_BROTLI = False try: - import zstandard # type: ignore + import zstandard HAS_ZSTD = True except ImportError: HAS_ZSTD = False @@ -114,7 +114,7 @@ def handle_compressed_file(filecontent: bytes) -> bytes: # try brotli if HAS_BROTLI: try: - return brotli.decompress(filecontent) + return brotli.decompress(filecontent) # type: ignore[no-any-return] except brotli.error: pass # logging.debug('invalid Brotli file') # try zlib/deflate @@ -408,7 +408,7 @@ def language_classifier(temp_text: str, temp_comments: str) -> Optional[str]: else: LOGGER.warning('Language detector not installed, skipping detection') result = None - return result + return result # type: ignore[no-any-return] def language_filter(temp_text: str, temp_comments: str, target_language: str, docmeta: Any) -> Tuple[bool, Any]: