From 0ad8c3d4a12756d1b603ff1a753e18e1eb276816 Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <adbar@users.noreply.github.com>
Date: Fri, 22 Nov 2024 18:39:58 +0100
Subject: [PATCH] type hinting: add remaining types and integrate into CI
 (#748)

* type hinting: add remaining types and integrate into CI

* change workflow order

* fix errors and update setup

* fix build

* try Python 3.14

* fix CI workflow

* add py.typed

* fix remaining strict warnings
---
 .github/workflows/tests.yml     | 33 +++++++++++------------
 MANIFEST.in                     |  1 +
 pyproject.toml                  |  3 +++
 trafilatura/downloads.py        | 11 ++++----
 trafilatura/external.py         |  2 +-
 trafilatura/htmlprocessing.py   | 15 ++++++-----
 trafilatura/main_extractor.py   | 46 ++++++++++++++++-----------------
 trafilatura/metadata.py         |  4 ++-
 trafilatura/py.typed            |  0
 trafilatura/readability_lxml.py |  3 ++-
 trafilatura/settings.py         |  8 +-----
 trafilatura/utils.py            |  6 ++---
 12 files changed, 65 insertions(+), 67 deletions(-)
 create mode 100644 trafilatura/py.typed

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index ca82b852..e990ae57 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -18,7 +18,7 @@ jobs:
       matrix:
         os: [ubuntu-latest]
         # https://github.com/actions/python-versions/blob/main/versions-manifest.json
-        python-version: ["3.9", "3.11"]  # "3.13", "3.14-dev"
+        python-version: ["3.9", "3.11", "3.13"]  # "3.14-dev"
         env:
           - MINIMAL: "true"
             PROXY_TEST: "false"
@@ -57,7 +57,7 @@ jobs:
         python-version: ${{ matrix.python-version }}
 
     - name: Upgrade pip
-      run: python -m pip install --upgrade pip setuptools wheel
+      run: python -m pip install --upgrade pip
 
     - name: Get pip cache dir
       id: pip-cache
@@ -75,35 +75,32 @@ jobs:
     # package setup
     - uses: actions/checkout@v4
 
-    # only where prebuilt wheels do not exist
-    # - name: Install LXML dependencies
-    #   if: ${{ matrix.python-version == '3.13-dev' }}
-    #   run: |
-    #     sudo apt-get update
-    #     sudo apt-get install libxml2-dev libxslt-dev
-
     - name: Install dependencies
       run: python -m pip install -e ".[dev]"
 
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+
     # pycurl installation fix
     - name: Install packages required by pycurl
-      if: ${{ matrix.env.MINIMAL == 'false'}}
+      if: ${{ matrix.env.MINIMAL == 'false' }}
       run: |
         sudo apt-get update
         sudo apt-get install libcurl4-gnutls-dev libgnutls28-dev
     # alternatively: sudo apt-get install libcurl4-openssl-dev libssl-dev
 
     - name: Install full dependencies
-      if: ${{ matrix.env.MINIMAL == 'false'}}
+      if: ${{ matrix.env.MINIMAL == 'false' }}
       run: python -m pip install -e ".[all]"
 
-    # tests
-    - name: Lint with flake8
+    - name: Type checking
+      if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.12' }}
       run: |
-        # stop the build if there are Python syntax errors or undefined names
-        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
-        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+        mypy -p trafilatura
 
     - name: Test with pytest
       run: |
@@ -113,7 +110,7 @@ jobs:
 
     # coverage
     - name: Upload coverage to Codecov
-      if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.11' }}
+      if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.12' }}
       uses: codecov/codecov-action@v4
       env:
         CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
diff --git a/MANIFEST.in b/MANIFEST.in
index c4c8c161..91ba57d5 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,6 +1,7 @@
 include CITATION.cff CONTRIBUTING.md HISTORY.md README.rst LICENSE
 graft trafilatura/data/
 include trafilatura/settings.cfg
+include trafilatura/py.typed
 
 include tests/__init__.py
 include tests/*test*.py
diff --git a/pyproject.toml b/pyproject.toml
index a08a5943..0d352adc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -94,8 +94,11 @@ trafilatura = "trafilatura.cli:main"
 [project.optional-dependencies]
 dev = [
     "flake8",
+    "mypy",
     "pytest",
     "pytest-cov",
+    "types-lxml",
+    "types-urllib3",
 ]
 all = [
     "brotli",
diff --git a/trafilatura/downloads.py b/trafilatura/downloads.py
index 2bae7fe7..9475b0d7 100644
--- a/trafilatura/downloads.py
+++ b/trafilatura/downloads.py
@@ -15,7 +15,6 @@
 from time import sleep
 from typing import (
     Any,
-    ByteString,
     Callable,
     Dict,
     Generator,
@@ -73,7 +72,7 @@ def create_pool(**args: Any) -> Union[urllib3.PoolManager, Any]:
     return manager_class(**manager_args, **args)  # type: ignore[arg-type]
 
 
-DEFAULT_HEADERS = urllib3.util.make_headers(accept_encoding=True)
+DEFAULT_HEADERS = urllib3.util.make_headers(accept_encoding=True)  # type: ignore[no-untyped-call]
 USER_AGENT = (
     "trafilatura/" + version("trafilatura") + " (+https://github.com/adbar/trafilatura)"
 )
@@ -106,7 +105,7 @@ class Response:
     "Store information gathered in a HTTP response object."
     __slots__ = ["data", "headers", "html", "status", "url"]
 
-    def __init__(self, data: ByteString, status: int, url: str) -> None:
+    def __init__(self, data: bytes, status: int, url: str) -> None:
         self.data = data
         self.headers: Optional[Dict[str, str]] = None
         self.html: Optional[str] = None
@@ -332,14 +331,14 @@ def _pycurl_is_live_page(url: str) -> bool:
     curl.setopt(pycurl.SSL_VERIFYPEER, 0)
     curl.setopt(pycurl.SSL_VERIFYHOST, 0)
     # Set option to avoid getting the response body
-    curl.setopt(curl.NOBODY, True)  # type: ignore[attr-defined]
+    curl.setopt(curl.NOBODY, True)
     if PROXY_URL:
         curl.setopt(pycurl.PRE_PROXY, PROXY_URL)
     # Perform the request
     try:
         curl.perform()
         # Get the response code
-        page_exists = curl.getinfo(curl.RESPONSE_CODE) < 400  # type: ignore[attr-defined]
+        page_exists = curl.getinfo(curl.RESPONSE_CODE) < 400
     except pycurl.error as err:
         LOGGER.debug("pycurl HEAD error: %s %s", url, err)
         page_exists = False
@@ -503,7 +502,7 @@ def _send_pycurl_request(
     # ip_info = curl.getinfo(curl.PRIMARY_IP)
 
     resp = Response(
-        bufferbytes, curl.getinfo(curl.RESPONSE_CODE), curl.getinfo(curl.EFFECTIVE_URL)  # type: ignore[attr-defined]
+        bufferbytes, curl.getinfo(curl.RESPONSE_CODE), curl.getinfo(curl.EFFECTIVE_URL)
     )
     curl.close()
 
diff --git a/trafilatura/external.py b/trafilatura/external.py
index 72c45741..49801869 100644
--- a/trafilatura/external.py
+++ b/trafilatura/external.py
@@ -103,7 +103,7 @@ def compare_extraction(tree: HtmlElement, backup_tree: HtmlElement, body: _Eleme
 
     # post-processing: remove unwanted sections
     if use_readability and not jt_result:
-        body, text, len_text = sanitize_tree(body, options)
+        body, text, len_text = sanitize_tree(body, options)  # type: ignore[arg-type]
 
     return body, text, len_text
 
diff --git a/trafilatura/htmlprocessing.py b/trafilatura/htmlprocessing.py
index 34b228c7..af855ee2 100644
--- a/trafilatura/htmlprocessing.py
+++ b/trafilatura/htmlprocessing.py
@@ -291,7 +291,7 @@ def convert_lists(elem: _Element) -> None:
     for subelem in elem.iter("dd", "dt", "li"):
         # keep track of dd/dt items
         if subelem.tag in ("dd", "dt"):
-            subelem.set("rend", f"{subelem.tag}-{i}")
+            subelem.set("rend", f"{str(subelem.tag)}-{i}")
             # increment counter after <dd> in description list
             if subelem.tag == "dd":
                 i += 1
@@ -397,7 +397,7 @@ def convert_tags(
             convert_link(elem, base_url)
 
     if options.formatting:
-        for elem in tree.iter(REND_TAG_MAPPING.keys()):  # type: ignore[call-overload]
+        for elem in tree.iter(REND_TAG_MAPPING.keys()):
             elem.attrib.clear()
             elem.set("rend", REND_TAG_MAPPING[elem.tag])
             elem.tag = "hi"
@@ -405,7 +405,7 @@ def convert_tags(
         strip_tags(tree, *REND_TAG_MAPPING.keys())
 
     # iterate over all concerned elements
-    for elem in tree.iter(CONVERSIONS.keys()):  # type: ignore[call-overload]
+    for elem in tree.iter(CONVERSIONS.keys()):
         CONVERSIONS[elem.tag](elem)
     # images
     if options.images:
@@ -430,12 +430,13 @@ def convert_tags(
 
 def convert_to_html(tree: _Element) -> _Element:
     "Convert XML to simplified HTML."
-    for elem in tree.iter(HTML_CONVERSIONS.keys()):  # type: ignore[call-overload]
+    for elem in tree.iter(HTML_CONVERSIONS.keys()):
+        conversion = HTML_CONVERSIONS[str(elem.tag)]
         # apply function or straight conversion
-        if callable(HTML_CONVERSIONS[elem.tag]):
-            elem.tag = HTML_CONVERSIONS[elem.tag](elem)  # type: ignore[operator]
+        if callable(conversion):
+            elem.tag = conversion(elem)
         else:
-            elem.tag = HTML_CONVERSIONS[elem.tag]
+            elem.tag = conversion  # type: ignore[assignment]
         # handle attributes
         if elem.tag == "a":
             elem.set("href", elem.attrib.pop("target", ""))
diff --git a/trafilatura/main_extractor.py b/trafilatura/main_extractor.py
index d82c63e1..eb50338e 100644
--- a/trafilatura/main_extractor.py
+++ b/trafilatura/main_extractor.py
@@ -7,7 +7,7 @@
 import re  # import regex as re
 
 from copy import deepcopy
-from typing import Any, Optional, Tuple, Union
+from typing import Any, Optional, Tuple, Set, Union
 
 from lxml.etree import _Element, Element, SubElement, strip_elements, strip_tags, tostring
 from lxml.html import HtmlElement
@@ -16,7 +16,7 @@
 from .htmlprocessing import (delete_by_link_density, handle_textnode,
                              link_density_test_tables, process_node,
                              prune_unwanted_nodes)
-from .settings import TAG_CATALOG
+from .settings import TAG_CATALOG, Extractor
 from .utils import FORMATTING_PROTECTED, is_image_file, text_chars_test, trim
 from .xml import delete_element
 from .xpaths import (BODY_XPATH, COMMENTS_DISCARD_XPATH, COMMENTS_XPATH,
@@ -35,12 +35,12 @@
 NOT_AT_THE_END = {'head', 'ref'}
 
 
-def _log_event(msg: str, tag: str, text: Optional[Union[bytes, str]]) -> None:
+def _log_event(msg: str, tag: Any, text: Optional[Union[bytes, str]]) -> None:
     "Format extraction event for debugging purposes."
     LOGGER.debug("%s: %s %s", msg, tag, trim(text or "") or "None")
 
 
-def handle_titles(element: _Element, options: Any) -> Optional[_Element]:
+def handle_titles(element: _Element, options: Extractor) -> Optional[_Element]:
     '''Process head elements (titles)'''
     if len(element) == 0:
         # maybe needs attention?
@@ -66,7 +66,7 @@ def handle_titles(element: _Element, options: Any) -> Optional[_Element]:
     return None
 
 
-def handle_formatting(element: _Element, options: Any) -> Optional[_Element]:
+def handle_formatting(element: _Element, options: Extractor) -> Optional[_Element]:
     '''Process formatting elements (b, i, etc. converted to hi) found
        outside of paragraphs'''
     formatting = process_node(element, options)
@@ -124,7 +124,7 @@ def add_sub_element(new_child_elem: _Element, subelem: _Element, processed_subch
         sub_child_elem.set(attr, subelem.attrib[attr])
 
 
-def process_nested_elements(child: _Element, new_child_elem: _Element, options: Any) -> None:
+def process_nested_elements(child: _Element, new_child_elem: _Element, options: Extractor) -> None:
     "Iterate through an element child and rewire its descendants."
     new_child_elem.text = child.text
     for subelem in child.iterdescendants("*"):
@@ -158,7 +158,7 @@ def define_newelem(processed_elem: _Element, orig_elem: _Element) -> None:
         childelem.text, childelem.tail = processed_elem.text, processed_elem.tail
 
 
-def handle_lists(element: _Element, options: Any) -> Optional[_Element]:
+def handle_lists(element: _Element, options: Extractor) -> Optional[_Element]:
     "Process lists elements including their descendants."
     processed_element = Element(element.tag)
 
@@ -224,7 +224,7 @@ def handle_code_blocks(element: _Element) -> _Element:
     return processed_element
 
 
-def handle_quotes(element: _Element, options: Any) -> Optional[_Element]:
+def handle_quotes(element: _Element, options: Extractor) -> Optional[_Element]:
     "Process quotes elements."
     if is_code_block_element(element):
         return handle_code_blocks(element)
@@ -242,7 +242,7 @@ def handle_quotes(element: _Element, options: Any) -> Optional[_Element]:
     return None
 
 
-def handle_other_elements(element: _Element, potential_tags: Any, options: Any) -> Optional[_Element]:
+def handle_other_elements(element: _Element, potential_tags: Set[str], options: Extractor) -> Optional[_Element]:
     "Handle diverse or unknown elements in the scope of relevant tags."
     # handle w3schools code
     if element.tag == "div" and "w3-code" in element.get("class", ""):
@@ -269,7 +269,7 @@ def handle_other_elements(element: _Element, potential_tags: Any, options: Any)
     return None
 
 
-def handle_paragraphs(element: _Element, potential_tags: Any, options: Any) -> Optional[_Element]:
+def handle_paragraphs(element: _Element, potential_tags: Set[str], options: Extractor) -> Optional[_Element]:
     "Process paragraphs along with their children, trim and clean the content."
     element.attrib.clear()  # todo: test if necessary
     # strip_tags(element, 'p') # change in precision due to spaces?
@@ -355,7 +355,7 @@ def define_cell_type(is_header: bool) -> _Element:
     return cell_element
 
 
-def handle_table(table_elem: _Element, potential_tags: Any, options: Any) -> Optional[_Element]:
+def handle_table(table_elem: _Element, potential_tags: Set[str], options: Extractor) -> Optional[_Element]:
     "Process single table element."
     newtable = Element("table")
 
@@ -365,7 +365,7 @@ def handle_table(table_elem: _Element, potential_tags: Any, options: Any) -> Opt
     # calculate maximum number of columns per row, includin colspan
     max_cols = 0
     for tr in table_elem.iter('tr'):
-        max_cols = max(max_cols, sum(int(td.get("colspan", 1)) for td in tr.iter(TABLE_ELEMS)))  # type: ignore
+        max_cols = max(max_cols, sum(int(td.get("colspan", 1)) for td in tr.iter(TABLE_ELEMS)))
 
     # explore sub-elements
     seen_header_row = False
@@ -471,7 +471,7 @@ def handle_image(element: _Element) -> Optional[_Element]:
     return processed_element
 
 
-def handle_textelem(element: _Element, potential_tags: Any, options: Any) -> Optional[_Element]:
+def handle_textelem(element: _Element, potential_tags: Set[str], options: Extractor) -> Optional[_Element]:
     '''Process text element and determine how to deal with its content'''
     new_element = None
     # bypass: nested elements
@@ -501,7 +501,7 @@ def handle_textelem(element: _Element, potential_tags: Any, options: Any) -> Opt
     return new_element
 
 
-def recover_wild_text(tree: HtmlElement, result_body: _Element, options: Any, potential_tags: Any = TAG_CATALOG) -> _Element:
+def recover_wild_text(tree: HtmlElement, result_body: _Element, options: Extractor, potential_tags: Any = TAG_CATALOG) -> _Element:
     '''Look for all previously unconsidered wild elements, including outside of the determined
        frame and throughout the document to recover potentially missing text parts'''
     LOGGER.debug('Recovering wild text elements')
@@ -518,11 +518,11 @@ def recover_wild_text(tree: HtmlElement, result_body: _Element, options: Any, po
         strip_tags(search_tree, 'span')
     subelems = search_tree.xpath(search_expr)
     result_body.extend(filter(lambda x: x is not None, (handle_textelem(e, potential_tags, options)
-                       for e in subelems)))
+                       for e in subelems)))  # type: ignore[arg-type]
     return result_body
 
 
-def prune_unwanted_sections(tree: HtmlElement, potential_tags: Any, options: Any) -> HtmlElement:
+def prune_unwanted_sections(tree: HtmlElement, potential_tags: Set[str], options: Extractor) -> HtmlElement:
     'Rule-based deletion of targeted document sections'
     favor_precision = options.focus == "precision"
     # prune the rest
@@ -556,7 +556,7 @@ def prune_unwanted_sections(tree: HtmlElement, potential_tags: Any, options: Any
     return tree
 
 
-def _extract(tree: HtmlElement, options: Any) -> Tuple[_Element, str, Any]:
+def _extract(tree: HtmlElement, options: Extractor) -> Tuple[_Element, str, Set[str]]:
     # init
     potential_tags = set(TAG_CATALOG)
     if options.tables is True:
@@ -583,7 +583,7 @@ def _extract(tree: HtmlElement, options: Any) -> Tuple[_Element, str, Any]:
             factor = 1
         else:
             factor = 3
-        if not ptest or len(''.join(ptest)) < options.min_extracted_size * factor:
+        if not ptest or len(''.join(ptest)) < options.min_extracted_size * factor:  # type: ignore[attr-defined]
             potential_tags.add('div')
         # polish list of potential tags
         if 'ref' not in potential_tags:
@@ -609,7 +609,7 @@ def _extract(tree: HtmlElement, options: Any) -> Tuple[_Element, str, Any]:
     return result_body, temp_text, potential_tags
 
 
-def extract_content(cleaned_tree: HtmlElement, options: Any) -> Tuple[_Element, str, int]:
+def extract_content(cleaned_tree: HtmlElement, options: Extractor) -> Tuple[_Element, str, int]:
     '''Find the main content of a page using a set of XPath expressions,
        then extract relevant elements, strip them of unwanted subparts and
        convert them'''
@@ -622,7 +622,7 @@ def extract_content(cleaned_tree: HtmlElement, options: Any) -> Tuple[_Element,
 
     # try parsing wild <p> elements if nothing found or text too short
     # todo: test precision and recall settings here
-    if len(result_body) == 0 or len(temp_text) < options.min_extracted_size:
+    if len(result_body) == 0 or len(temp_text) < options.min_extracted_size:  # type: ignore[attr-defined]
         result_body = recover_wild_text(backup_tree, result_body, options, potential_tags)
         temp_text = ' '.join(result_body.itertext()).strip()
     # filter output
@@ -632,7 +632,7 @@ def extract_content(cleaned_tree: HtmlElement, options: Any) -> Tuple[_Element,
     return result_body, temp_text, len(temp_text)
 
 
-def process_comments_node(elem: _Element, potential_tags: Any, options: Any) -> Optional[_Element]:
+def process_comments_node(elem: _Element, potential_tags: Set[str], options: Extractor) -> Optional[_Element]:
     '''Process comment node and determine how to deal with its content'''
     if elem.tag in potential_tags:
         # print(elem.tag, elem.text_content())
@@ -646,7 +646,7 @@ def process_comments_node(elem: _Element, potential_tags: Any, options: Any) ->
     return None
 
 
-def extract_comments(tree: HtmlElement, options: Any) -> Tuple[_Element, str, int, HtmlElement]:
+def extract_comments(tree: HtmlElement, options: Extractor) -> Tuple[_Element, str, int, HtmlElement]:
     "Try to extract comments out of potential sections in the HTML."
     comments_body = Element("body")
     # define iteration strategy
@@ -668,7 +668,7 @@ def extract_comments(tree: HtmlElement, options: Any) -> Tuple[_Element, str, in
         #        comments_body.append(processed_elem)
         # processed_elems = (process_comments_node(elem, potential_tags, options) for elem in
         #                    subtree.xpath('.//*'))
-        comments_body.extend(filter(lambda x: x is not None, (process_comments_node(e, potential_tags, options) for e in subtree.xpath(".//*"))))
+        comments_body.extend(filter(lambda x: x is not None, (process_comments_node(e, potential_tags, options) for e in subtree.xpath(".//*"))))  # type: ignore[arg-type]
         # control
         if len(comments_body) > 0:  # if it has children
             LOGGER.debug(expr)
diff --git a/trafilatura/metadata.py b/trafilatura/metadata.py
index 09dbd2dd..f6fe6d8c 100644
--- a/trafilatura/metadata.py
+++ b/trafilatura/metadata.py
@@ -38,6 +38,7 @@
     TITLE_XPATHS,
 )
 
+__all__ = ["Document"]
 
 LOGGER = logging.getLogger(__name__)
 logging.getLogger("htmldate").setLevel(logging.WARNING)
@@ -309,7 +310,8 @@ def examine_meta(tree: HtmlElement) -> Document:
     # backups
     metadata.sitename = metadata.sitename or backup_sitename
     # copy
-    metadata.set_attributes(tags=tags)
+    metadata.tags = tags
+    # metadata.set_attributes(tags=tags)
     return metadata
 
 
diff --git a/trafilatura/py.typed b/trafilatura/py.typed
new file mode 100644
index 00000000..e69de29b
diff --git a/trafilatura/readability_lxml.py b/trafilatura/readability_lxml.py
index 5ccfb9f4..96742bd0 100644
--- a/trafilatura/readability_lxml.py
+++ b/trafilatura/readability_lxml.py
@@ -269,7 +269,8 @@ def class_weight(self, elem: HtmlElement) -> float:
 
     def score_node(self, elem: HtmlElement) -> Candidate:
         score = self.class_weight(elem)
-        name = elem.tag.lower()
+        tag = str(elem.tag)
+        name = tag.lower()
         if name in DIV_SCORES:
             score += 5
         elif name in BLOCK_SCORES:
diff --git a/trafilatura/settings.py b/trafilatura/settings.py
index 2341f7e9..778543ae 100644
--- a/trafilatura/settings.py
+++ b/trafilatura/settings.py
@@ -279,19 +279,13 @@ def __init__(
         self.filedate: Optional[str] = filedate
 
     @classmethod
-    def from_dict(cls: Any, data: Dict[str, Any]) -> Any:
+    def from_dict(cls, data: Dict[str, Any]) -> 'Document':
         "Set a series of attributes using a dictionary."
         doc = cls()
         for key, value in data.items():
             setattr(doc, key, value)
         return doc
 
-    def set_attributes(self, **kwargs: Optional[Dict[str, Any]]) -> None:
-        "Helper function to (re-)set a series of attributes."
-        for key, value in kwargs.items():
-            if value:
-                setattr(self, key, value)
-
     def clean_and_trim(self) -> None:
         "Limit text length and trim the attributes."
         for slot in self.__slots__:
diff --git a/trafilatura/utils.py b/trafilatura/utils.py
index fce29955..aae37d7f 100644
--- a/trafilatura/utils.py
+++ b/trafilatura/utils.py
@@ -32,7 +32,7 @@
     HAS_BROTLI = False
 
 try:
-    import zstandard  # type: ignore
+    import zstandard
     HAS_ZSTD = True
 except ImportError:
     HAS_ZSTD = False
@@ -114,7 +114,7 @@ def handle_compressed_file(filecontent: bytes) -> bytes:
     # try brotli
     if HAS_BROTLI:
         try:
-            return brotli.decompress(filecontent)
+            return brotli.decompress(filecontent)  # type: ignore[no-any-return]
         except brotli.error:
             pass  # logging.debug('invalid Brotli file')
     # try zlib/deflate
@@ -408,7 +408,7 @@ def language_classifier(temp_text: str, temp_comments: str) -> Optional[str]:
     else:
         LOGGER.warning('Language detector not installed, skipping detection')
         result = None
-    return result
+    return result  # type: ignore[no-any-return]
 
 
 def language_filter(temp_text: str, temp_comments: str, target_language: str, docmeta: Any) -> Tuple[bool, Any]: