From f98f557b236e8eaf646e24c68d7e28950e065473 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Wed, 15 May 2024 18:12:15 +0200 Subject: [PATCH] downloads: fix deflate decoding and add optional zstd to accepted encodings (#594) * downloads: fix deflate and add optional zstd to accepted encodings * polish * better logging and minimal version --- setup.py | 1 + tests/downloads_tests.py | 54 ++++++++++++++++++++++++---------- trafilatura/utils.py | 63 +++++++++++++++++++++++++++------------- 3 files changed, 83 insertions(+), 35 deletions(-) diff --git a/setup.py b/setup.py index 9bcaef43..62f1d3de 100644 --- a/setup.py +++ b/setup.py @@ -31,6 +31,7 @@ def get_long_description(): "htmldate[speed] >= 1.8.1", "py3langid >= 0.2.2", "pycurl >= 7.45.3", + "zstandard >= 0.20.0", ], "gui": [ "Gooey >= 1.0.1", diff --git a/tests/downloads_tests.py b/tests/downloads_tests.py index b9eaa0d8..1ddd17df 100644 --- a/tests/downloads_tests.py +++ b/tests/downloads_tests.py @@ -7,16 +7,25 @@ import logging import os import sys +import zlib try: import pycurl + HAS_PYCURL = True except ImportError: - pycurl = None + HAS_PYCURL = False try: import brotli + HAS_BROTLI = True except ImportError: - brotli = None + HAS_BROTLI = False + +try: + import zstandard + HAS_ZSTD = True +except ImportError: + HAS_ZSTD = False from time import sleep from unittest.mock import patch @@ -38,7 +47,7 @@ add_to_compressed_dict, fetch_url, is_live_page, load_download_buffer) from trafilatura.settings import DEFAULT_CONFIG, args_to_extractor, use_config -from trafilatura.utils import decode_file, decode_response, load_html +from trafilatura.utils import decode_file, decode_response, handle_compressed_file, load_html logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) @@ -86,7 +95,7 @@ def test_fetch(): assert _urllib3_is_live_page('https://httpbun.com/status/404') is False assert is_live_page('https://httpbun.com/status/403') is False # is_live pycurl tests - if pycurl is not None: + if HAS_PYCURL: assert _pycurl_is_live_page('https://httpbun.com/status/301') is True # fetch_url @@ -95,7 +104,7 @@ def test_fetch(): # test if the functions default to no_ssl # doesn't work? # assert _send_urllib_request('https://expired.badssl.com/', False, False, DEFAULT_CONFIG) is not None - if pycurl is not None: + if HAS_PYCURL: assert _send_pycurl_request('https://expired.badssl.com/', False, False, DEFAULT_CONFIG) is not None # no SSL, no decoding url = 'https://httpbun.com/status/200' @@ -103,7 +112,7 @@ def test_fetch(): response = _send_urllib_request('https://httpbun.com/status/200', no_ssl, True, DEFAULT_CONFIG) assert b"200" in response.data and b"OK" in response.data # JSON assert response.headers["x-powered-by"].startswith("httpbun") - if pycurl is not None: + if HAS_PYCURL: response1 = _send_pycurl_request('https://httpbun.com/status/200', True, True, DEFAULT_CONFIG) assert response1.headers["x-powered-by"].startswith("httpbun") assert _handle_response(url, response1, False, DEFAULT_OPTS).data == _handle_response(url, response, False, DEFAULT_OPTS).data @@ -137,7 +146,7 @@ def test_fetch(): res = fetch_url('https://httpbun.com/redirect/1', config=new_config) assert res is None # Also test max redir implementation on pycurl if available - if pycurl is not None: + if HAS_PYCURL: assert _send_pycurl_request('https://httpbun.com/redirect/1', True, False, new_config) is None _reset_downloads_global_objects() # reset global objects again to avoid affecting other tests @@ -147,10 +156,12 @@ def test_config(): # default config is none assert _parse_config(DEFAULT_CONFIG) == (None, None) # default accept-encoding - if brotli is None: - assert DEFAULT_HEADERS['accept-encoding'].endswith(',deflate') - else: - assert DEFAULT_HEADERS['accept-encoding'].endswith(',br') + accepted = ['deflate', 'gzip'] + if HAS_BROTLI: + accepted.append('br') + if HAS_ZSTD: + accepted.append('zstd') + assert sorted(DEFAULT_HEADERS['accept-encoding'].split(',')) == sorted(accepted) # default user-agent default = _determine_headers(DEFAULT_CONFIG) assert default['User-Agent'] == USER_AGENT @@ -164,19 +175,32 @@ def test_config(): def test_decode(): '''Test how responses are being decoded.''' + html_string = "
ABC
" # response type - data = b" " - assert decode_file(data) is not None + assert decode_file(b" ") is not None # GZip - html_string = "
ABC
" gz_string = gzip.compress(html_string.encode("utf-8")) + assert handle_compressed_file(gz_string) == html_string.encode("utf-8") assert decode_file(gz_string) == html_string with pytest.raises(ValueError): decode_response(gz_string) + # Deflate + deflate_string = zlib.compress(html_string.encode("utf-8")) + assert handle_compressed_file(deflate_string) == html_string.encode("utf-8") + assert decode_file(deflate_string) == html_string # Brotli - if brotli is not None: + if HAS_BROTLI: brotli_string = brotli.compress(html_string.encode("utf-8")) + assert handle_compressed_file(brotli_string) == html_string.encode("utf-8") assert decode_file(brotli_string) == html_string + # ZStandard + if HAS_ZSTD: + zstd_string = zstandard.compress(html_string.encode("utf-8")) + assert handle_compressed_file(zstd_string) == html_string.encode("utf-8") + assert decode_file(zstd_string) == html_string + # errors + for bad_file in ("äöüß", b"\x1f\x8b\x08abc", b"\x28\xb5\x2f\xfdabc"): + assert handle_compressed_file(bad_file) == bad_file def test_queue(): diff --git a/trafilatura/utils.py b/trafilatura/utils.py index 9f2aa3f4..0b5b4517 100644 --- a/trafilatura/utils.py +++ b/trafilatura/utils.py @@ -4,20 +4,28 @@ content filtering and language detection. """ +import gzip import logging import re +import zlib from functools import lru_cache -from gzip import decompress from html import unescape from itertools import islice from unicodedata import normalize -# if brotli is installed +# response compression try: import brotli + HAS_BROTLI = True except ImportError: - brotli = None + HAS_BROTLI = False + +try: + import zstandard + HAS_ZSTD = True +except ImportError: + HAS_ZSTD = False # language detection try: @@ -93,23 +101,38 @@ def handle_compressed_file(filecontent): - """Tell if a file's magic number corresponds to the GZip format - and try to decode it. Alternatively, try Brotli if the package - is installed.""" - if isinstance(filecontent, bytes): - # source: https://stackoverflow.com/questions/3703276/how-to-tell-if-a-file-is-gzip-compressed - if filecontent[:2] == b'\x1f\x8b': - # decode GZipped data - try: - filecontent = decompress(filecontent) - except (EOFError, OSError): - logging.warning('invalid GZ file') - # try brotli - elif brotli is not None: - try: - filecontent = brotli.decompress(filecontent) - except brotli.error: - pass # logging.debug('invalid Brotli file') + """ + Don't trust response headers and try to decompress a binary string + with a cascade of installed packages. Use magic numbers when available. + """ + if not isinstance(filecontent, bytes): + return filecontent + + # source: https://stackoverflow.com/questions/3703276/how-to-tell-if-a-file-is-gzip-compressed + if filecontent[:3] == b"\x1f\x8b\x08": + try: + return gzip.decompress(filecontent) + except Exception: # EOFError, OSError, gzip.BadGzipFile + LOGGER.warning("invalid GZ file") + # try zstandard + if HAS_ZSTD and filecontent[:4] == b"\x28\xb5\x2f\xfd": + try: + return zstandard.decompress(filecontent) # max_output_size=??? + except zstandard.ZstdError: + LOGGER.warning("invalid ZSTD file") + # try brotli + if HAS_BROTLI: + try: + return brotli.decompress(filecontent) + except brotli.error: + pass # logging.debug('invalid Brotli file') + # try zlib/deflate + try: + return zlib.decompress(filecontent) + except zlib.error: + pass + + # return content unchanged if decompression failed return filecontent