diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 82c3614f..afa2ad82 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -19,7 +19,11 @@ jobs: os: [ubuntu-latest] # https://github.com/actions/python-versions/blob/main/versions-manifest.json python-version: ["3.9", "3.11"] # "3.13-dev" - env: [{ MINIMAL: "true" }, { MINIMAL: "false" }] + env: + - MINIMAL: "true" + PROXY_TEST: "false" + - MINIMAL: "false" + PROXY_TEST: "true" include: # custom python versions - os: ubuntu-20.04 @@ -36,6 +40,19 @@ jobs: python-version: "3.10" - os: ubuntu-latest python-version: "3.12" + services: + socks_proxy: + image: ${{ matrix.os == 'ubuntu-latest' && 'serjs/go-socks5-proxy' || '' }} + ports: + - 1080:1080 + socks_proxy_auth: + image: ${{ matrix.os == 'ubuntu-latest' && 'serjs/go-socks5-proxy' || '' }} + env: + PROXY_USER: user + PROXY_PASSWORD: pass + ports: + - 1081:1080 + steps: # Python and pip setup - name: Set up Python ${{ matrix.python-version }} @@ -97,6 +114,8 @@ jobs: run: | python -m pip install pytest pytest-cov pytest --cov=./ --cov-report=xml + env: + PROXY_TEST: ${{ matrix.env.PROXY_TEST }} # coverage - name: Upload coverage to Codecov diff --git a/compose.yml b/compose.yml new file mode 100644 index 00000000..9b5b238e --- /dev/null +++ b/compose.yml @@ -0,0 +1,18 @@ +services: + socks_proxy: + image: serjs/go-socks5-proxy + ports: + - 1080:1080 + socks_proxy_auth: + image: serjs/go-socks5-proxy + ports: + - 1081:1080 + environment: + PROXY_USER: user + PROXY_PASSWORD: pass +# tor_proxy: +# image: dperson/torproxy +# ports: +# - 9050:9050 + + diff --git a/setup.py b/setup.py index 29c74109..13ae74f2 100644 --- a/setup.py +++ b/setup.py @@ -31,6 +31,7 @@ def get_long_description(): "htmldate[speed] >= 1.8.1", "py3langid >= 0.2.2", "pycurl >= 7.45.3", + "urllib3[socks]", "zstandard >= 0.20.0", ], "gui": [ diff --git a/tests/downloads_tests.py b/tests/downloads_tests.py index a443e922..a286bc8d 100644 --- a/tests/downloads_tests.py +++ b/tests/downloads_tests.py @@ -43,6 +43,7 @@ from trafilatura.settings import DEFAULT_CONFIG, args_to_extractor, use_config from trafilatura.utils import decode_file, decode_response, handle_compressed_file, load_html + logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) ZERO_CONFIG = DEFAULT_CONFIG @@ -59,6 +60,7 @@ def _reset_downloads_global_objects(): """ Force global objects to be re-created """ + trafilatura.downloads.PROXY_URL = None trafilatura.downloads.HTTP_POOL = None trafilatura.downloads.NO_CERT_POOL = None trafilatura.downloads.RETRY_STRATEGY = None @@ -154,6 +156,39 @@ def test_fetch(): _reset_downloads_global_objects() +IS_PROXY_TEST = os.environ.get("PROXY_TEST", "false") == "true" + +PROXY_URLS = ( + ("socks5://localhost:1080", True), + ("socks5://user:pass@localhost:1081", True), + ("socks5://localhost:10/", False), + ("bogus://localhost:1080", False), +) + + +def proxied(f): + "Run the download using a potentially malformed proxy address." + for proxy_url, is_working in PROXY_URLS: + _reset_downloads_global_objects() + trafilatura.downloads.PROXY_URL = proxy_url + if is_working: + f() + else: + with pytest.raises(AssertionError): + f() + _reset_downloads_global_objects() + + +@pytest.mark.skipif(not IS_PROXY_TEST, reason="proxy tests disabled") +def test_proxied_is_live_page(): + proxied(test_is_live_page) + + +@pytest.mark.skipif(not IS_PROXY_TEST, reason="proxy tests disabled") +def test_proxied_fetch(): + proxied(test_fetch) + + def test_config(): '''Test how configuration options are read and stored.''' # default config is none @@ -241,6 +276,8 @@ def test_queue(): test_response_object() test_is_live_page() test_fetch() + test_proxied_is_live_page() + test_proxied_fetch() test_config() test_decode() test_queue() diff --git a/trafilatura/downloads.py b/trafilatura/downloads.py index 8f0e6858..5a27114f 100644 --- a/trafilatura/downloads.py +++ b/trafilatura/downloads.py @@ -4,6 +4,7 @@ """ import logging +import os import random from concurrent.futures import ThreadPoolExecutor, as_completed @@ -16,9 +17,21 @@ import certifi import urllib3 +from courlan import UrlStore +from courlan.network import redirection_test + +from .settings import DEFAULT_CONFIG, Extractor +from .utils import URL_BLACKLIST_REGEX, decode_file, is_acceptable_length, make_chunks + + try: - import pycurl + from urllib3.contrib.socks import SOCKSProxyManager + PROXY_URL = os.environ.get("http_proxy") +except ImportError: + PROXY_URL = None +try: + import pycurl CURL_SHARE = pycurl.CurlShare() # available options: # https://curl.se/libcurl/c/curl_share_setopt.html @@ -30,27 +43,28 @@ except ImportError: HAS_PYCURL = False -from courlan import UrlStore -from courlan.network import redirection_test - try: # Python 3.8+ from importlib.metadata import version except ImportError: from importlib_metadata import version -from .settings import DEFAULT_CONFIG, Extractor -from .utils import URL_BLACKLIST_REGEX, decode_file, is_acceptable_length, make_chunks - LOGGER = logging.getLogger(__name__) -NUM_CONNECTIONS = 50 - urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) HTTP_POOL = None NO_CERT_POOL = None RETRY_STRATEGY = None + +def create_pool(**args): + "Configure urllib3 download pool according to user-defined settings." + manager_class = SOCKSProxyManager if PROXY_URL else urllib3.PoolManager + manager_args = {"proxy_url": PROXY_URL} if PROXY_URL else {} + manager_args["num_pools"] = 50 + return manager_class(**manager_args, **args) + + DEFAULT_HEADERS = urllib3.util.make_headers(accept_encoding=True) USER_AGENT = ( "trafilatura/" + version("trafilatura") + " (+https://github.com/adbar/trafilatura)" @@ -161,20 +175,18 @@ def _send_urllib_request( try: if no_ssl is False: if not HTTP_POOL: - HTTP_POOL = urllib3.PoolManager( + HTTP_POOL = create_pool( retries=RETRY_STRATEGY, timeout=config.getint("DEFAULT", "DOWNLOAD_TIMEOUT"), - ca_certs=certifi.where(), - num_pools=NUM_CONNECTIONS, + ca_certs=certifi.where() ) # cert_reqs='CERT_REQUIRED' pool_manager = HTTP_POOL else: if not NO_CERT_POOL: - NO_CERT_POOL = urllib3.PoolManager( + NO_CERT_POOL = create_pool( retries=RETRY_STRATEGY, timeout=config.getint("DEFAULT", "DOWNLOAD_TIMEOUT"), - cert_reqs="CERT_NONE", - num_pools=NUM_CONNECTIONS, + cert_reqs="CERT_NONE" ) pool_manager = NO_CERT_POOL # execute request @@ -288,6 +300,8 @@ def _pycurl_is_live_page(url: str) -> bool: curl.setopt(pycurl.SSL_VERIFYHOST, 0) # Set option to avoid getting the response body curl.setopt(curl.NOBODY, True) + if PROXY_URL: + curl.setopt(pycurl.PRE_PROXY, PROXY_URL) # Perform the request try: curl.perform() @@ -410,6 +424,9 @@ def _send_pycurl_request( headerbytes = BytesIO() curl.setopt(pycurl.HEADERFUNCTION, headerbytes.write) + if PROXY_URL: + curl.setopt(pycurl.PRE_PROXY, PROXY_URL) + # TCP_FASTOPEN # curl.setopt(pycurl.FAILONERROR, 1) # curl.setopt(pycurl.ACCEPT_ENCODING, '')