Skip to content

Commit

Permalink
downloads: add support for SOCKS proxies (adbar#682)
Browse files Browse the repository at this point in the history
* downloads: add SOCKS Proxy support

* review settings and workflow

* Refactors tests

* code linting

* fix tests

---------

Co-authored-by: Adrien Barbaresi <[email protected]>
  • Loading branch information
gremid and adbar authored Aug 28, 2024
1 parent 14c79c0 commit b3aea4a
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 16 deletions.
21 changes: 20 additions & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,11 @@ jobs:
os: [ubuntu-latest]
# https://github.com/actions/python-versions/blob/main/versions-manifest.json
python-version: ["3.9", "3.11"] # "3.13-dev"
env: [{ MINIMAL: "true" }, { MINIMAL: "false" }]
env:
- MINIMAL: "true"
PROXY_TEST: "false"
- MINIMAL: "false"
PROXY_TEST: "true"
include:
# custom python versions
- os: ubuntu-20.04
Expand All @@ -36,6 +40,19 @@ jobs:
python-version: "3.10"
- os: ubuntu-latest
python-version: "3.12"
services:
socks_proxy:
image: ${{ matrix.os == 'ubuntu-latest' && 'serjs/go-socks5-proxy' || '' }}
ports:
- 1080:1080
socks_proxy_auth:
image: ${{ matrix.os == 'ubuntu-latest' && 'serjs/go-socks5-proxy' || '' }}
env:
PROXY_USER: user
PROXY_PASSWORD: pass
ports:
- 1081:1080

steps:
# Python and pip setup
- name: Set up Python ${{ matrix.python-version }}
Expand Down Expand Up @@ -97,6 +114,8 @@ jobs:
run: |
python -m pip install pytest pytest-cov
pytest --cov=./ --cov-report=xml
env:
PROXY_TEST: ${{ matrix.env.PROXY_TEST }}

# coverage
- name: Upload coverage to Codecov
Expand Down
18 changes: 18 additions & 0 deletions compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
services:
socks_proxy:
image: serjs/go-socks5-proxy
ports:
- 1080:1080
socks_proxy_auth:
image: serjs/go-socks5-proxy
ports:
- 1081:1080
environment:
PROXY_USER: user
PROXY_PASSWORD: pass
# tor_proxy:
# image: dperson/torproxy
# ports:
# - 9050:9050


1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def get_long_description():
"htmldate[speed] >= 1.8.1",
"py3langid >= 0.2.2",
"pycurl >= 7.45.3",
"urllib3[socks]",
"zstandard >= 0.20.0",
],
"gui": [
Expand Down
37 changes: 37 additions & 0 deletions tests/downloads_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from trafilatura.settings import DEFAULT_CONFIG, args_to_extractor, use_config
from trafilatura.utils import decode_file, decode_response, handle_compressed_file, load_html


logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

ZERO_CONFIG = DEFAULT_CONFIG
Expand All @@ -59,6 +60,7 @@ def _reset_downloads_global_objects():
"""
Force global objects to be re-created
"""
trafilatura.downloads.PROXY_URL = None
trafilatura.downloads.HTTP_POOL = None
trafilatura.downloads.NO_CERT_POOL = None
trafilatura.downloads.RETRY_STRATEGY = None
Expand Down Expand Up @@ -154,6 +156,39 @@ def test_fetch():
_reset_downloads_global_objects()


IS_PROXY_TEST = os.environ.get("PROXY_TEST", "false") == "true"

PROXY_URLS = (
("socks5://localhost:1080", True),
("socks5://user:pass@localhost:1081", True),
("socks5://localhost:10/", False),
("bogus://localhost:1080", False),
)


def proxied(f):
"Run the download using a potentially malformed proxy address."
for proxy_url, is_working in PROXY_URLS:
_reset_downloads_global_objects()
trafilatura.downloads.PROXY_URL = proxy_url
if is_working:
f()
else:
with pytest.raises(AssertionError):
f()
_reset_downloads_global_objects()


@pytest.mark.skipif(not IS_PROXY_TEST, reason="proxy tests disabled")
def test_proxied_is_live_page():
proxied(test_is_live_page)


@pytest.mark.skipif(not IS_PROXY_TEST, reason="proxy tests disabled")
def test_proxied_fetch():
proxied(test_fetch)


def test_config():
'''Test how configuration options are read and stored.'''
# default config is none
Expand Down Expand Up @@ -241,6 +276,8 @@ def test_queue():
test_response_object()
test_is_live_page()
test_fetch()
test_proxied_is_live_page()
test_proxied_fetch()
test_config()
test_decode()
test_queue()
47 changes: 32 additions & 15 deletions trafilatura/downloads.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"""

import logging
import os
import random

from concurrent.futures import ThreadPoolExecutor, as_completed
Expand All @@ -16,9 +17,21 @@
import certifi
import urllib3

from courlan import UrlStore
from courlan.network import redirection_test

from .settings import DEFAULT_CONFIG, Extractor
from .utils import URL_BLACKLIST_REGEX, decode_file, is_acceptable_length, make_chunks


try:
import pycurl
from urllib3.contrib.socks import SOCKSProxyManager
PROXY_URL = os.environ.get("http_proxy")
except ImportError:
PROXY_URL = None

try:
import pycurl
CURL_SHARE = pycurl.CurlShare()
# available options:
# https://curl.se/libcurl/c/curl_share_setopt.html
Expand All @@ -30,27 +43,28 @@
except ImportError:
HAS_PYCURL = False

from courlan import UrlStore
from courlan.network import redirection_test

try: # Python 3.8+
from importlib.metadata import version
except ImportError:
from importlib_metadata import version

from .settings import DEFAULT_CONFIG, Extractor
from .utils import URL_BLACKLIST_REGEX, decode_file, is_acceptable_length, make_chunks


LOGGER = logging.getLogger(__name__)

NUM_CONNECTIONS = 50

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
HTTP_POOL = None
NO_CERT_POOL = None
RETRY_STRATEGY = None


def create_pool(**args):
"Configure urllib3 download pool according to user-defined settings."
manager_class = SOCKSProxyManager if PROXY_URL else urllib3.PoolManager
manager_args = {"proxy_url": PROXY_URL} if PROXY_URL else {}
manager_args["num_pools"] = 50
return manager_class(**manager_args, **args)


DEFAULT_HEADERS = urllib3.util.make_headers(accept_encoding=True)
USER_AGENT = (
"trafilatura/" + version("trafilatura") + " (+https://github.com/adbar/trafilatura)"
Expand Down Expand Up @@ -161,20 +175,18 @@ def _send_urllib_request(
try:
if no_ssl is False:
if not HTTP_POOL:
HTTP_POOL = urllib3.PoolManager(
HTTP_POOL = create_pool(
retries=RETRY_STRATEGY,
timeout=config.getint("DEFAULT", "DOWNLOAD_TIMEOUT"),
ca_certs=certifi.where(),
num_pools=NUM_CONNECTIONS,
ca_certs=certifi.where()
) # cert_reqs='CERT_REQUIRED'
pool_manager = HTTP_POOL
else:
if not NO_CERT_POOL:
NO_CERT_POOL = urllib3.PoolManager(
NO_CERT_POOL = create_pool(
retries=RETRY_STRATEGY,
timeout=config.getint("DEFAULT", "DOWNLOAD_TIMEOUT"),
cert_reqs="CERT_NONE",
num_pools=NUM_CONNECTIONS,
cert_reqs="CERT_NONE"
)
pool_manager = NO_CERT_POOL
# execute request
Expand Down Expand Up @@ -288,6 +300,8 @@ def _pycurl_is_live_page(url: str) -> bool:
curl.setopt(pycurl.SSL_VERIFYHOST, 0)
# Set option to avoid getting the response body
curl.setopt(curl.NOBODY, True)
if PROXY_URL:
curl.setopt(pycurl.PRE_PROXY, PROXY_URL)
# Perform the request
try:
curl.perform()
Expand Down Expand Up @@ -410,6 +424,9 @@ def _send_pycurl_request(
headerbytes = BytesIO()
curl.setopt(pycurl.HEADERFUNCTION, headerbytes.write)

if PROXY_URL:
curl.setopt(pycurl.PRE_PROXY, PROXY_URL)

# TCP_FASTOPEN
# curl.setopt(pycurl.FAILONERROR, 1)
# curl.setopt(pycurl.ACCEPT_ENCODING, '')
Expand Down

0 comments on commit b3aea4a

Please sign in to comment.