Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dev(narugo): add resume for ranged headers of http_get function #2823

Merged
merged 14 commits into from
Feb 11, 2025
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 51 additions & 1 deletion src/huggingface_hub/file_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,57 @@ def http_get(
initial_headers = headers
headers = copy.deepcopy(headers) or {}
if resume_size > 0:
headers["Range"] = "bytes=%d-" % (resume_size,)
original_range = headers.get("Range")
if original_range:
if "," in original_range:
# multiple range, supported by HTTP but cannot determine how to resume
# so just replace it and raise a warning
warnings.warn(
f"Multiple ranges detected - {original_range!r}, using full range after resume", UserWarning
)
headers["Range"] = f"bytes={resume_size}-"
else:
range_match = re.match(r"^\s*bytes\s*=\s*(\d*)\s*-\s*(\d*)\s*$", original_range, re.IGNORECASE)
# check the range format
if not range_match:
# invalid format, not supported by HTTP
raise RuntimeError(f"Invalid range format - {original_range!r}.")
else:
start_str, end_str = range_match.groups()

# suffix range(e.g. bytes=-500)
if not start_str:
if not end_str:
# invalid format, not supported by HTTP
raise RuntimeError(f"Invalid range format - {original_range!r}.")
else:
suffix_value = int(end_str)
new_suffix_value = suffix_value - resume_size
if new_suffix_value <= 0:
# If the file is already fully downloaded, we don't need to download it again.
return
else:
headers["Range"] = f"bytes=-{new_suffix_value}"
else:
# calculate new start position
start = int(start_str)
new_start = start + resume_size
end = int(end_str) if end_str else None

# process range check
if end is not None and new_start > end:
# If the file is already fully downloaded, we don't need to download it again.
return
else:
# set the new range
if end is not None:
new_range = f"bytes={new_start}-{end}"
else:
new_range = f"bytes={new_start}-"
headers["Range"] = new_range
else:
# simple resume case
headers["Range"] = f"bytes={resume_size}-"
narugo1992 marked this conversation as resolved.
Show resolved Hide resolved

r = _request_wrapper(
method="GET", url=url, stream=True, proxies=proxies, headers=headers, timeout=constants.HF_HUB_DOWNLOAD_TIMEOUT
Expand Down
105 changes: 91 additions & 14 deletions tests/test_file_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import warnings
from contextlib import contextmanager
from pathlib import Path
from typing import Iterable
from typing import Iterable, List
from unittest.mock import Mock, patch

import pytest
Expand Down Expand Up @@ -932,8 +932,8 @@ def test_get_pointer_path_but_invalid_relative_filename(self) -> None:
_get_pointer_path("path/to/storage", "abcdef", relative_filename)


class TestHttpGet(unittest.TestCase):
def test_http_get_with_ssl_and_timeout_error(self):
class TestHttpGet:
def test_http_get_with_ssl_and_timeout_error(self, caplog):
def _iter_content_1() -> Iterable[bytes]:
yield b"0" * 10
yield b"0" * 10
Expand Down Expand Up @@ -966,23 +966,100 @@ def _iter_content_4() -> Iterable[bytes]:

temp_file = io.BytesIO()

with self.assertLogs("huggingface_hub.file_download", level="WARNING") as records:
http_get("fake_url", temp_file=temp_file)
http_get("fake_url", temp_file=temp_file)

# Check 3 warnings
self.assertEqual(len(records.records), 3)
assert len([r for r in caplog.records if r.levelname == "WARNING"]) == 3

# Check final value
self.assertEqual(temp_file.tell(), 100)
self.assertEqual(temp_file.getvalue(), b"0" * 100)
assert temp_file.tell() == 100
assert temp_file.getvalue() == b"0" * 100

# Check number of calls + correct range headers
self.assertEqual(len(mock.call_args_list), 4)
self.assertEqual(mock.call_args_list[0].kwargs["headers"], {})
self.assertEqual(mock.call_args_list[1].kwargs["headers"], {"Range": "bytes=20-"})
self.assertEqual(mock.call_args_list[2].kwargs["headers"], {"Range": "bytes=30-"})
self.assertEqual(mock.call_args_list[3].kwargs["headers"], {"Range": "bytes=60-"})
assert len(mock.call_args_list) == 4
assert mock.call_args_list[0].kwargs["headers"] == {}
assert mock.call_args_list[1].kwargs["headers"] == {"Range": "bytes=20-"}
assert mock.call_args_list[2].kwargs["headers"] == {"Range": "bytes=30-"}
assert mock.call_args_list[3].kwargs["headers"] == {"Range": "bytes=60-"}

@pytest.mark.parametrize(
"initial_range,expected_ranges",
[
# Test suffix ranges (bytes=-100)
(
"bytes=-100",
[
"bytes=-100",
"bytes=-80",
"bytes=-70",
"bytes=-40",
]
),
# Test prefix ranges (bytes=15-)
(
"bytes=15-",
[
"bytes=15-",
"bytes=35-",
"bytes=45-",
"bytes=75-",
]
),
# Test double closed ranges (bytes=15-114)
(
"bytes=15-114",
[
"bytes=15-114",
"bytes=35-114",
"bytes=45-114",
"bytes=75-114",
]
),
],
)
def test_http_get_with_range_headers(self, caplog,initial_range: str, expected_ranges: List[str]):

def _iter_content_1() -> Iterable[bytes]:
yield b"0" * 10
yield b"0" * 10
raise requests.exceptions.SSLError("Fake SSLError")

def _iter_content_2() -> Iterable[bytes]:
yield b"0" * 10
raise requests.ReadTimeout("Fake ReadTimeout")

def _iter_content_3() -> Iterable[bytes]:
yield b"0" * 10
yield b"0" * 10
yield b"0" * 10
raise requests.ConnectionError("Fake ConnectionError")

def _iter_content_4() -> Iterable[bytes]:
yield b"0" * 10
yield b"0" * 10
yield b"0" * 10
yield b"0" * 10

with patch("huggingface_hub.file_download._request_wrapper") as mock:
mock.return_value.headers = {"Content-Length": 100}
mock.return_value.iter_content.side_effect = [
_iter_content_1(),
_iter_content_2(),
_iter_content_3(),
_iter_content_4(),
]

temp_file = io.BytesIO()

http_get("fake_url", temp_file=temp_file, headers={"Range": initial_range})

assert len([r for r in caplog.records if r.levelname == "WARNING"]) == 3

assert temp_file.tell() == 100
assert temp_file.getvalue() == b"0" * 100

assert len(mock.call_args_list) == 4
for i, expected_range in enumerate(expected_ranges):
assert mock.call_args_list[i].kwargs["headers"] == {"Range": expected_range}

class CreateSymlinkTest(unittest.TestCase):
@unittest.skipIf(os.name == "nt", "No symlinks on Windows")
Expand Down
Loading