Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deprecate HfApi.list_files_info #1910

Merged
merged 8 commits into from
Dec 15, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions src/huggingface_hub/_commit_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@

UploadMode = Literal["lfs", "regular"]

# Max is 1,000 per request on the Hub for HfApi.list_files_info
# Max is 1,000 per request on the Hub for HfApi.get_paths_info
# Otherwise we get:
# HfHubHTTPError: 413 Client Error: Payload Too Large for url: https://huggingface.co/api/datasets/xxx (Request ID: xxx)\n\ntoo many parameters
# See https://github.com/huggingface/huggingface_hub/issues/1503
Expand Down Expand Up @@ -555,21 +555,22 @@ def _fetch_lfs_files_to_copy(
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
If the Hub API response is improperly formatted.
"""
from .hf_api import HfApi
from .hf_api import HfApi, RepoFile

hf_api = HfApi(endpoint=endpoint, token=token)
files_to_copy = {}
for src_revision, operations in groupby(copies, key=lambda op: op.src_revision):
operations = list(operations) # type: ignore
paths = [op.src_path_in_repo for op in operations]
for offset in range(0, len(paths), FETCH_LFS_BATCH_SIZE):
src_repo_files = hf_api.list_files_info(
src_repo_files = hf_api.get_paths_info(
repo_id=repo_id,
paths=paths[offset : offset + FETCH_LFS_BATCH_SIZE],
revision=src_revision or revision,
repo_type=repo_type,
)
for src_repo_file in src_repo_files:
assert isinstance(src_repo_file, RepoFile)
mariosasko marked this conversation as resolved.
Show resolved Hide resolved
if not src_repo_file.lfs:
raise NotImplementedError("Copying a non-LFS file is not implemented")
files_to_copy[(src_repo_file.rfilename, src_revision)] = src_repo_file
Expand Down
7 changes: 5 additions & 2 deletions src/huggingface_hub/hf_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@
validate_hf_hub_args,
)
from .utils import tqdm as hf_tqdm
from .utils._deprecation import _deprecate_method
from .utils._typing import CallableT
from .utils.endpoint_helpers import (
DatasetFilter,
Expand Down Expand Up @@ -2347,6 +2348,7 @@ def file_exists(
return False

@validate_hf_hub_args
@_deprecate_method(version="1.0", message="Use `list_repo_tree` and `get_paths_info` instead.")
mariosasko marked this conversation as resolved.
Show resolved Hide resolved
def list_files_info(
self,
repo_id: str,
Expand Down Expand Up @@ -2559,9 +2561,10 @@ def list_repo_files(
"""
return [
f.rfilename
for f in self.list_files_info(
repo_id=repo_id, paths=None, revision=revision, repo_type=repo_type, token=token
for f in self.list_repo_tree(
repo_id=repo_id, recursive=True, revision=revision, repo_type=repo_type, token=token
)
if isinstance(f, RepoFile)
]

@validate_hf_hub_args
Expand Down
19 changes: 17 additions & 2 deletions tests/test_hf_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@
DUMMY_MODEL_ID,
DUMMY_MODEL_ID_REVISION_ONE_SPECIFIC_COMMIT,
SAMPLE_DATASET_IDENTIFIER,
expect_deprecation,
repo_name,
require_git_lfs,
rmtree_with_retry,
Expand Down Expand Up @@ -1000,7 +1001,7 @@ def test_commit_copy_file(self, repo_url: RepoUrl) -> None:
self.assertIn("lfs Copy (1).bin", repo_files)

# Check same LFS file
repo_file1, repo_file2 = self._api.list_files_info(repo_id=repo_id, paths=["lfs.bin", "lfs Copy.bin"])
repo_file1, repo_file2 = self._api.get_paths_info(repo_id=repo_id, paths=["lfs.bin", "lfs Copy.bin"])
self.assertEqual(repo_file1.lfs["sha256"], repo_file2.lfs["sha256"])

@use_tmp_repo()
Expand Down Expand Up @@ -1158,6 +1159,7 @@ def setUpClass(cls):
def tearDownClass(cls):
cls._api.delete_repo(repo_id=cls.repo_id)

@expect_deprecation("list_files_info")
def test_get_regular_file_info(self):
files = list(self._api.list_files_info(repo_id=self.repo_id, paths="file.md"))
self.assertEqual(len(files), 1)
Expand All @@ -1168,6 +1170,7 @@ def test_get_regular_file_info(self):
self.assertEqual(file.size, 4)
self.assertEqual(file.blob_id, "6320cd248dd8aeaab759d5871f8781b5c0505172")

@expect_deprecation("list_files_info")
def test_get_lfs_file_info(self):
files = list(self._api.list_files_info(repo_id=self.repo_id, paths="lfs.bin"))
self.assertEqual(len(files), 1)
Expand All @@ -1185,34 +1188,41 @@ def test_get_lfs_file_info(self):
self.assertEqual(file.size, 4)
self.assertEqual(file.blob_id, "0a828055346279420bd02a4221c177bbcdc045d8")

@expect_deprecation("list_files_info")
def test_list_files(self):
files = list(self._api.list_files_info(repo_id=self.repo_id, paths=["file.md", "lfs.bin", "2/file_2.md"]))
self.assertEqual(len(files), 3)
self.assertEqual({f.path for f in files}, {"file.md", "lfs.bin", "2/file_2.md"})

@expect_deprecation("list_files_info")
def test_list_files_and_folder(self):
files = list(self._api.list_files_info(repo_id=self.repo_id, paths=["file.md", "lfs.bin", "2"]))
self.assertEqual(len(files), 3)
self.assertEqual({f.path for f in files}, {"file.md", "lfs.bin", "2/file_2.md"})

@expect_deprecation("list_files_info")
def test_list_unknown_path_among_other(self):
files = list(self._api.list_files_info(repo_id=self.repo_id, paths=["file.md", "unknown"]))
self.assertEqual(len(files), 1)

@expect_deprecation("list_files_info")
def test_list_unknown_path_alone(self):
files = list(self._api.list_files_info(repo_id=self.repo_id, paths="unknown"))
self.assertEqual(len(files), 0)

@expect_deprecation("list_files_info")
def test_list_folder_flat(self):
files = list(self._api.list_files_info(repo_id=self.repo_id, paths=["2"]))
self.assertEqual(len(files), 1)
self.assertEqual(files[0].path, "2/file_2.md")

@expect_deprecation("list_files_info")
def test_list_folder_recursively(self):
files = list(self._api.list_files_info(repo_id=self.repo_id, paths=["1"]))
self.assertEqual(len(files), 2)
self.assertEqual({f.path for f in files}, {"1/2/file_1_2.md", "1/file_1.md"})

@expect_deprecation("list_files_info")
def test_list_repo_files_manually(self):
files = list(self._api.list_files_info(repo_id=self.repo_id))
self.assertEqual(len(files), 7)
Expand All @@ -1221,22 +1231,26 @@ def test_list_repo_files_manually(self):
{".gitattributes", "1/2/file_1_2.md", "1/file_1.md", "2/file_2.md", "3/file_3.md", "file.md", "lfs.bin"},
)

@expect_deprecation("list_files_info")
def test_list_repo_files_alias(self):
self.assertEqual(
set(self._api.list_repo_files(repo_id=self.repo_id)),
set(f.path for f in self._api.list_files_info(repo_id=self.repo_id)),
{".gitattributes", "1/2/file_1_2.md", "1/file_1.md", "2/file_2.md", "3/file_3.md", "file.md", "lfs.bin"},
)

@expect_deprecation("list_files_info")
def test_list_with_root_path_is_ignored(self):
# must use `paths=None`
files = list(self._api.list_files_info(repo_id=self.repo_id, paths="/"))
self.assertEqual(len(files), 0)

@expect_deprecation("list_files_info")
def test_list_with_empty_path_is_invalid(self):
# must use `paths=None`
with self.assertRaises(BadRequestError):
list(self._api.list_files_info(repo_id=self.repo_id, paths=""))

@expect_deprecation("list_files_info")
@with_production_testing
def test_list_files_with_expand(self):
files = list(
Expand All @@ -1256,6 +1270,7 @@ def test_list_files_with_expand(self):
self.assertTrue(vae_model.security["safe"])
self.assertTrue(isinstance(vae_model.security["av_scan"], dict)) # all details in here

@expect_deprecation("list_files_info")
@with_production_testing
def test_list_files_without_expand(self):
files = list(
Expand Down
Loading