Skip to content

Commit

Permalink
Deprecate HfApi.list_files_info (#1910)
Browse files Browse the repository at this point in the history
* Deprecate `HfApi.list_files_info`

* Style

* Fix test

* Mypy fix

* Add missing import

* Last fix :)

* Apply suggestions from code review

Co-authored-by: Lucain <[email protected]>

* Style

---------

Co-authored-by: Lucain <[email protected]>
  • Loading branch information
mariosasko and Wauplin authored Dec 15, 2023
1 parent 8d917aa commit 7319ea4
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 7 deletions.
8 changes: 5 additions & 3 deletions src/huggingface_hub/_commit_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@

UploadMode = Literal["lfs", "regular"]

# Max is 1,000 per request on the Hub for HfApi.list_files_info
# Max is 1,000 per request on the Hub for HfApi.get_paths_info
# Otherwise we get:
# HfHubHTTPError: 413 Client Error: Payload Too Large for url: https://huggingface.co/api/datasets/xxx (Request ID: xxx)\n\ntoo many parameters
# See https://github.com/huggingface/huggingface_hub/issues/1503
Expand Down Expand Up @@ -555,21 +555,23 @@ def _fetch_lfs_files_to_copy(
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
If the Hub API response is improperly formatted.
"""
from .hf_api import HfApi
from .hf_api import HfApi, RepoFolder

hf_api = HfApi(endpoint=endpoint, token=token)
files_to_copy = {}
for src_revision, operations in groupby(copies, key=lambda op: op.src_revision):
operations = list(operations) # type: ignore
paths = [op.src_path_in_repo for op in operations]
for offset in range(0, len(paths), FETCH_LFS_BATCH_SIZE):
src_repo_files = hf_api.list_files_info(
src_repo_files = hf_api.get_paths_info(
repo_id=repo_id,
paths=paths[offset : offset + FETCH_LFS_BATCH_SIZE],
revision=src_revision or revision,
repo_type=repo_type,
)
for src_repo_file in src_repo_files:
if isinstance(src_repo_file, RepoFolder):
raise NotImplementedError("Copying a folder is not implemented.")
if not src_repo_file.lfs:
raise NotImplementedError("Copying a non-LFS file is not implemented")
files_to_copy[(src_repo_file.rfilename, src_revision)] = src_repo_file
Expand Down
7 changes: 5 additions & 2 deletions src/huggingface_hub/hf_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@
validate_hf_hub_args,
)
from .utils import tqdm as hf_tqdm
from .utils._deprecation import _deprecate_method
from .utils._typing import CallableT
from .utils.endpoint_helpers import (
DatasetFilter,
Expand Down Expand Up @@ -2376,6 +2377,7 @@ def file_exists(
return False

@validate_hf_hub_args
@_deprecate_method(version="0.23", message="Use `list_repo_tree` and `get_paths_info` instead.")
def list_files_info(
self,
repo_id: str,
Expand Down Expand Up @@ -2588,9 +2590,10 @@ def list_repo_files(
"""
return [
f.rfilename
for f in self.list_files_info(
repo_id=repo_id, paths=None, revision=revision, repo_type=repo_type, token=token
for f in self.list_repo_tree(
repo_id=repo_id, recursive=True, revision=revision, repo_type=repo_type, token=token
)
if isinstance(f, RepoFile)
]

@validate_hf_hub_args
Expand Down
19 changes: 17 additions & 2 deletions tests/test_hf_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@
DUMMY_MODEL_ID,
DUMMY_MODEL_ID_REVISION_ONE_SPECIFIC_COMMIT,
SAMPLE_DATASET_IDENTIFIER,
expect_deprecation,
repo_name,
require_git_lfs,
rmtree_with_retry,
Expand Down Expand Up @@ -1003,7 +1004,7 @@ def test_commit_copy_file(self, repo_url: RepoUrl) -> None:
self.assertIn("lfs Copy (1).bin", repo_files)

# Check same LFS file
repo_file1, repo_file2 = self._api.list_files_info(repo_id=repo_id, paths=["lfs.bin", "lfs Copy.bin"])
repo_file1, repo_file2 = self._api.get_paths_info(repo_id=repo_id, paths=["lfs.bin", "lfs Copy.bin"])
self.assertEqual(repo_file1.lfs["sha256"], repo_file2.lfs["sha256"])

@use_tmp_repo()
Expand Down Expand Up @@ -1161,6 +1162,7 @@ def setUpClass(cls):
def tearDownClass(cls):
cls._api.delete_repo(repo_id=cls.repo_id)

@expect_deprecation("list_files_info")
def test_get_regular_file_info(self):
files = list(self._api.list_files_info(repo_id=self.repo_id, paths="file.md"))
self.assertEqual(len(files), 1)
Expand All @@ -1171,6 +1173,7 @@ def test_get_regular_file_info(self):
self.assertEqual(file.size, 4)
self.assertEqual(file.blob_id, "6320cd248dd8aeaab759d5871f8781b5c0505172")

@expect_deprecation("list_files_info")
def test_get_lfs_file_info(self):
files = list(self._api.list_files_info(repo_id=self.repo_id, paths="lfs.bin"))
self.assertEqual(len(files), 1)
Expand All @@ -1188,34 +1191,41 @@ def test_get_lfs_file_info(self):
self.assertEqual(file.size, 4)
self.assertEqual(file.blob_id, "0a828055346279420bd02a4221c177bbcdc045d8")

@expect_deprecation("list_files_info")
def test_list_files(self):
files = list(self._api.list_files_info(repo_id=self.repo_id, paths=["file.md", "lfs.bin", "2/file_2.md"]))
self.assertEqual(len(files), 3)
self.assertEqual({f.path for f in files}, {"file.md", "lfs.bin", "2/file_2.md"})

@expect_deprecation("list_files_info")
def test_list_files_and_folder(self):
files = list(self._api.list_files_info(repo_id=self.repo_id, paths=["file.md", "lfs.bin", "2"]))
self.assertEqual(len(files), 3)
self.assertEqual({f.path for f in files}, {"file.md", "lfs.bin", "2/file_2.md"})

@expect_deprecation("list_files_info")
def test_list_unknown_path_among_other(self):
files = list(self._api.list_files_info(repo_id=self.repo_id, paths=["file.md", "unknown"]))
self.assertEqual(len(files), 1)

@expect_deprecation("list_files_info")
def test_list_unknown_path_alone(self):
files = list(self._api.list_files_info(repo_id=self.repo_id, paths="unknown"))
self.assertEqual(len(files), 0)

@expect_deprecation("list_files_info")
def test_list_folder_flat(self):
files = list(self._api.list_files_info(repo_id=self.repo_id, paths=["2"]))
self.assertEqual(len(files), 1)
self.assertEqual(files[0].path, "2/file_2.md")

@expect_deprecation("list_files_info")
def test_list_folder_recursively(self):
files = list(self._api.list_files_info(repo_id=self.repo_id, paths=["1"]))
self.assertEqual(len(files), 2)
self.assertEqual({f.path for f in files}, {"1/2/file_1_2.md", "1/file_1.md"})

@expect_deprecation("list_files_info")
def test_list_repo_files_manually(self):
files = list(self._api.list_files_info(repo_id=self.repo_id))
self.assertEqual(len(files), 7)
Expand All @@ -1224,22 +1234,26 @@ def test_list_repo_files_manually(self):
{".gitattributes", "1/2/file_1_2.md", "1/file_1.md", "2/file_2.md", "3/file_3.md", "file.md", "lfs.bin"},
)

@expect_deprecation("list_files_info")
def test_list_repo_files_alias(self):
self.assertEqual(
set(self._api.list_repo_files(repo_id=self.repo_id)),
set(f.path for f in self._api.list_files_info(repo_id=self.repo_id)),
{".gitattributes", "1/2/file_1_2.md", "1/file_1.md", "2/file_2.md", "3/file_3.md", "file.md", "lfs.bin"},
)

@expect_deprecation("list_files_info")
def test_list_with_root_path_is_ignored(self):
# must use `paths=None`
files = list(self._api.list_files_info(repo_id=self.repo_id, paths="/"))
self.assertEqual(len(files), 0)

@expect_deprecation("list_files_info")
def test_list_with_empty_path_is_invalid(self):
# must use `paths=None`
with self.assertRaises(BadRequestError):
list(self._api.list_files_info(repo_id=self.repo_id, paths=""))

@expect_deprecation("list_files_info")
@with_production_testing
def test_list_files_with_expand(self):
files = list(
Expand All @@ -1259,6 +1273,7 @@ def test_list_files_with_expand(self):
self.assertTrue(vae_model.security["safe"])
self.assertTrue(isinstance(vae_model.security["av_scan"], dict)) # all details in here

@expect_deprecation("list_files_info")
@with_production_testing
def test_list_files_without_expand(self):
files = list(
Expand Down

0 comments on commit 7319ea4

Please sign in to comment.