Skip to content

Commit 86d737f

Browse files
committed
Modify getsize to return total size, not just the top level
1 parent e1d98cd commit 86d737f

File tree

4 files changed

+51
-49
lines changed

4 files changed

+51
-49
lines changed

docs/release.rst

+2
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ Enhancements
2828

2929
Maintenance
3030
~~~~~~~~~~~
31+
* ``getsize`` now returns the total size of all nested arrays.
32+
By :user:`Ben Jeffery <benjeffery>` :issue:`253`.
3133

3234
Deprecations
3335
~~~~~~~~~~~~

zarr/storage.py

+37-33
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
from collections import OrderedDict
3131
from collections.abc import MutableMapping
3232
from functools import lru_cache
33-
from os import scandir
3433
from pickle import PicklingError
3534
from threading import Lock, RLock
3635
from typing import Sequence, Mapping, Optional, Union, List, Tuple, Dict, Any
@@ -270,9 +269,15 @@ def _getsize(store: BaseStore, path: Path = None) -> int:
270269
# also include zarr.json?
271270
# members += ['zarr.json']
272271
else:
273-
members = listdir(store, path)
274-
prefix = _path_to_prefix(path)
275-
members = [prefix + k for k in members]
272+
to_visit = [path]
273+
members = []
274+
while to_visit:
275+
print(to_visit)
276+
current_path = to_visit.pop()
277+
current_members = listdir(store, current_path)
278+
prefix = _path_to_prefix(current_path)
279+
members.extend([prefix + k for k in current_members])
280+
to_visit.extend([prefix + k for k in current_members])
276281
for k in members:
277282
try:
278283
v = store[k]
@@ -976,8 +981,12 @@ def getsize(self, path: Path = None):
976981
elif isinstance(value, self.cls):
977982
# total size for directory
978983
size = 0
979-
for v in value.values():
980-
if not isinstance(v, self.cls):
984+
to_visit = list(value.values())
985+
while to_visit:
986+
v = to_visit.pop()
987+
if isinstance(v, self.cls):
988+
to_visit.extend(v.values())
989+
else:
981990
size += buffer_size(v)
982991
return size
983992

@@ -1274,9 +1283,13 @@ def getsize(self, path=None):
12741283
return os.path.getsize(fs_path)
12751284
elif os.path.isdir(fs_path):
12761285
size = 0
1277-
for child in scandir(fs_path):
1278-
if child.is_file():
1279-
size += child.stat().st_size
1286+
for root, _, files in os.walk(fs_path):
1287+
# Include the size of the directory itself, as this can be substantial
1288+
# for directories with many files.
1289+
size += os.path.getsize(root)
1290+
for file in files:
1291+
file_path = os.path.join(root, file)
1292+
size += os.path.getsize(file_path)
12801293
return size
12811294
else:
12821295
return 0
@@ -1921,29 +1934,19 @@ def listdir(self, path=None):
19211934
def getsize(self, path=None):
19221935
path = normalize_storage_path(path)
19231936
with self.mutex:
1924-
children = self.listdir(path)
1925-
if children:
1926-
size = 0
1927-
for child in children:
1928-
if path:
1929-
name = path + "/" + child
1930-
else:
1931-
name = child
1932-
try:
1933-
info = self.zf.getinfo(name)
1934-
except KeyError:
1935-
pass
1936-
else:
1937-
size += info.compress_size
1938-
return size
1939-
elif path:
1937+
to_visit = [path] if path else self.listdir(path)
1938+
total_size = 0
1939+
while to_visit:
1940+
current_path = to_visit.pop()
19401941
try:
1941-
info = self.zf.getinfo(path)
1942-
return info.compress_size
1942+
info = self.zf.getinfo(current_path)
1943+
total_size += info.compress_size
19431944
except KeyError:
1944-
return 0
1945-
else:
1946-
return 0
1945+
children = self.listdir(current_path)
1946+
for child in children:
1947+
full_path = current_path + "/" + child if current_path else child
1948+
to_visit.append(full_path)
1949+
return total_size
19471950

19481951
def clear(self):
19491952
if self.mode == "r":
@@ -2527,6 +2530,8 @@ def listdir(self, path: Path = None):
25272530
return listing
25282531

25292532
def getsize(self, path=None) -> int:
2533+
print("WYF")
2534+
print(self._store, path)
25302535
return getsize(self._store, path=path)
25312536

25322537
def _pop_value(self):
@@ -2795,10 +2800,9 @@ def getsize(self, path=None):
27952800
size = self.cursor.execute(
27962801
"""
27972802
SELECT COALESCE(SUM(LENGTH(v)), 0) FROM zarr
2798-
WHERE k LIKE (? || "%") AND
2799-
0 == INSTR(LTRIM(SUBSTR(k, LENGTH(?) + 1), "/"), "/")
2803+
WHERE k LIKE (? || "%")
28002804
""",
2801-
(path, path),
2805+
(path,),
28022806
)
28032807
for (s,) in size:
28042808
return s

zarr/tests/test_core.py

+9-4
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import sys
44
import pickle
55
import shutil
6+
import tempfile
67
from typing import Any, Literal, Optional, Tuple, Union, Sequence
78
import unittest
89
from itertools import zip_longest
@@ -100,6 +101,7 @@ class TestArray:
100101
write_empty_chunks = True
101102
read_only = False
102103
storage_transformers: Tuple[Any, ...] = ()
104+
group_size = 0
103105

104106
def create_store(self) -> BaseStore:
105107
return KVStore(dict())
@@ -229,15 +231,15 @@ def test_nbytes_stored(self):
229231
buffer_size(v) for k, v in z.store.items() if k != "zarr.json"
230232
)
231233
else:
232-
expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values())
234+
expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + self.group_size
233235
assert expect_nbytes_stored == z.nbytes_stored
234236
z[:] = 42
235237
if self.version == 3:
236238
expect_nbytes_stored = sum(
237239
buffer_size(v) for k, v in z.store.items() if k != "zarr.json"
238240
)
239241
else:
240-
expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values())
242+
expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + self.group_size
241243
assert expect_nbytes_stored == z.nbytes_stored
242244

243245
# mess with store
@@ -1677,6 +1679,8 @@ def test_nbytes_stored(self):
16771679

16781680

16791681
class TestArrayWithDirectoryStore(TestArray):
1682+
group_size = 4096
1683+
16801684
def create_store(self):
16811685
path = mkdtemp()
16821686
atexit.register(shutil.rmtree, path)
@@ -1686,10 +1690,10 @@ def create_store(self):
16861690
def test_nbytes_stored(self):
16871691
# dict as store
16881692
z = self.create_array(shape=1000, chunks=100)
1689-
expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values())
1693+
expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + self.group_size
16901694
assert expect_nbytes_stored == z.nbytes_stored
16911695
z[:] = 42
1692-
expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values())
1696+
expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + self.group_size
16931697
assert expect_nbytes_stored == z.nbytes_stored
16941698

16951699

@@ -2028,6 +2032,7 @@ def expected(self):
20282032

20292033
@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec")
20302034
class TestArrayWithN5FSStore(TestArrayWithN5Store):
2035+
group_size = 0
20312036
def create_store(self):
20322037
path = mkdtemp()
20332038
atexit.register(shutil.rmtree, path)

zarr/tests/test_storage.py

+3-12
Original file line numberDiff line numberDiff line change
@@ -366,19 +366,10 @@ def test_hierarchy(self):
366366

367367
# test getsize (optional)
368368
if hasattr(store, "getsize"):
369-
# TODO: proper behavior of getsize?
370-
# v3 returns size of all nested arrays, not just the
371-
# size of the arrays in the current folder.
372-
if self.version == 2:
373-
assert 6 == store.getsize()
374-
else:
375-
assert 15 == store.getsize()
369+
assert 15 == store.getsize()
376370
assert 3 == store.getsize("a")
377371
assert 3 == store.getsize("b")
378-
if self.version == 2:
379-
assert 3 == store.getsize("c")
380-
else:
381-
assert 9 == store.getsize("c")
372+
assert 3 == store.getsize("c")
382373
assert 3 == store.getsize("c/d")
383374
assert 6 == store.getsize("c/e")
384375
assert 3 == store.getsize("c/e/f")
@@ -2256,7 +2247,7 @@ def test_getsize():
22562247
store["foo"] = b"aaa"
22572248
store["bar"] = b"bbbb"
22582249
store["baz/quux"] = b"ccccc"
2259-
assert 7 == getsize(store)
2250+
assert 12 == getsize(store)
22602251
assert 5 == getsize(store, "baz")
22612252

22622253
store = KVStore(dict())

0 commit comments

Comments
 (0)