Skip to content

Commit 4892daf

Browse files
committed
Added buffer_size= option to apply_over_* functions for convenience.
This just gets passed along to the block size/shape chooser functions if the block size/shape is not explicitly supplied to the apply* function. Also renamed memory= to buffer_size as it is more self-explanatory.
1 parent 585feb7 commit 4892daf

5 files changed

+45
-32
lines changed

src/delayedarray/DelayedArray.py

+4-5
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from .extract_dense_array import extract_dense_array
1717
from .to_dense_array import to_dense_array
1818
from .extract_sparse_array import extract_sparse_array
19-
from .apply_over_blocks import apply_over_blocks, choose_block_shape_for_iteration
19+
from .apply_over_blocks import apply_over_blocks
2020
from .create_dask_array import create_dask_array
2121
from .chunk_shape import chunk_shape
2222
from .is_sparse import is_sparse
@@ -934,19 +934,18 @@ def _reduce_SparseNdarray(x: SparseNdarray, multipliers: List[int], axes: List[i
934934

935935

936936
def _reduce(x: DelayedArray, axes: List[int], operation: Callable, buffer_size: int):
937-
block_shape = choose_block_shape_for_iteration(x, memory = buffer_size)
938937
multipliers = _create_offset_multipliers(x.shape, axes)
939938
if is_sparse(x):
940939
apply_over_blocks(
941940
x,
942941
lambda position, block : _reduce_SparseNdarray(block, multipliers, axes, position, operation),
943-
block_shape=block_shape,
944-
allow_sparse=True
942+
buffer_size=buffer_size,
943+
allow_sparse=True,
945944
)
946945
else:
947946
apply_over_blocks(
948947
x,
949948
lambda position, block : _reduce_ndarray(block, multipliers, axes, position, operation),
950-
block_shape=block_shape
949+
buffer_size=buffer_size,
951950
)
952951
return

src/delayedarray/apply_over_blocks.py

+12-5
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
__license__ = "MIT"
1212

1313

14-
def choose_block_shape_for_iteration(x, memory: int = 10000000) -> Tuple[int, ...]:
14+
def choose_block_shape_for_iteration(x, buffer_size: int = 10000000) -> Tuple[int, ...]:
1515
"""
1616
Choose the block dimensions for blockwise iteration through an array, see
1717
`~apply_over_blocks` for details.
@@ -21,7 +21,9 @@ def choose_block_shape_for_iteration(x, memory: int = 10000000) -> Tuple[int, ..
2121
2222
dimension: Dimension to iterate over.
2323
24-
memory: Available memory in bytes, to hold a single block in memory.
24+
buffer_size:
25+
Buffer_size in bytes, to hold a single block per iteration. Larger
26+
values generally improve speed at the cost of memory.
2527
2628
Returns:
2729
Dimensions of the blocks. All values are guaranteed to be positive,
@@ -32,7 +34,7 @@ def choose_block_shape_for_iteration(x, memory: int = 10000000) -> Tuple[int, ..
3234
if d == 0:
3335
return (*(max(1, d) for d in x.shape),)
3436

35-
num_elements = memory / x.dtype.itemsize
37+
num_elements = buffer_size / x.dtype.itemsize
3638
chunk_dims = chunk_shape(x)
3739
block_size = 1
3840
for s in chunk_dims:
@@ -64,7 +66,7 @@ def choose_block_shape_for_iteration(x, memory: int = 10000000) -> Tuple[int, ..
6466
return (*block_dims,)
6567

6668

67-
def apply_over_blocks(x, fun: Callable, block_shape: Optional[Tuple] = None, allow_sparse: bool = False) -> list:
69+
def apply_over_blocks(x, fun: Callable, block_shape: Optional[Tuple] = None, allow_sparse: bool = False, buffer_size: int = 1e8) -> list:
6870
"""
6971
Iterate over an array by blocks. We apply a user-provided function and
7072
collect the results before proceeding to the next block.
@@ -88,11 +90,16 @@ def apply_over_blocks(x, fun: Callable, block_shape: Optional[Tuple] = None, all
8890
``x`` contains a sparse array, the block contents are instead
8991
represented by a :py:class:`~SparseNdarray.SparseNdarray`.
9092
93+
buffer_size:
94+
Buffer_size in bytes, to hold a single block per iteration. Larger
95+
values generally improve speed at the cost of memory. Only used
96+
if ``block_shape`` is not provided.
97+
9198
Returns:
9299
List containing the output of ``fun`` on each block.
93100
"""
94101
if block_shape is None:
95-
block_shape = choose_block_shape_for_iteration(x)
102+
block_shape = choose_block_shape_for_iteration(x, buffer_size = buffer_size)
96103

97104
num_tasks_total = 1
98105
num_tasks_by_dim = []

src/delayedarray/apply_over_dimension.py

+12-5
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def guess_iteration_block_size(x, dimension, memory: int = 10000000) -> int:
1818
return choose_block_size_for_1d_iteration(x, dimension, memory)
1919

2020

21-
def choose_block_size_for_1d_iteration(x, dimension: int, memory: int = 10000000) -> int:
21+
def choose_block_size_for_1d_iteration(x, dimension: int, buffer_size: int = 10000000) -> int:
2222
"""
2323
Choose a block size for iterating over an array on a certain dimension,
2424
see `~apply_over_dimension` for more details.
@@ -28,7 +28,9 @@ def choose_block_size_for_1d_iteration(x, dimension: int, memory: int = 10000000
2828
2929
dimension: Dimension to iterate over.
3030
31-
memory: Available memory in bytes, to hold a single block in memory.
31+
buffer_size:
32+
Buffer_size in bytes, to hold a single block per iteration. Larger
33+
values generally improve speed at the cost of memory.
3234
3335
Returns:
3436
Size of the block on the iteration dimension. This is guaranteed to be
@@ -46,7 +48,7 @@ def choose_block_size_for_1d_iteration(x, dimension: int, memory: int = 10000000
4648
if i != dimension:
4749
prod_other *= s
4850

49-
num_elements = memory / x.dtype.itemsize
51+
num_elements = buffer_size / x.dtype.itemsize
5052
ideal = int(num_elements / prod_other)
5153
if ideal == 0:
5254
return 1
@@ -59,7 +61,7 @@ def choose_block_size_for_1d_iteration(x, dimension: int, memory: int = 10000000
5961
return int(ideal / curdim) * curdim
6062

6163

62-
def apply_over_dimension(x, dimension: int, fun: Callable, block_size: Optional[int] = None, allow_sparse: bool = False) -> list:
64+
def apply_over_dimension(x, dimension: int, fun: Callable, block_size: Optional[int] = None, allow_sparse: bool = False, buffer_size: int = 1e8) -> list:
6365
"""
6466
Iterate over an array on a certain dimension. At each iteration, the block
6567
of observations consists of the full extent of all dimensions other than
@@ -87,11 +89,16 @@ def apply_over_dimension(x, dimension: int, fun: Callable, block_size: Optional[
8789
``x`` contains a sparse array, the block contents are instead
8890
represented by a :py:class:`~SparseNdarray.SparseNdarray`.
8991
92+
buffer_size:
93+
Buffer_size in bytes, to hold a single block per iteration. Larger
94+
values generally improve speed at the cost of memory. Only used
95+
if ``block_size`` is not provided.
96+
9097
Returns:
9198
List containing the output of ``fun`` on each block.
9299
"""
93100
if block_size is None:
94-
block_size = choose_block_size_for_1d_iteration(x, dimension)
101+
block_size = choose_block_size_for_1d_iteration(x, dimension, buffer_size = buffer_size)
95102

96103
limit = x.shape[dimension]
97104
tasks = math.ceil(limit / block_size)

tests/test_apply_over_blocks.py

+8-8
Original file line numberDiff line numberDiff line change
@@ -27,21 +27,21 @@ def chunk_shape_ChunkyBoi(x: _ChunkyBoi):
2727

2828
def test_choose_block_shape_for_iteration():
2929
x = np.random.rand(100, 10)
30-
assert da.choose_block_shape_for_iteration(x, memory=200) == (2, 10)
31-
assert da.choose_block_shape_for_iteration(x, memory=800) == (10, 10)
30+
assert da.choose_block_shape_for_iteration(x, buffer_size=200) == (2, 10)
31+
assert da.choose_block_shape_for_iteration(x, buffer_size=800) == (10, 10)
3232

33-
# Not enough memory.
34-
assert da.choose_block_shape_for_iteration(x, memory=0) == (1, 1)
35-
assert da.choose_block_shape_for_iteration(x, memory=40) == (1, 5)
33+
# Not enough buffer_size.
34+
assert da.choose_block_shape_for_iteration(x, buffer_size=0) == (1, 1)
35+
assert da.choose_block_shape_for_iteration(x, buffer_size=40) == (1, 5)
3636

3737
# Behaves correctly with empty objects.
3838
empty = np.random.rand(100, 0)
3939
assert da.choose_block_shape_for_iteration(empty) == (100, 1)
4040

4141
x = _ChunkyBoi((100, 200), (20, 25))
42-
assert da.choose_block_shape_for_iteration(x, memory=4000) == (20, 25)
43-
assert da.choose_block_shape_for_iteration(x, memory=40000) == (100, 50)
44-
assert da.choose_block_shape_for_iteration(x, memory=80000) == (100, 100)
42+
assert da.choose_block_shape_for_iteration(x, buffer_size=4000) == (20, 25)
43+
assert da.choose_block_shape_for_iteration(x, buffer_size=40000) == (100, 50)
44+
assert da.choose_block_shape_for_iteration(x, buffer_size=80000) == (100, 100)
4545

4646

4747
def _dense_sum(position, block):

tests/test_apply_over_dimension.py

+9-9
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,12 @@ def chunk_shape_ChunkyBoi(x: _ChunkyBoi):
2727

2828
def test_choose_block_size_for_1d_iteration():
2929
x = np.random.rand(100, 10)
30-
assert da.choose_block_size_for_1d_iteration(x, 0, memory=800) == 10
31-
assert da.choose_block_size_for_1d_iteration(x, 1, memory=800) == 1
30+
assert da.choose_block_size_for_1d_iteration(x, 0, buffer_size=800) == 10
31+
assert da.choose_block_size_for_1d_iteration(x, 1, buffer_size=800) == 1
3232

33-
# No memory.
34-
assert da.choose_block_size_for_1d_iteration(x, 0, memory=0) == 1
35-
assert da.choose_block_size_for_1d_iteration(x, 1, memory=0) == 1
33+
# No buffer_size.
34+
assert da.choose_block_size_for_1d_iteration(x, 0, buffer_size=0) == 1
35+
assert da.choose_block_size_for_1d_iteration(x, 1, buffer_size=0) == 1
3636

3737
# Behaves correctly with empty objects.
3838
empty = np.random.rand(100, 0)
@@ -41,10 +41,10 @@ def test_choose_block_size_for_1d_iteration():
4141

4242
# Making a slightly more complex situation.
4343
x = _ChunkyBoi((100, 200), (20, 25))
44-
assert da.choose_block_size_for_1d_iteration(x, 0, memory=4000) == 2
45-
assert da.choose_block_size_for_1d_iteration(x, 1, memory=4000) == 5
46-
assert da.choose_block_size_for_1d_iteration(x, 0, memory=40000) == 20
47-
assert da.choose_block_size_for_1d_iteration(x, 1, memory=40000) == 50
44+
assert da.choose_block_size_for_1d_iteration(x, 0, buffer_size=4000) == 2
45+
assert da.choose_block_size_for_1d_iteration(x, 1, buffer_size=4000) == 5
46+
assert da.choose_block_size_for_1d_iteration(x, 0, buffer_size=40000) == 20
47+
assert da.choose_block_size_for_1d_iteration(x, 1, buffer_size=40000) == 50
4848

4949

5050
def _dense_sum(position, block):

0 commit comments

Comments
 (0)