Skip to content

Commit 6b0d9cd

Browse files
committed
Exported the converter from chunk shape to a SimpleGrid.
Also explained the various choices for the cost factor, now that we have to document the converter's arguments.
1 parent 87b7525 commit 6b0d9cd

File tree

3 files changed

+70
-18
lines changed

3 files changed

+70
-18
lines changed

src/delayedarray/Grid.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,8 @@ def __init__(self, boundaries: Tuple[Sequence[int], ...], cost_factor: float, in
7474
Positive number representing the cost of iteration over each
7575
element of the grid's array. The actual cost is defined by the
7676
product of the cost factor by the array size. This is used to
77-
choose between iteration schemes.
77+
choose between iteration schemes; as a reference, extraction
78+
from an in-memory NumPy array has a cost factor of 1.
7879
7980
internals:
8081
Internal use only.

src/delayedarray/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,6 @@
4141
from .create_dask_array import create_dask_array
4242
from .is_sparse import is_sparse
4343
from .is_masked import is_masked
44-
from .chunk_grid import chunk_grid
44+
from .chunk_grid import chunk_grid, chunk_shape_to_grid
4545
from .is_pristine import is_pristine
4646
from .wrap import wrap

src/delayedarray/chunk_grid.py

+67-16
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,30 @@
1212
__license__ = "MIT"
1313

1414

15-
def _chunk_shape_to_grid(chunks: Sequence[int], shape: Tuple[int, ...], cost_factor: int):
15+
def chunk_shape_to_grid(chunks: Sequence[int], shape: Tuple[int, ...], cost_factor: int) -> SimpleGrid:
16+
"""
17+
Convert a chunk shape to a :py:class:`~delayedarray.Grid.SimpleGrid`.
18+
This assumes that the underlying array is split up into regular intervals
19+
on each dimension; the first chunk should start from zero, and only the
20+
last chunk may be of a different size (bounded by the dimension extent).
21+
22+
Args:
23+
chunks:
24+
Chunk size for each dimension. These should be positive.
25+
26+
shape:
27+
Extent of each dimension of the array. These should be non-negative
28+
and of the same length as ``chunks``.
29+
30+
cost_factor:
31+
Cost factor for iterating over each element of the associated
32+
array. This is used to decide between iteration schemes and can be
33+
increased for more expensive types, e.g., file-backed arrays. As a
34+
reference, in-memory NumPy arrays are assigned a cost factor of 1.
35+
36+
Returns:
37+
A ``SimpleGrid`` object with the chunk shape as the boundaries.
38+
"""
1639
out = []
1740
for i, ch in enumerate(chunks):
1841
sh = shape[i]
@@ -42,8 +65,13 @@ def chunk_grid(x: Any) -> AbstractGrid:
4265

4366

4467
@chunk_grid.register
45-
def chunk_grid_ndarray(x: ndarray):
46-
"""See :py:meth:`~delayedarray.chunk_grid.chunk_grid`."""
68+
def chunk_grid_ndarray(x: ndarray) -> SimpleGrid:
69+
"""
70+
See :py:meth:`~delayedarray.chunk_grid.chunk_grid`.
71+
72+
The cost factor for iteration is set to 1, which is considered the lowest
73+
cost for data extraction given that everything is stored in memory.
74+
"""
4775
raw = [1] * len(x.shape)
4876
if x.flags.f_contiguous:
4977
raw[0] = x.shape[0]
@@ -52,15 +80,21 @@ def chunk_grid_ndarray(x: ndarray):
5280
# to figure that out from NumPy flags. Guess we should just assume
5381
# that it's C-contiguous, given that most things are.
5482
raw[-1] = x.shape[-1]
55-
return _chunk_shape_to_grid(raw, x.shape, cost_factor=1)
83+
return chunk_shape_to_grid(raw, x.shape, cost_factor=1)
5684

5785

5886
@chunk_grid.register
59-
def chunk_grid_SparseNdarray(x: SparseNdarray):
60-
"""See :py:meth:`~delayedarray.chunk_grid.chunk_grid`."""
87+
def chunk_grid_SparseNdarray(x: SparseNdarray) -> SimpleGrid:
88+
"""
89+
See :py:meth:`~delayedarray.chunk_grid.chunk_grid`.
90+
91+
The cost factor for iteration is set to 1.5. This is slightly higher than
92+
that of dense NumPy arrays as the ``SparseNdarray`` is a bit more expensive
93+
for random access on the first dimension.
94+
"""
6195
raw = [1] * len(x.shape)
6296
raw[0] = x.shape[0]
63-
return _chunk_shape_to_grid(raw, x.shape, cost_factor=1.5)
97+
return chunk_shape_to_grid(raw, x.shape, cost_factor=1.5)
6498

6599

66100
# If scipy is installed, we add all the methods for the various scipy.sparse matrices.
@@ -70,19 +104,36 @@ def chunk_grid_SparseNdarray(x: SparseNdarray):
70104

71105

72106
@chunk_grid.register
73-
def chunk_grid_csc_matrix(x: sp.csc_matrix):
74-
"""See :py:meth:`~delayedarray.chunk_grid.chunk_grid`."""
75-
return _chunk_shape_to_grid((x.shape[0], 1), x.shape, cost_factor=1.5)
107+
def chunk_grid_csc_matrix(x: sp.csc_matrix) -> SimpleGrid:
108+
"""
109+
See :py:meth:`~delayedarray.chunk_grid.chunk_grid`.
110+
111+
The cost factor for iteration is set to 1.5. This is slightly higher
112+
than that of dense NumPy arrays as CSC matrices are a bit more
113+
expensive for random row access.
114+
"""
115+
return chunk_shape_to_grid((x.shape[0], 1), x.shape, cost_factor=1.5)
76116

77117

78118
@chunk_grid.register
79-
def chunk_grid_csr_matrix(x: sp.csr_matrix):
80-
"""See :py:meth:`~delayedarray.chunk_grid.chunk_grid`."""
81-
return _chunk_shape_to_grid((1, x.shape[1]), x.shape, cost_factor=1.5)
119+
def chunk_grid_csr_matrix(x: sp.csr_matrix) -> SimpleGrid:
120+
"""
121+
See :py:meth:`~delayedarray.chunk_grid.chunk_grid`.
122+
123+
The cost factor for iteration is set to 1.5. This is slightly higher
124+
than that of dense NumPy arrays as CSR matrices are a bit more
125+
expensive for random column access.
126+
"""
127+
return chunk_shape_to_grid((1, x.shape[1]), x.shape, cost_factor=1.5)
82128

83129

84130
@chunk_grid.register
85-
def chunk_grid_coo_matrix(x: sp.coo_matrix):
86-
"""See :py:meth:`~delayedarray.chunk_grid.chunk_grid`."""
131+
def chunk_grid_coo_matrix(x: sp.coo_matrix) -> SimpleGrid:
132+
"""
133+
See :py:meth:`~delayedarray.chunk_grid.chunk_grid`.
134+
135+
The cost factor for iteration is set to 5, as any extraction from a COO
136+
matrix requires a full scan through all elements.
137+
"""
87138
# ???? let's just do our best here, there's no nice way to access COO.
88-
return _chunk_shape_to_grid(x.shape, x.shape, cost_factor=1.5)
139+
return chunk_shape_to_grid(x.shape, x.shape, cost_factor=5)

0 commit comments

Comments
 (0)