Exported the converter from chunk shape to a SimpleGrid.

LTLA · LTLA · commit 6b0d9cd1ed7c · 2024-02-01T08:27:58.000-08:00
Also explained the various choices for the cost factor, now that
we have to document the converter's arguments.
diff --git a/src/delayedarray/Grid.py b/src/delayedarray/Grid.py
@@ -74,7 +74,8 @@ def __init__(self, boundaries: Tuple[Sequence[int], ...], cost_factor: float, in
                 Positive number representing the cost of iteration over each
                 element of the grid's array. The actual cost is defined by the
                 product of the cost factor by the array size. This is used to
-                choose between iteration schemes.
+                choose between iteration schemes; as a reference, extraction
+                from an in-memory NumPy array has a cost factor of 1.
 
             internals:
                 Internal use only.
diff --git a/src/delayedarray/__init__.py b/src/delayedarray/__init__.py
@@ -41,6 +41,6 @@
 from .create_dask_array import create_dask_array
 from .is_sparse import is_sparse
 from .is_masked import is_masked
-from .chunk_grid import chunk_grid
+from .chunk_grid import chunk_grid, chunk_shape_to_grid
 from .is_pristine import is_pristine
 from .wrap import wrap
diff --git a/src/delayedarray/chunk_grid.py b/src/delayedarray/chunk_grid.py
@@ -12,7 +12,30 @@
 __license__ = "MIT"
 
 
-def _chunk_shape_to_grid(chunks: Sequence[int], shape: Tuple[int, ...], cost_factor: int):
+def chunk_shape_to_grid(chunks: Sequence[int], shape: Tuple[int, ...], cost_factor: int) -> SimpleGrid:
+    """
+    Convert a chunk shape to a :py:class:`~delayedarray.Grid.SimpleGrid`.
+    This assumes that the underlying array is split up into regular intervals
+    on each dimension; the first chunk should start from zero, and only the
+    last chunk may be of a different size (bounded by the dimension extent).
+
+    Args:
+        chunks:
+            Chunk size for each dimension. These should be positive.
+
+        shape:
+            Extent of each dimension of the array. These should be non-negative
+            and of the same length as ``chunks``.
+
+        cost_factor:
+            Cost factor for iterating over each element of the associated
+            array. This is used to decide between iteration schemes and can be
+            increased for more expensive types, e.g., file-backed arrays. As a
+            reference, in-memory NumPy arrays are assigned a cost factor of 1.
+
+    Returns:
+        A ``SimpleGrid`` object with the chunk shape as the boundaries.
+    """
     out = []
     for i, ch in enumerate(chunks):
         sh = shape[i]
@@ -42,8 +65,13 @@ def chunk_grid(x: Any) -> AbstractGrid:
 
 
 @chunk_grid.register
-def chunk_grid_ndarray(x: ndarray):
-    """See :py:meth:`~delayedarray.chunk_grid.chunk_grid`."""
+def chunk_grid_ndarray(x: ndarray) -> SimpleGrid:
+    """
+    See :py:meth:`~delayedarray.chunk_grid.chunk_grid`.
+
+    The cost factor for iteration is set to 1, which is considered the lowest
+    cost for data extraction given that everything is stored in memory.
+    """
     raw = [1] * len(x.shape)
     if x.flags.f_contiguous:
         raw[0] = x.shape[0]
@@ -52,15 +80,21 @@ def chunk_grid_ndarray(x: ndarray):
         # to figure that out from NumPy flags. Guess we should just assume
         # that it's C-contiguous, given that most things are.
         raw[-1] = x.shape[-1]
-    return _chunk_shape_to_grid(raw, x.shape, cost_factor=1)
+    return chunk_shape_to_grid(raw, x.shape, cost_factor=1)
 
 
 @chunk_grid.register
-def chunk_grid_SparseNdarray(x: SparseNdarray):
-    """See :py:meth:`~delayedarray.chunk_grid.chunk_grid`."""
+def chunk_grid_SparseNdarray(x: SparseNdarray) -> SimpleGrid:
+    """
+    See :py:meth:`~delayedarray.chunk_grid.chunk_grid`.
+
+    The cost factor for iteration is set to 1.5. This is slightly higher than
+    that of dense NumPy arrays as the ``SparseNdarray`` is a bit more expensive
+    for random access on the first dimension.
+    """
     raw = [1] * len(x.shape)
     raw[0] = x.shape[0]
-    return _chunk_shape_to_grid(raw, x.shape, cost_factor=1.5)
+    return chunk_shape_to_grid(raw, x.shape, cost_factor=1.5)
 
 
 # If scipy is installed, we add all the methods for the various scipy.sparse matrices.
@@ -70,19 +104,36 @@ def chunk_grid_SparseNdarray(x: SparseNdarray):
 
 
     @chunk_grid.register
-    def chunk_grid_csc_matrix(x: sp.csc_matrix):
-        """See :py:meth:`~delayedarray.chunk_grid.chunk_grid`."""
-        return _chunk_shape_to_grid((x.shape[0], 1), x.shape, cost_factor=1.5)
+    def chunk_grid_csc_matrix(x: sp.csc_matrix) -> SimpleGrid:
+        """
+        See :py:meth:`~delayedarray.chunk_grid.chunk_grid`.
+
+        The cost factor for iteration is set to 1.5. This is slightly higher
+        than that of dense NumPy arrays as CSC matrices are a bit more
+        expensive for random row access.
+        """
+        return chunk_shape_to_grid((x.shape[0], 1), x.shape, cost_factor=1.5)
 
 
     @chunk_grid.register
-    def chunk_grid_csr_matrix(x: sp.csr_matrix):
-        """See :py:meth:`~delayedarray.chunk_grid.chunk_grid`."""
-        return _chunk_shape_to_grid((1, x.shape[1]), x.shape, cost_factor=1.5)
+    def chunk_grid_csr_matrix(x: sp.csr_matrix) -> SimpleGrid:
+        """
+        See :py:meth:`~delayedarray.chunk_grid.chunk_grid`.
+
+        The cost factor for iteration is set to 1.5. This is slightly higher
+        than that of dense NumPy arrays as CSR matrices are a bit more
+        expensive for random column access.
+        """
+        return chunk_shape_to_grid((1, x.shape[1]), x.shape, cost_factor=1.5)
 
 
     @chunk_grid.register
-    def chunk_grid_coo_matrix(x: sp.coo_matrix):
-        """See :py:meth:`~delayedarray.chunk_grid.chunk_grid`."""
+    def chunk_grid_coo_matrix(x: sp.coo_matrix) -> SimpleGrid:
+        """
+        See :py:meth:`~delayedarray.chunk_grid.chunk_grid`.
+
+        The cost factor for iteration is set to 5, as any extraction from a COO
+        matrix requires a full scan through all elements.
+        """
         # ???? let's just do our best here, there's no nice way to access COO.
-        return _chunk_shape_to_grid(x.shape, x.shape, cost_factor=1.5)
+        return chunk_shape_to_grid(x.shape, x.shape, cost_factor=5)