Skip to content

Commit 18570e5

Browse files
committed
Added basic statistics (mean/sum/var) for SparseNdarrays.
1 parent a7b29d2 commit 18570e5

File tree

4 files changed

+393
-11
lines changed

4 files changed

+393
-11
lines changed

src/delayedarray/SparseNdarray.py

+233-7
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
_concatenate_unmasked_ndarrays,
1515
_concatenate_maybe_masked_ndarrays
1616
)
17+
from ._statistics import _find_useful_axes, _expected_sample_size, _choose_output_type, _create_offset_multipliers
1718

1819
__author__ = "ltla"
1920
__copyright__ = "ltla"
@@ -778,6 +779,76 @@ def T(self) -> "SparseNdarray":
778779
return _transpose_SparseNdarray(self, axes)
779780

780781

782+
def sum(self, axis: Optional[Union[int, Tuple[int, ...]]] = None, dtype: Optional[numpy.dtype] = None) -> numpy.ndarray:
783+
"""
784+
Take the sum of values across the ``SparseNdarray``, possibly over a
785+
given axis or set of axes.
786+
787+
Args:
788+
axis:
789+
A single integer specifying the axis to perform the calculation.
790+
Alternatively a tuple or None, see ``numpy.sum`` for details.
791+
792+
dtype:
793+
NumPy type for the output array. If None, this is automatically
794+
chosen based on the type of the ``SparseNdarray``, see
795+
``numpy.sum`` for details.
796+
797+
Returns:
798+
A NumPy array containing the summed values. If ``axis = None``,
799+
this will be a NumPy scalar instead.
800+
"""
801+
return _sum(self, axis=axis, dtype=dtype)
802+
803+
804+
def mean(self, axis: Optional[Union[int, Tuple[int, ...]]] = None, dtype: Optional[numpy.dtype] = None) -> numpy.ndarray:
805+
"""
806+
Take the mean of values across the ``SparseNdarray``, possibly over a
807+
given axis or set of axes.
808+
809+
Args:
810+
axis:
811+
A single integer specifying the axis to perform the calculation.
812+
Alternatively a tuple or None, see ``numpy.mean`` for details.
813+
814+
dtype:
815+
NumPy type for the output array. If None, this is automatically
816+
chosen based on the type of the ``SparseNdarray``, see
817+
``numpy.mean`` for details.
818+
819+
Returns:
820+
A NumPy array containing the mean values. If ``axis = None``,
821+
this will be a NumPy scalar instead.
822+
"""
823+
return _mean(self, axis=axis, dtype=dtype)
824+
825+
826+
def var(self, axis: Optional[Union[int, Tuple[int, ...]]] = None, dtype: Optional[numpy.dtype] = None, ddof: int = 0) -> numpy.ndarray:
827+
"""
828+
Take the variances of values across the ``SparseNdarray``, possibly
829+
over a given axis or set of axes.
830+
831+
Args:
832+
axis:
833+
A single integer specifying the axis to perform the calculation.
834+
Alternatively a tuple or None, see ``numpy.mean`` for details.
835+
836+
dtype:
837+
NumPy type for the output array. If None, this is automatically
838+
chosen based on the type of the ``SparseNdarray``, see
839+
``numpy.mean`` for details.
840+
841+
ddof:
842+
Delta in the degrees of freedom to subtract from the denominator.
843+
Typically set to 1 to obtain the sample variance.
844+
845+
Returns:
846+
A NumPy array containing the variances. If ``axis = None``,
847+
this will be a NumPy scalar instead.
848+
"""
849+
return _var(self, axis=axis, dtype=dtype, ddof=ddof)
850+
851+
781852
# Other stuff
782853
def __copy__(self) -> "SparseNdarray":
783854
"""
@@ -1008,19 +1079,14 @@ def _extract_dense_array_from_SparseNdarray(x: SparseNdarray, subset: Tuple[Sequ
10081079
output = numpy.zeros((*reversed(idims),), dtype=x._dtype)
10091080

10101081
if x._contents is not None:
1011-
# Wrapping it in a masked array so that masked values are respected.
1012-
output = numpy.ma.MaskedArray(output, mask=False)
1013-
1082+
if x._is_masked:
1083+
output = numpy.ma.MaskedArray(output, mask=False)
10141084
ndim = len(x._shape)
10151085
if ndim > 1:
10161086
_recursive_extract_dense_array(x._contents, subset, subset_summary=subset_summary, output=output, dim=ndim-1)
10171087
else:
10181088
_extract_sparse_vector_to_dense(x._contents[0], x._contents[1], subset_summary=subset_summary, output=output)
10191089

1020-
# Stripping out the mask if there were, in fact, no masked values.
1021-
if isinstance(output.mask, bool) and not output.mask:
1022-
output = output.data
1023-
10241090
return output.T
10251091

10261092

@@ -1614,3 +1680,163 @@ def _concatenate_SparseNdarrays(xs: List[SparseNdarray], along: int):
16141680
new_contents = _concatenate_sparse_vectors(outidx, outval, payload)
16151681

16161682
return SparseNdarray(shape=(*new_shape,), contents=new_contents, dtype=output_dtype, index_dtype=output_index_dtype, is_masked=output_is_masked, check=False)
1683+
1684+
1685+
#########################################################
1686+
#########################################################
1687+
1688+
1689+
_ReductionPayload = namedtuple("_ReductionPayload", [ "multipliers", "operation" ])
1690+
1691+
1692+
def _reduce_sparse_vector(idx: numpy.ndarray, val: numpy.ndarray, payload: _ReductionPayload, offset: int = 0):
1693+
for j, ix in enumerate(idx):
1694+
extra = payload.multipliers[0] * ix
1695+
payload.operation(offset + extra, val[j])
1696+
return
1697+
1698+
1699+
def _recursive_reduce_SparseNdarray(contents, payload: _ReductionPayload, dim: int, offset: int = 0):
1700+
mult = payload.multipliers[dim]
1701+
if dim == 1:
1702+
for i, con in enumerate(contents):
1703+
if con is not None:
1704+
_reduce_sparse_vector(con[0], con[1], payload, offset = offset + mult * i)
1705+
else:
1706+
for i, con in enumerate(contents):
1707+
if con is not None:
1708+
_recursive_reduce_SparseNdarray(con, payload, dim = dim - 1, offset = offset + mult * i)
1709+
return
1710+
1711+
1712+
def _reduce_SparseNdarray(x: SparseNdarray, axes: List[int], operation: Callable):
1713+
if x.contents is not None:
1714+
multipliers = _create_offset_multipliers(x.shape, axes)
1715+
payload = _ReductionPayload(operation=operation, multipliers=multipliers)
1716+
ndim = len(x.shape)
1717+
if ndim == 1:
1718+
_reduce_sparse_vector(x.contents[0], x.contents[1], payload)
1719+
else:
1720+
_recursive_reduce_SparseNdarray(x.contents, payload, dim=ndim - 1)
1721+
return
1722+
1723+
1724+
def _allocate_output_array(shape: Tuple[int, ...], axes: List[int], dtype: numpy.dtype) -> numpy.ndarray:
1725+
# Either returning a scalar or not.
1726+
if len(axes) == 0:
1727+
return numpy.zeros(1, dtype=dtype)
1728+
else:
1729+
shape = [shape[i] for i in axes]
1730+
return numpy.zeros((*shape,), dtype=dtype, order="F")
1731+
1732+
1733+
def _sum(x: SparseNdarray, axis: Optional[Union[int, Tuple[int, ...]]], dtype: Optional[numpy.dtype]) -> numpy.ndarray:
1734+
axes = _find_useful_axes(len(x.shape), axis)
1735+
if dtype is None:
1736+
dtype = _choose_output_type(x.dtype, preserve_integer = True)
1737+
output = _allocate_output_array(x.shape, axes, dtype)
1738+
buffer = output.ravel(order="F")
1739+
1740+
if x._is_masked:
1741+
masked = numpy.zeros(output.shape, dtype=numpy.uint, order="F")
1742+
mask_buffer = masked.ravel(order="F")
1743+
def op(offset, value):
1744+
if value is not numpy.ma.masked:
1745+
buffer[offset] += value
1746+
else:
1747+
mask_buffer[offset] += 1
1748+
_reduce_SparseNdarray(x, axes, op)
1749+
size = _expected_sample_size(x.shape, axes)
1750+
output = numpy.ma.MaskedArray(output, mask=(masked == size))
1751+
else:
1752+
def op(offset, value):
1753+
buffer[offset] += value
1754+
_reduce_SparseNdarray(x, axes, op)
1755+
1756+
if len(axes) == 0:
1757+
return output[0]
1758+
else:
1759+
return output
1760+
1761+
1762+
def _mean(x: SparseNdarray, axis: Optional[Union[int, Tuple[int, ...]]], dtype: Optional[numpy.dtype]) -> numpy.ndarray:
1763+
axes = _find_useful_axes(len(x.shape), axis)
1764+
if dtype is None:
1765+
dtype = _choose_output_type(x.dtype, preserve_integer = False)
1766+
output = _allocate_output_array(x.shape, axes, dtype)
1767+
buffer = output.ravel(order="F")
1768+
size = _expected_sample_size(x.shape, axes)
1769+
1770+
if x._is_masked:
1771+
masked = numpy.zeros(output.shape, dtype=numpy.uint, order="F")
1772+
mask_buffer = masked.ravel(order="F")
1773+
def op(offset, value):
1774+
if value is not numpy.ma.masked:
1775+
buffer[offset] += value
1776+
else:
1777+
mask_buffer[offset] += 1
1778+
_reduce_SparseNdarray(x, axes, op)
1779+
denom = size - masked
1780+
output = numpy.ma.MaskedArray(output, mask=(denom==0))
1781+
else:
1782+
def op(offset, value):
1783+
buffer[offset] += value
1784+
_reduce_SparseNdarray(x, axes, op)
1785+
denom = size
1786+
1787+
output /= denom
1788+
if len(axes) == 0:
1789+
return output[0]
1790+
else:
1791+
return output
1792+
1793+
1794+
def _var(x: SparseNdarray, axis: Optional[Union[int, Tuple[int, ...]]], dtype: Optional[numpy.dtype], ddof: int) -> numpy.ndarray:
1795+
axes = _find_useful_axes(len(x.shape), axis)
1796+
if dtype is None:
1797+
dtype = _choose_output_type(x.dtype, preserve_integer = False)
1798+
size = _expected_sample_size(x.shape, axes)
1799+
1800+
# Using Welford's online algorithm.
1801+
sumsq = _allocate_output_array(x.shape, axes, dtype)
1802+
sumsq_buffer = sumsq.ravel(order="F")
1803+
means = numpy.zeros(sumsq.shape, dtype=dtype, order="F")
1804+
means_buffer = means.ravel(order="F")
1805+
counts = numpy.zeros(sumsq.shape, dtype=numpy.int64, order="F")
1806+
counts_buffer = counts.ravel(order="F")
1807+
1808+
def raw_op(offset, value):
1809+
counts_buffer[offset] += 1
1810+
delta = value - means_buffer[offset]
1811+
means_buffer[offset] += delta / counts_buffer[offset]
1812+
delta_2 = value - means_buffer[offset]
1813+
sumsq_buffer[offset] += delta * delta_2
1814+
1815+
if x._is_masked:
1816+
masked = numpy.zeros(sumsq.shape, dtype=numpy.int64, order="F")
1817+
mask_buffer = masked.ravel(order="F")
1818+
def op(offset, value):
1819+
if value is not numpy.ma.masked:
1820+
raw_op(offset, value)
1821+
else:
1822+
mask_buffer[offset] += 1
1823+
_reduce_SparseNdarray(x, axes, op)
1824+
actual_size = size - masked
1825+
denom = actual_size - ddof
1826+
num_zero = actual_size - counts
1827+
sumsq = numpy.ma.MaskedArray(sumsq, mask = (denom <= 0))
1828+
else:
1829+
_reduce_SparseNdarray(x, axes, raw_op)
1830+
actual_size = size
1831+
denom = max(0, size - ddof)
1832+
num_zero = size - counts
1833+
1834+
old_means = means.copy()
1835+
means *= counts / actual_size
1836+
sumsq += num_zero * (old_means * means)
1837+
1838+
sumsq /= denom
1839+
if len(axes) == 0:
1840+
return sumsq[0]
1841+
else:
1842+
return sumsq

src/delayedarray/_statistics.py

+61
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
from typing import List, Tuple
2+
import numpy
3+
4+
5+
def _find_useful_axes(ndim, axis) -> List[int]:
6+
output = []
7+
if axis is not None:
8+
if isinstance(axis, int):
9+
if axis < 0:
10+
axis = ndim + axis
11+
for i in range(ndim):
12+
if i != axis:
13+
output.append(i)
14+
else:
15+
used = set()
16+
for a in axis:
17+
if a < 0:
18+
a = ndim + a
19+
used.add(a)
20+
for i in range(ndim):
21+
if i not in used:
22+
output.append(i)
23+
return output
24+
25+
26+
def _expected_sample_size(shape: Tuple[int, ...], axes: List[int]) -> int:
27+
size = 1
28+
j = 0
29+
for i, d in enumerate(shape):
30+
if j == len(axes) or i < axes[j]:
31+
size *= d
32+
else:
33+
j += 1
34+
return size
35+
36+
37+
def _choose_output_type(dtype: numpy.dtype, preserve_integer: bool) -> numpy.dtype:
38+
# Mimic numpy.sum's method for choosing the type.
39+
if numpy.issubdtype(dtype, numpy.integer):
40+
if preserve_integer:
41+
xinfo = numpy.iinfo(dtype)
42+
if xinfo.kind == "i":
43+
pinfo = numpy.iinfo(numpy.int_)
44+
if xinfo.bits < pinfo.bits:
45+
dtype = numpy.dtype(numpy.int_)
46+
else:
47+
pinfo = numpy.iinfo(numpy.uint)
48+
if xinfo.bits < pinfo.bits:
49+
dtype = numpy.dtype(numpy.uint)
50+
else:
51+
dtype = numpy.dtype("float64")
52+
return dtype
53+
54+
55+
def _create_offset_multipliers(shape: Tuple[int, ...], axes: List[int]) -> List[int]:
56+
multipliers = [0] * len(shape)
57+
sofar = 1
58+
for a in axes:
59+
multipliers[a] = sofar
60+
sofar *= shape[a]
61+
return multipliers

0 commit comments

Comments
 (0)