|
14 | 14 | _concatenate_unmasked_ndarrays,
|
15 | 15 | _concatenate_maybe_masked_ndarrays
|
16 | 16 | )
|
| 17 | +from ._statistics import _find_useful_axes, _expected_sample_size, _choose_output_type, _create_offset_multipliers |
17 | 18 |
|
18 | 19 | __author__ = "ltla"
|
19 | 20 | __copyright__ = "ltla"
|
@@ -778,6 +779,76 @@ def T(self) -> "SparseNdarray":
|
778 | 779 | return _transpose_SparseNdarray(self, axes)
|
779 | 780 |
|
780 | 781 |
|
| 782 | + def sum(self, axis: Optional[Union[int, Tuple[int, ...]]] = None, dtype: Optional[numpy.dtype] = None) -> numpy.ndarray: |
| 783 | + """ |
| 784 | + Take the sum of values across the ``SparseNdarray``, possibly over a |
| 785 | + given axis or set of axes. |
| 786 | +
|
| 787 | + Args: |
| 788 | + axis: |
| 789 | + A single integer specifying the axis to perform the calculation. |
| 790 | + Alternatively a tuple or None, see ``numpy.sum`` for details. |
| 791 | +
|
| 792 | + dtype: |
| 793 | + NumPy type for the output array. If None, this is automatically |
| 794 | + chosen based on the type of the ``SparseNdarray``, see |
| 795 | + ``numpy.sum`` for details. |
| 796 | +
|
| 797 | + Returns: |
| 798 | + A NumPy array containing the summed values. If ``axis = None``, |
| 799 | + this will be a NumPy scalar instead. |
| 800 | + """ |
| 801 | + return _sum(self, axis=axis, dtype=dtype) |
| 802 | + |
| 803 | + |
| 804 | + def mean(self, axis: Optional[Union[int, Tuple[int, ...]]] = None, dtype: Optional[numpy.dtype] = None) -> numpy.ndarray: |
| 805 | + """ |
| 806 | + Take the mean of values across the ``SparseNdarray``, possibly over a |
| 807 | + given axis or set of axes. |
| 808 | +
|
| 809 | + Args: |
| 810 | + axis: |
| 811 | + A single integer specifying the axis to perform the calculation. |
| 812 | + Alternatively a tuple or None, see ``numpy.mean`` for details. |
| 813 | +
|
| 814 | + dtype: |
| 815 | + NumPy type for the output array. If None, this is automatically |
| 816 | + chosen based on the type of the ``SparseNdarray``, see |
| 817 | + ``numpy.mean`` for details. |
| 818 | +
|
| 819 | + Returns: |
| 820 | + A NumPy array containing the mean values. If ``axis = None``, |
| 821 | + this will be a NumPy scalar instead. |
| 822 | + """ |
| 823 | + return _mean(self, axis=axis, dtype=dtype) |
| 824 | + |
| 825 | + |
| 826 | + def var(self, axis: Optional[Union[int, Tuple[int, ...]]] = None, dtype: Optional[numpy.dtype] = None, ddof: int = 0) -> numpy.ndarray: |
| 827 | + """ |
| 828 | + Take the variances of values across the ``SparseNdarray``, possibly |
| 829 | + over a given axis or set of axes. |
| 830 | +
|
| 831 | + Args: |
| 832 | + axis: |
| 833 | + A single integer specifying the axis to perform the calculation. |
| 834 | + Alternatively a tuple or None, see ``numpy.mean`` for details. |
| 835 | +
|
| 836 | + dtype: |
| 837 | + NumPy type for the output array. If None, this is automatically |
| 838 | + chosen based on the type of the ``SparseNdarray``, see |
| 839 | + ``numpy.mean`` for details. |
| 840 | +
|
| 841 | + ddof: |
| 842 | + Delta in the degrees of freedom to subtract from the denominator. |
| 843 | + Typically set to 1 to obtain the sample variance. |
| 844 | +
|
| 845 | + Returns: |
| 846 | + A NumPy array containing the variances. If ``axis = None``, |
| 847 | + this will be a NumPy scalar instead. |
| 848 | + """ |
| 849 | + return _var(self, axis=axis, dtype=dtype, ddof=ddof) |
| 850 | + |
| 851 | + |
781 | 852 | # Other stuff
|
782 | 853 | def __copy__(self) -> "SparseNdarray":
|
783 | 854 | """
|
@@ -1008,19 +1079,14 @@ def _extract_dense_array_from_SparseNdarray(x: SparseNdarray, subset: Tuple[Sequ
|
1008 | 1079 | output = numpy.zeros((*reversed(idims),), dtype=x._dtype)
|
1009 | 1080 |
|
1010 | 1081 | if x._contents is not None:
|
1011 |
| - # Wrapping it in a masked array so that masked values are respected. |
1012 |
| - output = numpy.ma.MaskedArray(output, mask=False) |
1013 |
| - |
| 1082 | + if x._is_masked: |
| 1083 | + output = numpy.ma.MaskedArray(output, mask=False) |
1014 | 1084 | ndim = len(x._shape)
|
1015 | 1085 | if ndim > 1:
|
1016 | 1086 | _recursive_extract_dense_array(x._contents, subset, subset_summary=subset_summary, output=output, dim=ndim-1)
|
1017 | 1087 | else:
|
1018 | 1088 | _extract_sparse_vector_to_dense(x._contents[0], x._contents[1], subset_summary=subset_summary, output=output)
|
1019 | 1089 |
|
1020 |
| - # Stripping out the mask if there were, in fact, no masked values. |
1021 |
| - if isinstance(output.mask, bool) and not output.mask: |
1022 |
| - output = output.data |
1023 |
| - |
1024 | 1090 | return output.T
|
1025 | 1091 |
|
1026 | 1092 |
|
@@ -1614,3 +1680,163 @@ def _concatenate_SparseNdarrays(xs: List[SparseNdarray], along: int):
|
1614 | 1680 | new_contents = _concatenate_sparse_vectors(outidx, outval, payload)
|
1615 | 1681 |
|
1616 | 1682 | return SparseNdarray(shape=(*new_shape,), contents=new_contents, dtype=output_dtype, index_dtype=output_index_dtype, is_masked=output_is_masked, check=False)
|
| 1683 | + |
| 1684 | + |
| 1685 | +######################################################### |
| 1686 | +######################################################### |
| 1687 | + |
| 1688 | + |
| 1689 | +_ReductionPayload = namedtuple("_ReductionPayload", [ "multipliers", "operation" ]) |
| 1690 | + |
| 1691 | + |
| 1692 | +def _reduce_sparse_vector(idx: numpy.ndarray, val: numpy.ndarray, payload: _ReductionPayload, offset: int = 0): |
| 1693 | + for j, ix in enumerate(idx): |
| 1694 | + extra = payload.multipliers[0] * ix |
| 1695 | + payload.operation(offset + extra, val[j]) |
| 1696 | + return |
| 1697 | + |
| 1698 | + |
| 1699 | +def _recursive_reduce_SparseNdarray(contents, payload: _ReductionPayload, dim: int, offset: int = 0): |
| 1700 | + mult = payload.multipliers[dim] |
| 1701 | + if dim == 1: |
| 1702 | + for i, con in enumerate(contents): |
| 1703 | + if con is not None: |
| 1704 | + _reduce_sparse_vector(con[0], con[1], payload, offset = offset + mult * i) |
| 1705 | + else: |
| 1706 | + for i, con in enumerate(contents): |
| 1707 | + if con is not None: |
| 1708 | + _recursive_reduce_SparseNdarray(con, payload, dim = dim - 1, offset = offset + mult * i) |
| 1709 | + return |
| 1710 | + |
| 1711 | + |
| 1712 | +def _reduce_SparseNdarray(x: SparseNdarray, axes: List[int], operation: Callable): |
| 1713 | + if x.contents is not None: |
| 1714 | + multipliers = _create_offset_multipliers(x.shape, axes) |
| 1715 | + payload = _ReductionPayload(operation=operation, multipliers=multipliers) |
| 1716 | + ndim = len(x.shape) |
| 1717 | + if ndim == 1: |
| 1718 | + _reduce_sparse_vector(x.contents[0], x.contents[1], payload) |
| 1719 | + else: |
| 1720 | + _recursive_reduce_SparseNdarray(x.contents, payload, dim=ndim - 1) |
| 1721 | + return |
| 1722 | + |
| 1723 | + |
| 1724 | +def _allocate_output_array(shape: Tuple[int, ...], axes: List[int], dtype: numpy.dtype) -> numpy.ndarray: |
| 1725 | + # Either returning a scalar or not. |
| 1726 | + if len(axes) == 0: |
| 1727 | + return numpy.zeros(1, dtype=dtype) |
| 1728 | + else: |
| 1729 | + shape = [shape[i] for i in axes] |
| 1730 | + return numpy.zeros((*shape,), dtype=dtype, order="F") |
| 1731 | + |
| 1732 | + |
| 1733 | +def _sum(x: SparseNdarray, axis: Optional[Union[int, Tuple[int, ...]]], dtype: Optional[numpy.dtype]) -> numpy.ndarray: |
| 1734 | + axes = _find_useful_axes(len(x.shape), axis) |
| 1735 | + if dtype is None: |
| 1736 | + dtype = _choose_output_type(x.dtype, preserve_integer = True) |
| 1737 | + output = _allocate_output_array(x.shape, axes, dtype) |
| 1738 | + buffer = output.ravel(order="F") |
| 1739 | + |
| 1740 | + if x._is_masked: |
| 1741 | + masked = numpy.zeros(output.shape, dtype=numpy.uint, order="F") |
| 1742 | + mask_buffer = masked.ravel(order="F") |
| 1743 | + def op(offset, value): |
| 1744 | + if value is not numpy.ma.masked: |
| 1745 | + buffer[offset] += value |
| 1746 | + else: |
| 1747 | + mask_buffer[offset] += 1 |
| 1748 | + _reduce_SparseNdarray(x, axes, op) |
| 1749 | + size = _expected_sample_size(x.shape, axes) |
| 1750 | + output = numpy.ma.MaskedArray(output, mask=(masked == size)) |
| 1751 | + else: |
| 1752 | + def op(offset, value): |
| 1753 | + buffer[offset] += value |
| 1754 | + _reduce_SparseNdarray(x, axes, op) |
| 1755 | + |
| 1756 | + if len(axes) == 0: |
| 1757 | + return output[0] |
| 1758 | + else: |
| 1759 | + return output |
| 1760 | + |
| 1761 | + |
| 1762 | +def _mean(x: SparseNdarray, axis: Optional[Union[int, Tuple[int, ...]]], dtype: Optional[numpy.dtype]) -> numpy.ndarray: |
| 1763 | + axes = _find_useful_axes(len(x.shape), axis) |
| 1764 | + if dtype is None: |
| 1765 | + dtype = _choose_output_type(x.dtype, preserve_integer = False) |
| 1766 | + output = _allocate_output_array(x.shape, axes, dtype) |
| 1767 | + buffer = output.ravel(order="F") |
| 1768 | + size = _expected_sample_size(x.shape, axes) |
| 1769 | + |
| 1770 | + if x._is_masked: |
| 1771 | + masked = numpy.zeros(output.shape, dtype=numpy.uint, order="F") |
| 1772 | + mask_buffer = masked.ravel(order="F") |
| 1773 | + def op(offset, value): |
| 1774 | + if value is not numpy.ma.masked: |
| 1775 | + buffer[offset] += value |
| 1776 | + else: |
| 1777 | + mask_buffer[offset] += 1 |
| 1778 | + _reduce_SparseNdarray(x, axes, op) |
| 1779 | + denom = size - masked |
| 1780 | + output = numpy.ma.MaskedArray(output, mask=(denom==0)) |
| 1781 | + else: |
| 1782 | + def op(offset, value): |
| 1783 | + buffer[offset] += value |
| 1784 | + _reduce_SparseNdarray(x, axes, op) |
| 1785 | + denom = size |
| 1786 | + |
| 1787 | + output /= denom |
| 1788 | + if len(axes) == 0: |
| 1789 | + return output[0] |
| 1790 | + else: |
| 1791 | + return output |
| 1792 | + |
| 1793 | + |
| 1794 | +def _var(x: SparseNdarray, axis: Optional[Union[int, Tuple[int, ...]]], dtype: Optional[numpy.dtype], ddof: int) -> numpy.ndarray: |
| 1795 | + axes = _find_useful_axes(len(x.shape), axis) |
| 1796 | + if dtype is None: |
| 1797 | + dtype = _choose_output_type(x.dtype, preserve_integer = False) |
| 1798 | + size = _expected_sample_size(x.shape, axes) |
| 1799 | + |
| 1800 | + # Using Welford's online algorithm. |
| 1801 | + sumsq = _allocate_output_array(x.shape, axes, dtype) |
| 1802 | + sumsq_buffer = sumsq.ravel(order="F") |
| 1803 | + means = numpy.zeros(sumsq.shape, dtype=dtype, order="F") |
| 1804 | + means_buffer = means.ravel(order="F") |
| 1805 | + counts = numpy.zeros(sumsq.shape, dtype=numpy.int64, order="F") |
| 1806 | + counts_buffer = counts.ravel(order="F") |
| 1807 | + |
| 1808 | + def raw_op(offset, value): |
| 1809 | + counts_buffer[offset] += 1 |
| 1810 | + delta = value - means_buffer[offset] |
| 1811 | + means_buffer[offset] += delta / counts_buffer[offset] |
| 1812 | + delta_2 = value - means_buffer[offset] |
| 1813 | + sumsq_buffer[offset] += delta * delta_2 |
| 1814 | + |
| 1815 | + if x._is_masked: |
| 1816 | + masked = numpy.zeros(sumsq.shape, dtype=numpy.int64, order="F") |
| 1817 | + mask_buffer = masked.ravel(order="F") |
| 1818 | + def op(offset, value): |
| 1819 | + if value is not numpy.ma.masked: |
| 1820 | + raw_op(offset, value) |
| 1821 | + else: |
| 1822 | + mask_buffer[offset] += 1 |
| 1823 | + _reduce_SparseNdarray(x, axes, op) |
| 1824 | + actual_size = size - masked |
| 1825 | + denom = actual_size - ddof |
| 1826 | + num_zero = actual_size - counts |
| 1827 | + sumsq = numpy.ma.MaskedArray(sumsq, mask = (denom <= 0)) |
| 1828 | + else: |
| 1829 | + _reduce_SparseNdarray(x, axes, raw_op) |
| 1830 | + actual_size = size |
| 1831 | + denom = max(0, size - ddof) |
| 1832 | + num_zero = size - counts |
| 1833 | + |
| 1834 | + old_means = means.copy() |
| 1835 | + means *= counts / actual_size |
| 1836 | + sumsq += num_zero * (old_means * means) |
| 1837 | + |
| 1838 | + sumsq /= denom |
| 1839 | + if len(axes) == 0: |
| 1840 | + return sumsq[0] |
| 1841 | + else: |
| 1842 | + return sumsq |
0 commit comments