xarray-contrib · jemmajeffree · Jul 18, 2025 · Jul 18, 2025 · Jul 18, 2025 · Jul 18, 2025
diff --git a/flox/aggregations.py b/flox/aggregations.py
@@ -15,6 +15,8 @@
 from . import aggregate_flox, aggregate_npg, xrutils
 from . import xrdtypes as dtypes
 from .lib import dask_array_type, sparse_array_type
+from .multiarray import MultiArray
+from .xrutils import notnull
 
 if TYPE_CHECKING:
     FuncTuple = tuple[Callable | str, ...]
@@ -161,8 +163,8 @@ def __init__(
         self,
         name: str,
         *,
-        numpy: str | None = None,
-        chunk: str | FuncTuple | None,
+        numpy: partial | str | None = None,
+        chunk: partial | str | FuncTuple | None,
         combine: str | FuncTuple | None,
         preprocess: Callable | None = None,
         finalize: Callable | None = None,
@@ -343,57 +345,178 @@ def _mean_finalize(sum_, count):
 )
 
 
-# TODO: fix this for complex numbers
-def _var_finalize(sumsq, sum_, count, ddof=0):
-    with np.errstate(invalid="ignore", divide="ignore"):
-        result = (sumsq - (sum_**2 / count)) / (count - ddof)
-    result[count <= ddof] = np.nan
-    return result
+def var_chunk(
+    group_idx, array, *, skipna: bool, engine: str, axis=-1, size=None, fill_value=None, dtype=None
+):
+    # Calculate length and sum - important for the adjustment terms to sum squared deviations
+    array_lens = generic_aggregate(
+        group_idx,
+        array,
+        func="nanlen",
+        engine=engine,
+        axis=axis,
+        size=size,
+        fill_value=0,  # Unpack fill value bc it's currently defined for multiarray
+        dtype=dtype,
+    )
+
+    array_sums = generic_aggregate(
+        group_idx,
+        array,
+        func="nansum" if skipna else "sum",
+        engine=engine,
+        axis=axis,
+        size=size,
+        fill_value=0,  # Unpack fill value bc it's currently defined for multiarray
+        dtype=dtype,
+    )
+
+    # Calculate sum squared deviations - the main part of variance sum
+    array_means = array_sums / array_lens
+
+    sum_squared_deviations = generic_aggregate(
+        group_idx,
+        (array - array_means[..., group_idx]) ** 2,
+        func="nansum" if skipna else "sum",
+        engine=engine,
+        axis=axis,
+        size=size,
+        fill_value=0,  # Unpack fill value bc it's currently defined for multiarray
+        dtype=dtype,
+    )
+
+    return MultiArray((sum_squared_deviations, array_sums, array_lens))
+
+
+def _var_combine(array, axis, keepdims=True):
+    def clip_last(array, ax, n=1):
+        """Return array except the last element along axis
+        Purely included to tidy up the adj_terms line
+        """
+        assert n > 0, "Clipping nothing off the end isn't implemented"
+        not_last = [slice(None, None) for i in range(array.ndim)]
+        not_last[ax] = slice(None, -n)
+        return array[*not_last]
+
+    def clip_first(array, ax, n=1):
+        """Return array except the first element along axis
+        Purely included to tidy up the adj_terms line
+        """
+        not_first = [slice(None, None) for i in range(array.ndim)]
+        not_first[ax] = slice(n, None)
+        return array[*not_first]
+
+    for ax in axis:
+        if array.shape[ax] == 1:
+            continue
+
+        sum_deviations, sum_X, sum_len = array.arrays
+
+        # Calculate parts needed for cascading combination
+        cumsum_X = np.cumsum(sum_X, axis=ax)
+        cumsum_len = np.cumsum(sum_len, axis=ax)
+
+        # There will be instances in which one or both chunks being merged are empty
+        # In which case, the adjustment term should be zero, but will throw a divide-by-zero error
+        # We're going to add a constant to the bottom of the adjustment term equation on those instances
+        # and count on the zeros on the top making our adjustment term still zero
+        zero_denominator = (clip_last(cumsum_len, ax) == 0) | (clip_first(sum_len, ax) == 0)
+
+        # Adjustment terms to tweak the sum of squared deviations because not every chunk has the same mean
+        adj_terms = (
+            clip_last(cumsum_len, ax) * clip_first(sum_X, ax)
+            - clip_first(sum_len, ax) * clip_last(cumsum_X, ax)
+        ) ** 2 / (
+            clip_last(cumsum_len, ax)
+            * clip_first(sum_len, ax)
+            * (clip_last(cumsum_len, ax) + clip_first(sum_len, ax))
+            + zero_denominator.astype(int)
+        )
+
+        check = adj_terms * zero_denominator
+        assert np.all(check[notnull(check)] == 0), (
+            "Instances where we add something to the denominator must come out to zero"
+        )
+
+        array = MultiArray(
+            (
+                np.sum(sum_deviations, axis=ax, keepdims=keepdims)
+                + np.sum(adj_terms, axis=ax, keepdims=keepdims),  # sum of squared deviations
+                np.sum(sum_X, axis=ax, keepdims=keepdims),  # sum of array items
+                np.sum(sum_len, axis=ax, keepdims=keepdims),  # sum of array lengths
+            )
+        )
+    return array
+
+
+def is_var_chunk_reduction(agg: Callable) -> bool:
+    if isinstance(agg, partial):
+        agg = agg.func
+    return agg is blockwise_or_numpy_var or agg is var_chunk
+
+
+def _var_finalize(multiarray, ddof=0):
+    den = multiarray.arrays[2] - ddof
+    # preserve nans for groups with 0 obs; so these values are -ddof
+    ret = multiarray.arrays[0] / den
+    ret[den < 0] = np.nan
+    return ret
 
 
-def _std_finalize(sumsq, sum_, count, ddof=0):
-    return np.sqrt(_var_finalize(sumsq, sum_, count, ddof))
+def _std_finalize(multiarray, ddof=0):
+    return np.sqrt(_var_finalize(multiarray, ddof))
+
+
+def blockwise_or_numpy_var(*args, skipna: bool, ddof=0, std=False, **kwargs):
+    res = _var_finalize(var_chunk(*args, skipna=skipna, **kwargs), ddof)
+    return np.sqrt(res) if std else res
 
 
 # var, std always promote to float, so we set nan
 var = Aggregation(
     "var",
-    chunk=("sum_of_squares", "sum", "nanlen"),
-    combine=("sum", "sum", "sum"),
+    chunk=partial(var_chunk, skipna=False),
+    numpy=partial(blockwise_or_numpy_var, skipna=False),
+    combine=(_var_combine,),
     finalize=_var_finalize,
-    fill_value=0,
+    fill_value=((0, 0, 0),),
     final_fill_value=np.nan,
-    dtypes=(None, None, np.intp),
+    dtypes=(None,),
     final_dtype=np.floating,
 )
+
 nanvar = Aggregation(
     "nanvar",
-    chunk=("nansum_of_squares", "nansum", "nanlen"),
-    combine=("sum", "sum", "sum"),
+    chunk=partial(var_chunk, skipna=True),
+    numpy=partial(blockwise_or_numpy_var, skipna=True),
+    combine=(_var_combine,),
     finalize=_var_finalize,
-    fill_value=0,
+    fill_value=((0, 0, 0),),
     final_fill_value=np.nan,
-    dtypes=(None, None, np.intp),
+    dtypes=(None,),
     final_dtype=np.floating,
 )
+
 std = Aggregation(
     "std",
-    chunk=("sum_of_squares", "sum", "nanlen"),
-    combine=("sum", "sum", "sum"),
+    chunk=partial(var_chunk, skipna=False),
+    numpy=partial(blockwise_or_numpy_var, skipna=False, std=True),
+    combine=(_var_combine,),
     finalize=_std_finalize,
-    fill_value=0,
+    fill_value=((0, 0, 0),),
     final_fill_value=np.nan,
-    dtypes=(None, None, np.intp),
+    dtypes=(None,),
     final_dtype=np.floating,
 )
 nanstd = Aggregation(
     "nanstd",
-    chunk=("nansum_of_squares", "nansum", "nanlen"),
-    combine=("sum", "sum", "sum"),
+    chunk=partial(var_chunk, skipna=True),
+    numpy=partial(blockwise_or_numpy_var, skipna=True, std=True),
+    combine=(_var_combine,),
     finalize=_std_finalize,
-    fill_value=0,
+    fill_value=((0, 0, 0),),
     final_fill_value=np.nan,
-    dtypes=(None, None, np.intp),
+    dtypes=(None,),
     final_dtype=np.floating,
 )
 

diff --git a/flox/core.py b/flox/core.py
@@ -44,6 +44,7 @@
     _atleast_1d,
     _initialize_aggregation,
     generic_aggregate,
+    is_var_chunk_reduction,
     quantile_new_dims_func,
 )
 from .cache import memoize
@@ -1288,7 +1289,8 @@ def chunk_reduce(
     # optimize that out.
     previous_reduction: T_Func = ""
     for reduction, fv, kw, dt in zip(funcs, fill_values, kwargss, dtypes):
-        if empty:
+        # UGLY! but this is because the `var` breaks our design assumptions
+        if empty and not is_var_chunk_reduction(reduction):
             result = np.full(shape=final_array_shape, fill_value=fv, like=array)
         elif is_nanlen(reduction) and is_nanlen(previous_reduction):
             result = results["intermediates"][-1]
@@ -1297,6 +1299,10 @@ def chunk_reduce(
             kw_func = dict(size=size, dtype=dt, fill_value=fv)
             kw_func.update(kw)
 
+            # UGLY! but this is because the `var` breaks our design assumptions
+            if is_var_chunk_reduction(reduction):
+                kw_func.update(engine=engine)
+
             if callable(reduction):
                 # passing a custom reduction for npg to apply per-group is really slow!
                 # So this `reduction` has to do the groupby-aggregation

diff --git a/flox/multiarray.py b/flox/multiarray.py
@@ -0,0 +1,97 @@
+from collections.abc import Callable
+from typing import Self
+
+import numpy as np
+
+MULTIARRAY_HANDLED_FUNCTIONS: dict[Callable, Callable] = {}
+
+
+class MultiArray:
+    arrays: tuple[np.ndarray, ...]
+
+    def __init__(self, arrays):
+        self.arrays = arrays
+        assert all(arrays[0].shape == a.shape for a in arrays), "Expect all arrays to have the same shape"
+
+    def astype(self, dt, **kwargs) -> Self:
+        return type(self)(tuple(array.astype(dt, **kwargs) for array in self.arrays))
+
+    def reshape(self, shape, **kwargs) -> Self:
+        return type(self)(tuple(array.reshape(shape, **kwargs) for array in self.arrays))
+
+    def squeeze(self, axis=None) -> Self:
+        return type(self)(tuple(array.squeeze(axis) for array in self.arrays))
+
+    def __setitem__(self, key, value) -> None:
+        assert len(value) == len(self.arrays)
+        for array, val in zip(self.arrays, value):
+            array[key] = val
+
+    def __array_function__(self, func, types, args, kwargs):
+        if func not in MULTIARRAY_HANDLED_FUNCTIONS:
+            return NotImplemented
+        # Note: this allows subclasses that don't override
+        # __array_function__ to handle MyArray objects
+        # if not all(issubclass(t, MyArray) for t in types): # I can't see this being relevant at all for this code, but maybe it's safer to leave it in?
+        # return NotImplemented
+        return MULTIARRAY_HANDLED_FUNCTIONS[func](*args, **kwargs)
+
+    # Shape is needed, seems likely that the other two might be
+    # Making some strong assumptions here that all the arrays are the same shape, and I don't really like this
+    @property
+    def dtype(self) -> np.dtype:
+        return self.arrays[0].dtype
+
+    @property
+    def shape(self) -> tuple[int, ...]:
+        return self.arrays[0].shape
+
+    @property
+    def ndim(self) -> int:
+        return self.arrays[0].ndim
+
+    def __getitem__(self, key) -> Self:
+        return type(self)([array[key] for array in self.arrays])
+
+
+def implements(numpy_function):
+    """Register an __array_function__ implementation for MyArray objects."""
+
+    def decorator(func):
+        MULTIARRAY_HANDLED_FUNCTIONS[numpy_function] = func
+        return func
+
+    return decorator
+
+
+@implements(np.expand_dims)
+def expand_dims(multiarray, axis) -> MultiArray:
+    return MultiArray(tuple(np.expand_dims(a, axis) for a in multiarray.arrays))
+
+
+@implements(np.concatenate)
+def concatenate(multiarrays, axis) -> MultiArray:
+    n_arrays = len(multiarrays[0].arrays)
+    for ma in multiarrays[1:]:
+        assert len(ma.arrays) == n_arrays
+    return MultiArray(
+        tuple(np.concatenate(tuple(ma.arrays[i] for ma in multiarrays), axis) for i in range(n_arrays))
+    )
+
+
+@implements(np.transpose)
+def transpose(multiarray, axes) -> MultiArray:
+    return MultiArray(tuple(np.transpose(a, axes) for a in multiarray.arrays))
+
+
+@implements(np.squeeze)
+def squeeze(multiarray, axis) -> MultiArray:
+    return MultiArray(tuple(np.squeeze(a, axis) for a in multiarray.arrays))
+
+
+@implements(np.full)
+def full(shape, fill_values, *args, **kwargs) -> MultiArray:
+    """All arguments except fill_value are shared by each array in the MultiArray.
+    Iterate over fill_values to create arrays
+    """
+    return MultiArray(tuple(np.full(shape, fv, *args, **kwargs) for fv in fill_values))
diff --git a/flox/xrutils.py b/flox/xrutils.py
@@ -146,6 +146,9 @@ def is_scalar(value: Any, include_0d: bool = True) -> bool:
 
 
 def notnull(data):
+    if isinstance(data, tuple) and len(data) == 3 and data == (0, 0, 0):
+        # boo: another special case for Var
+        return True
     if not is_duck_array(data):
         data = np.asarray(data)
 
@@ -163,6 +166,9 @@ def notnull(data):
 
 
 def isnull(data: Any):
+    if isinstance(data, tuple) and len(data) == 3 and data == (0, 0, 0):
+        # boo: another special case for Var
+        return False
     if data is None:
         return False
     if not is_duck_array(data):

diff --git a/tests/strategies.py b/tests/strategies.py
@@ -108,9 +108,8 @@ def insert_nans(draw: st.DrawFn, array: np.ndarray) -> np.ndarray:
     "any",
     "all",
 ] + list(SCIPY_STATS_FUNCS)
-SKIPPED_FUNCS = ["var", "std", "nanvar", "nanstd"]
 def test_groupby_reduce(data, array, func: str) -> None: 
 def test_groupby_reduce(data, array, func: str) -> None: 
 
-func_st = st.sampled_from([f for f in ALL_FUNCS if f not in NON_NUMPY_FUNCS and f not in SKIPPED_FUNCS])
+func_st = st.sampled_from([f for f in ALL_FUNCS if f not in NON_NUMPY_FUNCS])
 
 
 @st.composite