pydata · dcherian · Apr 18, 2024 · Feb 2, 2024 · Feb 2, 2024 · Feb 5, 2024
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -78,6 +78,10 @@ New Features
   (:issue:`7377`, :pull:`8684`)
   By `Marco Wolsza <https://github.com/maawoo>`_.
 
+- Xarray now makes a best attempt not to coerce :py:class:`pandas.api.extensions.ExtensionArray` to a numpy array
+by supporting 1D `ExtensionArray` objects internally where possible.  Thus, `Dataset`s initialized with a `pd.Catgeorical`, for example,
+will retain the object.  However, one cannot do operations that are not possible on the `ExtensionArray` then, such as broadcasting.
+
 Breaking changes
 ~~~~~~~~~~~~~~~~
 

diff --git a/properties/test_pandas_roundtrip.py b/properties/test_pandas_roundtrip.py
@@ -17,7 +17,9 @@
 from hypothesis import given  # isort:skip
 
 numeric_dtypes = st.one_of(
-    npst.unsigned_integer_dtypes(), npst.integer_dtypes(), npst.floating_dtypes()
+    npst.unsigned_integer_dtypes(endianness="="),
+    npst.integer_dtypes(endianness="="),
+    npst.floating_dtypes(endianness="="),
 )
 
 numeric_series = numeric_dtypes.flatmap(lambda dt: pdst.series(dtype=dt))

diff --git a/pyproject.toml b/pyproject.toml
@@ -129,6 +129,7 @@ module = [
   "opt_einsum.*",
   "pandas.*",
   "pooch.*",
+  "pyarrow.*",
   "pydap.*",
   "pytest.*",
   "scipy.*",

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -24,6 +24,7 @@
 from typing import IO, TYPE_CHECKING, Any, Callable, Generic, Literal, cast, overload
 
 import numpy as np
+from pandas.api.types import is_extension_array_dtype
 
 # remove once numpy 2.0 is the oldest supported version
 try:
@@ -6835,10 +6836,13 @@ def reduce(
                 if (
                     # Some reduction functions (e.g. std, var) need to run on variables
                     # that don't have the reduce dims: PR5393
-                    not reduce_dims
-                    or not numeric_only
-                    or np.issubdtype(var.dtype, np.number)
-                    or (var.dtype == np.bool_)
+                    (
+                        not reduce_dims
+                        or not numeric_only
+                        or np.issubdtype(var.dtype, np.number)
+                        or (var.dtype == np.bool_)
+                    )
+                    and not is_extension_array_dtype(var.dtype)
 def test_reduce_non_numeric(self) -> None: 
 def test_reduce_non_numeric(self) -> None: 
                 ):
                     # prefer to aggregate over axis=None rather than
                     # axis=(0, 1) if they will be equivalent, because
@@ -7151,13 +7155,37 @@ def to_pandas(self) -> pd.Series | pd.DataFrame:
         )
 
     def _to_dataframe(self, ordered_dims: Mapping[Any, int]):
-        columns = [k for k in self.variables if k not in self.dims]
+        columns_in_order = [k for k in self.variables if k not in self.dims]
+        non_extension_array_columns = [
+            k
+            for k in columns_in_order
+            if not is_extension_array_dtype(self.variables[k].data)
+        ]
+        extension_array_columns = [
+            k
+            for k in columns_in_order
+            if is_extension_array_dtype(self.variables[k].data)
+        ]
         data = [
             self._variables[k].set_dims(ordered_dims).values.reshape(-1)
-            for k in columns
+            for k in non_extension_array_columns
         ]
         index = self.coords.to_index([*ordered_dims])
-        return pd.DataFrame(dict(zip(columns, data)), index=index)
+        broadcasted_df = pd.DataFrame(
+            dict(zip(non_extension_array_columns, data)), index=index
+        )
+        for extension_array_column in extension_array_columns:
+            extension_array = self.variables[extension_array_column].data.array
+            index = self[self.variables[extension_array_column].dims[0]].data
+            extension_array_df = pd.DataFrame(
+                {extension_array_column: extension_array},
+                index=self[self.variables[extension_array_column].dims[0]].data,
+            )
+            extension_array_df.index.name = self.variables[extension_array_column].dims[
+                0
+            ]
+            broadcasted_df = broadcasted_df.join(extension_array_df)
+        return broadcasted_df[columns_in_order]
 
     def to_dataframe(self, dim_order: Sequence[Hashable] | None = None) -> pd.DataFrame:
         """Convert this dataset into a pandas.DataFrame.
@@ -7303,11 +7331,14 @@ def from_dataframe(cls, dataframe: pd.DataFrame, sparse: bool = False) -> Self:
                 "cannot convert a DataFrame with a non-unique MultiIndex into xarray"
             )
 
-        # Cast to a NumPy array first, in case the Series is a pandas Extension
-        # array (which doesn't have a valid NumPy dtype)
-        # TODO: allow users to control how this casting happens, e.g., by
-        # forwarding arguments to pandas.Series.to_numpy?
-        arrays = [(k, np.asarray(v)) for k, v in dataframe.items()]
+        arrays = [
+            (k, np.asarray(v))
+            for k, v in dataframe.items()
+            if not is_extension_array_dtype(v)
+        ]
+        extension_arrays = [
+            (k, v) for k, v in dataframe.items() if is_extension_array_dtype(v)
+        ]
 
         indexes: dict[Hashable, Index] = {}
         index_vars: dict[Hashable, Variable] = {}
@@ -7321,6 +7352,8 @@ def from_dataframe(cls, dataframe: pd.DataFrame, sparse: bool = False) -> Self:
                 xr_idx = PandasIndex(lev, dim)
                 indexes[dim] = xr_idx
                 index_vars.update(xr_idx.create_variables())
+            arrays += [(k, np.asarray(v)) for k, v in extension_arrays]
+            extension_arrays = []
         else:
             index_name = idx.name if idx.name is not None else "index"
             dims = (index_name,)
@@ -7334,7 +7367,9 @@ def from_dataframe(cls, dataframe: pd.DataFrame, sparse: bool = False) -> Self:
             obj._set_sparse_data_from_dataframe(idx, arrays, dims)
         else:
             obj._set_numpy_data_from_dataframe(idx, arrays, dims)
-        return obj
+        for name, extension_array in extension_arrays:
+            obj[name] = (dims, extension_array)
+        return obj[dataframe.columns] if len(dataframe.columns) else obj
 
     def to_dask_dataframe(
         self, dim_order: Sequence[Hashable] | None = None, set_index: bool = False

diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py
@@ -32,6 +32,7 @@
 from numpy import concatenate as _concatenate
 from numpy.lib.stride_tricks import sliding_window_view  # noqa
 from packaging.version import Version
+from pandas.api.types import is_extension_array_dtype
 
 from xarray.core import dask_array_ops, dtypes, nputils
 from xarray.core.options import OPTIONS
@@ -156,7 +157,7 @@ def isnull(data):
         return full_like(data, dtype=bool, fill_value=False)
     else:
         # at this point, array should have dtype=object
-        if isinstance(data, np.ndarray):
+        if isinstance(data, np.ndarray) or is_extension_array_dtype(data):
             return pandas_isnull(data)
         else:
             # Not reachable yet, but intended for use with other duck array
@@ -221,9 +222,17 @@ def asarray(data, xp=np):
 
 def as_shared_dtype(scalars_or_arrays, xp=np):
     """Cast a arrays to a shared dtype using xarray's type promotion rules."""
-    array_type_cupy = array_type("cupy")
-    if array_type_cupy and any(
-        isinstance(x, array_type_cupy) for x in scalars_or_arrays
+    if any(is_extension_array_dtype(x) for x in scalars_or_arrays):
+        extension_array_types = [
+            x.dtype for x in scalars_or_arrays if is_extension_array_dtype(x)
+        ]
+        if len(extension_array_types) == len(scalars_or_arrays) and all(
+            isinstance(x, type(extension_array_types[0])) for x in extension_array_types
+        ):
+            return scalars_or_arrays
+        arrays = [asarray(np.array(x), xp=xp) for x in scalars_or_arrays]
+    elif array_type_cupy := array_type("cupy") and any(  # noqa: F841
+        isinstance(x, array_type_cupy) for x in scalars_or_arrays  # noqa: F821
     ):
         import cupy as cp
 

diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py
@@ -4,20 +4,21 @@
 import functools
 import operator
 from collections import Counter, defaultdict
-from collections.abc import Hashable, Mapping
+from collections.abc import Hashable, Mapping, Sequence
 from contextlib import suppress
 from dataclasses import dataclass, field
 from datetime import timedelta
 from html import escape
-from typing import TYPE_CHECKING, Any, Callable
+from typing import TYPE_CHECKING, Any, Callable, Generic
 
 import numpy as np
 import pandas as pd
+from pandas.api.types import is_extension_array_dtype
 
 from xarray.core import duck_array_ops
 from xarray.core.nputils import NumpyVIndexAdapter
 from xarray.core.options import OPTIONS
-from xarray.core.types import T_Xarray
+from xarray.core.types import DTypeLikeSave, T_ExtensionArray, T_Xarray
 from xarray.core.utils import (
     NDArrayMixin,
     either_dict_or_kwargs,
@@ -1454,6 +1455,129 @@ def transpose(self, order):
         return self.array.transpose(order)
 
 
+HANDLED_EXTENSION_ARRAY_FUNCTIONS: dict[Callable, Callable] = {}
+
+
+def implements(numpy_function):
+    """Register an __array_function__ implementation for MyArray objects."""
+
+    def decorator(func):
+        HANDLED_EXTENSION_ARRAY_FUNCTIONS[numpy_function] = func
+        return func
+
+    return decorator
+
+
+@implements(np.issubdtype)
+def __extension_duck_array__issubdtype(
+    extension_array_dtype: T_ExtensionArray, other_dtype: DTypeLikeSave
+) -> bool:
+    return False  # never want a function to think a pandas extension dtype is a subtype of numpy
+
+
+@implements(np.broadcast_to)
+def __extension_duck_array__broadcast(arr: T_ExtensionArray, shape: tuple):
+    if shape[0] == len(arr) and len(shape) == 1:
+        return arr
+    raise NotImplementedError("Cannot broadcast 1d-only pandas categorical array.")
+
+
+@implements(np.stack)
+def __extension_duck_array__stack(arr: T_ExtensionArray, axis: int):
+    raise NotImplementedError("Cannot stack 1d-only pandas categorical array.")
+
+
+@implements(np.concatenate)
+def __extension_duck_array__concatenate(
+    arrays: Sequence[T_ExtensionArray], axis: int = 0, out=None
+) -> T_ExtensionArray:
+    return type(arrays[0])._concat_same_type(arrays)
+
+
+@implements(np.where)
+def __extension_duck_array__where(
+    condition: np.ndarray, x: T_ExtensionArray, y: T_ExtensionArray
+) -> T_ExtensionArray:
+    if (
+        isinstance(x, pd.Categorical)
+        and isinstance(y, pd.Categorical)
+        and x.dtype != y.dtype
+    ):
+        x = x.add_categories(set(y.categories).difference(set(x.categories)))
+        y = y.add_categories(set(x.categories).difference(set(y.categories)))
+    return pd.Series(x).where(condition, pd.Series(y)).array
+
+
+class ExtensionDuckArray(Generic[T_ExtensionArray]):
+    array: T_ExtensionArray
+
+    def __init__(self, array: T_ExtensionArray):
+        """NEP-18 compliant wrapper for pandas extension arrays.
+
+        Parameters
+        ----------
+        array : T_ExtensionArray
+            The array to be wrapped upon e.g,. :py:class:`xarray.Variable` creation.
+        ```
+        """
+        if isinstance(array, pd.api.extensions.ExtensionArray):
+            self.array = array
+        else:
+            raise TypeError(f"{array} is not an pandas ExtensionArray.")
+
+    def __array_function__(self, func, types, args, kwargs):
+        def replace_duck_with_extension_array(args) -> list:
+            args_as_list = list(args)
+            for index, value in enumerate(args_as_list):
+                if isinstance(value, ExtensionDuckArray):
+                    args_as_list[index] = value.array
+                elif isinstance(
+                    value, tuple
+                ):  # should handle more than just tuple? iterable?
+                    args_as_list[index] = tuple(
+                        replace_duck_with_extension_array(value)
+                    )
+                elif isinstance(value, list):
+                    args_as_list[index] = replace_duck_with_extension_array(value)
+            return args_as_list
+
+        args = tuple(replace_duck_with_extension_array(args))
+        if func not in HANDLED_EXTENSION_ARRAY_FUNCTIONS:
+            return func(*args, **kwargs)
+        res = HANDLED_EXTENSION_ARRAY_FUNCTIONS[func](*args, **kwargs)
+        if is_extension_array_dtype(res):
+            return type(self)[type(res)](res)
+        return res
+
+    def __array_ufunc__(ufunc, method, *inputs, **kwargs):
+        return ufunc(*inputs, **kwargs)
+
+    def __repr__(self):
+        return f"{type(self)}(array={repr(self.array)})"
+
+    def __getattr__(self, attr: str) -> object:
+        if hasattr(self.array, attr):
+            return getattr(self.array, attr)
+        raise AttributeError(f"{attr} not found.")
+
+    def __getitem__(self, key) -> ExtensionDuckArray[T_ExtensionArray]:
+        item = self.array[key]
+        if is_extension_array_dtype(item):  # not a singleton - better way to check?
+            return type(self)(item)
+        return item
+
+    def __setitem__(self, key, val):
+        self.array[key] = val
+
+    def __eq__(self, other):
+        if isinstance(other, ExtensionDuckArray):
+            return self.array == other.array
+        return self.array == other
+
+    def __ne__(self, other):
+        return ~(self == other)
+
+
 class PandasIndexingAdapter(ExplicitlyIndexedNDArrayMixin):
     """Wrap a pandas.Index to preserve dtypes and handle explicit indexing."""
 

diff --git a/xarray/core/types.py b/xarray/core/types.py
@@ -167,6 +167,9 @@ def copy(
 # hopefully in the future we can narrow this down more:
 T_DuckArray = TypeVar("T_DuckArray", bound=Any, covariant=True)
 
+# For typing pandas extension arrays.
+T_ExtensionArray = TypeVar("T_ExtensionArray", bound=pd.api.extensions.ExtensionArray)
+
 
 ScalarOrArray = Union["ArrayLike", np.generic, np.ndarray, "DaskArray"]
 VarCompatible = Union["Variable", "ScalarOrArray"]

diff --git a/xarray/core/variable.py b/xarray/core/variable.py
@@ -13,13 +13,15 @@
 import numpy as np
 import pandas as pd
 from numpy.typing import ArrayLike
+from pandas.api.types import is_extension_array_dtype
 
 import xarray as xr  # only for Dataset and DataArray
 from xarray.core import common, dtypes, duck_array_ops, indexing, nputils, ops, utils
 from xarray.core.arithmetic import VariableArithmetic
 from xarray.core.common import AbstractArray
 from xarray.core.indexing import (
     BasicIndexer,
+    ExtensionDuckArray,
     OuterIndexer,
     PandasIndexingAdapter,
     VectorizedIndexer,
@@ -50,6 +52,7 @@
 NON_NUMPY_SUPPORTED_ARRAY_TYPES = (
     indexing.ExplicitlyIndexed,
     pd.Index,
+    pd.api.extensions.ExtensionArray,
 )
 # https://github.com/python/mypy/issues/224
 BASIC_INDEXING_TYPES = integer_types + (slice,)
@@ -171,6 +174,8 @@ def _maybe_wrap_data(data):
     """
     if isinstance(data, pd.Index):
         return PandasIndexingAdapter(data)
+    if isinstance(data, pd.api.extensions.ExtensionArray):
+        return ExtensionDuckArray[type(data)](data)
     return data
 
 
@@ -2535,6 +2540,11 @@ def chunk(  # type: ignore[override]
         dask.array.from_array
         """
 
+        if is_extension_array_dtype(self):
+            raise ValueError(
+                f"{self} was found to be a Pandas ExtensionArray.  Please convert to numpy first."
+            )
+
         if from_array_kwargs is None:
             from_array_kwargs = {}