From b4252c369204caa3d025f3719ba62170ea5590cb Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 4 Jan 2025 02:07:03 -0500 Subject: [PATCH] Assorted cleanups --- pandas/core/arrays/list_.py | 26 ++++++-------------------- pandas/core/internals/construction.py | 3 --- pandas/core/internals/managers.py | 5 +---- 3 files changed, 7 insertions(+), 27 deletions(-) diff --git a/pandas/core/arrays/list_.py b/pandas/core/arrays/list_.py index eeb62b3e50656..7a15b41739f79 100644 --- a/pandas/core/arrays/list_.py +++ b/pandas/core/arrays/list_.py @@ -74,7 +74,7 @@ class ListDtype(ArrowDtype): An ExtensionDtype suitable for storing homogeneous lists of data. """ - _is_immutable = True # TODO(wayd): should we allow mutability? + _is_immutable = True def __init__(self, value_dtype: pa.DataType) -> None: super().__init__(pa.large_list(value_dtype)) @@ -100,10 +100,7 @@ def name(self) -> str: # type: ignore[override] """ A string identifying the data type. """ - # TODO: reshaping tests require the name list to match the large_list - # implementation; assumedly there are some astype(str(dtype)) casts - # going on. Should fix so this can just be "list[...]" for end user - return f"large_list[{self.pyarrow_dtype.value_type!s}]" + return f"list[{self.pyarrow_dtype.value_type!s}]" @property def kind(self) -> str: @@ -124,7 +121,6 @@ def construct_array_type(cls) -> type_t[ListArray]: return ListArray def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: - # TODO(wayd): should we implemented value type support? for dtype in dtypes: if ( isinstance(dtype, ListDtype) @@ -153,8 +149,7 @@ def __init__( if isinstance(values, (pa.Array, pa.ChunkedArray)): parent_type = values.type if not isinstance(parent_type, (pa.ListType, pa.LargeListType)): - # Ideally could cast here, but I don't think pyarrow implements - # many list casts + # TODO: maybe implement native casts in pyarrow new_values = [ [x.as_py()] if x.is_valid else None for x in values ] @@ -164,12 +159,10 @@ def __init__( else: value_type = pa.array(values).type.value_type - # Internally always use large_string instead of string if value_type == pa.string(): value_type = pa.large_string() if not isinstance(values, pa.ChunkedArray): - # To support NA, we need to create an Array first :-( arr = pa.array(values, type=pa.large_list(value_type), from_pandas=True) self._pa_array = pa.chunked_array(arr, type=pa.large_list(value_type)) else: @@ -200,8 +193,6 @@ def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False): values = pa.array(scalars, from_pandas=True) if values.type == "null" and dtype is not None: - # TODO: the sequencing here seems wrong; just making the tests pass for now - # but this needs a comprehensive review pa_type = string_to_pyarrow_type(str(dtype)) values = pa.array(values, type=pa_type) @@ -232,8 +223,6 @@ def _box_pa( return cls._box_pa_array(value, pa_type) def __getitem__(self, item): - # PyArrow does not support NumPy's selection with an equal length - # mask, so let's convert those to integral positions if needed if isinstance(item, (np.ndarray, ExtensionArray)): if is_bool_dtype(item.dtype): mask_len = len(item) @@ -305,9 +294,6 @@ def _empty(cls, shape: Shape, dtype: ExtensionDtype): ExtensionDtype.empty ExtensionDtype.empty is the 'official' public version of this API. """ - # Implementer note: while ExtensionDtype.empty is the public way to - # call this method, it is still required to implement this `_empty` - # method as well (it is called internally in pandas) if isinstance(shape, tuple): if len(shape) > 1: raise ValueError("ListArray may only be 1-D") @@ -334,9 +320,9 @@ def __eq__(self, other): elif isinstance(other, (pa.ListScalar, pa.LargeListScalar)): from pandas.arrays import BooleanArray - # TODO: pyarrow.compute does not implement broadcasting equality - # for an array of lists to a listscalar - # TODO: pyarrow doesn't compare missing values as missing??? + # TODO: pyarrow.compute does not implement equal for lists + # https://github.com/apache/arrow/issues/45167 + # TODO: pyarrow doesn't compare missing values in Python as missing??? # arr = pa.array([1, 2, None]) # pc.equal(arr, arr[2]) returns all nulls but # arr[2] == arr[2] returns True diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 873d373e8bf59..af038c2d6751f 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -13,7 +13,6 @@ import numpy as np from numpy import ma -import pyarrow as pa from pandas._config import using_string_dtype @@ -462,8 +461,6 @@ def treat_as_nested(data, dtype) -> bool: len(data) > 0 and is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1 - # TODO(wayd): hack so pyarrow list elements don't expand - and not isinstance(data[0], pa.ListScalar) and not isinstance(dtype, ListDtype) and not (isinstance(data, ExtensionArray) and data.ndim == 2) ) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 9dc31c3cbf86f..a3738bb25f56c 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1976,10 +1976,7 @@ def from_blocks( @classmethod def from_array( - cls, - array: ArrayLike, - index: Index, - refs: BlockValuesRefs | None = None, + cls, array: ArrayLike, index: Index, refs: BlockValuesRefs | None = None ) -> SingleBlockManager: """ Constructor for if we have an array that is not yet a Block.