-
Notifications
You must be signed in to change notification settings - Fork 136
Feat: let nw.Enum
accept categories, map pandas ordered categorical to Enum (only in main namespace, not stable.v1)
#2192
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. Weβll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
3581985
69e9d88
9e63e68
3465fbf
a570751
6f22771
04b2ee6
1393ac7
6e85c1a
70d5c67
e4f2d87
86eb3a2
3e1db9e
5a996b1
1ce56ca
a8f6e42
342ea83
5dd7c72
1bf98d0
da1a455
a79554a
0712557
4fed6d1
f06df25
a0a8c39
21d03dd
36fd178
105e394
0d5b5cc
02b60e0
07d4a25
5e53281
0018a5b
99ad2b5
000013c
6aed0db
3357c55
3b40237
15cac8e
84ea789
833bc3c
7aca1b8
eb0b328
d2504a4
def83a3
f89e331
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -211,7 +211,9 @@ def rename( | |
|
||
|
||
@functools.lru_cache(maxsize=16) | ||
def non_object_native_to_narwhals_dtype(dtype: str, version: Version) -> DType: | ||
def non_object_native_to_narwhals_dtype(native_dtype: Any, version: Version) -> DType: | ||
dtype = str(native_dtype) | ||
|
||
Comment on lines
+214
to
+216
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This change seems to have been there since the first commit (3581985), but doesn't seem to be documented? It looks like this part is related Which would mean we do the Are all non-object |
||
dtypes = import_dtypes_module(version) | ||
if dtype in {"int64", "Int64", "Int64[pyarrow]", "int64[pyarrow]"}: | ||
return dtypes.Int64() | ||
|
@@ -249,7 +251,13 @@ def non_object_native_to_narwhals_dtype(dtype: str, version: Version) -> DType: | |
return dtypes.String() | ||
if dtype in {"bool", "boolean", "boolean[pyarrow]", "bool[pyarrow]"}: | ||
return dtypes.Boolean() | ||
if dtype == "category" or dtype.startswith("dictionary<"): | ||
if dtype.startswith("dictionary<"): | ||
return dtypes.Categorical() | ||
if dtype == "category": | ||
if version is Version.V1: | ||
return dtypes.Categorical() | ||
if native_dtype.ordered: | ||
return dtypes.Enum(native_dtype.categories) | ||
return dtypes.Categorical() | ||
if (match_ := PATTERN_PD_DATETIME.match(dtype)) or ( | ||
match_ := PATTERN_PA_DATETIME.match(dtype) | ||
|
@@ -310,7 +318,7 @@ def native_to_narwhals_dtype( | |
return arrow_native_to_narwhals_dtype(native_dtype.to_arrow(), version) | ||
return arrow_native_to_narwhals_dtype(native_dtype.pyarrow_dtype, version) | ||
if str_dtype != "object": | ||
return non_object_native_to_narwhals_dtype(str_dtype, version) | ||
return non_object_native_to_narwhals_dtype(native_dtype, version) | ||
elif implementation is Implementation.DASK: | ||
# Per conversations with their maintainers, they don't support arbitrary | ||
# objects, so we can just return String. | ||
|
@@ -471,8 +479,15 @@ def narwhals_to_native_dtype( # noqa: PLR0915 | |
msg = "PyArrow>=11.0.0 is required for `Date` dtype." | ||
return "date32[pyarrow]" | ||
if isinstance_or_issubclass(dtype, dtypes.Enum): | ||
msg = "Converting to Enum is not (yet) supported" | ||
raise NotImplementedError(msg) | ||
if version is Version.V1: | ||
msg = "Converting to Enum is not supported in narwhals.stable.v1" | ||
raise NotImplementedError(msg) | ||
if isinstance(dtype, dtypes.Enum): | ||
ns = implementation.to_native_namespace() | ||
return ns.CategoricalDtype(dtype.categories, ordered=True) | ||
msg = "Can not cast / initialize Enum without categories present" | ||
raise ValueError(msg) | ||
|
||
if isinstance_or_issubclass( | ||
dtype, (dtypes.Struct, dtypes.Array, dtypes.List, dtypes.Time, dtypes.Binary) | ||
): | ||
|
Original file line number | Diff line number | Diff line change | ||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -1,5 +1,6 @@ | ||||||||||||||||||||
from __future__ import annotations | ||||||||||||||||||||
|
||||||||||||||||||||
import enum | ||||||||||||||||||||
from collections import OrderedDict | ||||||||||||||||||||
from datetime import timezone | ||||||||||||||||||||
from itertools import starmap | ||||||||||||||||||||
|
@@ -9,6 +10,7 @@ | |||||||||||||||||||
from narwhals.utils import isinstance_or_issubclass | ||||||||||||||||||||
|
||||||||||||||||||||
if TYPE_CHECKING: | ||||||||||||||||||||
from typing import Iterable | ||||||||||||||||||||
from typing import Iterator | ||||||||||||||||||||
from typing import Sequence | ||||||||||||||||||||
|
||||||||||||||||||||
|
@@ -443,9 +445,38 @@ class Enum(DType): | |||||||||||||||||||
>>> data = ["beluga", "narwhal", "orca"] | ||||||||||||||||||||
>>> s_native = pl.Series(data, dtype=pl.Enum(data)) | ||||||||||||||||||||
>>> nw.from_native(s_native, series_only=True).dtype | ||||||||||||||||||||
Enum | ||||||||||||||||||||
Enum(categories=['beluga', 'narwhal', 'orca']) | ||||||||||||||||||||
""" | ||||||||||||||||||||
Comment on lines
445
to
449
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What do you think about adapting this to be more like the example(s) in I believe this style was originally used because we didn't have a constructor for
Suggested change
|
||||||||||||||||||||
|
||||||||||||||||||||
categories: Sequence[str] | ||||||||||||||||||||
|
||||||||||||||||||||
def __init__(self, categories: Iterable[str] | type[enum.Enum]) -> None: | ||||||||||||||||||||
if isinstance(categories, type) and issubclass(categories, enum.Enum): | ||||||||||||||||||||
categories = (v.value for v in categories) | ||||||||||||||||||||
sequence: tuple[str, ...] = tuple(categories) | ||||||||||||||||||||
seen: set[str] = set() | ||||||||||||||||||||
for cat in sequence: | ||||||||||||||||||||
if cat in seen: | ||||||||||||||||||||
msg = f"{type(self).__name__} categories must be unique; found duplicate {cat!r}" | ||||||||||||||||||||
raise ValueError(msg) | ||||||||||||||||||||
if not isinstance(cat, str): | ||||||||||||||||||||
msg = f"{type(self).__name__} categories must be strings; found data of type {type(cat).__name__!r}" | ||||||||||||||||||||
raise TypeError(msg) | ||||||||||||||||||||
seen.add(cat) | ||||||||||||||||||||
self.categories = sequence | ||||||||||||||||||||
|
||||||||||||||||||||
def __eq__(self: Self, other: object) -> bool: | ||||||||||||||||||||
# allow comparing object instances to class | ||||||||||||||||||||
if type(other) is type: | ||||||||||||||||||||
return other is Enum | ||||||||||||||||||||
return isinstance(other, type(self)) and self.categories == other.categories | ||||||||||||||||||||
|
||||||||||||||||||||
def __hash__(self: Self) -> int: # pragma: no cover | ||||||||||||||||||||
return hash((self.__class__, tuple(self.categories))) | ||||||||||||||||||||
|
||||||||||||||||||||
def __repr__(self: Self) -> str: # pragma: no cover | ||||||||||||||||||||
return f"{type(self).__name__}(categories={list(self.categories)!r})" | ||||||||||||||||||||
|
||||||||||||||||||||
|
||||||||||||||||||||
class Field: | ||||||||||||||||||||
"""Definition of a single field within a `Struct` DataType. | ||||||||||||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,7 +11,7 @@ | |
from narwhals.dtypes import Decimal | ||
from narwhals.dtypes import DType | ||
from narwhals.dtypes import Duration as NwDuration | ||
from narwhals.dtypes import Enum | ||
from narwhals.dtypes import Enum as NwEnum | ||
from narwhals.dtypes import Field | ||
from narwhals.dtypes import Float32 | ||
from narwhals.dtypes import Float64 | ||
|
@@ -72,6 +72,35 @@ def __hash__(self: Self) -> int: | |
return hash(self.__class__) | ||
|
||
|
||
class Enum(NwEnum): | ||
"""A fixed categorical encoding of a unique set of strings. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @dangotbanned the typing here gets a bit wonky as we currently need If feels like I may have something backwards here though? Would love to hear you thoughts. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the ping @camriddell, will take a look in the morning There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Definitely a tricky one, but I have a few ideas I'm gonna try out today. I did a search of existing usage and looked at what we allow in tests. I think our main concern should be preserving the behavior of I haven't tried out customizing-instance-and-subclass-checks yet - but have thought about it for another There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm wanting to avoid subclassing, since this is a pretty clear Liskov substitution principle violation (not your fault, just how There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think it's ok to allow There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Lol @MarcoGorelli the timing on this π (105e394) |
||
|
||
Polars has an Enum data type, while pandas and PyArrow do not. | ||
|
||
Examples: | ||
>>> import polars as pl | ||
>>> import narwhals.stable.v1 as nw | ||
>>> data = ["beluga", "narwhal", "orca"] | ||
>>> s_native = pl.Series(data, dtype=pl.Enum(data)) | ||
>>> nw.from_native(s_native, series_only=True).dtype | ||
Enum | ||
""" | ||
|
||
def __init__(self: Self) -> None: | ||
super(NwEnum, self).__init__() | ||
|
||
def __eq__(self, other: DType | type[DType]) -> bool: # type: ignore[override] | ||
if type(other) is type: | ||
return other in {type(self), NwEnum} | ||
return isinstance(other, type(self)) | ||
|
||
def __hash__(self: Self) -> int: # pragma: no cover | ||
return super(NwEnum, self).__hash__() | ||
|
||
def __repr__(self: Self) -> str: # pragma: no cover | ||
return super(NwEnum, self).__repr__() | ||
|
||
|
||
__all__ = [ | ||
"Array", | ||
"Binary", | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@MarcoGorelli does this seem like it could be widened upstream (https://github.com/pandas-dev/pandas-stubs)?
I've traced back the runtime check to
is_list_like
:The current annotation only permits
list[Any]
as a non-pandas
Sequence
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@camriddell ignore this, I only meant to add as a comment - not the review π«£
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@MarcoGorelli gentle nudge on this, in case it was missed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
hey - yeah, probably, the pandas stubs definitely don't get all the attention they probably deserve