-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[WIP] df.apply
: add support for engine='bodo'
#60622
Changes from all commits
1e62d38
7e2c2c3
27fbc0a
4c2e94a
4349d61
0872285
cd94be9
9a90fa0
dcdd00e
d077892
8276283
5711ad4
ad226c8
829e879
539c6ba
cf213bd
6f7bf12
f1bdbd7
cfecfbf
edf2f48
468fdf4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,7 +27,10 @@ | |
npt, | ||
) | ||
from pandas.compat._optional import import_optional_dependency | ||
from pandas.errors import SpecificationError | ||
from pandas.errors import ( | ||
ExecutionError, | ||
SpecificationError, | ||
) | ||
from pandas.util._decorators import cache_readonly | ||
|
||
from pandas.core.dtypes.cast import is_nested_object | ||
|
@@ -598,9 +601,9 @@ def apply_list_or_dict_like(self) -> DataFrame | Series: | |
Result when self.func is a list-like or dict-like, None otherwise. | ||
""" | ||
|
||
if self.engine == "numba": | ||
if self.engine in ("numba", "bodo"): | ||
raise NotImplementedError( | ||
"The 'numba' engine doesn't support list-like/" | ||
f"The '{self.engine}' engine doesn't support list-like/" | ||
"dict likes of callables yet." | ||
) | ||
|
||
|
@@ -853,9 +856,9 @@ def apply(self) -> DataFrame | Series: | |
|
||
# dispatch to handle list-like or dict-like | ||
if is_list_like(self.func): | ||
if self.engine == "numba": | ||
if self.engine in ("numba", "bodo"): | ||
raise NotImplementedError( | ||
"the 'numba' engine doesn't support lists of callables yet" | ||
f"the '{self.engine}' engine doesn't support lists of callables yet" | ||
) | ||
return self.apply_list_or_dict_like() | ||
|
||
|
@@ -870,13 +873,16 @@ def apply(self) -> DataFrame | Series: | |
"the 'numba' engine doesn't support using " | ||
"a string as the callable function" | ||
) | ||
elif self.engine == "bodo": | ||
return self.apply_series_bodo() | ||
|
||
return self.apply_str() | ||
|
||
# ufunc | ||
elif isinstance(self.func, np.ufunc): | ||
if self.engine == "numba": | ||
if self.engine in ("numba", "bodo"): | ||
raise NotImplementedError( | ||
"the 'numba' engine doesn't support " | ||
f"the '{self.engine}' engine doesn't support " | ||
"using a numpy ufunc as the callable function" | ||
) | ||
with np.errstate(all="ignore"): | ||
|
@@ -886,9 +892,10 @@ def apply(self) -> DataFrame | Series: | |
|
||
# broadcasting | ||
if self.result_type == "broadcast": | ||
if self.engine == "numba": | ||
if self.engine in ("numba", "bodo"): | ||
raise NotImplementedError( | ||
"the 'numba' engine doesn't support result_type='broadcast'" | ||
f"the '{self.engine}' engine doesn't support " | ||
"result_type='broadcast'" | ||
) | ||
return self.apply_broadcast(self.obj) | ||
|
||
|
@@ -1007,6 +1014,8 @@ def wrapper(*args, **kwargs): | |
result = nb_looper(self.values, self.axis, *args) | ||
# If we made the result 2-D, squeeze it back to 1-D | ||
result = np.squeeze(result) | ||
elif self.engine == "bodo": | ||
raise NotImplementedError("the 'bodo' engine does not support raw=True.") | ||
else: | ||
result = np.apply_along_axis( | ||
wrap_function(self.func), | ||
|
@@ -1051,10 +1060,17 @@ def apply_broadcast(self, target: DataFrame) -> DataFrame: | |
return result | ||
|
||
def apply_standard(self): | ||
if self.engine == "python": | ||
if self.engine == "numba": | ||
results, res_index = self.apply_series_numba() | ||
elif self.engine == "bodo": | ||
return self.apply_series_bodo() | ||
elif self.engine == "python": | ||
results, res_index = self.apply_series_generator() | ||
else: | ||
results, res_index = self.apply_series_numba() | ||
raise ValueError( | ||
"invalid value for engine, must be one " | ||
"of {'python', 'numba', 'bodo'}" | ||
) | ||
|
||
# wrap results | ||
return self.wrap_results(results, res_index) | ||
|
@@ -1089,6 +1105,36 @@ def apply_series_numba(self): | |
results = self.apply_with_numba() | ||
return results, self.result_index | ||
|
||
def apply_series_bodo(self) -> DataFrame | Series: | ||
if self.result_type is not None: | ||
raise NotImplementedError( | ||
"the 'bodo' engine does not support result_type yet." | ||
) | ||
|
||
if self.axis != 1 and not isinstance(self.func, str): | ||
raise NotImplementedError( | ||
"the 'bodo' engine only supports axis=1 for user-defined functions." | ||
) | ||
|
||
if self.args or self.kwargs: | ||
raise NotImplementedError( | ||
"the 'bodo' engine does not support passing additional args/kwargs " | ||
"to apply function yet." | ||
) | ||
|
||
bodo = import_optional_dependency("bodo") | ||
|
||
@bodo.jit(**self.engine_kwargs) | ||
def do_apply(obj, func, axis): | ||
return obj.apply(func, axis) | ||
|
||
try: | ||
result = do_apply(self.obj, self.func, self.axis) | ||
except bodo.utils.typing.BodoError as e: | ||
raise ExecutionError("Execution with engine='bodo' failed.") from e | ||
Comment on lines
+1133
to
+1134
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IMO I don't think pandas should need to define and raise a new exception class for BodoError. We don't do this for numba for example There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think there is definitely value in offering up an abstracted error type as it makes writing generic code easier as an end user. Maybe this should be done as a pre-cursor with just Numba? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you mind sharing a specific example of what you have in mind? I fail to understand yoir point, and it does seem like ir makes things more complex if users receive a pandas exception when Bodo can't run the user function. Unless the point is the error message, to let users know that it's not a problem in pandas ans rather it's in Bodo being able to execute the user function. That could make more sense if the errors the user receives from Bodo don't point the user in this direction. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The analogy would be in SQL where we have many different engines and connections. Whether someone is using SQLAlchemy, ADBC, or SQLite3 we lean towards raising just
If we required them to catch the specific error raised by the various connection types, they would have to adjust their error handling as well, which I think is less than ideal |
||
|
||
return result | ||
|
||
def wrap_results(self, results: ResType, res_index: Index) -> DataFrame | Series: | ||
from pandas import Series | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does
boto.jit
take the same keyword arguments asnumba.jit
? I couldn't easily find the API docs forbodo.jit