From f787764c43fda0a76ad9dc0dd7e0e65f6c69d907 Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Mon, 13 Jan 2025 14:20:50 -0800 Subject: [PATCH 01/67] ENH: Support pipe() method in Rolling and Expanding (#60697) * ENH: Support pipe() method in Rolling and Expanding * Fix mypy errors * Fix docstring errors * Add pipe method to doc reference --- doc/source/reference/window.rst | 2 + doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/window/doc.py | 57 ++++++++++++++++++++++ pandas/core/window/expanding.py | 61 ++++++++++++++++++++++- pandas/core/window/rolling.py | 85 ++++++++++++++++++++++++++++++++- pandas/tests/window/test_api.py | 32 +++++++++++++ 6 files changed, 236 insertions(+), 2 deletions(-) diff --git a/doc/source/reference/window.rst b/doc/source/reference/window.rst index 14af2b8a120e0..fb89fd2a5ffb2 100644 --- a/doc/source/reference/window.rst +++ b/doc/source/reference/window.rst @@ -35,6 +35,7 @@ Rolling window functions Rolling.skew Rolling.kurt Rolling.apply + Rolling.pipe Rolling.aggregate Rolling.quantile Rolling.sem @@ -76,6 +77,7 @@ Expanding window functions Expanding.skew Expanding.kurt Expanding.apply + Expanding.pipe Expanding.aggregate Expanding.quantile Expanding.sem diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 47838d1e49d61..34df7fc2027a5 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -44,6 +44,7 @@ Other enhancements - Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`) - :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`) - :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`) +- :class:`Rolling` and :class:`Expanding` now support ``pipe`` method (:issue:`57076`) - :class:`Series` now supports the Arrow PyCapsule Interface for export (:issue:`59518`) - :func:`DataFrame.to_excel` argument ``merge_cells`` now accepts a value of ``"columns"`` to only merge :class:`MultiIndex` column header header cells (:issue:`35384`) - :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`) diff --git a/pandas/core/window/doc.py b/pandas/core/window/doc.py index cdb670ee218b4..6dbc52a99e70c 100644 --- a/pandas/core/window/doc.py +++ b/pandas/core/window/doc.py @@ -85,6 +85,63 @@ def create_section_header(header: str) -> str: """ ).replace("\n", "", 1) +template_pipe = """ +Apply a ``func`` with arguments to this %(klass)s object and return its result. + +Use `.pipe` when you want to improve readability by chaining together +functions that expect Series, DataFrames, GroupBy, Rolling, Expanding or Resampler +objects. +Instead of writing + +>>> h = lambda x, arg2, arg3: x + 1 - arg2 * arg3 +>>> g = lambda x, arg1: x * 5 / arg1 +>>> f = lambda x: x ** 4 +>>> df = pd.DataFrame({'A': [1, 2, 3, 4]}, index=pd.date_range('2012-08-02', periods=4)) +>>> h(g(f(df.rolling('2D')), arg1=1), arg2=2, arg3=3) # doctest: +SKIP + +You can write + +>>> (df.rolling('2D') +... .pipe(f) +... .pipe(g, arg1=1) +... .pipe(h, arg2=2, arg3=3)) # doctest: +SKIP + +which is much more readable. + +Parameters +---------- +func : callable or tuple of (callable, str) + Function to apply to this %(klass)s object or, alternatively, + a `(callable, data_keyword)` tuple where `data_keyword` is a + string indicating the keyword of `callable` that expects the + %(klass)s object. +*args : iterable, optional + Positional arguments passed into `func`. +**kwargs : dict, optional + A dictionary of keyword arguments passed into `func`. + +Returns +------- +%(klass)s + The original object with the function `func` applied. + +See Also +-------- +Series.pipe : Apply a function with arguments to a series. +DataFrame.pipe: Apply a function with arguments to a dataframe. +apply : Apply function to each group instead of to the + full %(klass)s object. + +Notes +----- +See more `here +`_ + +Examples +-------- +%(examples)s +""" + numba_notes = ( "See :ref:`window.numba_engine` and :ref:`enhancingperf.numba` for " "extended documentation and performance considerations for the Numba engine.\n\n" diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index bff3a1660eba9..6a7d0329ab6da 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -5,9 +5,15 @@ TYPE_CHECKING, Any, Literal, + final, + overload, ) -from pandas.util._decorators import doc +from pandas.util._decorators import ( + Appender, + Substitution, + doc, +) from pandas.core.indexers.objects import ( BaseIndexer, @@ -20,6 +26,7 @@ kwargs_numeric_only, numba_notes, template_header, + template_pipe, template_returns, template_see_also, window_agg_numba_parameters, @@ -34,7 +41,11 @@ from collections.abc import Callable from pandas._typing import ( + Concatenate, + P, QuantileInterpolation, + Self, + T, WindowingRankType, ) @@ -241,6 +252,54 @@ def apply( kwargs=kwargs, ) + @overload + def pipe( + self, + func: Callable[Concatenate[Self, P], T], + *args: P.args, + **kwargs: P.kwargs, + ) -> T: ... + + @overload + def pipe( + self, + func: tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: ... + + @final + @Substitution( + klass="Expanding", + examples=""" + >>> df = pd.DataFrame({'A': [1, 2, 3, 4]}, + ... index=pd.date_range('2012-08-02', periods=4)) + >>> df + A + 2012-08-02 1 + 2012-08-03 2 + 2012-08-04 3 + 2012-08-05 4 + + To get the difference between each expanding window's maximum and minimum + value in one pass, you can do + + >>> df.expanding().pipe(lambda x: x.max() - x.min()) + A + 2012-08-02 0.0 + 2012-08-03 1.0 + 2012-08-04 2.0 + 2012-08-05 3.0""", + ) + @Appender(template_pipe) + def pipe( + self, + func: Callable[Concatenate[Self, P], T] | tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: + return super().pipe(func, *args, **kwargs) + @doc( template_header, create_section_header("Parameters"), diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 385ffb901acf0..90c3cff975ff0 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -14,6 +14,8 @@ TYPE_CHECKING, Any, Literal, + final, + overload, ) import numpy as np @@ -26,7 +28,11 @@ import pandas._libs.window.aggregations as window_aggregations from pandas.compat._optional import import_optional_dependency from pandas.errors import DataError -from pandas.util._decorators import doc +from pandas.util._decorators import ( + Appender, + Substitution, + doc, +) from pandas.core.dtypes.common import ( ensure_float64, @@ -81,6 +87,7 @@ kwargs_scipy, numba_notes, template_header, + template_pipe, template_returns, template_see_also, window_agg_numba_parameters, @@ -102,8 +109,12 @@ from pandas._typing import ( ArrayLike, + Concatenate, NDFrameT, QuantileInterpolation, + P, + Self, + T, WindowingRankType, npt, ) @@ -1529,6 +1540,30 @@ def apply_func(values, begin, end, min_periods, raw=raw): return apply_func + @overload + def pipe( + self, + func: Callable[Concatenate[Self, P], T], + *args: P.args, + **kwargs: P.kwargs, + ) -> T: ... + + @overload + def pipe( + self, + func: tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: ... + + def pipe( + self, + func: Callable[Concatenate[Self, P], T] | tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: + return com.pipe(self, func, *args, **kwargs) + def sum( self, numeric_only: bool = False, @@ -2044,6 +2079,54 @@ def apply( kwargs=kwargs, ) + @overload + def pipe( + self, + func: Callable[Concatenate[Self, P], T], + *args: P.args, + **kwargs: P.kwargs, + ) -> T: ... + + @overload + def pipe( + self, + func: tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: ... + + @final + @Substitution( + klass="Rolling", + examples=""" + >>> df = pd.DataFrame({'A': [1, 2, 3, 4]}, + ... index=pd.date_range('2012-08-02', periods=4)) + >>> df + A + 2012-08-02 1 + 2012-08-03 2 + 2012-08-04 3 + 2012-08-05 4 + + To get the difference between each rolling 2-day window's maximum and minimum + value in one pass, you can do + + >>> df.rolling('2D').pipe(lambda x: x.max() - x.min()) + A + 2012-08-02 0.0 + 2012-08-03 1.0 + 2012-08-04 1.0 + 2012-08-05 1.0""", + ) + @Appender(template_pipe) + def pipe( + self, + func: Callable[Concatenate[Self, P], T] | tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: + return super().pipe(func, *args, **kwargs) + @doc( template_header, create_section_header("Parameters"), diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index 15eaa8c167487..877b50e37670c 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -177,6 +177,38 @@ def test_agg_nested_dicts(): r.agg({"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}}) +@pytest.mark.parametrize( + "func,window_size", + [ + ( + "rolling", + 2, + ), + ( + "expanding", + None, + ), + ], +) +def test_pipe(func, window_size): + # Issue #57076 + df = DataFrame( + { + "B": np.random.default_rng(2).standard_normal(10), + "C": np.random.default_rng(2).standard_normal(10), + } + ) + r = getattr(df, func)(window_size) + + expected = r.max() - r.mean() + result = r.pipe(lambda x: x.max() - x.mean()) + tm.assert_frame_equal(result, expected) + + expected = r.max() - 2 * r.min() + result = r.pipe(lambda x, k: x.max() - k * x.min(), k=2) + tm.assert_frame_equal(result, expected) + + def test_count_nonnumeric_types(step): # GH12541 cols = [ From 221ad46d193c9a5e46fb50aea3d91efdf1310c31 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Mon, 13 Jan 2025 17:24:39 -0500 Subject: [PATCH 02/67] Remove sizeof(char) uses (#60717) --- pandas/_libs/src/parser/tokenizer.c | 7 +++--- .../src/vendored/ujson/python/objToJSON.c | 22 +++++++++---------- pandas/_libs/tslibs/period.pyx | 2 +- 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index c9f7a796a9b1c..61e96fc835e4d 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -148,7 +148,7 @@ int parser_init(parser_t *self) { self->warn_msg = NULL; // token stream - self->stream = malloc(STREAM_INIT_SIZE * sizeof(char)); + self->stream = malloc(STREAM_INIT_SIZE); if (self->stream == NULL) { parser_cleanup(self); return PARSER_OUT_OF_MEMORY; @@ -221,9 +221,8 @@ static int make_stream_space(parser_t *self, size_t nbytes) { char *orig_ptr = (void *)self->stream; TRACE(("\n\nmake_stream_space: nbytes = %zu. grow_buffer(self->stream...)\n", nbytes)) - self->stream = - (char *)grow_buffer((void *)self->stream, self->stream_len, - &self->stream_cap, nbytes * 2, sizeof(char), &status); + self->stream = (char *)grow_buffer((void *)self->stream, self->stream_len, + &self->stream_cap, nbytes * 2, 1, &status); TRACE(("make_stream_space: self->stream=%p, self->stream_len = %zu, " "self->stream_cap=%zu, status=%zu\n", self->stream, self->stream_len, self->stream_cap, status)) diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index 5f35860c59cb7..6b957148f94da 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -984,7 +984,7 @@ static char *List_iterGetName(JSOBJ Py_UNUSED(obj), //============================================================================= static void Index_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { GET_TC(tc)->index = 0; - GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); + GET_TC(tc)->cStr = PyObject_Malloc(20); if (!GET_TC(tc)->cStr) { PyErr_NoMemory(); } @@ -998,10 +998,10 @@ static int Index_iterNext(JSOBJ obj, JSONTypeContext *tc) { const Py_ssize_t index = GET_TC(tc)->index; Py_XDECREF(GET_TC(tc)->itemValue); if (index == 0) { - memcpy(GET_TC(tc)->cStr, "name", sizeof(char) * 5); + memcpy(GET_TC(tc)->cStr, "name", 5); GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name"); } else if (index == 1) { - memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5); + memcpy(GET_TC(tc)->cStr, "data", 5); GET_TC(tc)->itemValue = get_values(obj); if (!GET_TC(tc)->itemValue) { return 0; @@ -1033,7 +1033,7 @@ static char *Index_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, static void Series_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; GET_TC(tc)->index = 0; - GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); + GET_TC(tc)->cStr = PyObject_Malloc(20); enc->outputFormat = VALUES; // for contained series if (!GET_TC(tc)->cStr) { PyErr_NoMemory(); @@ -1048,13 +1048,13 @@ static int Series_iterNext(JSOBJ obj, JSONTypeContext *tc) { const Py_ssize_t index = GET_TC(tc)->index; Py_XDECREF(GET_TC(tc)->itemValue); if (index == 0) { - memcpy(GET_TC(tc)->cStr, "name", sizeof(char) * 5); + memcpy(GET_TC(tc)->cStr, "name", 5); GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name"); } else if (index == 1) { - memcpy(GET_TC(tc)->cStr, "index", sizeof(char) * 6); + memcpy(GET_TC(tc)->cStr, "index", 6); GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); } else if (index == 2) { - memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5); + memcpy(GET_TC(tc)->cStr, "data", 5); GET_TC(tc)->itemValue = get_values(obj); if (!GET_TC(tc)->itemValue) { return 0; @@ -1088,7 +1088,7 @@ static char *Series_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, static void DataFrame_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; GET_TC(tc)->index = 0; - GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); + GET_TC(tc)->cStr = PyObject_Malloc(20); enc->outputFormat = VALUES; // for contained series & index if (!GET_TC(tc)->cStr) { PyErr_NoMemory(); @@ -1103,13 +1103,13 @@ static int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { const Py_ssize_t index = GET_TC(tc)->index; Py_XDECREF(GET_TC(tc)->itemValue); if (index == 0) { - memcpy(GET_TC(tc)->cStr, "columns", sizeof(char) * 8); + memcpy(GET_TC(tc)->cStr, "columns", 8); GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "columns"); } else if (index == 1) { - memcpy(GET_TC(tc)->cStr, "index", sizeof(char) * 6); + memcpy(GET_TC(tc)->cStr, "index", 6); GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); } else if (index == 2) { - memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5); + memcpy(GET_TC(tc)->cStr, "data", 5); Py_INCREF(obj); GET_TC(tc)->itemValue = obj; } else { diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index d6d69a49c9539..f697180da5eeb 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -679,7 +679,7 @@ cdef char* c_strftime(npy_datetimestruct *dts, char *fmt): c_date.tm_yday = get_day_of_year(dts.year, dts.month, dts.day) - 1 c_date.tm_isdst = -1 - result = malloc(result_len * sizeof(char)) + result = malloc(result_len) if result is NULL: raise MemoryError() From 1708e9020c418e91fae430cf6a7a6ec09c466429 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Mon, 13 Jan 2025 17:27:47 -0500 Subject: [PATCH 03/67] ENH: Enable .mode to sort with NA values (#60702) --- pandas/core/algorithms.py | 2 +- pandas/tests/frame/test_reductions.py | 17 ++--------------- pandas/tests/reductions/test_reductions.py | 13 +++---------- 3 files changed, 6 insertions(+), 26 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 56f8adda93251..eefe08859c1e9 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1012,7 +1012,7 @@ def mode( return npresult, res_mask # type: ignore[return-value] try: - npresult = np.sort(npresult) + npresult = safe_sort(npresult) except TypeError as err: warnings.warn( f"Unable to sort modes: {err}", diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index fde4dfeed9c55..04b1456cdbea6 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -672,23 +672,10 @@ def test_mode_dropna(self, dropna, expected): expected = DataFrame(expected) tm.assert_frame_equal(result, expected) - def test_mode_sortwarning(self, using_infer_string): - # Check for the warning that is raised when the mode - # results cannot be sorted - + def test_mode_sort_with_na(self, using_infer_string): df = DataFrame({"A": [np.nan, np.nan, "a", "a"]}) expected = DataFrame({"A": ["a", np.nan]}) - - # TODO(infer_string) avoid this UserWarning for python storage - warning = ( - None - if using_infer_string and df.A.dtype.storage == "pyarrow" - else UserWarning - ) - with tm.assert_produces_warning(warning, match="Unable to sort modes"): - result = df.mode(dropna=False) - result = result.sort_values(by="A").reset_index(drop=True) - + result = df.mode(dropna=False) tm.assert_frame_equal(result, expected) def test_mode_empty_df(self): diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 476978aeab15a..a7bb80727206e 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -1607,17 +1607,10 @@ def test_mode_intoverflow(self, dropna, expected1, expected2): expected2 = Series(expected2, dtype=np.uint64) tm.assert_series_equal(result, expected2) - def test_mode_sortwarning(self): - # Check for the warning that is raised when the mode - # results cannot be sorted - - expected = Series(["foo", np.nan], dtype=object) + def test_mode_sort_with_na(self): s = Series([1, "foo", "foo", np.nan, np.nan]) - - with tm.assert_produces_warning(UserWarning, match="Unable to sort modes"): - result = s.mode(dropna=False) - result = result.sort_values().reset_index(drop=True) - + expected = Series(["foo", np.nan], dtype=object) + result = s.mode(dropna=False) tm.assert_series_equal(result, expected) def test_mode_boolean_with_na(self): From b5d4e89d378e69a87b1b9ac7f3d6fa6867840fff Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Mon, 13 Jan 2025 17:28:28 -0500 Subject: [PATCH 04/67] ENH: Implement cum* methods for PyArrow strings (#60633) * ENH: Implement cum* methods for PyArrow strings * cleanup * Cleanup * fixup * Fix extension tests * xfail test when there is no pyarrow * mypy fixups * Change logic & whatsnew * Change logic & whatsnew * Fix fixture * Fixup --- doc/source/whatsnew/v2.3.0.rst | 2 +- pandas/conftest.py | 16 +++++++ pandas/core/arrays/arrow/array.py | 55 +++++++++++++++++++++++ pandas/tests/apply/test_str.py | 9 ++-- pandas/tests/extension/base/accumulate.py | 5 ++- pandas/tests/extension/test_arrow.py | 15 ++++--- pandas/tests/extension/test_string.py | 10 +++++ pandas/tests/series/test_cumulative.py | 54 ++++++++++++++++++++++ 8 files changed, 155 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index b107a5d3ba100..9e0e095eb4de8 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -35,8 +35,8 @@ Other enhancements - The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been updated to work correctly with NumPy >= 2 (:issue:`57739`) +- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns when backed by PyArrow (:issue:`60633`) - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`) -- .. --------------------------------------------------------------------------- .. _whatsnew_230.notable_bug_fixes: diff --git a/pandas/conftest.py b/pandas/conftest.py index 106518678df6a..f9c10a7758bd2 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1317,6 +1317,22 @@ def nullable_string_dtype(request): return request.param +@pytest.fixture( + params=[ + pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), + pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), + ] +) +def pyarrow_string_dtype(request): + """ + Parametrized fixture for string dtypes backed by Pyarrow. + + * 'str[pyarrow]' + * 'string[pyarrow]' + """ + return pd.StringDtype(*request.param) + + @pytest.fixture( params=[ "python", diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 4d9c8eb3a41b6..900548a239c8e 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -41,6 +41,7 @@ is_list_like, is_numeric_dtype, is_scalar, + is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import DatetimeTZDtype @@ -1619,6 +1620,9 @@ def _accumulate( ------ NotImplementedError : subclass does not define accumulations """ + if is_string_dtype(self): + return self._str_accumulate(name=name, skipna=skipna, **kwargs) + pyarrow_name = { "cummax": "cumulative_max", "cummin": "cumulative_min", @@ -1654,6 +1658,57 @@ def _accumulate( return type(self)(result) + def _str_accumulate( + self, name: str, *, skipna: bool = True, **kwargs + ) -> ArrowExtensionArray | ExtensionArray: + """ + Accumulate implementation for strings, see `_accumulate` docstring for details. + + pyarrow.compute does not implement these methods for strings. + """ + if name == "cumprod": + msg = f"operation '{name}' not supported for dtype '{self.dtype}'" + raise TypeError(msg) + + # We may need to strip out trailing NA values + tail: pa.array | None = None + na_mask: pa.array | None = None + pa_array = self._pa_array + np_func = { + "cumsum": np.cumsum, + "cummin": np.minimum.accumulate, + "cummax": np.maximum.accumulate, + }[name] + + if self._hasna: + na_mask = pc.is_null(pa_array) + if pc.all(na_mask) == pa.scalar(True): + return type(self)(pa_array) + if skipna: + if name == "cumsum": + pa_array = pc.fill_null(pa_array, "") + else: + # We can retain the running min/max by forward/backward filling. + pa_array = pc.fill_null_forward(pa_array) + pa_array = pc.fill_null_backward(pa_array) + else: + # When not skipping NA values, the result should be null from + # the first NA value onward. + idx = pc.index(na_mask, True).as_py() + tail = pa.nulls(len(pa_array) - idx, type=pa_array.type) + pa_array = pa_array[:idx] + + # error: Cannot call function of unknown type + pa_result = pa.array(np_func(pa_array), type=pa_array.type) # type: ignore[operator] + + if tail is not None: + pa_result = pa.concat_arrays([pa_result, tail]) + elif na_mask is not None: + pa_result = pc.if_else(na_mask, None, pa_result) + + result = type(self)(pa_result) + return result + def _reduce_pyarrow(self, name: str, *, skipna: bool = True, **kwargs) -> pa.Scalar: """ Return a pyarrow scalar result of performing the reduction operation. diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index c52168ae48ca8..ce71cfec535e4 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -4,7 +4,10 @@ import numpy as np import pytest -from pandas.compat import WASM +from pandas.compat import ( + HAS_PYARROW, + WASM, +) from pandas.core.dtypes.common import is_number @@ -163,10 +166,10 @@ def test_agg_cython_table_transform_series(request, series, func, expected): # GH21224 # test transforming functions in # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) - if series.dtype == "string" and func == "cumsum": + if series.dtype == "string" and func == "cumsum" and not HAS_PYARROW: request.applymarker( pytest.mark.xfail( - raises=(TypeError, NotImplementedError), + raises=NotImplementedError, reason="TODO(infer_string) cumsum not yet implemented for string", ) ) diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index 9a41a3a582c4a..9a2f186c2a00b 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -18,8 +18,9 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: def check_accumulate(self, ser: pd.Series, op_name: str, skipna: bool): try: alt = ser.astype("float64") - except TypeError: - # e.g. Period can't be cast to float64 + except (TypeError, ValueError): + # e.g. Period can't be cast to float64 (TypeError) + # String can't be cast to float64 (ValueError) alt = ser.astype(object) result = getattr(ser, op_name)(skipna=skipna) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index c5f5a65b77eea..4fccf02e08bd6 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -393,13 +393,12 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: # attribute "pyarrow_dtype" pa_type = ser.dtype.pyarrow_dtype # type: ignore[union-attr] - if ( - pa.types.is_string(pa_type) - or pa.types.is_binary(pa_type) - or pa.types.is_decimal(pa_type) - ): + if pa.types.is_binary(pa_type) or pa.types.is_decimal(pa_type): if op_name in ["cumsum", "cumprod", "cummax", "cummin"]: return False + elif pa.types.is_string(pa_type): + if op_name == "cumprod": + return False elif pa.types.is_boolean(pa_type): if op_name in ["cumprod", "cummax", "cummin"]: return False @@ -414,6 +413,12 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: def test_accumulate_series(self, data, all_numeric_accumulations, skipna, request): pa_type = data.dtype.pyarrow_dtype op_name = all_numeric_accumulations + + if pa.types.is_string(pa_type) and op_name in ["cumsum", "cummin", "cummax"]: + # https://github.com/pandas-dev/pandas/pull/60633 + # Doesn't fit test structure, tested in series/test_cumulative.py instead. + return + ser = pd.Series(data) if not self._supports_accumulation(ser, op_name): diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index e19351b2ad058..6ce48e434d329 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -24,6 +24,8 @@ from pandas.compat import HAS_PYARROW +from pandas.core.dtypes.base import StorageExtensionDtype + import pandas as pd import pandas._testing as tm from pandas.api.types import is_string_dtype @@ -192,6 +194,14 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: and op_name in ("any", "all") ) + def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: + assert isinstance(ser.dtype, StorageExtensionDtype) + return ser.dtype.storage == "pyarrow" and op_name in [ + "cummin", + "cummax", + "cumsum", + ] + def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): dtype = cast(StringDtype, tm.get_dtype(obj)) if op_name in ["__add__", "__radd__"]: diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index a9d5486139b46..89882d9d797c5 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -6,6 +6,8 @@ tests.frame.test_cumulative """ +import re + import numpy as np import pytest @@ -227,3 +229,55 @@ def test_cumprod_timedelta(self): ser = pd.Series([pd.Timedelta(days=1), pd.Timedelta(days=3)]) with pytest.raises(TypeError, match="cumprod not supported for Timedelta"): ser.cumprod() + + @pytest.mark.parametrize( + "data, op, skipna, expected_data", + [ + ([], "cumsum", True, []), + ([], "cumsum", False, []), + (["x", "z", "y"], "cumsum", True, ["x", "xz", "xzy"]), + (["x", "z", "y"], "cumsum", False, ["x", "xz", "xzy"]), + (["x", pd.NA, "y"], "cumsum", True, ["x", pd.NA, "xy"]), + (["x", pd.NA, "y"], "cumsum", False, ["x", pd.NA, pd.NA]), + ([pd.NA, "x", "y"], "cumsum", True, [pd.NA, "x", "xy"]), + ([pd.NA, "x", "y"], "cumsum", False, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cumsum", True, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cumsum", False, [pd.NA, pd.NA, pd.NA]), + ([], "cummin", True, []), + ([], "cummin", False, []), + (["y", "z", "x"], "cummin", True, ["y", "y", "x"]), + (["y", "z", "x"], "cummin", False, ["y", "y", "x"]), + (["y", pd.NA, "x"], "cummin", True, ["y", pd.NA, "x"]), + (["y", pd.NA, "x"], "cummin", False, ["y", pd.NA, pd.NA]), + ([pd.NA, "y", "x"], "cummin", True, [pd.NA, "y", "x"]), + ([pd.NA, "y", "x"], "cummin", False, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cummin", True, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cummin", False, [pd.NA, pd.NA, pd.NA]), + ([], "cummax", True, []), + ([], "cummax", False, []), + (["x", "z", "y"], "cummax", True, ["x", "z", "z"]), + (["x", "z", "y"], "cummax", False, ["x", "z", "z"]), + (["x", pd.NA, "y"], "cummax", True, ["x", pd.NA, "y"]), + (["x", pd.NA, "y"], "cummax", False, ["x", pd.NA, pd.NA]), + ([pd.NA, "x", "y"], "cummax", True, [pd.NA, "x", "y"]), + ([pd.NA, "x", "y"], "cummax", False, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cummax", True, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cummax", False, [pd.NA, pd.NA, pd.NA]), + ], + ) + def test_cum_methods_pyarrow_strings( + self, pyarrow_string_dtype, data, op, skipna, expected_data + ): + # https://github.com/pandas-dev/pandas/pull/60633 + ser = pd.Series(data, dtype=pyarrow_string_dtype) + method = getattr(ser, op) + expected = pd.Series(expected_data, dtype=pyarrow_string_dtype) + result = method(skipna=skipna) + tm.assert_series_equal(result, expected) + + def test_cumprod_pyarrow_strings(self, pyarrow_string_dtype, skipna): + # https://github.com/pandas-dev/pandas/pull/60633 + ser = pd.Series(list("xyz"), dtype=pyarrow_string_dtype) + msg = re.escape(f"operation 'cumprod' not supported for dtype '{ser.dtype}'") + with pytest.raises(TypeError, match=msg): + ser.cumprod(skipna=skipna) From fa5c2550e81c3e745eb7948b56adac45454853d5 Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Mon, 13 Jan 2025 17:56:32 -0800 Subject: [PATCH 05/67] ENH: Expose NoDefault in pandas.api.extensions (#60696) * ENH: Expose NoDefault in pandas.api.extensions * Add entry to whatsnew * Address review comment --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/api/typing/__init__.py | 2 ++ pandas/tests/api/test_api.py | 1 + 3 files changed, 4 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 34df7fc2027a5..1ca0bb9c653a4 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -30,6 +30,7 @@ Other enhancements ^^^^^^^^^^^^^^^^^^ - :class:`pandas.api.typing.FrozenList` is available for typing the outputs of :attr:`MultiIndex.names`, :attr:`MultiIndex.codes` and :attr:`MultiIndex.levels` (:issue:`58237`) - :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`) +- :class:`pandas.api.typing.NoDefault` is available for typing ``no_default`` - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`) - :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`) - :func:`read_spss` now supports kwargs to be passed to pyreadstat (:issue:`56356`) diff --git a/pandas/api/typing/__init__.py b/pandas/api/typing/__init__.py index a18a1e9d5cbb7..c1178c72f3edc 100644 --- a/pandas/api/typing/__init__.py +++ b/pandas/api/typing/__init__.py @@ -3,6 +3,7 @@ """ from pandas._libs import NaTType +from pandas._libs.lib import NoDefault from pandas._libs.missing import NAType from pandas.core.groupby import ( @@ -44,6 +45,7 @@ "JsonReader", "NAType", "NaTType", + "NoDefault", "PeriodIndexResamplerGroupby", "Resampler", "Rolling", diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index c1d9f5ea4d25c..4a05259a98087 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -261,6 +261,7 @@ class TestApi(Base): "JsonReader", "NaTType", "NAType", + "NoDefault", "PeriodIndexResamplerGroupby", "Resampler", "Rolling", From 8bc8c0a6119b053e520f5018dc1350863f7277e4 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Tue, 14 Jan 2025 09:34:40 -0500 Subject: [PATCH 06/67] TST(string dtype): Resolve xfail in test_base.py (#60713) --- pandas/core/arrays/string_.py | 5 +++++ pandas/tests/indexes/test_base.py | 9 +++------ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 3b881cfd2df2f..623a6a10c75b5 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -533,6 +533,11 @@ def _str_map_nan_semantics( else: return self._str_map_str_or_object(dtype, na_value, arr, f, mask) + def view(self, dtype: Dtype | None = None) -> ArrayLike: + if dtype is not None: + raise TypeError("Cannot change data-type for string array.") + return super().view(dtype=dtype) + # error: Definition of "_concat_same_type" in base class "NDArrayBacked" is # incompatible with definition in base class "ExtensionArray" diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 06df8902f319c..608158d40cf23 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -351,14 +351,11 @@ def test_view_with_args_object_array_raises(self, index): msg = "When changing to a larger dtype" with pytest.raises(ValueError, match=msg): index.view("i8") - elif index.dtype == "str" and not index.dtype.storage == "python": - # TODO(infer_string): Make the errors consistent - with pytest.raises(NotImplementedError, match="i8"): - index.view("i8") else: msg = ( - "Cannot change data-type for array of references.|" - "Cannot change data-type for object array.|" + r"Cannot change data-type for array of references\.|" + r"Cannot change data-type for object array\.|" + r"Cannot change data-type for array of strings\.|" ) with pytest.raises(TypeError, match=msg): index.view("i8") From 817b7069bd9fc014232c066dc79dafbf5463137e Mon Sep 17 00:00:00 2001 From: Tolker-KU <55140581+Tolker-KU@users.noreply.github.com> Date: Tue, 14 Jan 2025 19:00:43 +0100 Subject: [PATCH 07/67] ENH: Format `decimal.Decimal` as full precision strings in `.to_json(...)` (#60698) * Format decimal.Decimal as full precision strings in .to_json(...) * Fix failing tests * Clean up Decimal to utf8 convertion and switch to using PyObject_Format() to suppress scientific notation * Add whatsnew entry --- doc/source/whatsnew/v3.0.0.rst | 1 + .../src/vendored/ujson/python/objToJSON.c | 35 +++++++++++++++++-- .../json/test_json_table_schema_ext_dtype.py | 4 +-- pandas/tests/io/json/test_pandas.py | 7 +--- pandas/tests/io/json/test_ujson.py | 30 ++++++++-------- 5 files changed, 52 insertions(+), 25 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 1ca0bb9c653a4..bf1b52d3a0957 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -53,6 +53,7 @@ Other enhancements - :meth:`DataFrame.ewm` now allows ``adjust=False`` when ``times`` is provided (:issue:`54328`) - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`) - :meth:`DataFrame.pivot_table` and :func:`pivot_table` now allow the passing of keyword arguments to ``aggfunc`` through ``**kwargs`` (:issue:`57884`) +- :meth:`DataFrame.to_json` now encodes ``Decimal`` as strings instead of floats (:issue:`60698`) - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`) diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index 6b957148f94da..4adc32ba0fed9 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -373,6 +373,27 @@ static char *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, size_t *outLen) { return outValue; } +static char *PyDecimalToUTF8Callback(JSOBJ _obj, JSONTypeContext *tc, + size_t *len) { + PyObject *obj = (PyObject *)_obj; + PyObject *format_spec = PyUnicode_FromStringAndSize("f", 1); + PyObject *str = PyObject_Format(obj, format_spec); + Py_DECREF(format_spec); + + if (str == NULL) { + ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; + return NULL; + } + + GET_TC(tc)->newObj = str; + + Py_ssize_t s_len; + char *outValue = (char *)PyUnicode_AsUTF8AndSize(str, &s_len); + *len = s_len; + + return outValue; +} + //============================================================================= // Numpy array iteration functions //============================================================================= @@ -1467,8 +1488,18 @@ static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { tc->type = JT_UTF8; return; } else if (object_is_decimal_type(obj)) { - pc->doubleValue = PyFloat_AsDouble(obj); - tc->type = JT_DOUBLE; + PyObject *is_nan_py = PyObject_RichCompare(obj, obj, Py_NE); + if (is_nan_py == NULL) { + goto INVALID; + } + int is_nan = (is_nan_py == Py_True); + Py_DECREF(is_nan_py); + if (is_nan) { + tc->type = JT_NULL; + return; + } + pc->PyTypeToUTF8 = PyDecimalToUTF8Callback; + tc->type = JT_UTF8; return; } else if (PyDateTime_Check(obj) || PyDate_Check(obj)) { if (object_is_nat_type(obj)) { diff --git a/pandas/tests/io/json/test_json_table_schema_ext_dtype.py b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py index 8de289afe9ff9..12ae24b064c9d 100644 --- a/pandas/tests/io/json/test_json_table_schema_ext_dtype.py +++ b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py @@ -159,7 +159,7 @@ def test_build_decimal_series(self, dc): expected = OrderedDict( [ ("schema", schema), - ("data", [OrderedDict([("id", 0), ("a", 10.0)])]), + ("data", [OrderedDict([("id", 0), ("a", "10")])]), ] ) @@ -245,7 +245,7 @@ def test_to_json(self, da, dc, sa, ia): [ ("idx", 0), ("A", "2021-10-10T00:00:00.000"), - ("B", 10.0), + ("B", "10"), ("C", "pandas"), ("D", 10), ] diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index ad9dbf7554a8b..59997d52179e6 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1,6 +1,5 @@ import datetime from datetime import timedelta -from decimal import Decimal from io import StringIO import json import os @@ -2025,12 +2024,8 @@ def test_to_s3(self, s3_public_bucket, s3so): timeout -= 0.1 assert timeout > 0, "Timed out waiting for file to appear on moto" - def test_json_pandas_nulls(self, nulls_fixture, request): + def test_json_pandas_nulls(self, nulls_fixture): # GH 31615 - if isinstance(nulls_fixture, Decimal): - mark = pytest.mark.xfail(reason="not implemented") - request.applymarker(mark) - expected_warning = None msg = ( "The default 'epoch' date format is deprecated and will be removed " diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 62118f1c82ebb..c5ccc3b3f7184 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -57,56 +57,56 @@ def test_encode_decimal(self): sut = decimal.Decimal("1337.1337") encoded = ujson.ujson_dumps(sut, double_precision=15) decoded = ujson.ujson_loads(encoded) - assert decoded == 1337.1337 + assert decoded == "1337.1337" sut = decimal.Decimal("0.95") encoded = ujson.ujson_dumps(sut, double_precision=1) - assert encoded == "1.0" + assert encoded == '"0.95"' decoded = ujson.ujson_loads(encoded) - assert decoded == 1.0 + assert decoded == "0.95" sut = decimal.Decimal("0.94") encoded = ujson.ujson_dumps(sut, double_precision=1) - assert encoded == "0.9" + assert encoded == '"0.94"' decoded = ujson.ujson_loads(encoded) - assert decoded == 0.9 + assert decoded == "0.94" sut = decimal.Decimal("1.95") encoded = ujson.ujson_dumps(sut, double_precision=1) - assert encoded == "2.0" + assert encoded == '"1.95"' decoded = ujson.ujson_loads(encoded) - assert decoded == 2.0 + assert decoded == "1.95" sut = decimal.Decimal("-1.95") encoded = ujson.ujson_dumps(sut, double_precision=1) - assert encoded == "-2.0" + assert encoded == '"-1.95"' decoded = ujson.ujson_loads(encoded) - assert decoded == -2.0 + assert decoded == "-1.95" sut = decimal.Decimal("0.995") encoded = ujson.ujson_dumps(sut, double_precision=2) - assert encoded == "1.0" + assert encoded == '"0.995"' decoded = ujson.ujson_loads(encoded) - assert decoded == 1.0 + assert decoded == "0.995" sut = decimal.Decimal("0.9995") encoded = ujson.ujson_dumps(sut, double_precision=3) - assert encoded == "1.0" + assert encoded == '"0.9995"' decoded = ujson.ujson_loads(encoded) - assert decoded == 1.0 + assert decoded == "0.9995" sut = decimal.Decimal("0.99999999999999944") encoded = ujson.ujson_dumps(sut, double_precision=15) - assert encoded == "1.0" + assert encoded == '"0.99999999999999944"' decoded = ujson.ujson_loads(encoded) - assert decoded == 1.0 + assert decoded == "0.99999999999999944" @pytest.mark.parametrize("ensure_ascii", [True, False]) def test_encode_string_conversion(self, ensure_ascii): From 5c9b6718dea589be6fafab04adbd22dd0550a061 Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Wed, 15 Jan 2025 08:19:37 -0800 Subject: [PATCH 08/67] =?UTF-8?q?BUG:=20Fix=20DataFrame=20binary=20arithma?= =?UTF-8?q?tic=20operation=20handling=20of=20unaligned=20=E2=80=A6=20(#605?= =?UTF-8?q?38)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * BUG: Fix DataFrame binary arithmatic operation handling of unaligned MultiIndex columns * Address review comment --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/frame.py | 17 +++++++++++++++++ pandas/tests/frame/test_arithmetic.py | 25 +++++++++++++++++++++++++ 3 files changed, 43 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index bf1b52d3a0957..b3df52fe1758a 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -689,6 +689,7 @@ MultiIndex - :meth:`DataFrame.melt` would not accept multiple names in ``var_name`` when the columns were a :class:`MultiIndex` (:issue:`58033`) - :meth:`MultiIndex.insert` would not insert NA value correctly at unified location of index -1 (:issue:`59003`) - :func:`MultiIndex.get_level_values` accessing a :class:`DatetimeIndex` does not carry the frequency attribute along (:issue:`58327`, :issue:`57949`) +- Bug in :class:`DataFrame` arithmetic operations in case of unaligned MultiIndex columns (:issue:`60498`) - I/O diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 851bc1ce4075c..ffffaeba4196e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7967,6 +7967,16 @@ def _arith_method_with_reindex(self, right: DataFrame, op) -> DataFrame: new_left = left if lcol_indexer is None else left.iloc[:, lcol_indexer] new_right = right if rcol_indexer is None else right.iloc[:, rcol_indexer] + + # GH#60498 For MultiIndex column alignment + if isinstance(cols, MultiIndex): + # When overwriting column names, make a shallow copy so as to not modify + # the input DFs + new_left = new_left.copy(deep=False) + new_right = new_right.copy(deep=False) + new_left.columns = cols + new_right.columns = cols + result = op(new_left, new_right) # Do the join on the columns instead of using left._align_for_op @@ -7997,6 +8007,13 @@ def _should_reindex_frame_op(self, right, op, axis: int, fill_value, level) -> b if not isinstance(right, DataFrame): return False + if ( + isinstance(self.columns, MultiIndex) + or isinstance(right.columns, MultiIndex) + ) and not self.columns.equals(right.columns): + # GH#60498 Reindex if MultiIndexe columns are not matching + return True + if fill_value is None and level is None and axis == 1: # TODO: any other cases we should handle here? diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 6b61fe8b05219..7ada1884feb90 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -2033,6 +2033,31 @@ def test_arithmetic_multiindex_align(): tm.assert_frame_equal(result, expected) +def test_arithmetic_multiindex_column_align(): + # GH#60498 + df1 = DataFrame( + data=100, + columns=MultiIndex.from_product( + [["1A", "1B"], ["2A", "2B"]], names=["Lev1", "Lev2"] + ), + index=["C1", "C2"], + ) + df2 = DataFrame( + data=np.array([[0.1, 0.25], [0.2, 0.45]]), + columns=MultiIndex.from_product([["1A", "1B"]], names=["Lev1"]), + index=["C1", "C2"], + ) + expected = DataFrame( + data=np.array([[10.0, 10.0, 25.0, 25.0], [20.0, 20.0, 45.0, 45.0]]), + columns=MultiIndex.from_product( + [["1A", "1B"], ["2A", "2B"]], names=["Lev1", "Lev2"] + ), + index=["C1", "C2"], + ) + result = df1 * df2 + tm.assert_frame_equal(result, expected) + + def test_bool_frame_mult_float(): # GH 18549 df = DataFrame(True, list("ab"), list("cd")) From a15a4b5e6f0397906f619ce8888670eadcf3af55 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Thu, 16 Jan 2025 05:30:58 +0530 Subject: [PATCH 09/67] DOC: fix PR01,SA01,ES01 for pandas.RangeIndex.from_range (#60720) --- ci/code_checks.sh | 1 - pandas/core/indexes/range.py | 21 +++++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 56cb22741b9a3..ec6dba05b2b0e 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -72,7 +72,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \ -i "pandas.Period.freq GL08" \ -i "pandas.Period.ordinal GL08" \ - -i "pandas.RangeIndex.from_range PR01,SA01" \ -i "pandas.Timedelta.max PR02" \ -i "pandas.Timedelta.min PR02" \ -i "pandas.Timedelta.resolution PR02" \ diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 935762d0455c5..2db50bbbdfa37 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -190,10 +190,31 @@ def from_range(cls, data: range, name=None, dtype: Dtype | None = None) -> Self: """ Create :class:`pandas.RangeIndex` from a ``range`` object. + This method provides a way to create a :class:`pandas.RangeIndex` directly + from a Python ``range`` object. The resulting :class:`RangeIndex` will have + the same start, stop, and step values as the input ``range`` object. + It is particularly useful for constructing indices in an efficient and + memory-friendly manner. + + Parameters + ---------- + data : range + The range object to be converted into a RangeIndex. + name : str, default None + Name to be stored in the index. + dtype : Dtype or None + Data type for the RangeIndex. If None, the default integer type will + be used. + Returns ------- RangeIndex + See Also + -------- + RangeIndex : Immutable Index implementing a monotonic integer range. + Index : Immutable sequence used for indexing and alignment. + Examples -------- >>> pd.RangeIndex.from_range(range(5)) From fb6c4e33c45938d7675d4c9a132324cd08df2f3c Mon Sep 17 00:00:00 2001 From: William Ayd Date: Thu, 16 Jan 2025 12:11:45 -0500 Subject: [PATCH 10/67] Use const char* for JSON key name (#60721) --- .../pandas/vendored/ujson/lib/ultrajson.h | 4 +- .../src/vendored/ujson/lib/ultrajsonenc.c | 2 +- .../src/vendored/ujson/python/objToJSON.c | 135 ++++++++---------- pandas/tests/io/json/test_compression.py | 1 + pandas/tests/io/json/test_pandas.py | 2 + 5 files changed, 62 insertions(+), 82 deletions(-) diff --git a/pandas/_libs/include/pandas/vendored/ujson/lib/ultrajson.h b/pandas/_libs/include/pandas/vendored/ujson/lib/ultrajson.h index 0d62bb0ba915c..51fdbc50bba57 100644 --- a/pandas/_libs/include/pandas/vendored/ujson/lib/ultrajson.h +++ b/pandas/_libs/include/pandas/vendored/ujson/lib/ultrajson.h @@ -170,8 +170,8 @@ typedef void (*JSPFN_ITERBEGIN)(JSOBJ obj, JSONTypeContext *tc); typedef int (*JSPFN_ITERNEXT)(JSOBJ obj, JSONTypeContext *tc); typedef void (*JSPFN_ITEREND)(JSOBJ obj, JSONTypeContext *tc); typedef JSOBJ (*JSPFN_ITERGETVALUE)(JSOBJ obj, JSONTypeContext *tc); -typedef char *(*JSPFN_ITERGETNAME)(JSOBJ obj, JSONTypeContext *tc, - size_t *outLen); +typedef const char *(*JSPFN_ITERGETNAME)(JSOBJ obj, JSONTypeContext *tc, + size_t *outLen); typedef void *(*JSPFN_MALLOC)(size_t size); typedef void (*JSPFN_FREE)(void *pptr); typedef void *(*JSPFN_REALLOC)(void *base, size_t size); diff --git a/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c index c8d8b5ab6bd6e..1564ecb64b01d 100644 --- a/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c +++ b/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c @@ -920,7 +920,7 @@ Perhaps implement recursion detection */ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t cbName) { const char *value; - char *objName; + const char *objName; int count; JSOBJ iterObj; size_t szlen; diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index 4adc32ba0fed9..8342dbcd1763d 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -53,8 +53,8 @@ Numeric decoder derived from TCL library npy_int64 get_nat(void) { return NPY_MIN_INT64; } -typedef char *(*PFN_PyTypeToUTF8)(JSOBJ obj, JSONTypeContext *ti, - size_t *_outLen); +typedef const char *(*PFN_PyTypeToUTF8)(JSOBJ obj, JSONTypeContext *ti, + size_t *_outLen); int object_is_decimal_type(PyObject *obj); int object_is_dataframe_type(PyObject *obj); @@ -106,7 +106,7 @@ typedef struct __TypeContext { double doubleValue; JSINT64 longValue; - char *cStr; + const char *cStr; NpyArrContext *npyarr; PdBlockContext *pdblock; int transpose; @@ -301,14 +301,15 @@ static npy_float64 total_seconds(PyObject *td) { return double_val; } -static char *PyBytesToUTF8(JSOBJ _obj, JSONTypeContext *Py_UNUSED(tc), - size_t *_outLen) { +static const char *PyBytesToUTF8(JSOBJ _obj, JSONTypeContext *Py_UNUSED(tc), + size_t *_outLen) { PyObject *obj = (PyObject *)_obj; *_outLen = PyBytes_GET_SIZE(obj); return PyBytes_AS_STRING(obj); } -static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, size_t *_outLen) { +static const char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, + size_t *_outLen) { char *encoded = (char *)PyUnicode_AsUTF8AndSize(_obj, (Py_ssize_t *)_outLen); if (encoded == NULL) { /* Something went wrong. @@ -321,8 +322,8 @@ static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, size_t *_outLen) { } /* JSON callback. returns a char* and mutates the pointer to *len */ -static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused), - JSONTypeContext *tc, size_t *len) { +static const char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused), + JSONTypeContext *tc, size_t *len) { NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; NPY_DATETIMEUNIT valueUnit = ((PyObjectEncoder *)tc->encoder)->valueUnit; GET_TC(tc)->cStr = int64ToIso(GET_TC(tc)->longValue, valueUnit, base, len); @@ -330,15 +331,15 @@ static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused), } /* JSON callback. returns a char* and mutates the pointer to *len */ -static char *NpyTimeDeltaToIsoCallback(JSOBJ Py_UNUSED(unused), - JSONTypeContext *tc, size_t *len) { +static const char *NpyTimeDeltaToIsoCallback(JSOBJ Py_UNUSED(unused), + JSONTypeContext *tc, size_t *len) { GET_TC(tc)->cStr = int64ToIsoDuration(GET_TC(tc)->longValue, len); return GET_TC(tc)->cStr; } /* JSON callback */ -static char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc, - size_t *len) { +static const char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc, + size_t *len) { if (!PyDate_Check(obj) && !PyDateTime_Check(obj)) { PyErr_SetString(PyExc_TypeError, "Expected date or datetime object"); ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; @@ -349,7 +350,8 @@ static char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc, return PyDateTimeToIso(obj, base, len); } -static char *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, size_t *outLen) { +static const char *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, + size_t *outLen) { PyObject *obj = (PyObject *)_obj; PyObject *str = PyObject_CallMethod(obj, "isoformat", NULL); if (str == NULL) { @@ -373,8 +375,8 @@ static char *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, size_t *outLen) { return outValue; } -static char *PyDecimalToUTF8Callback(JSOBJ _obj, JSONTypeContext *tc, - size_t *len) { +static const char *PyDecimalToUTF8Callback(JSOBJ _obj, JSONTypeContext *tc, + size_t *len) { PyObject *obj = (PyObject *)_obj; PyObject *format_spec = PyUnicode_FromStringAndSize("f", 1); PyObject *str = PyObject_Format(obj, format_spec); @@ -558,10 +560,10 @@ static JSOBJ NpyArr_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *NpyArr_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { +static const char *NpyArr_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { NpyArrContext *npyarr = GET_TC(tc)->npyarr; - char *cStr; + const char *cStr; if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { const npy_intp idx = npyarr->index[npyarr->stridedim] - 1; @@ -609,11 +611,11 @@ static int PdBlock_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { return NpyArr_iterNextItem(obj, tc); } -static char *PdBlock_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { +static const char *PdBlock_iterGetName(JSOBJ Py_UNUSED(obj), + JSONTypeContext *tc, size_t *outLen) { PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; NpyArrContext *npyarr = blkCtxt->npyCtxts[0]; - char *cStr; + const char *cStr; if (GET_TC(tc)->iterNext == PdBlock_iterNextItem) { const npy_intp idx = blkCtxt->colIdx - 1; @@ -631,12 +633,12 @@ static char *PdBlock_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, return cStr; } -static char *PdBlock_iterGetName_Transpose(JSOBJ Py_UNUSED(obj), - JSONTypeContext *tc, - size_t *outLen) { +static const char *PdBlock_iterGetName_Transpose(JSOBJ Py_UNUSED(obj), + JSONTypeContext *tc, + size_t *outLen) { PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; NpyArrContext *npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; - char *cStr; + const char *cStr; if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { const npy_intp idx = npyarr->index[npyarr->stridedim] - 1; @@ -817,9 +819,9 @@ static JSOBJ Tuple_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *Tuple_iterGetName(JSOBJ Py_UNUSED(obj), - JSONTypeContext *Py_UNUSED(tc), - size_t *Py_UNUSED(outLen)) { +static const char *Tuple_iterGetName(JSOBJ Py_UNUSED(obj), + JSONTypeContext *Py_UNUSED(tc), + size_t *Py_UNUSED(outLen)) { return NULL; } @@ -864,9 +866,9 @@ static JSOBJ Set_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *Set_iterGetName(JSOBJ Py_UNUSED(obj), - JSONTypeContext *Py_UNUSED(tc), - size_t *Py_UNUSED(outLen)) { +static const char *Set_iterGetName(JSOBJ Py_UNUSED(obj), + JSONTypeContext *Py_UNUSED(tc), + size_t *Py_UNUSED(outLen)) { return NULL; } @@ -962,8 +964,8 @@ static JSOBJ Dir_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *Dir_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { +static const char *Dir_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { *outLen = PyBytes_GET_SIZE(GET_TC(tc)->itemName); return PyBytes_AS_STRING(GET_TC(tc)->itemName); } @@ -994,9 +996,9 @@ static JSOBJ List_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *List_iterGetName(JSOBJ Py_UNUSED(obj), - JSONTypeContext *Py_UNUSED(tc), - size_t *Py_UNUSED(outLen)) { +static const char *List_iterGetName(JSOBJ Py_UNUSED(obj), + JSONTypeContext *Py_UNUSED(tc), + size_t *Py_UNUSED(outLen)) { return NULL; } @@ -1005,24 +1007,16 @@ static char *List_iterGetName(JSOBJ Py_UNUSED(obj), //============================================================================= static void Index_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { GET_TC(tc)->index = 0; - GET_TC(tc)->cStr = PyObject_Malloc(20); - if (!GET_TC(tc)->cStr) { - PyErr_NoMemory(); - } } static int Index_iterNext(JSOBJ obj, JSONTypeContext *tc) { - if (!GET_TC(tc)->cStr) { - return 0; - } - const Py_ssize_t index = GET_TC(tc)->index; Py_XDECREF(GET_TC(tc)->itemValue); if (index == 0) { - memcpy(GET_TC(tc)->cStr, "name", 5); + GET_TC(tc)->cStr = "name"; GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name"); } else if (index == 1) { - memcpy(GET_TC(tc)->cStr, "data", 5); + GET_TC(tc)->cStr = "data"; GET_TC(tc)->itemValue = get_values(obj); if (!GET_TC(tc)->itemValue) { return 0; @@ -1042,8 +1036,8 @@ static JSOBJ Index_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *Index_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { +static const char *Index_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { *outLen = strlen(GET_TC(tc)->cStr); return GET_TC(tc)->cStr; } @@ -1054,28 +1048,20 @@ static char *Index_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, static void Series_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; GET_TC(tc)->index = 0; - GET_TC(tc)->cStr = PyObject_Malloc(20); enc->outputFormat = VALUES; // for contained series - if (!GET_TC(tc)->cStr) { - PyErr_NoMemory(); - } } static int Series_iterNext(JSOBJ obj, JSONTypeContext *tc) { - if (!GET_TC(tc)->cStr) { - return 0; - } - const Py_ssize_t index = GET_TC(tc)->index; Py_XDECREF(GET_TC(tc)->itemValue); if (index == 0) { - memcpy(GET_TC(tc)->cStr, "name", 5); + GET_TC(tc)->cStr = "name"; GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name"); } else if (index == 1) { - memcpy(GET_TC(tc)->cStr, "index", 6); + GET_TC(tc)->cStr = "index"; GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); } else if (index == 2) { - memcpy(GET_TC(tc)->cStr, "data", 5); + GET_TC(tc)->cStr = "data"; GET_TC(tc)->itemValue = get_values(obj); if (!GET_TC(tc)->itemValue) { return 0; @@ -1097,8 +1083,8 @@ static JSOBJ Series_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *Series_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { +static const char *Series_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { *outLen = strlen(GET_TC(tc)->cStr); return GET_TC(tc)->cStr; } @@ -1109,28 +1095,20 @@ static char *Series_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, static void DataFrame_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; GET_TC(tc)->index = 0; - GET_TC(tc)->cStr = PyObject_Malloc(20); enc->outputFormat = VALUES; // for contained series & index - if (!GET_TC(tc)->cStr) { - PyErr_NoMemory(); - } } static int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { - if (!GET_TC(tc)->cStr) { - return 0; - } - const Py_ssize_t index = GET_TC(tc)->index; Py_XDECREF(GET_TC(tc)->itemValue); if (index == 0) { - memcpy(GET_TC(tc)->cStr, "columns", 8); + GET_TC(tc)->cStr = "columns"; GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "columns"); } else if (index == 1) { - memcpy(GET_TC(tc)->cStr, "index", 6); + GET_TC(tc)->cStr = "index"; GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); } else if (index == 2) { - memcpy(GET_TC(tc)->cStr, "data", 5); + GET_TC(tc)->cStr = "data"; Py_INCREF(obj); GET_TC(tc)->itemValue = obj; } else { @@ -1150,8 +1128,8 @@ static JSOBJ DataFrame_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *DataFrame_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { +static const char *DataFrame_iterGetName(JSOBJ Py_UNUSED(obj), + JSONTypeContext *tc, size_t *outLen) { *outLen = strlen(GET_TC(tc)->cStr); return GET_TC(tc)->cStr; } @@ -1201,8 +1179,8 @@ static JSOBJ Dict_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *Dict_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { +static const char *Dict_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { *outLen = PyBytes_GET_SIZE(GET_TC(tc)->itemName); return PyBytes_AS_STRING(GET_TC(tc)->itemName); } @@ -1902,7 +1880,6 @@ static void Object_endTypeContext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { GET_TC(tc)->rowLabels = NULL; NpyArr_freeLabels(GET_TC(tc)->columnLabels, GET_TC(tc)->columnLabelsLen); GET_TC(tc)->columnLabels = NULL; - PyObject_Free(GET_TC(tc)->cStr); GET_TC(tc)->cStr = NULL; PyObject_Free(tc->prv); tc->prv = NULL; @@ -1953,8 +1930,8 @@ static JSOBJ Object_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { return GET_TC(tc)->iterGetValue(obj, tc); } -static char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc, - size_t *outLen) { +static const char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc, + size_t *outLen) { return GET_TC(tc)->iterGetName(obj, tc, outLen); } diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index ff7d34c85c015..953a9246da1cd 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -41,6 +41,7 @@ def test_read_zipped_json(datapath): @td.skip_if_not_us_locale @pytest.mark.single_cpu +@pytest.mark.network def test_with_s3_url(compression, s3_public_bucket, s3so): # Bucket created in tests/io/conftest.py df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}')) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 59997d52179e6..5dc1272880c9b 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1412,6 +1412,7 @@ def test_read_inline_jsonl(self): tm.assert_frame_equal(result, expected) @pytest.mark.single_cpu + @pytest.mark.network @td.skip_if_not_us_locale def test_read_s3_jsonl(self, s3_public_bucket_with_data, s3so): # GH17200 @@ -2011,6 +2012,7 @@ def test_json_multiindex(self): assert result == expected @pytest.mark.single_cpu + @pytest.mark.network def test_to_s3(self, s3_public_bucket, s3so): # GH 28375 mock_bucket_name, target_file = s3_public_bucket.name, "test.json" From 8ea7c5609b537a864626f415fecda17537e6748d Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Thu, 16 Jan 2025 16:22:09 -0500 Subject: [PATCH 11/67] DOC: fix PR07,SA01 for pandas.arrays.ArrowExtensionArray (#60724) --- ci/code_checks.sh | 1 - pandas/core/arrays/arrow/array.py | 7 +++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index ec6dba05b2b0e..948d8bee8ba5b 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -79,7 +79,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.min PR02" \ -i "pandas.Timestamp.resolution PR02" \ -i "pandas.Timestamp.tzinfo GL08" \ - -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \ -i "pandas.arrays.TimedeltaArray PR07,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \ -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \ diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 900548a239c8e..5c32b05868383 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -258,6 +258,7 @@ class ArrowExtensionArray( Parameters ---------- values : pyarrow.Array or pyarrow.ChunkedArray + The input data to initialize the ArrowExtensionArray. Attributes ---------- @@ -271,6 +272,12 @@ class ArrowExtensionArray( ------- ArrowExtensionArray + See Also + -------- + array : Create a Pandas array with a specified dtype. + DataFrame.to_feather : Write a DataFrame to the binary Feather format. + read_feather : Load a feather-format object from the file path. + Notes ----- Most methods are implemented using `pyarrow compute functions. `__ From 50767f803ff5e98be5c569fe442b670b1ffe5180 Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Thu, 16 Jan 2025 18:01:22 -0800 Subject: [PATCH 12/67] DOC: Update doc for newly added groupby method kurt (#60725) --- doc/source/reference/groupby.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/reference/groupby.rst b/doc/source/reference/groupby.rst index 3b02ffe20c10e..fc180c8161a7e 100644 --- a/doc/source/reference/groupby.rst +++ b/doc/source/reference/groupby.rst @@ -104,6 +104,7 @@ Function application DataFrameGroupBy.shift DataFrameGroupBy.size DataFrameGroupBy.skew + DataFrameGroupBy.kurt DataFrameGroupBy.std DataFrameGroupBy.sum DataFrameGroupBy.var @@ -159,6 +160,7 @@ Function application SeriesGroupBy.shift SeriesGroupBy.size SeriesGroupBy.skew + SeriesGroupBy.kurt SeriesGroupBy.std SeriesGroupBy.sum SeriesGroupBy.var From 72fd708761f1598f1a8ce9b693529b81fd8ca252 Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Thu, 16 Jan 2025 18:10:41 -0800 Subject: [PATCH 13/67] ENH: Add first and last aggregations to Rolling and Expanding (#60579) * ENH: Add first and last aggregations to Rolling and Expanding * Update reference doc * Set 'See Also' section in doc * Fix docstring * Retry fixing docstring * Fix missing period in docstring * Another missing period --- doc/source/reference/window.rst | 4 + doc/source/whatsnew/v3.0.0.rst | 1 + pandas/_libs/window/aggregations.pyi | 12 +++ pandas/_libs/window/aggregations.pyx | 83 +++++++++++++++ pandas/core/window/expanding.py | 72 +++++++++++++ pandas/core/window/rolling.py | 88 +++++++++++++++ .../tests/window/test_cython_aggregations.py | 2 + pandas/tests/window/test_expanding.py | 100 ++++++++++++++++++ pandas/tests/window/test_groupby.py | 4 +- pandas/tests/window/test_rolling.py | 76 +++++++++++++ pandas/tests/window/test_rolling_functions.py | 4 + pandas/tests/window/test_timeseries_window.py | 38 +++++++ 12 files changed, 483 insertions(+), 1 deletion(-) diff --git a/doc/source/reference/window.rst b/doc/source/reference/window.rst index fb89fd2a5ffb2..2aeb57faac112 100644 --- a/doc/source/reference/window.rst +++ b/doc/source/reference/window.rst @@ -30,6 +30,8 @@ Rolling window functions Rolling.std Rolling.min Rolling.max + Rolling.first + Rolling.last Rolling.corr Rolling.cov Rolling.skew @@ -72,6 +74,8 @@ Expanding window functions Expanding.std Expanding.min Expanding.max + Expanding.first + Expanding.last Expanding.corr Expanding.cov Expanding.skew diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index b3df52fe1758a..1e33971acac1a 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -57,6 +57,7 @@ Other enhancements - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`) +- :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`) - :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`) - :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`) - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`) diff --git a/pandas/_libs/window/aggregations.pyi b/pandas/_libs/window/aggregations.pyi index a6cfbec9b15b9..ee735761e3dc6 100644 --- a/pandas/_libs/window/aggregations.pyi +++ b/pandas/_libs/window/aggregations.pyi @@ -60,6 +60,18 @@ def roll_min( end: np.ndarray, # np.ndarray[np.int64] minp: int, # int64_t ) -> np.ndarray: ... # np.ndarray[float] +def roll_first( + values: np.ndarray, # np.ndarray[np.float64] + start: np.ndarray, # np.ndarray[np.int64] + end: np.ndarray, # np.ndarray[np.int64] + minp: int, # int64_t +) -> np.ndarray: ... # np.ndarray[float] +def roll_last( + values: np.ndarray, # np.ndarray[np.float64] + start: np.ndarray, # np.ndarray[np.int64] + end: np.ndarray, # np.ndarray[np.int64] + minp: int, # int64_t +) -> np.ndarray: ... # np.ndarray[float] def roll_quantile( values: np.ndarray, # const float64_t[:] start: np.ndarray, # np.ndarray[np.int64] diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 5b9ee095d4643..d33c840371d2a 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -1133,6 +1133,89 @@ cdef _roll_min_max(ndarray[float64_t] values, return output +# ---------------------------------------------------------------------- +# Rolling first, last + + +def roll_first(const float64_t[:] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp) -> np.ndarray: + return _roll_first_last(values, start, end, minp, is_first=1) + + +def roll_last(const float64_t[:] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp) -> np.ndarray: + return _roll_first_last(values, start, end, minp, is_first=0) + + +cdef _roll_first_last(const float64_t[:] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, bint is_first): + cdef: + Py_ssize_t i, j, fl_idx + bint is_monotonic_increasing_bounds + int64_t nobs = 0, N = len(start), s, e + float64_t val, res + ndarray[float64_t] output + + is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds( + start, end + ) + + output = np.empty(N, dtype=np.float64) + + if (end - start).max() == 0: + output[:] = NaN + return output + + with nogil: + for i in range(0, N): + s = start[i] + e = end[i] + + if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]: + fl_idx = -1 + nobs = 0 + for j in range(s, e): + val = values[j] + if val == val: + if not is_first or fl_idx < s: + fl_idx = j + nobs += 1 + else: + # handle deletes + for j in range(start[i - 1], s): + val = values[j] + if val == val: + nobs -= 1 + + # update fl_idx if out of range, if first + if is_first and fl_idx < s: + fl_idx = -1 + for j in range(s, end[i - 1]): + val = values[j] + if val == val: + fl_idx = j + break + + # handle adds + for j in range(end[i - 1], e): + val = values[j] + if val == val: + if not is_first or fl_idx < s: + fl_idx = j + nobs += 1 + + if nobs >= minp and fl_idx >= s: + res = values[fl_idx] + else: + res = NaN + + output[i] = res + + if not is_monotonic_increasing_bounds: + nobs = 0 + + return output + cdef enum InterpolationType: LINEAR, diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index 6a7d0329ab6da..81c89e1ef5428 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -723,6 +723,78 @@ def skew(self, numeric_only: bool = False): def kurt(self, numeric_only: bool = False): return super().kurt(numeric_only=numeric_only) + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + dedent( + """ + GroupBy.first : Similar method for GroupBy objects. + Expanding.last : Method to get the last element in each window.\n + """ + ).replace("\n", "", 1), + create_section_header("Examples"), + dedent( + """ + The example below will show an expanding calculation with a window size of + three. + + >>> s = pd.Series(range(5)) + >>> s.expanding(3).first() + 0 NaN + 1 NaN + 2 0.0 + 3 0.0 + 4 0.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="expanding", + aggregation_description="First (left-most) element of the window", + agg_method="first", + ) + def first(self, numeric_only: bool = False): + return super().first(numeric_only=numeric_only) + + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + dedent( + """ + GroupBy.last : Similar method for GroupBy objects. + Expanding.first : Method to get the first element in each window.\n + """ + ).replace("\n", "", 1), + create_section_header("Examples"), + dedent( + """ + The example below will show an expanding calculation with a window size of + three. + + >>> s = pd.Series(range(5)) + >>> s.expanding(3).last() + 0 NaN + 1 NaN + 2 2.0 + 3 3.0 + 4 4.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="expanding", + aggregation_description="Last (right-most) element of the window", + agg_method="last", + ) + def last(self, numeric_only: bool = False): + return super().last(numeric_only=numeric_only) + @doc( template_header, create_section_header("Parameters"), diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 90c3cff975ff0..631ab15464942 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1740,6 +1740,22 @@ def kurt(self, numeric_only: bool = False): numeric_only=numeric_only, ) + def first(self, numeric_only: bool = False): + window_func = window_aggregations.roll_first + return self._apply( + window_func, + name="first", + numeric_only=numeric_only, + ) + + def last(self, numeric_only: bool = False): + window_func = window_aggregations.roll_last + return self._apply( + window_func, + name="last", + numeric_only=numeric_only, + ) + def quantile( self, q: float, @@ -2622,6 +2638,78 @@ def sem(self, ddof: int = 1, numeric_only: bool = False): def kurt(self, numeric_only: bool = False): return super().kurt(numeric_only=numeric_only) + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + dedent( + """ + GroupBy.first : Similar method for GroupBy objects. + Rolling.last : Method to get the last element in each window.\n + """ + ).replace("\n", "", 1), + create_section_header("Examples"), + dedent( + """ + The example below will show a rolling calculation with a window size of + three. + + >>> s = pd.Series(range(5)) + >>> s.rolling(3).first() + 0 NaN + 1 NaN + 2 0.0 + 3 1.0 + 4 2.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="First (left-most) element of the window", + agg_method="first", + ) + def first(self, numeric_only: bool = False): + return super().first(numeric_only=numeric_only) + + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + dedent( + """ + GroupBy.last : Similar method for GroupBy objects. + Rolling.first : Method to get the first element in each window.\n + """ + ).replace("\n", "", 1), + create_section_header("Examples"), + dedent( + """ + The example below will show a rolling calculation with a window size of + three. + + >>> s = pd.Series(range(5)) + >>> s.rolling(3).last() + 0 NaN + 1 NaN + 2 2.0 + 3 3.0 + 4 4.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="Last (right-most) element of the window", + agg_method="last", + ) + def last(self, numeric_only: bool = False): + return super().last(numeric_only=numeric_only) + @doc( template_header, create_section_header("Parameters"), diff --git a/pandas/tests/window/test_cython_aggregations.py b/pandas/tests/window/test_cython_aggregations.py index c60cb6ea74ec0..feb25a294c540 100644 --- a/pandas/tests/window/test_cython_aggregations.py +++ b/pandas/tests/window/test_cython_aggregations.py @@ -30,6 +30,8 @@ def _get_rolling_aggregations(): ("roll_median_c", window_aggregations.roll_median_c), ("roll_max", window_aggregations.roll_max), ("roll_min", window_aggregations.roll_min), + ("roll_first", window_aggregations.roll_first), + ("roll_last", window_aggregations.roll_last), ] + [ ( diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py index b2f76bdd0e2ad..39cedc3b692da 100644 --- a/pandas/tests/window/test_expanding.py +++ b/pandas/tests/window/test_expanding.py @@ -451,6 +451,8 @@ def test_moment_functions_zero_length_pairwise(f): lambda x: x.expanding(min_periods=5).corr(x, pairwise=False), lambda x: x.expanding(min_periods=5).max(), lambda x: x.expanding(min_periods=5).min(), + lambda x: x.expanding(min_periods=5).first(), + lambda x: x.expanding(min_periods=5).last(), lambda x: x.expanding(min_periods=5).sum(), lambda x: x.expanding(min_periods=5).mean(), lambda x: x.expanding(min_periods=5).std(), @@ -596,6 +598,104 @@ def test_expanding_corr_pairwise_diff_length(): tm.assert_frame_equal(result4, expected) +@pytest.mark.parametrize( + "values,method,expected", + [ + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "first", + [float("nan"), float("nan"), 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "first", + [ + float("nan"), + float("nan"), + float("nan"), + float("nan"), + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + ], + ), + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "last", + [float("nan"), float("nan"), 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "last", + [ + float("nan"), + float("nan"), + float("nan"), + float("nan"), + 5.0, + 5.0, + 7.0, + 7.0, + 9.0, + 9.0, + ], + ), + ], +) +def test_expanding_first_last(values, method, expected): + # GH#33155 + x = Series(values) + result = getattr(x.expanding(3), method)() + expected = Series(expected) + tm.assert_almost_equal(result, expected) + + x = DataFrame({"A": values}) + result = getattr(x.expanding(3), method)() + expected = DataFrame({"A": expected}) + tm.assert_almost_equal(result, expected) + + +@pytest.mark.parametrize( + "values,method,expected", + [ + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "first", + [1.0] * 10, + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "first", + [1.0] * 10, + ), + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "last", + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "last", + [1.0, 1.0, 3.0, 3.0, 5.0, 5.0, 7.0, 7.0, 9.0, 9.0], + ), + ], +) +def test_expanding_first_last_no_minp(values, method, expected): + # GH#33155 + x = Series(values) + result = getattr(x.expanding(min_periods=0), method)() + expected = Series(expected) + tm.assert_almost_equal(result, expected) + + x = DataFrame({"A": values}) + result = getattr(x.expanding(min_periods=0), method)() + expected = DataFrame({"A": expected}) + tm.assert_almost_equal(result, expected) + + def test_expanding_apply_args_kwargs(engine_and_raw): def mean_w_arg(x, const): return np.mean(x) + const diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index f53250378e33c..392239b8adadd 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -91,6 +91,8 @@ def test_getitem_multiple(self, roll_frame): "mean", "min", "max", + "first", + "last", "count", "kurt", "skew", @@ -1032,7 +1034,7 @@ def frame(self): return DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)}) @pytest.mark.parametrize( - "f", ["sum", "mean", "min", "max", "count", "kurt", "skew"] + "f", ["sum", "mean", "min", "max", "first", "last", "count", "kurt", "skew"] ) def test_expanding(self, f, frame): g = frame.groupby("A", group_keys=False) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index af3194b5085c4..2aaa35ec5ec2c 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1326,6 +1326,82 @@ def test_rolling_corr_timedelta_index(index, window): tm.assert_almost_equal(result, expected) +@pytest.mark.parametrize( + "values,method,expected", + [ + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "first", + [float("nan"), float("nan"), 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "first", + [float("nan")] * 10, + ), + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "last", + [float("nan"), float("nan"), 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "last", + [float("nan")] * 10, + ), + ], +) +def test_rolling_first_last(values, method, expected): + # GH#33155 + x = Series(values) + result = getattr(x.rolling(3), method)() + expected = Series(expected) + tm.assert_almost_equal(result, expected) + + x = DataFrame({"A": values}) + result = getattr(x.rolling(3), method)() + expected = DataFrame({"A": expected}) + tm.assert_almost_equal(result, expected) + + +@pytest.mark.parametrize( + "values,method,expected", + [ + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "first", + [1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "first", + [1.0, 1.0, 1.0, 3.0, 3.0, 5.0, 5.0, 7.0, 7.0, 9.0], + ), + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "last", + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "last", + [1.0, 1.0, 3.0, 3.0, 5.0, 5.0, 7.0, 7.0, 9.0, 9.0], + ), + ], +) +def test_rolling_first_last_no_minp(values, method, expected): + # GH#33155 + x = Series(values) + result = getattr(x.rolling(3, min_periods=0), method)() + expected = Series(expected) + tm.assert_almost_equal(result, expected) + + x = DataFrame({"A": values}) + result = getattr(x.rolling(3, min_periods=0), method)() + expected = DataFrame({"A": expected}) + tm.assert_almost_equal(result, expected) + + def test_groupby_rolling_nan_included(): # GH 35542 data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]} diff --git a/pandas/tests/window/test_rolling_functions.py b/pandas/tests/window/test_rolling_functions.py index f77a98ae9a7d9..6820ab7332975 100644 --- a/pandas/tests/window/test_rolling_functions.py +++ b/pandas/tests/window/test_rolling_functions.py @@ -340,6 +340,8 @@ def test_center_reindex_frame(frame, roll_func, kwargs, minp, fill_value): lambda x: x.rolling(window=10, min_periods=5).var(), lambda x: x.rolling(window=10, min_periods=5).skew(), lambda x: x.rolling(window=10, min_periods=5).kurt(), + lambda x: x.rolling(window=10, min_periods=5).first(), + lambda x: x.rolling(window=10, min_periods=5).last(), lambda x: x.rolling(window=10, min_periods=5).quantile(q=0.5), lambda x: x.rolling(window=10, min_periods=5).median(), lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=False), @@ -501,6 +503,8 @@ def test_rolling_min_max_numeric_types(any_real_numpy_dtype): lambda x: x.rolling(window=10, min_periods=5).var(), lambda x: x.rolling(window=10, min_periods=5).skew(), lambda x: x.rolling(window=10, min_periods=5).kurt(), + lambda x: x.rolling(window=10, min_periods=5).first(), + lambda x: x.rolling(window=10, min_periods=5).last(), lambda x: x.rolling(window=10, min_periods=5).quantile(0.5), lambda x: x.rolling(window=10, min_periods=5).median(), lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=False), diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index eacdaddfa28b0..043f369566a5d 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -541,6 +541,42 @@ def test_ragged_max(self, ragged): expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) + def test_ragged_first(self, ragged): + df = ragged + + result = df.rolling(window="1s", min_periods=1).first() + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="2s", min_periods=1).first() + expected = df.copy() + expected["B"] = [0.0, 1, 1, 3, 3] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="5s", min_periods=1).first() + expected = df.copy() + expected["B"] = [0.0, 0, 0, 1, 1] + tm.assert_frame_equal(result, expected) + + def test_ragged_last(self, ragged): + df = ragged + + result = df.rolling(window="1s", min_periods=1).last() + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="2s", min_periods=1).last() + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="5s", min_periods=1).last() + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "freq, op, result_data", [ @@ -586,6 +622,8 @@ def test_freqs_ops(self, freq, op, result_data): "skew", "min", "max", + "first", + "last", ], ) def test_all(self, f, regular): From a4e814954b6f1c41528c071b028df62def7765c0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 17 Jan 2025 20:11:37 +0100 Subject: [PATCH 14/67] REGR: from_records not initializing subclasses properly (#60726) * REGR: from_records not initializing subclasses properly * Move whatsnew --- doc/source/whatsnew/v2.3.0.rst | 1 - doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/frame.py | 5 ++++- pandas/tests/frame/test_subclass.py | 7 +++++++ 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 9e0e095eb4de8..96eed72823e72 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -175,7 +175,6 @@ Other ^^^^^ - Fixed usage of ``inspect`` when the optional dependencies ``pyarrow`` or ``jinja2`` are not installed (:issue:`60196`) -- .. --------------------------------------------------------------------------- .. _whatsnew_230.contributors: diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 1e33971acac1a..102628257d6f2 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -812,6 +812,7 @@ Other - Bug in ``Series.list`` methods not preserving the original name. (:issue:`60522`) - Bug in printing a :class:`DataFrame` with a :class:`DataFrame` stored in :attr:`DataFrame.attrs` raised a ``ValueError`` (:issue:`60455`) - Bug in printing a :class:`Series` with a :class:`DataFrame` stored in :attr:`Series.attrs` raised a ``ValueError`` (:issue:`60568`) +- Fixed regression in :meth:`DataFrame.from_records` not initializing subclasses properly (:issue:`57008`) .. ***DO NOT USE THIS SECTION*** diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ffffaeba4196e..863465ca1565c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2317,7 +2317,10 @@ def maybe_reorder( columns = columns.drop(exclude) mgr = arrays_to_mgr(arrays, columns, result_index) - return cls._from_mgr(mgr, axes=mgr.axes) + df = DataFrame._from_mgr(mgr, axes=mgr.axes) + if cls is not DataFrame: + return cls(df, copy=False) + return df def to_records( self, index: bool = True, column_dtypes=None, index_dtypes=None diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index 7d18ef28a722d..cbd563a03b908 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -769,6 +769,13 @@ def test_constructor_with_metadata(): assert isinstance(subset, MySubclassWithMetadata) +def test_constructor_with_metadata_from_records(): + # GH#57008 + df = MySubclassWithMetadata.from_records([{"a": 1, "b": 2}]) + assert df.my_metadata is None + assert type(df) is MySubclassWithMetadata + + class SimpleDataFrameSubClass(DataFrame): """A subclass of DataFrame that does not define a constructor.""" From 27baf4887a1b7d4f4a378c2f951cdc95fb1ab2b8 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Tue, 21 Jan 2025 12:18:58 -0500 Subject: [PATCH 15/67] DOC: fix ES01 for pandas.read_feather (#60746) --- pandas/io/feather_format.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 7b4c81853eba3..565c53f0f3fc5 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -78,6 +78,14 @@ def read_feather( """ Load a feather-format object from the file path. + Feather is particularly useful for scenarios that require efficient + serialization and deserialization of tabular data. It supports + schema preservation, making it a reliable choice for use cases + such as sharing data between Python and R, or persisting intermediate + results during data processing pipelines. This method provides additional + flexibility with options for selective column reading, thread parallelism, + and choosing the backend for data types. + Parameters ---------- path : str, path object, or file-like object From bbd6526461b6e9fc7783bd51298db5cb2ae0c679 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Tue, 21 Jan 2025 17:26:09 +0000 Subject: [PATCH 16/67] ENH: `pandas.api.interchange.from_dataframe` now uses the Arrow PyCapsule Interface if available, only falling back to the Dataframe Interchange Protocol if that fails (#60739) * add test for list dtype * catch arrowinvalid and keep raising runtimeerror * use rst hyperlink --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/interchange/from_dataframe.py | 16 +++++++++++++++- pandas/tests/interchange/test_impl.py | 14 +++++++++++--- 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 102628257d6f2..8471630511e32 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -30,6 +30,7 @@ Other enhancements ^^^^^^^^^^^^^^^^^^ - :class:`pandas.api.typing.FrozenList` is available for typing the outputs of :attr:`MultiIndex.names`, :attr:`MultiIndex.codes` and :attr:`MultiIndex.levels` (:issue:`58237`) - :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`) +- :meth:`pandas.api.interchange.from_dataframe` now uses the `PyCapsule Interface `_ if available, only falling back to the Dataframe Interchange Protocol if that fails (:issue:`60739`) - :class:`pandas.api.typing.NoDefault` is available for typing ``no_default`` - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`) - :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`) diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index 5c9b8ac8ea085..b990eca39b3dd 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -41,7 +41,9 @@ def from_dataframe(df, allow_copy: bool = True) -> pd.DataFrame: .. note:: For new development, we highly recommend using the Arrow C Data Interface - alongside the Arrow PyCapsule Interface instead of the interchange protocol + alongside the Arrow PyCapsule Interface instead of the interchange protocol. + From pandas 3.0 onwards, `from_dataframe` uses the PyCapsule Interface, + only falling back to the interchange protocol if that fails. .. warning:: @@ -90,6 +92,18 @@ def from_dataframe(df, allow_copy: bool = True) -> pd.DataFrame: if isinstance(df, pd.DataFrame): return df + if hasattr(df, "__arrow_c_stream__"): + try: + pa = import_optional_dependency("pyarrow", min_version="14.0.0") + except ImportError: + # fallback to _from_dataframe + pass + else: + try: + return pa.table(df).to_pandas(zero_copy_only=not allow_copy) + except pa.ArrowInvalid as e: + raise RuntimeError(e) from e + if not hasattr(df, "__dataframe__"): raise ValueError("`df` does not support __dataframe__") diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index b80b4b923c247..a41d7dec8b496 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -278,7 +278,7 @@ def test_empty_pyarrow(data): expected = pd.DataFrame(data) arrow_df = pa_from_dataframe(expected) result = from_dataframe(arrow_df) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_column_type=False) def test_multi_chunk_pyarrow() -> None: @@ -288,8 +288,7 @@ def test_multi_chunk_pyarrow() -> None: table = pa.table([n_legs], names=names) with pytest.raises( RuntimeError, - match="To join chunks a copy is required which is " - "forbidden by allow_copy=False", + match="Cannot do zero copy conversion into multi-column DataFrame block", ): pd.api.interchange.from_dataframe(table, allow_copy=False) @@ -641,3 +640,12 @@ def test_buffer_dtype_categorical( col = dfi.get_column_by_name("data") assert col.dtype == expected_dtype assert col.get_buffers()["data"][1] == expected_buffer_dtype + + +def test_from_dataframe_list_dtype(): + pa = pytest.importorskip("pyarrow", "14.0.0") + data = {"a": [[1, 2], [4, 5, 6]]} + tbl = pa.table(data) + result = from_dataframe(tbl) + expected = pd.DataFrame(data) + tm.assert_frame_equal(result, expected) From 297a19eeebe64b1df9abeedccd2bdbc9dbc94693 Mon Sep 17 00:00:00 2001 From: William Andrea <22385371+wjandrea@users.noreply.github.com> Date: Tue, 21 Jan 2025 13:27:50 -0400 Subject: [PATCH 17/67] DOC: Fix typo "numpy.ndarray.putmask" (#60731) Should be "numpy.putmask" --- pandas/core/indexes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 165fe109c4c94..e2f9c5e9868a9 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5348,7 +5348,7 @@ def putmask(self, mask, value) -> Index: See Also -------- - numpy.ndarray.putmask : Changes elements of an array + numpy.putmask : Changes elements of an array based on conditional and input values. Examples From 7234104f104883092a97474ac3eda98e8a5ea35c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9?= <99898527+grossardt@users.noreply.github.com> Date: Tue, 21 Jan 2025 11:29:26 -0600 Subject: [PATCH 18/67] DOC: Clarify deprecation warning for iloc (#60745) api-doc rewrite deprecation warning iloc --- pandas/core/indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index e0bc0a23acd9f..656ee54cbc5d4 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -160,7 +160,7 @@ def iloc(self) -> _iLocIndexer: .. versionchanged:: 3.0 - Returning a tuple from a callable is deprecated. + Callables which return a tuple are deprecated as input. ``.iloc[]`` is primarily integer position based (from ``0`` to ``length-1`` of the axis), but may also be used with a boolean From 42bf3751a3b6354907c30f435717b9708b2661b4 Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Tue, 21 Jan 2025 10:59:54 -0800 Subject: [PATCH 19/67] ENH: Support skipna parameter in GroupBy mean and sum (#60741) * ENH: Support skipna parameter in GroupBy mean and sum * Move numba tests to test_numba.py * Fix docstring and failing future string test --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/_libs/groupby.pyi | 2 + pandas/_libs/groupby.pyx | 48 +++++++++- pandas/core/_numba/kernels/mean_.py | 3 +- pandas/core/_numba/kernels/sum_.py | 14 ++- pandas/core/groupby/groupby.py | 74 ++++++++++++++- pandas/tests/groupby/aggregate/test_numba.py | 17 ++++ pandas/tests/groupby/test_api.py | 10 +- pandas/tests/groupby/test_reductions.py | 96 +++++++++++++++++++- 9 files changed, 255 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 8471630511e32..fea269ac4555e 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -60,6 +60,7 @@ Other enhancements - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`) - :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`) - :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`) +- :meth:`.DataFrameGroupBy.mean`, :meth:`.DataFrameGroupBy.sum`, :meth:`.SeriesGroupBy.mean` and :meth:`.SeriesGroupBy.sum` now accept ``skipna`` parameter (:issue:`15675`) - :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`) - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`) - :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 34367f55d2bbb..e3909203d1f5a 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -66,6 +66,7 @@ def group_sum( result_mask: np.ndarray | None = ..., min_count: int = ..., is_datetimelike: bool = ..., + skipna: bool = ..., ) -> None: ... def group_prod( out: np.ndarray, # int64float_t[:, ::1] @@ -115,6 +116,7 @@ def group_mean( is_datetimelike: bool = ..., # bint mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., + skipna: bool = ..., ) -> None: ... def group_ohlc( out: np.ndarray, # floatingintuint_t[:, ::1] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 59bc59135a8ff..fd288dff01f32 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -700,13 +700,14 @@ def group_sum( uint8_t[:, ::1] result_mask=None, Py_ssize_t min_count=0, bint is_datetimelike=False, + bint skipna=True, ) -> None: """ Only aggregates on axis=0 using Kahan summation """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - sum_t val, t, y + sum_t val, t, y, nan_val sum_t[:, ::1] sumx, compensation int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) @@ -722,6 +723,15 @@ def group_sum( compensation = np.zeros((out).shape, dtype=(out).base.dtype) N, K = (values).shape + if uses_mask: + nan_val = 0 + elif is_datetimelike: + nan_val = NPY_NAT + elif sum_t is int64_t or sum_t is uint64_t: + # This has no effect as int64 can't be nan. Setting to 0 to avoid type error + nan_val = 0 + else: + nan_val = NAN with nogil(sum_t is not object): for i in range(N): @@ -734,6 +744,16 @@ def group_sum( for j in range(K): val = values[i, j] + if not skipna and ( + (uses_mask and result_mask[lab, j]) or + (is_datetimelike and sumx[lab, j] == NPY_NAT) or + _treat_as_na(sumx[lab, j], False) + ): + # If sum is already NA, don't add to it. This is important for + # datetimelikebecause adding a value to NPY_NAT may not result + # in a NPY_NAT + continue + if uses_mask: isna_entry = mask[i, j] else: @@ -765,6 +785,11 @@ def group_sum( # because of no gil compensation[lab, j] = 0 sumx[lab, j] = t + elif not skipna: + if uses_mask: + result_mask[lab, j] = True + else: + sumx[lab, j] = nan_val _check_below_mincount( out, uses_mask, result_mask, ncounts, K, nobs, min_count, sumx @@ -1100,6 +1125,7 @@ def group_mean( bint is_datetimelike=False, const uint8_t[:, ::1] mask=None, uint8_t[:, ::1] result_mask=None, + bint skipna=True, ) -> None: """ Compute the mean per label given a label assignment for each value. @@ -1125,6 +1151,8 @@ def group_mean( Mask of the input values. result_mask : ndarray[bool, ndim=2], optional Mask of the out array + skipna : bool, optional + If True, ignore nans in `values`. Notes ----- @@ -1168,6 +1196,16 @@ def group_mean( for j in range(K): val = values[i, j] + if not skipna and ( + (uses_mask and result_mask[lab, j]) or + (is_datetimelike and sumx[lab, j] == NPY_NAT) or + _treat_as_na(sumx[lab, j], False) + ): + # If sum is already NA, don't add to it. This is important for + # datetimelike because adding a value to NPY_NAT may not result + # in NPY_NAT + continue + if uses_mask: isna_entry = mask[i, j] elif is_datetimelike: @@ -1191,6 +1229,14 @@ def group_mean( # because of no gil compensation[lab, j] = 0. sumx[lab, j] = t + elif not skipna: + # Set the nobs to 0 so that in case of datetimelike, + # dividing NPY_NAT by nobs may not result in a NPY_NAT + nobs[lab, j] = 0 + if uses_mask: + result_mask[lab, j] = True + else: + sumx[lab, j] = nan_val for i in range(ncounts): for j in range(K): diff --git a/pandas/core/_numba/kernels/mean_.py b/pandas/core/_numba/kernels/mean_.py index cc10bd003af7e..2b59ea2fe12a5 100644 --- a/pandas/core/_numba/kernels/mean_.py +++ b/pandas/core/_numba/kernels/mean_.py @@ -169,9 +169,10 @@ def grouped_mean( labels: npt.NDArray[np.intp], ngroups: int, min_periods: int, + skipna: bool, ) -> tuple[np.ndarray, list[int]]: output, nobs_arr, comp_arr, consecutive_counts, prev_vals = grouped_kahan_sum( - values, result_dtype, labels, ngroups + values, result_dtype, labels, ngroups, skipna ) # Post-processing, replace sums that don't satisfy min_periods diff --git a/pandas/core/_numba/kernels/sum_.py b/pandas/core/_numba/kernels/sum_.py index 76f4e22b43c4b..9f2e9541b31d0 100644 --- a/pandas/core/_numba/kernels/sum_.py +++ b/pandas/core/_numba/kernels/sum_.py @@ -165,6 +165,7 @@ def grouped_kahan_sum( result_dtype: np.dtype, labels: npt.NDArray[np.intp], ngroups: int, + skipna: bool, ) -> tuple[ np.ndarray, npt.NDArray[np.int64], np.ndarray, npt.NDArray[np.int64], np.ndarray ]: @@ -180,7 +181,15 @@ def grouped_kahan_sum( lab = labels[i] val = values[i] - if lab < 0: + if lab < 0 or np.isnan(output[lab]): + continue + + if not skipna and np.isnan(val): + output[lab] = np.nan + nobs_arr[lab] += 1 + comp_arr[lab] = np.nan + consecutive_counts[lab] = 1 + prev_vals[lab] = np.nan continue sum_x = output[lab] @@ -219,11 +228,12 @@ def grouped_sum( labels: npt.NDArray[np.intp], ngroups: int, min_periods: int, + skipna: bool, ) -> tuple[np.ndarray, list[int]]: na_pos = [] output, nobs_arr, comp_arr, consecutive_counts, prev_vals = grouped_kahan_sum( - values, result_dtype, labels, ngroups + values, result_dtype, labels, ngroups, skipna ) # Post-processing, replace sums that don't satisfy min_periods diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f4ba40e275a8d..f9059e6e8896f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -214,6 +214,61 @@ class providing the base-class of operations. {example} """ +_groupby_agg_method_skipna_engine_template = """ +Compute {fname} of group values. + +Parameters +---------- +numeric_only : bool, default {no} + Include only float, int, boolean columns. + + .. versionchanged:: 2.0.0 + + numeric_only no longer accepts ``None``. + +min_count : int, default {mc} + The required number of valid values to perform the operation. If fewer + than ``min_count`` non-NA values are present the result will be NA. + +skipna : bool, default {s} + Exclude NA/null values. If the entire group is NA and ``skipna`` is + ``True``, the result will be NA. + + .. versionchanged:: 3.0.0 + +engine : str, default None {e} + * ``'cython'`` : Runs rolling apply through C-extensions from cython. + * ``'numba'`` : Runs rolling apply through JIT compiled code from numba. + Only available when ``raw`` is set to ``True``. + * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` + +engine_kwargs : dict, default None {ek} + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be + applied to both the ``func`` and the ``apply`` groupby aggregation. + +Returns +------- +Series or DataFrame + Computed {fname} of values within each group. + +See Also +-------- +SeriesGroupBy.min : Return the min of the group values. +DataFrameGroupBy.min : Return the min of the group values. +SeriesGroupBy.max : Return the max of the group values. +DataFrameGroupBy.max : Return the max of the group values. +SeriesGroupBy.sum : Return the sum of the group values. +DataFrameGroupBy.sum : Return the sum of the group values. + +Examples +-------- +{example} +""" + _pipe_template = """ Apply a ``func`` with arguments to this %(klass)s object and return its result. @@ -2091,6 +2146,7 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike: def mean( self, numeric_only: bool = False, + skipna: bool = True, engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, ): @@ -2106,6 +2162,12 @@ def mean( numeric_only no longer accepts ``None`` and defaults to ``False``. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + + .. versionadded:: 3.0.0 + engine : str, default None * ``'cython'`` : Runs the operation through C-extensions from cython. * ``'numba'`` : Runs the operation through JIT compiled code from numba. @@ -2172,12 +2234,16 @@ def mean( executor.float_dtype_mapping, engine_kwargs, min_periods=0, + skipna=skipna, ) else: result = self._cython_agg_general( "mean", - alt=lambda x: Series(x, copy=False).mean(numeric_only=numeric_only), + alt=lambda x: Series(x, copy=False).mean( + numeric_only=numeric_only, skipna=skipna + ), numeric_only=numeric_only, + skipna=skipna, ) return result.__finalize__(self.obj, method="groupby") @@ -2817,10 +2883,11 @@ def size(self) -> DataFrame | Series: @final @doc( - _groupby_agg_method_engine_template, + _groupby_agg_method_skipna_engine_template, fname="sum", no=False, mc=0, + s=True, e=None, ek=None, example=dedent( @@ -2862,6 +2929,7 @@ def sum( self, numeric_only: bool = False, min_count: int = 0, + skipna: bool = True, engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, ): @@ -2873,6 +2941,7 @@ def sum( executor.default_dtype_mapping, engine_kwargs, min_periods=min_count, + skipna=skipna, ) else: # If we are grouping on categoricals we want unobserved categories to @@ -2884,6 +2953,7 @@ def sum( min_count=min_count, alias="sum", npfunc=np.sum, + skipna=skipna, ) return result diff --git a/pandas/tests/groupby/aggregate/test_numba.py b/pandas/tests/groupby/aggregate/test_numba.py index 15c1efe5fd1ff..ca265a1d1108b 100644 --- a/pandas/tests/groupby/aggregate/test_numba.py +++ b/pandas/tests/groupby/aggregate/test_numba.py @@ -186,6 +186,23 @@ def test_multifunc_numba_vs_cython_frame(agg_kwargs): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("func", ["sum", "mean"]) +def test_multifunc_numba_vs_cython_frame_noskipna(func): + pytest.importorskip("numba") + data = DataFrame( + { + 0: ["a", "a", "b", "b", "a"], + 1: [1.0, np.nan, 3.0, 4.0, 5.0], + 2: [1, 2, 3, 4, 5], + }, + columns=[0, 1, 2], + ) + grouped = data.groupby(0) + result = grouped.agg(func, skipna=False, engine="numba") + expected = grouped.agg(func, skipna=False, engine="cython") + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "agg_kwargs,expected_func", [ diff --git a/pandas/tests/groupby/test_api.py b/pandas/tests/groupby/test_api.py index baec3ed1a5024..cc69de2581a79 100644 --- a/pandas/tests/groupby/test_api.py +++ b/pandas/tests/groupby/test_api.py @@ -176,7 +176,10 @@ def test_frame_consistency(groupby_func): elif groupby_func in ("max", "min"): exclude_expected = {"axis", "kwargs", "skipna"} exclude_result = {"min_count", "engine", "engine_kwargs"} - elif groupby_func in ("mean", "std", "sum", "var"): + elif groupby_func in ("sum", "mean"): + exclude_expected = {"axis", "kwargs"} + exclude_result = {"engine", "engine_kwargs"} + elif groupby_func in ("std", "var"): exclude_expected = {"axis", "kwargs", "skipna"} exclude_result = {"engine", "engine_kwargs"} elif groupby_func in ("median", "prod", "sem"): @@ -234,7 +237,10 @@ def test_series_consistency(request, groupby_func): elif groupby_func in ("max", "min"): exclude_expected = {"axis", "kwargs", "skipna"} exclude_result = {"min_count", "engine", "engine_kwargs"} - elif groupby_func in ("mean", "std", "sum", "var"): + elif groupby_func in ("sum", "mean"): + exclude_expected = {"axis", "kwargs"} + exclude_result = {"engine", "engine_kwargs"} + elif groupby_func in ("std", "var"): exclude_expected = {"axis", "kwargs", "skipna"} exclude_result = {"engine", "engine_kwargs"} elif groupby_func in ("median", "prod", "sem"): diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index a17200c123d22..1db12f05e821f 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -422,6 +422,98 @@ def test_mean_on_timedelta(): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize( + "values, dtype, result_dtype", + [ + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "float64", "float64"), + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "Float64", "Float64"), + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "Int64", "Float64"), + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "timedelta64[ns]", "timedelta64[ns]"), + ( + pd.to_datetime( + [ + "2019-05-09", + pd.NaT, + "2019-05-11", + "2019-05-12", + "2019-05-13", + "2019-05-14", + "2019-05-15", + "2019-05-16", + "2019-05-17", + "2019-05-18", + ] + ), + "datetime64[ns]", + "datetime64[ns]", + ), + ], +) +def test_mean_skipna(values, dtype, result_dtype, skipna): + # GH#15675 + df = DataFrame( + { + "val": values, + "cat": ["A", "B"] * 5, + } + ).astype({"val": dtype}) + # We need to recast the expected values to the result_dtype because + # Series.mean() changes the dtype to float64/object depending on the input dtype + expected = ( + df.groupby("cat")["val"] + .apply(lambda x: x.mean(skipna=skipna)) + .astype(result_dtype) + ) + result = df.groupby("cat")["val"].mean(skipna=skipna) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "values, dtype", + [ + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "float64"), + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "Float64"), + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "Int64"), + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "timedelta64[ns]"), + ], +) +def test_sum_skipna(values, dtype, skipna): + # GH#15675 + df = DataFrame( + { + "val": values, + "cat": ["A", "B"] * 5, + } + ).astype({"val": dtype}) + # We need to recast the expected values to the original dtype because + # Series.sum() changes the dtype + expected = ( + df.groupby("cat")["val"].apply(lambda x: x.sum(skipna=skipna)).astype(dtype) + ) + result = df.groupby("cat")["val"].sum(skipna=skipna) + tm.assert_series_equal(result, expected) + + +def test_sum_skipna_object(skipna): + # GH#15675 + df = DataFrame( + { + "val": ["a", "b", np.nan, "d", "e", "f", "g", "h", "i", "j"], + "cat": ["A", "B"] * 5, + } + ).astype({"val": object}) + if skipna: + expected = Series( + ["aegi", "bdfhj"], index=pd.Index(["A", "B"], name="cat"), name="val" + ).astype(object) + else: + expected = Series( + [np.nan, "bdfhj"], index=pd.Index(["A", "B"], name="cat"), name="val" + ).astype(object) + result = df.groupby("cat")["val"].sum(skipna=skipna) + tm.assert_series_equal(result, expected) + + def test_cython_median(): arr = np.random.default_rng(2).standard_normal(1000) arr[::2] = np.nan @@ -1128,8 +1220,8 @@ def test_regression_allowlist_methods(op, skipna, sort): grouped = frame.groupby(level=0, sort=sort) - if op in ["skew", "kurt"]: - # skew and kurt have skipna + if op in ["skew", "kurt", "sum", "mean"]: + # skew, kurt, sum, mean have skipna result = getattr(grouped, op)(skipna=skipna) expected = frame.groupby(level=0).apply(lambda h: getattr(h, op)(skipna=skipna)) if sort: From 31704e3a69a65f0508c40c4881453757334c217e Mon Sep 17 00:00:00 2001 From: Pranav Raghu <73378019+Impaler343@users.noreply.github.com> Date: Wed, 22 Jan 2025 00:41:34 +0530 Subject: [PATCH 20/67] DOC: Add line clarifying sorting using sort_values() (#60734) fix docs --- pandas/core/frame.py | 3 ++- pandas/core/generic.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 863465ca1565c..af66bb54610f1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6890,7 +6890,8 @@ def sort_values( builtin :meth:`sorted` function, with the notable difference that this `key` function should be *vectorized*. It should expect a ``Series`` and return a Series with the same shape as the input. - It will be applied to each column in `by` independently. + It will be applied to each column in `by` independently. The values in the + returned Series will be used as the keys for sorting. Returns ------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index de7fb3682fb4f..e0a4f9d9c546a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4884,7 +4884,8 @@ def sort_values( builtin :meth:`sorted` function, with the notable difference that this `key` function should be *vectorized*. It should expect a ``Series`` and return a Series with the same shape as the input. - It will be applied to each column in `by` independently. + It will be applied to each column in `by` independently. The values in the + returned Series will be used as the keys for sorting. Returns ------- From b98336653128790661d4c66d398f3e44d481dd3b Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 21 Jan 2025 11:18:40 -0800 Subject: [PATCH 21/67] CI: Test Github Actions Arm64 Runners (#60722) * CI: Test Github Actions Arm64 Runners * try using platform as cache key * fixes * add platform for includes jobs --- .github/workflows/unit-tests.yml | 16 +++++++++++++--- .github/workflows/wheels.yml | 1 + 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 899b49cc4eff5..e0fa7f7421f13 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -22,10 +22,11 @@ defaults: jobs: ubuntu: - runs-on: ubuntu-22.04 + runs-on: ${{ matrix.platform }} timeout-minutes: 90 strategy: matrix: + platform: [ubuntu-22.04, ubuntu-24.04-arm] env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml] # Prevent the include jobs from overriding other jobs pattern: [""] @@ -35,9 +36,11 @@ jobs: env_file: actions-311-downstream_compat.yaml pattern: "not slow and not network and not single_cpu" pytest_target: "pandas/tests/test_downstream.py" + platform: ubuntu-22.04 - name: "Minimum Versions" env_file: actions-310-minimum_versions.yaml pattern: "not slow and not network and not single_cpu" + platform: ubuntu-22.04 - name: "Locale: it_IT" env_file: actions-311.yaml pattern: "not slow and not network and not single_cpu" @@ -48,6 +51,7 @@ jobs: # Also install it_IT (its encoding is ISO8859-1) but do not activate it. # It will be temporarily activated during tests with locale.setlocale extra_loc: "it_IT" + platform: ubuntu-22.04 - name: "Locale: zh_CN" env_file: actions-311.yaml pattern: "not slow and not network and not single_cpu" @@ -58,25 +62,31 @@ jobs: # Also install zh_CN (its encoding is gb2312) but do not activate it. # It will be temporarily activated during tests with locale.setlocale extra_loc: "zh_CN" + platform: ubuntu-22.04 - name: "Future infer strings" env_file: actions-312.yaml pandas_future_infer_string: "1" + platform: ubuntu-22.04 - name: "Future infer strings (without pyarrow)" env_file: actions-311.yaml pandas_future_infer_string: "1" + platform: ubuntu-22.04 - name: "Pypy" env_file: actions-pypy-39.yaml pattern: "not slow and not network and not single_cpu" test_args: "--max-worker-restart 0" + platform: ubuntu-22.04 - name: "Numpy Dev" env_file: actions-311-numpydev.yaml pattern: "not slow and not network and not single_cpu" test_args: "-W error::DeprecationWarning -W error::FutureWarning" + platform: ubuntu-22.04 - name: "Pyarrow Nightly" env_file: actions-311-pyarrownightly.yaml pattern: "not slow and not network and not single_cpu" + platform: ubuntu-22.04 fail-fast: false - name: ${{ matrix.name || format('ubuntu-latest {0}', matrix.env_file) }} + name: ${{ matrix.name || format('ubuntu-latest {0}', matrix.env_file) }}-${{ matrix.platform }} env: PATTERN: ${{ matrix.pattern }} LANG: ${{ matrix.lang || 'C.UTF-8' }} @@ -91,7 +101,7 @@ jobs: REMOVE_PYARROW: ${{ matrix.name == 'Future infer strings (without pyarrow)' && '1' || '0' }} concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_future_infer_string }} + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_future_infer_string }}-${{ matrix.platform }} cancel-in-progress: true services: diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 3314e645509d1..a4c2a732f9fc8 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -94,6 +94,7 @@ jobs: buildplat: - [ubuntu-22.04, manylinux_x86_64] - [ubuntu-22.04, musllinux_x86_64] + - [ubuntu-24.04-arm, manylinux_aarch64] - [macos-13, macosx_x86_64] # Note: M1 images on Github Actions start from macOS 14 - [macos-14, macosx_arm64] From 14dcb7b330b8ee5fb22e971807dd85df25c6ef2c Mon Sep 17 00:00:00 2001 From: William Ayd Date: Tue, 21 Jan 2025 17:46:21 -0500 Subject: [PATCH 22/67] Fix group_sum NaN comparison warnings (#60749) --- pandas/_libs/groupby.pyx | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index fd288dff01f32..70af22f514ce0 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -712,7 +712,7 @@ def group_sum( int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) bint uses_mask = mask is not None - bint isna_entry + bint isna_entry, isna_result if len_values != len_labels: raise ValueError("len(index) != len(labels)") @@ -744,20 +744,18 @@ def group_sum( for j in range(K): val = values[i, j] - if not skipna and ( - (uses_mask and result_mask[lab, j]) or - (is_datetimelike and sumx[lab, j] == NPY_NAT) or - _treat_as_na(sumx[lab, j], False) - ): - # If sum is already NA, don't add to it. This is important for - # datetimelikebecause adding a value to NPY_NAT may not result - # in a NPY_NAT - continue - if uses_mask: isna_entry = mask[i, j] + isna_result = result_mask[lab, j] else: isna_entry = _treat_as_na(val, is_datetimelike) + isna_result = _treat_as_na(sumx[lab, j], is_datetimelike) + + if not skipna and isna_result: + # If sum is already NA, don't add to it. This is important for + # datetimelikebecause adding a value to NPY_NAT may not result + # in a NPY_NAT + continue if not isna_entry: nobs[lab, j] += 1 From ae42f3e1c1b7af55059921b41fd61710fe2dd785 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 21 Jan 2025 17:06:56 -0800 Subject: [PATCH 23/67] CI: Rename ubuntu unit test jobs (#60751) --- .github/workflows/unit-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index e0fa7f7421f13..fe9ec7f40a54b 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -86,7 +86,7 @@ jobs: pattern: "not slow and not network and not single_cpu" platform: ubuntu-22.04 fail-fast: false - name: ${{ matrix.name || format('ubuntu-latest {0}', matrix.env_file) }}-${{ matrix.platform }} + name: ${{ matrix.name || format('{0} {1}', matrix.platform, matrix.env_file) }} env: PATTERN: ${{ matrix.pattern }} LANG: ${{ matrix.lang || 'C.UTF-8' }} From 5efac8250787414ec580f0472e2b563032ec7d53 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 22 Jan 2025 03:07:08 +0100 Subject: [PATCH 24/67] Update PyArrow conversion and arrow/parquet tests for pyarrow 19.0 (#60716) * Update PyArrow conversion and arrow/parquet tests for pyarrow 19.0 * update pypi index * extra filterwarnings * more test updates * temp enable infer_string option * Adapt test_get_handle_pyarrow_compat for pyarrow 19 * Use pa_version_under19p0 in test_get_handle_pyarrow_compat * Adjust test_string_inference for using_infer_string * Fix test_string_inference for feather --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- .github/workflows/unit-tests.yml | 1 + ci/deps/actions-311-pyarrownightly.yaml | 2 +- pandas/compat/__init__.py | 2 + pandas/compat/pyarrow.py | 2 + pandas/io/_util.py | 10 +++- pandas/tests/arrays/string_/test_string.py | 22 +++++++- pandas/tests/io/test_common.py | 5 +- pandas/tests/io/test_feather.py | 18 +++++- pandas/tests/io/test_parquet.py | 65 ++++++++++++++-------- 9 files changed, 96 insertions(+), 31 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index fe9ec7f40a54b..d2e2a170a1d04 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -84,6 +84,7 @@ jobs: - name: "Pyarrow Nightly" env_file: actions-311-pyarrownightly.yaml pattern: "not slow and not network and not single_cpu" + pandas_future_infer_string: "1" platform: ubuntu-22.04 fail-fast: false name: ${{ matrix.name || format('{0} {1}', matrix.platform, matrix.env_file) }} diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index 22e4907e5a6e5..2d3d11c294e12 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -23,7 +23,7 @@ dependencies: - pip: - "tzdata>=2022.7" - - "--extra-index-url https://pypi.fury.io/arrow-nightlies/" + - "--extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" - "--prefer-binary" - "--pre" - "pyarrow" diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index e7674386408f7..138456f877c5f 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -34,6 +34,7 @@ pa_version_under16p0, pa_version_under17p0, pa_version_under18p0, + pa_version_under19p0, ) if TYPE_CHECKING: @@ -166,4 +167,5 @@ def is_ci_environment() -> bool: "pa_version_under16p0", "pa_version_under17p0", "pa_version_under18p0", + "pa_version_under19p0", ] diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index bd009b544f31e..c501c06b93813 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -18,6 +18,7 @@ pa_version_under16p0 = _palv < Version("16.0.0") pa_version_under17p0 = _palv < Version("17.0.0") pa_version_under18p0 = _palv < Version("18.0.0") + pa_version_under19p0 = _palv < Version("19.0.0") HAS_PYARROW = True except ImportError: pa_version_under10p1 = True @@ -30,4 +31,5 @@ pa_version_under16p0 = True pa_version_under17p0 = True pa_version_under18p0 = True + pa_version_under19p0 = True HAS_PYARROW = False diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 9778a404e23e0..6827fbe9c998e 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -10,7 +10,10 @@ from pandas._config import using_string_dtype from pandas._libs import lib -from pandas.compat import pa_version_under18p0 +from pandas.compat import ( + pa_version_under18p0, + pa_version_under19p0, +) from pandas.compat._optional import import_optional_dependency import pandas as pd @@ -77,7 +80,10 @@ def arrow_table_to_pandas( elif dtype_backend == "pyarrow": types_mapper = pd.ArrowDtype elif using_string_dtype(): - types_mapper = _arrow_string_types_mapper() + if pa_version_under19p0: + types_mapper = _arrow_string_types_mapper() + else: + types_mapper = None elif dtype_backend is lib.no_default or dtype_backend == "numpy": types_mapper = None else: diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index a32ac7db4656a..f875873863b4d 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -10,7 +10,10 @@ from pandas._config import using_string_dtype -from pandas.compat.pyarrow import pa_version_under12p0 +from pandas.compat.pyarrow import ( + pa_version_under12p0, + pa_version_under19p0, +) from pandas.core.dtypes.common import is_dtype_equal @@ -539,7 +542,7 @@ def test_arrow_roundtrip(dtype, string_storage, using_infer_string): assert table.field("a").type == "large_string" with pd.option_context("string_storage", string_storage): result = table.to_pandas() - if dtype.na_value is np.nan and not using_string_dtype(): + if dtype.na_value is np.nan and not using_infer_string: assert result["a"].dtype == "object" else: assert isinstance(result["a"].dtype, pd.StringDtype) @@ -553,6 +556,21 @@ def test_arrow_roundtrip(dtype, string_storage, using_infer_string): assert result.loc[2, "a"] is result["a"].dtype.na_value +@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") +def test_arrow_from_string(using_infer_string): + # not roundtrip, but starting with pyarrow table without pandas metadata + pa = pytest.importorskip("pyarrow") + table = pa.table({"a": pa.array(["a", "b", None], type=pa.string())}) + + result = table.to_pandas() + + if using_infer_string and not pa_version_under19p0: + expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="str") + else: + expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="object") + tm.assert_frame_equal(result, expected) + + @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string): # GH-41040 diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 7ff3d24336f00..e162815271ab3 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -23,6 +23,7 @@ WASM, is_platform_windows, ) +from pandas.compat.pyarrow import pa_version_under19p0 import pandas.util._test_decorators as td import pandas as pd @@ -152,8 +153,8 @@ def test_get_handle_pyarrow_compat(self): s = StringIO(data) with icom.get_handle(s, "rb", is_text=False) as handles: df = pa_csv.read_csv(handles.handle).to_pandas() - # TODO will have to update this when pyarrow' to_pandas() is fixed - expected = expected.astype("object") + if pa_version_under19p0: + expected = expected.astype("object") tm.assert_frame_equal(df, expected) assert not s.closed diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 69354066dd5ef..24af0a014dd50 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -6,7 +6,10 @@ import numpy as np import pytest -from pandas.compat.pyarrow import pa_version_under18p0 +from pandas.compat.pyarrow import ( + pa_version_under18p0, + pa_version_under19p0, +) import pandas as pd import pandas._testing as tm @@ -239,16 +242,27 @@ def test_invalid_dtype_backend(self): with pytest.raises(ValueError, match=msg): read_feather(path, dtype_backend="numpy") - def test_string_inference(self, tmp_path): + def test_string_inference(self, tmp_path, using_infer_string): # GH#54431 path = tmp_path / "test_string_inference.p" df = pd.DataFrame(data={"a": ["x", "y"]}) df.to_feather(path) with pd.option_context("future.infer_string", True): result = read_feather(path) + dtype = pd.StringDtype(na_value=np.nan) expected = pd.DataFrame( data={"a": ["x", "y"]}, dtype=pd.StringDtype(na_value=np.nan) ) + expected = pd.DataFrame( + data={"a": ["x", "y"]}, + dtype=dtype, + columns=pd.Index( + ["a"], + dtype=object + if pa_version_under19p0 and not using_infer_string + else dtype, + ), + ) tm.assert_frame_equal(result, expected) @pytest.mark.skipif(pa_version_under18p0, reason="not supported before 18.0") diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 7919bb956dc7a..91580c31ea081 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -17,6 +17,7 @@ pa_version_under13p0, pa_version_under15p0, pa_version_under17p0, + pa_version_under19p0, ) import pandas as pd @@ -254,8 +255,10 @@ def test_invalid_engine(df_compat): check_round_trip(df_compat, "foo", "bar") -def test_options_py(df_compat, pa): +def test_options_py(df_compat, pa, using_infer_string): # use the set option + if using_infer_string and not pa_version_under19p0: + df_compat.columns = df_compat.columns.astype("str") with pd.option_context("io.parquet.engine", "pyarrow"): check_round_trip(df_compat) @@ -784,18 +787,21 @@ def test_unsupported_float16_cleanup(self, pa, path_type): def test_categorical(self, pa): # supported in >= 0.7.0 - df = pd.DataFrame() - df["a"] = pd.Categorical(list("abcdef")) - - # test for null, out-of-order values, and unobserved category - df["b"] = pd.Categorical( - ["bar", "foo", "foo", "bar", None, "bar"], - dtype=pd.CategoricalDtype(["foo", "bar", "baz"]), - ) - - # test for ordered flag - df["c"] = pd.Categorical( - ["a", "b", "c", "a", "c", "b"], categories=["b", "c", "d"], ordered=True + df = pd.DataFrame( + { + "a": pd.Categorical(list("abcdef")), + # test for null, out-of-order values, and unobserved category + "b": pd.Categorical( + ["bar", "foo", "foo", "bar", None, "bar"], + dtype=pd.CategoricalDtype(["foo", "bar", "baz"]), + ), + # test for ordered flag + "c": pd.Categorical( + ["a", "b", "c", "a", "c", "b"], + categories=["b", "c", "d"], + ordered=True, + ), + } ) check_round_trip(df, pa) @@ -858,11 +864,13 @@ def test_s3_roundtrip_for_dir( repeat=1, ) - def test_read_file_like_obj_support(self, df_compat): + def test_read_file_like_obj_support(self, df_compat, using_infer_string): pytest.importorskip("pyarrow") buffer = BytesIO() df_compat.to_parquet(buffer) df_from_buf = read_parquet(buffer) + if using_infer_string and not pa_version_under19p0: + df_compat.columns = df_compat.columns.astype("str") tm.assert_frame_equal(df_compat, df_from_buf) def test_expand_user(self, df_compat, monkeypatch): @@ -929,7 +937,7 @@ def test_additional_extension_arrays(self, pa, using_infer_string): "c": pd.Series(["a", None, "c"], dtype="string"), } ) - if using_infer_string: + if using_infer_string and pa_version_under19p0: check_round_trip(df, pa, expected=df.astype({"c": "str"})) else: check_round_trip(df, pa) @@ -943,7 +951,10 @@ def test_pyarrow_backed_string_array(self, pa, string_storage, using_infer_strin df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="string[pyarrow]")}) with pd.option_context("string_storage", string_storage): if using_infer_string: - expected = df.astype("str") + if pa_version_under19p0: + expected = df.astype("str") + else: + expected = df.astype(f"string[{string_storage}]") expected.columns = expected.columns.astype("str") else: expected = df.astype(f"string[{string_storage}]") @@ -1099,17 +1110,24 @@ def test_df_attrs_persistence(self, tmp_path, pa): new_df = read_parquet(path, engine=pa) assert new_df.attrs == df.attrs - def test_string_inference(self, tmp_path, pa): + def test_string_inference(self, tmp_path, pa, using_infer_string): # GH#54431 path = tmp_path / "test_string_inference.p" df = pd.DataFrame(data={"a": ["x", "y"]}, index=["a", "b"]) - df.to_parquet(path, engine="pyarrow") + df.to_parquet(path, engine=pa) with pd.option_context("future.infer_string", True): - result = read_parquet(path, engine="pyarrow") + result = read_parquet(path, engine=pa) + dtype = pd.StringDtype(na_value=np.nan) expected = pd.DataFrame( data={"a": ["x", "y"]}, - dtype=pd.StringDtype(na_value=np.nan), - index=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), + dtype=dtype, + index=pd.Index(["a", "b"], dtype=dtype), + columns=pd.Index( + ["a"], + dtype=object + if pa_version_under19p0 and not using_infer_string + else dtype, + ), ) tm.assert_frame_equal(result, expected) @@ -1122,7 +1140,10 @@ def test_roundtrip_decimal(self, tmp_path, pa): df = pd.DataFrame({"a": [Decimal("123.00")]}, dtype="string[pyarrow]") df.to_parquet(path, schema=pa.schema([("a", pa.decimal128(5))])) result = read_parquet(path) - expected = pd.DataFrame({"a": ["123"]}, dtype="string[python]") + if pa_version_under19p0: + expected = pd.DataFrame({"a": ["123"]}, dtype="string[python]") + else: + expected = pd.DataFrame({"a": [Decimal("123.00")]}, dtype="object") tm.assert_frame_equal(result, expected) def test_infer_string_large_string_type(self, tmp_path, pa): From 1bb264c443f6be64ac28ff9afc0341eed0bcc455 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Wed, 22 Jan 2025 04:55:49 -0500 Subject: [PATCH 25/67] API(str dtype): Raise on StringDtype for unary op + (#60710) --- doc/source/whatsnew/v2.3.0.rst | 1 + pandas/core/arrays/string_arrow.py | 3 +++ pandas/tests/frame/test_unary.py | 6 ------ 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 96eed72823e72..f2b6f70a3138c 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -105,6 +105,7 @@ Conversion Strings ^^^^^^^ +- Bug in :meth:`Series.__pos__` and :meth:`DataFrame.__pos__` did not raise for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`60710`) - Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`) - Bug in :meth:`Series.replace` with :class:`StringDtype` when replacing with a non-string value was not upcasting to ``object`` dtype (:issue:`60282`) - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 27c1425d11ac6..d35083fd892a8 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -481,6 +481,9 @@ def _cmp_method(self, other, op): return result.to_numpy(np.bool_, na_value=False) return result + def __pos__(self) -> Self: + raise TypeError(f"bad operand type for unary +: '{self.dtype}'") + class ArrowStringArrayNumpySemantics(ArrowStringArray): _na_value = np.nan diff --git a/pandas/tests/frame/test_unary.py b/pandas/tests/frame/test_unary.py index 217255e73b450..652f52bd226af 100644 --- a/pandas/tests/frame/test_unary.py +++ b/pandas/tests/frame/test_unary.py @@ -3,9 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW from pandas.compat.numpy import np_version_gte1p25 import pandas as pd @@ -122,9 +119,6 @@ def test_pos_object(self, df_data): tm.assert_frame_equal(+df, df) tm.assert_series_equal(+df["a"], df["a"]) - @pytest.mark.xfail( - using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)" - ) @pytest.mark.filterwarnings("ignore:Applying:DeprecationWarning") def test_pos_object_raises(self): # GH#21380 From f95558fab024a3b2da7d7111a9bd75079287b385 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 22 Jan 2025 12:38:57 -0500 Subject: [PATCH 26/67] DOC: fix PR07,SA01 for pandas.arrays.TimedeltaArray (#60757) --- ci/code_checks.sh | 1 - pandas/core/arrays/timedeltas.py | 8 +++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 948d8bee8ba5b..1b0e555c38b69 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -79,7 +79,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.min PR02" \ -i "pandas.Timestamp.resolution PR02" \ -i "pandas.Timestamp.tzinfo GL08" \ - -i "pandas.arrays.TimedeltaArray PR07,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \ -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \ -i "pandas.core.resample.Resampler.quantile PR01,PR07" \ diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index a8a0037d0bbb9..c5b3129c506c8 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -115,10 +115,10 @@ class TimedeltaArray(dtl.TimelikeOps): ---------- data : array-like The timedelta data. - dtype : numpy.dtype Currently, only ``numpy.dtype("timedelta64[ns]")`` is accepted. freq : Offset, optional + Frequency of the data. copy : bool, default False Whether to copy the underlying array of data. @@ -130,6 +130,12 @@ class TimedeltaArray(dtl.TimelikeOps): ------- None + See Also + -------- + Timedelta : Represents a duration, the difference between two dates or times. + TimedeltaIndex : Immutable Index of timedelta64 data. + to_timedelta : Convert argument to timedelta. + Examples -------- >>> pd.arrays.TimedeltaArray._from_sequence(pd.TimedeltaIndex(["1h", "2h"])) From 1039bd9aa2f090c5db5608843ce62807ed6e1e29 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 22 Jan 2025 12:39:35 -0500 Subject: [PATCH 27/67] DOC: fix RT03,SA01 for pandas.plotting.andrews_curves (#60759) --- ci/code_checks.sh | 1 - pandas/plotting/_misc.py | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 1b0e555c38b69..c7e644bd30cd3 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -83,7 +83,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \ -i "pandas.core.resample.Resampler.quantile PR01,PR07" \ -i "pandas.core.resample.Resampler.transform PR01,RT03,SA01" \ - -i "pandas.plotting.andrews_curves RT03,SA01" \ -i "pandas.tseries.offsets.BDay PR02,SA01" \ -i "pandas.tseries.offsets.BQuarterBegin.is_on_offset GL08" \ -i "pandas.tseries.offsets.BQuarterBegin.n GL08" \ diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index b20f8ac5f4796..3f839cefe798e 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -389,6 +389,12 @@ def andrews_curves( Returns ------- :class:`matplotlib.axes.Axes` + The matplotlib Axes object with the plot. + + See Also + -------- + plotting.parallel_coordinates : Plot parallel coordinates chart. + DataFrame.plot : Make plots of Series or DataFrame. Examples -------- From c168c0649169dd48b3349a3425c32640737cf070 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Wed, 22 Jan 2025 12:53:47 -0500 Subject: [PATCH 28/67] DOC: Whatsnew for sorting mode result (#60718) * DOC: Whatsnew for sorting mode result * Reverts --- doc/source/whatsnew/v2.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index f2b6f70a3138c..de1118b56dc81 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -95,7 +95,7 @@ Timezones Numeric ^^^^^^^ -- +- Enabled :class:`Series.mode` and :class:`DataFrame.mode` with ``dropna=False`` to sort the result for all dtypes in the presence of NA values; previously only certain dtypes would sort (:issue:`60702`) - Conversion From b60e222634b759e0c36ee3f97ba83211e5017b76 Mon Sep 17 00:00:00 2001 From: Agriya Khetarpal <74401230+agriyakhetarpal@users.noreply.github.com> Date: Wed, 22 Jan 2025 23:28:55 +0530 Subject: [PATCH 29/67] Miscellaneous updates for Pyodide 0.27: bump WASM CI and revise Arrow compatibility note (#60756) * Update Pyodide versions for CI * Git-ignore Pyodide xbuildenv folder * Pin to Pyodide 0.27.1 * Drop "WASM (pyodide and pyscript)" from Arrow compatibility notes * `TestCoercionFloat32.test_setitem` now xpasses --- .github/workflows/unit-tests.yml | 13 ++++++++----- .gitignore | 4 ++++ pandas/tests/series/indexing/test_setitem.py | 2 -- .../pdeps/0010-required-pyarrow-dependency.md | 1 - 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index d2e2a170a1d04..842629ba331d6 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -430,20 +430,20 @@ jobs: with: fetch-depth: 0 - - name: Set up Python for Pyodide + - name: Set up Python for pyodide-build id: setup-python uses: actions/setup-python@v5 with: - python-version: '3.11.3' + python-version: '3.12' - name: Set up Emscripten toolchain uses: mymindstorm/setup-emsdk@v14 with: - version: '3.1.46' + version: '3.1.58' actions-cache-folder: emsdk-cache - name: Install pyodide-build - run: pip install "pyodide-build==0.25.1" + run: pip install "pyodide-build>=0.29.2" - name: Build pandas for Pyodide run: | @@ -452,10 +452,13 @@ jobs: - name: Set up Node.js uses: actions/setup-node@v4 with: - node-version: '18' + node-version: '20' - name: Set up Pyodide virtual environment + env: + pyodide-version: '0.27.1' run: | + pyodide xbuildenv install ${{ env.pyodide-version }} pyodide venv .venv-pyodide source .venv-pyodide/bin/activate pip install dist/*.whl diff --git a/.gitignore b/.gitignore index a188e216d9f70..d951f3fb9cbad 100644 --- a/.gitignore +++ b/.gitignore @@ -137,3 +137,7 @@ doc/source/savefig/ # Interactive terminal generated files # ######################################## .jupyterlite.doit.db + +# Pyodide/WASM related files # +############################## +/.pyodide-xbuildenv-* diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 158198239ba75..49c933c308235 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -9,7 +9,6 @@ import numpy as np import pytest -from pandas.compat import WASM from pandas.compat.numpy import np_version_gte1p24 from pandas.errors import IndexingError @@ -1449,7 +1448,6 @@ def obj(self): np_version_gte1p24 and os.environ.get("NPY_PROMOTION_STATE", "weak") != "weak" ) - or WASM ), reason="np.float32(1.1) ends up as 1.100000023841858, so " "np_can_hold_element raises and we cast to float64", diff --git a/web/pandas/pdeps/0010-required-pyarrow-dependency.md b/web/pandas/pdeps/0010-required-pyarrow-dependency.md index d586c46e243f8..0c3bf3c776988 100644 --- a/web/pandas/pdeps/0010-required-pyarrow-dependency.md +++ b/web/pandas/pdeps/0010-required-pyarrow-dependency.md @@ -185,7 +185,6 @@ Additionally, if a user is installing pandas in an environment where wheels are the user will need to also build Arrow C++ and related dependencies when installing from source. These environments include - Alpine linux (commonly used as a base for Docker containers) -- WASM (pyodide and pyscript) - Python development versions Lastly, pandas development and releases will need to be mindful of PyArrow's development and release cadance. For example when From fef01c5c58a72dd58e20c776bc30b21924131303 Mon Sep 17 00:00:00 2001 From: Jacob Lazar <129856302+Jacob-Lazar@users.noreply.github.com> Date: Thu, 23 Jan 2025 00:11:45 +0530 Subject: [PATCH 30/67] DOC: add SPSS comparison guide structure (#60738) * DOC: add SPSS comparison guide structure - Create SPSS comparison documentation - Add header and introduction sections - Terminology translation table - Create template for common operations comparison Part of #60727 * DOC: edit SPSS comparison guide to documentation - Added file to doc/source/getting_started/comparison/index.rst toctree - Fixed formatting and whitespace issues to meet documentation standards * DOC: edit minor whitespaces in SPSS comparison guide * DOC: standardize class references in SPSS guide * DOC: Fix RST section underline lengths in SPSS comparison --------- Co-authored-by: jl_win_a --- .../comparison/comparison_with_spss.rst | 229 ++++++++++++++++++ .../getting_started/comparison/index.rst | 1 + 2 files changed, 230 insertions(+) create mode 100644 doc/source/getting_started/comparison/comparison_with_spss.rst diff --git a/doc/source/getting_started/comparison/comparison_with_spss.rst b/doc/source/getting_started/comparison/comparison_with_spss.rst new file mode 100644 index 0000000000000..12c64bfd180a3 --- /dev/null +++ b/doc/source/getting_started/comparison/comparison_with_spss.rst @@ -0,0 +1,229 @@ +.. _compare_with_spss: + +{{ header }} + +Comparison with SPSS +******************** +For potential users coming from `SPSS `__, this page is meant to demonstrate +how various SPSS operations would be performed using pandas. + +.. include:: includes/introduction.rst + +Data structures +--------------- + +General terminology translation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. csv-table:: + :header: "pandas", "SPSS" + :widths: 20, 20 + + :class:`DataFrame`, data file + column, variable + row, case + groupby, split file + :class:`NaN`, system-missing + +:class:`DataFrame` +~~~~~~~~~~~~~~~~~~ + +A :class:`DataFrame` in pandas is analogous to an SPSS data file - a two-dimensional +data source with labeled columns that can be of different types. As will be shown in this +document, almost any operation that can be performed in SPSS can also be accomplished in pandas. + +:class:`Series` +~~~~~~~~~~~~~~~ + +A :class:`Series` is the data structure that represents one column of a :class:`DataFrame`. SPSS doesn't have a +separate data structure for a single variable, but in general, working with a :class:`Series` is analogous +to working with a variable in SPSS. + +:class:`Index` +~~~~~~~~~~~~~~ + +Every :class:`DataFrame` and :class:`Series` has an :class:`Index` -- labels on the *rows* of the data. SPSS does not +have an exact analogue, as cases are simply numbered sequentially from 1. In pandas, if no index is +specified, a :class:`RangeIndex` is used by default (first row = 0, second row = 1, and so on). + +While using a labeled :class:`Index` or :class:`MultiIndex` can enable sophisticated analyses and is ultimately an +important part of pandas to understand, for this comparison we will essentially ignore the :class:`Index` and +just treat the :class:`DataFrame` as a collection of columns. Please see the :ref:`indexing documentation` +for much more on how to use an :class:`Index` effectively. + + +Copies vs. in place operations +------------------------------ + +.. include:: includes/copies.rst + + +Data input / output +------------------- + +Reading external data +~~~~~~~~~~~~~~~~~~~~~ + +Like SPSS, pandas provides utilities for reading in data from many formats. The ``tips`` dataset, found within +the pandas tests (`csv `_) +will be used in many of the following examples. + +In SPSS, you would use File > Open > Data to import a CSV file: + +.. code-block:: text + + FILE > OPEN > DATA + /TYPE=CSV + /FILE='tips.csv' + /DELIMITERS="," + /FIRSTCASE=2 + /VARIABLES=col1 col2 col3. + +The pandas equivalent would use :func:`read_csv`: + +.. code-block:: python + + url = ( + "https://raw.githubusercontent.com/pandas-dev" + "/pandas/main/pandas/tests/io/data/csv/tips.csv" + ) + tips = pd.read_csv(url) + tips + +Like SPSS's data import wizard, ``read_csv`` can take a number of parameters to specify how the data should be parsed. +For example, if the data was instead tab delimited, and did not have column names, the pandas command would be: + +.. code-block:: python + + tips = pd.read_csv("tips.csv", sep="\t", header=None) + + # alternatively, read_table is an alias to read_csv with tab delimiter + tips = pd.read_table("tips.csv", header=None) + + +Data operations +--------------- + +Filtering +~~~~~~~~~ + +In SPSS, filtering is done through Data > Select Cases: + +.. code-block:: text + + SELECT IF (total_bill > 10). + EXECUTE. + +In pandas, boolean indexing can be used: + +.. code-block:: python + + tips[tips["total_bill"] > 10] + + +Sorting +~~~~~~~ + +In SPSS, sorting is done through Data > Sort Cases: + +.. code-block:: text + + SORT CASES BY sex total_bill. + EXECUTE. + +In pandas, this would be written as: + +.. code-block:: python + + tips.sort_values(["sex", "total_bill"]) + + +String processing +----------------- + +Finding length of string +~~~~~~~~~~~~~~~~~~~~~~~~ + +In SPSS: + +.. code-block:: text + + COMPUTE length = LENGTH(time). + EXECUTE. + +.. include:: includes/length.rst + + +Changing case +~~~~~~~~~~~~~ + +In SPSS: + +.. code-block:: text + + COMPUTE upper = UPCASE(time). + COMPUTE lower = LOWER(time). + EXECUTE. + +.. include:: includes/case.rst + + +Merging +------- + +In SPSS, merging data files is done through Data > Merge Files. + +.. include:: includes/merge_setup.rst +.. include:: includes/merge.rst + + +GroupBy operations +------------------ + +Split-file processing +~~~~~~~~~~~~~~~~~~~~~ + +In SPSS, split-file analysis is done through Data > Split File: + +.. code-block:: text + + SORT CASES BY sex. + SPLIT FILE BY sex. + DESCRIPTIVES VARIABLES=total_bill tip + /STATISTICS=MEAN STDDEV MIN MAX. + +The pandas equivalent would be: + +.. code-block:: python + + tips.groupby("sex")[["total_bill", "tip"]].agg(["mean", "std", "min", "max"]) + + +Missing data +------------ + +SPSS uses the period (``.``) for numeric missing values and blank spaces for string missing values. +pandas uses ``NaN`` (Not a Number) for numeric missing values and ``None`` or ``NaN`` for string +missing values. + +.. include:: includes/missing.rst + + +Other considerations +-------------------- + +Output management +----------------- + +While pandas does not have a direct equivalent to SPSS's Output Management System (OMS), you can +capture and export results in various ways: + +.. code-block:: python + + # Save summary statistics to CSV + tips.groupby('sex')[['total_bill', 'tip']].mean().to_csv('summary.csv') + + # Save multiple results to Excel sheets + with pd.ExcelWriter('results.xlsx') as writer: + tips.describe().to_excel(writer, sheet_name='Descriptives') + tips.groupby('sex').mean().to_excel(writer, sheet_name='Means by Gender') diff --git a/doc/source/getting_started/comparison/index.rst b/doc/source/getting_started/comparison/index.rst index c3f58ce1f3d6d..3133d74afa3db 100644 --- a/doc/source/getting_started/comparison/index.rst +++ b/doc/source/getting_started/comparison/index.rst @@ -14,3 +14,4 @@ Comparison with other tools comparison_with_spreadsheets comparison_with_sas comparison_with_stata + comparison_with_spss From 1d33e4cedbb21b16917048358659bd96d1b8c8b6 Mon Sep 17 00:00:00 2001 From: Akshay Jain Date: Wed, 22 Jan 2025 13:28:29 -0800 Subject: [PATCH 31/67] BUG: Fixed TypeError for Series.isin() when large series and values contains NA (#60678) (#60736) * BUG: Fixed TypeError for Series.isin() when large series and values contains NA (#60678) * Add entry to whatsnew/v3.0.0.rst for bug fixing * Replaced np.vectorize() with any() for minor performance improvement and add new test cases * Fixed failed pre-commit.ci hooks : Formatting errors in algorithms.py, inconsistent-namespace-usage in test_isin.py, sorted whatsnew entry * Combined redundant if-statements to improve readability and performance --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/algorithms.py | 6 ++++++ pandas/tests/series/methods/test_isin.py | 24 ++++++++++++++++++++++++ 3 files changed, 31 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index fea269ac4555e..517ac7a4b44b9 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -804,6 +804,7 @@ Other - Bug in :meth:`Index.sort_values` when passing a key function that turns values into tuples, e.g. ``key=natsort.natsort_key``, would raise ``TypeError`` (:issue:`56081`) - Bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`) - Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`) +- Bug in :meth:`Series.isin` raising ``TypeError`` when series is large (>10**6) and ``values`` contains NA (:issue:`60678`) - Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`) - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`) - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` throwing ``ValueError`` when ``regex=True`` and all NA values. (:issue:`60688`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index eefe08859c1e9..aafd802b827a5 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -23,6 +23,7 @@ iNaT, lib, ) +from pandas._libs.missing import NA from pandas._typing import ( AnyArrayLike, ArrayLike, @@ -544,10 +545,15 @@ def isin(comps: ListLike, values: ListLike) -> npt.NDArray[np.bool_]: # Ensure np.isin doesn't get object types or it *may* throw an exception # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array), # isin is faster for small sizes + + # GH60678 + # Ensure values don't contain , otherwise it throws exception with np.in1d + if ( len(comps_array) > _MINIMUM_COMP_ARR_LEN and len(values) <= 26 and comps_array.dtype != object + and not any(v is NA for v in values) ): # If the values include nan we need to check for nan explicitly # since np.nan it not equal to np.nan diff --git a/pandas/tests/series/methods/test_isin.py b/pandas/tests/series/methods/test_isin.py index e997ae32cf2e2..4f8484252ba8f 100644 --- a/pandas/tests/series/methods/test_isin.py +++ b/pandas/tests/series/methods/test_isin.py @@ -211,6 +211,30 @@ def test_isin_large_series_mixed_dtypes_and_nan(monkeypatch): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize( + "dtype, data, values, expected", + [ + ("boolean", [pd.NA, False, True], [False, pd.NA], [True, True, False]), + ("Int64", [pd.NA, 2, 1], [1, pd.NA], [True, False, True]), + ("boolean", [pd.NA, False, True], [pd.NA, True, "a", 20], [True, False, True]), + ("boolean", [pd.NA, False, True], [], [False, False, False]), + ("Float64", [20.0, 30.0, pd.NA], [pd.NA], [False, False, True]), + ], +) +def test_isin_large_series_and_pdNA(dtype, data, values, expected, monkeypatch): + # https://github.com/pandas-dev/pandas/issues/60678 + # combination of large series (> _MINIMUM_COMP_ARR_LEN elements) and + # values contains pdNA + min_isin_comp = 2 + ser = Series(data, dtype=dtype) + expected = Series(expected, dtype="boolean") + + with monkeypatch.context() as m: + m.setattr(algorithms, "_MINIMUM_COMP_ARR_LEN", min_isin_comp) + result = ser.isin(values) + tm.assert_series_equal(result, expected) + + def test_isin_complex_numbers(): # GH 17927 array = [0, 1j, 1j, 1, 1 + 1j, 1 + 2j, 1 + 1j] From 4c3b968a0a4de483c00d15bd267bc776a218337e Mon Sep 17 00:00:00 2001 From: aaronchucarroll <120818400+aaronchucarroll@users.noreply.github.com> Date: Wed, 22 Jan 2025 17:48:22 -0500 Subject: [PATCH 32/67] ENH: Series.str.get_dummies() raise on string type (#59786) --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/core/arrays/arrow/array.py | 2 -- pandas/core/strings/accessor.py | 5 +++- pandas/core/strings/object_array.py | 2 +- pandas/tests/strings/test_get_dummies.py | 38 ++++-------------------- 5 files changed, 11 insertions(+), 38 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 517ac7a4b44b9..1d8d0f6a74cb1 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -65,8 +65,8 @@ Other enhancements - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`) - :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`) - :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`) +- :meth:`Series.str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`) - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`) -- :meth:`str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`) - Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` (:issue:`59091`) - Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`) - Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 5c32b05868383..e7f6b911f2fb1 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2531,8 +2531,6 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): else: dummies_dtype = np.bool_ dummies = np.zeros(n_rows * n_cols, dtype=dummies_dtype) - if dtype == str: - dummies[:] = False dummies[indices] = True dummies = dummies.reshape((n_rows, n_cols)) result = type(self)(pa.array(list(dummies))) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index d3ccd11281a77..5b35b5e393012 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -29,6 +29,7 @@ is_extension_array_dtype, is_integer, is_list_like, + is_numeric_dtype, is_object_dtype, is_re, ) @@ -2524,10 +2525,12 @@ def get_dummies( """ from pandas.core.frame import DataFrame + if dtype is not None and not (is_numeric_dtype(dtype) or is_bool_dtype(dtype)): + raise ValueError("Only numeric or boolean dtypes are supported for 'dtype'") # we need to cast to Series of strings as only that has all # methods available for making the dummies... result, name = self._data.array._str_get_dummies(sep, dtype) - if is_extension_array_dtype(dtype) or isinstance(dtype, ArrowDtype): + if is_extension_array_dtype(dtype): return self._wrap_result( DataFrame(result, columns=name, dtype=dtype), name=name, diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index a07ab9534f491..0adb7b51cf2b7 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -434,7 +434,7 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): dummies_dtype = _dtype else: dummies_dtype = np.bool_ - dummies = np.empty((len(arr), len(tags2)), dtype=dummies_dtype) + dummies = np.empty((len(arr), len(tags2)), dtype=dummies_dtype, order="F") def _isin(test_elements: str, element: str) -> bool: return element in test_elements diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py index 541b0ea150ba6..16e10c6fcdccd 100644 --- a/pandas/tests/strings/test_get_dummies.py +++ b/pandas/tests/strings/test_get_dummies.py @@ -1,12 +1,9 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td from pandas import ( - ArrowDtype, DataFrame, Index, MultiIndex, @@ -14,11 +11,6 @@ _testing as tm, ) -try: - import pyarrow as pa -except ImportError: - pa = None - def test_get_dummies(any_string_dtype): s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) @@ -99,32 +91,12 @@ def test_get_dummies_with_pyarrow_dtype(any_string_dtype, dtype): # GH#47872 -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_get_dummies_with_str_dtype(any_string_dtype): s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) - result = s.str.get_dummies("|", dtype=str) - expected = DataFrame( - [["T", "T", "F"], ["T", "F", "T"], ["F", "F", "F"]], - columns=list("abc"), - dtype=str, - ) - tm.assert_frame_equal(result, expected) - -# GH#47872 -@td.skip_if_no("pyarrow") -def test_get_dummies_with_pa_str_dtype(any_string_dtype): - import pyarrow as pa + msg = "Only numeric or boolean dtypes are supported for 'dtype'" + with pytest.raises(ValueError, match=msg): + s.str.get_dummies("|", dtype=str) - s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) - result = s.str.get_dummies("|", dtype=ArrowDtype(pa.string())) - expected = DataFrame( - [ - ["true", "true", "false"], - ["true", "false", "true"], - ["false", "false", "false"], - ], - columns=list("abc"), - dtype=ArrowDtype(pa.string()), - ) - tm.assert_frame_equal(result, expected) + with pytest.raises(ValueError, match=msg): + s.str.get_dummies("|", dtype="datetime64[ns]") From 60325b86e28edf40cb02444367efbc8deb2b5231 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Thu, 23 Jan 2025 02:38:26 -0500 Subject: [PATCH 33/67] ENH: Enable pytables to round-trip with StringDtype (#60663) Co-authored-by: William Ayd --- doc/source/whatsnew/v2.3.0.rst | 1 + pandas/io/pytables.py | 36 +++++++++++--- pandas/tests/io/pytables/test_put.py | 70 ++++++++++++++++++++++------ 3 files changed, 87 insertions(+), 20 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index de1118b56dc81..108ee62d88409 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -35,6 +35,7 @@ Other enhancements - The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been updated to work correctly with NumPy >= 2 (:issue:`57739`) +- :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype`` (:issue:`60663`) - The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns when backed by PyArrow (:issue:`60633`) - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index b75dc6c3a43b4..2f8096746318b 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -86,12 +86,16 @@ PeriodArray, ) from pandas.core.arrays.datetimes import tz_to_dtype +from pandas.core.arrays.string_ import BaseStringArray import pandas.core.common as com from pandas.core.computation.pytables import ( PyTablesExpr, maybe_expression, ) -from pandas.core.construction import extract_array +from pandas.core.construction import ( + array as pd_array, + extract_array, +) from pandas.core.indexes.api import ensure_index from pandas.io.common import stringify_path @@ -3023,6 +3027,9 @@ def read_array(self, key: str, start: int | None = None, stop: int | None = None if isinstance(node, tables.VLArray): ret = node[0][start:stop] + dtype = getattr(attrs, "value_type", None) + if dtype is not None: + ret = pd_array(ret, dtype=dtype) else: dtype = getattr(attrs, "value_type", None) shape = getattr(attrs, "shape", None) @@ -3262,6 +3269,11 @@ def write_array( elif lib.is_np_dtype(value.dtype, "m"): self._handle.create_array(self.group, key, value.view("i8")) getattr(self.group, key)._v_attrs.value_type = "timedelta64" + elif isinstance(value, BaseStringArray): + vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom()) + vlarr.append(value.to_numpy()) + node = getattr(self.group, key) + node._v_attrs.value_type = str(value.dtype) elif empty_array: self.write_array_empty(key, value) else: @@ -3294,7 +3306,11 @@ def read( index = self.read_index("index", start=start, stop=stop) values = self.read_array("values", start=start, stop=stop) result = Series(values, index=index, name=self.name, copy=False) - if using_string_dtype() and is_string_array(values, skipna=True): + if ( + using_string_dtype() + and isinstance(values, np.ndarray) + and is_string_array(values, skipna=True) + ): result = result.astype(StringDtype(na_value=np.nan)) return result @@ -3363,7 +3379,11 @@ def read( columns = items[items.get_indexer(blk_items)] df = DataFrame(values.T, columns=columns, index=axes[1], copy=False) - if using_string_dtype() and is_string_array(values, skipna=True): + if ( + using_string_dtype() + and isinstance(values, np.ndarray) + and is_string_array(values, skipna=True) + ): df = df.astype(StringDtype(na_value=np.nan)) dfs.append(df) @@ -4737,9 +4757,13 @@ def read( df = DataFrame._from_arrays([values], columns=cols_, index=index_) if not (using_string_dtype() and values.dtype.kind == "O"): assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype) - if using_string_dtype() and is_string_array( - values, # type: ignore[arg-type] - skipna=True, + if ( + using_string_dtype() + and isinstance(values, np.ndarray) + and is_string_array( + values, + skipna=True, + ) ): df = df.astype(StringDtype(na_value=np.nan)) frames.append(df) diff --git a/pandas/tests/io/pytables/test_put.py b/pandas/tests/io/pytables/test_put.py index a4257b54dd6db..66596f1138b96 100644 --- a/pandas/tests/io/pytables/test_put.py +++ b/pandas/tests/io/pytables/test_put.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs import Timestamp import pandas as pd @@ -26,7 +24,6 @@ pytestmark = [ pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), ] @@ -54,8 +51,8 @@ def test_api_default_format(tmp_path, setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) with pd.option_context("io.hdf.default_format", "fixed"): @@ -79,8 +76,8 @@ def test_api_default_format(tmp_path, setup_path): path = tmp_path / setup_path df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) with pd.option_context("io.hdf.default_format", "fixed"): @@ -106,7 +103,7 @@ def test_put(setup_path): ) df = DataFrame( np.random.default_rng(2).standard_normal((20, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=20, freq="B"), ) store["a"] = ts @@ -166,7 +163,7 @@ def test_put_compression(setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -183,7 +180,7 @@ def test_put_compression(setup_path): def test_put_compression_blosc(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -197,10 +194,20 @@ def test_put_compression_blosc(setup_path): tm.assert_frame_equal(store["c"], df) -def test_put_mixed_type(setup_path, performance_warning): +def test_put_datetime_ser(setup_path, performance_warning, using_infer_string): + # https://github.com/pandas-dev/pandas/pull/60663 + ser = Series(3 * [Timestamp("20010102").as_unit("ns")]) + with ensure_clean_store(setup_path) as store: + store.put("ser", ser) + expected = ser.copy() + result = store.get("ser") + tm.assert_series_equal(result, expected) + + +def test_put_mixed_type(setup_path, performance_warning, using_infer_string): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df["obj1"] = "foo" @@ -220,13 +227,42 @@ def test_put_mixed_type(setup_path, performance_warning): with ensure_clean_store(setup_path) as store: _maybe_remove(store, "df") - with tm.assert_produces_warning(performance_warning): + warning = None if using_infer_string else performance_warning + with tm.assert_produces_warning(warning): store.put("df", df) expected = store.get("df") tm.assert_frame_equal(expected, df) +def test_put_str_frame(setup_path, performance_warning, string_dtype_arguments): + # https://github.com/pandas-dev/pandas/pull/60663 + dtype = pd.StringDtype(*string_dtype_arguments) + df = DataFrame({"a": pd.array(["x", pd.NA, "y"], dtype=dtype)}) + with ensure_clean_store(setup_path) as store: + _maybe_remove(store, "df") + + store.put("df", df) + expected_dtype = "str" if dtype.na_value is np.nan else "string" + expected = df.astype(expected_dtype) + result = store.get("df") + tm.assert_frame_equal(result, expected) + + +def test_put_str_series(setup_path, performance_warning, string_dtype_arguments): + # https://github.com/pandas-dev/pandas/pull/60663 + dtype = pd.StringDtype(*string_dtype_arguments) + ser = Series(["x", pd.NA, "y"], dtype=dtype) + with ensure_clean_store(setup_path) as store: + _maybe_remove(store, "df") + + store.put("ser", ser) + expected_dtype = "str" if dtype.na_value is np.nan else "string" + expected = ser.astype(expected_dtype) + result = store.get("ser") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("format", ["table", "fixed"]) @pytest.mark.parametrize( "index", @@ -253,7 +289,7 @@ def test_store_index_types(setup_path, format, index): tm.assert_frame_equal(df, store["df"]) -def test_column_multiindex(setup_path): +def test_column_multiindex(setup_path, using_infer_string): # GH 4710 # recreate multi-indexes properly @@ -264,6 +300,12 @@ def test_column_multiindex(setup_path): expected = df.set_axis(df.index.to_numpy()) with ensure_clean_store(setup_path) as store: + if using_infer_string: + # TODO(infer_string) make this work for string dtype + msg = "Saving a MultiIndex with an extension dtype is not supported." + with pytest.raises(NotImplementedError, match=msg): + store.put("df", df) + return store.put("df", df) tm.assert_frame_equal( store["df"], expected, check_index_type=True, check_column_type=True From 222d7c7c5e3cc13d67facfa2d9bb7b6b03620a07 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 23 Jan 2025 17:03:46 +0100 Subject: [PATCH 34/67] TST (string dtype): follow-up fix for pyarrow 19.0 update (#60764) * TST (string dtype): follow-up fix for pyarrow 19.0 update * fix test --- pandas/tests/io/test_parquet.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 91580c31ea081..56a8e4c439164 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -104,10 +104,7 @@ def fp(request): @pytest.fixture def df_compat(): - # TODO(infer_string) should this give str columns? - return pd.DataFrame( - {"A": [1, 2, 3], "B": "foo"}, columns=pd.Index(["A", "B"], dtype=object) - ) + return pd.DataFrame({"A": [1, 2, 3], "B": "foo"}, columns=pd.Index(["A", "B"])) @pytest.fixture @@ -686,7 +683,11 @@ def test_parquet_read_from_url(self, httpserver, datapath, df_compat, engine): with open(datapath("io", "data", "parquet", "simple.parquet"), mode="rb") as f: httpserver.serve_content(content=f.read()) df = read_parquet(httpserver.url, engine=engine) - tm.assert_frame_equal(df, df_compat) + + expected = df_compat + if pa_version_under19p0: + expected.columns = expected.columns.astype(object) + tm.assert_frame_equal(df, expected) class TestParquetPyArrow(Base): From be538ef0d07055113cbdbf9b3a22c4852c7fd6d7 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Thu, 23 Jan 2025 23:49:44 +0530 Subject: [PATCH 35/67] =?UTF-8?q?DOC:=20fix=20ES01,SA01=20for=20pandas.tse?= =?UTF-8?q?ries.offsets.CustomBusinessMonthEnd.=E2=80=A6=20(#60775)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DOC: fix ES01,SA01 for pandas.tseries.offsets.CustomBusinessMonthEnd.is_on_offset and pandas.tseries.offsets.CustomBusinessMonthBegin.is_on_offset --- ci/code_checks.sh | 2 -- pandas/_libs/tslibs/offsets.pyx | 13 +++++++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index c7e644bd30cd3..cf7809c70296c 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -146,7 +146,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.tseries.offsets.CustomBusinessMonthBegin PR02" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.calendar GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.holidays GL08" \ - -i "pandas.tseries.offsets.CustomBusinessMonthBegin.is_on_offset SA01" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.m_offset GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.n GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.normalize GL08" \ @@ -154,7 +153,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.tseries.offsets.CustomBusinessMonthEnd PR02" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.calendar GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.holidays GL08" \ - -i "pandas.tseries.offsets.CustomBusinessMonthEnd.is_on_offset SA01" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.m_offset GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.n GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.normalize GL08" \ diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 7569f8e8864a0..3b02bf46c2f82 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -720,11 +720,24 @@ cdef class BaseOffset: """ Return boolean whether a timestamp intersects with this frequency. + This method determines if a given timestamp aligns with the start + of a custom business month, as defined by this offset. It accounts + for custom rules, such as skipping weekends or other non-business days, + and checks whether the provided datetime falls on a valid business day + that marks the beginning of the custom business month. + Parameters ---------- dt : datetime.datetime Timestamp to check intersections with frequency. + See Also + -------- + tseries.offsets.CustomBusinessMonthBegin : Represents the start of a custom + business month. + tseries.offsets.CustomBusinessMonthEnd : Represents the end of a custom + business month. + Examples -------- >>> ts = pd.Timestamp(2022, 1, 1) From 0c4ca3a9e4baa9b4fa8cbc81c57f2e2996636c10 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Fri, 24 Jan 2025 00:23:55 +0530 Subject: [PATCH 36/67] DOC: fix SA01 for pandas.tseries.offsets.LastWeekOfMonth (#60776) --- ci/code_checks.sh | 1 - pandas/_libs/tslibs/offsets.pyx | 9 +++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index cf7809c70296c..2d0fcce47d2a5 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -189,7 +189,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.tseries.offsets.Hour.is_on_offset GL08" \ -i "pandas.tseries.offsets.Hour.n GL08" \ -i "pandas.tseries.offsets.Hour.normalize GL08" \ - -i "pandas.tseries.offsets.LastWeekOfMonth SA01" \ -i "pandas.tseries.offsets.LastWeekOfMonth.is_on_offset GL08" \ -i "pandas.tseries.offsets.LastWeekOfMonth.n GL08" \ -i "pandas.tseries.offsets.LastWeekOfMonth.normalize GL08" \ diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 3b02bf46c2f82..36b431974c121 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -3723,6 +3723,15 @@ cdef class LastWeekOfMonth(WeekOfMonthMixin): - 5 is Saturday - 6 is Sunday. + See Also + -------- + tseries.offsets.WeekOfMonth : + Date offset for a specific weekday in a month. + tseries.offsets.MonthEnd : + Date offset for the end of the month. + tseries.offsets.BMonthEnd : + Date offset for the last business day of the month. + Examples -------- >>> ts = pd.Timestamp(2022, 1, 1) From c168883f8f10e312e6d596d8d750a1e4647393c6 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 24 Jan 2025 10:04:42 -0800 Subject: [PATCH 37/67] PERF: Avoid a numpy array copy in ArrowExtensionArray._to_datetimearray (#60778) --- pandas/core/arrays/arrow/array.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index e7f6b911f2fb1..0b546bed1c2b7 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1398,7 +1398,7 @@ def _to_datetimearray(self) -> DatetimeArray: np_dtype = np.dtype(f"M8[{pa_type.unit}]") dtype = tz_to_dtype(pa_type.tz, pa_type.unit) np_array = self._pa_array.to_numpy() - np_array = np_array.astype(np_dtype) + np_array = np_array.astype(np_dtype, copy=False) return DatetimeArray._simple_new(np_array, dtype=dtype) def _to_timedeltaarray(self) -> TimedeltaArray: @@ -1409,7 +1409,7 @@ def _to_timedeltaarray(self) -> TimedeltaArray: assert pa.types.is_duration(pa_type) np_dtype = np.dtype(f"m8[{pa_type.unit}]") np_array = self._pa_array.to_numpy() - np_array = np_array.astype(np_dtype) + np_array = np_array.astype(np_dtype, copy=False) return TimedeltaArray._simple_new(np_array, dtype=np_dtype) def _values_for_json(self) -> np.ndarray: From d38706af66249ef74e42671a480261c68bedfbce Mon Sep 17 00:00:00 2001 From: William Ayd Date: Fri, 24 Jan 2025 15:21:29 -0500 Subject: [PATCH 38/67] TST(string dtype): Fix xfails in test_block_internals.py (#60765) --- pandas/tests/frame/conftest.py | 2 +- .../frame/constructors/test_from_dict.py | 1 - pandas/tests/frame/test_block_internals.py | 35 ++++++------------- 3 files changed, 11 insertions(+), 27 deletions(-) diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py index ea8e2e8ecc194..b3140bad8276b 100644 --- a/pandas/tests/frame/conftest.py +++ b/pandas/tests/frame/conftest.py @@ -33,7 +33,7 @@ def float_string_frame(): df = DataFrame( np.random.default_rng(2).standard_normal((30, 4)), index=Index([f"foo_{i}" for i in range(30)], dtype=object), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), ) df["foo"] = "bar" return df diff --git a/pandas/tests/frame/constructors/test_from_dict.py b/pandas/tests/frame/constructors/test_from_dict.py index fc7c03dc25839..1509c47ba65c7 100644 --- a/pandas/tests/frame/constructors/test_from_dict.py +++ b/pandas/tests/frame/constructors/test_from_dict.py @@ -108,7 +108,6 @@ def test_constructor_list_of_series(self): expected = DataFrame.from_dict(sdict, orient="index") tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="columns inferring logic broken") def test_constructor_orient(self, float_string_frame): data_dict = float_string_frame.T._series recons = DataFrame.from_dict(data_dict, orient="index") diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 25e66a0e1c03d..6fdbfac8f4e0a 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( Categorical, @@ -162,21 +160,7 @@ def test_constructor_with_convert(self): ) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_construction_with_mixed(self, float_string_frame, using_infer_string): - # test construction edge cases with mixed types - - # f7u12, this does not work without extensive workaround - data = [ - [datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)], - [datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 1)], - ] - df = DataFrame(data) - - # check dtypes - result = df.dtypes - expected = Series({"datetime64[us]": 3}) - # mixed-type frames float_string_frame["datetime"] = datetime.now() float_string_frame["timedelta"] = timedelta(days=1, seconds=1) @@ -196,13 +180,11 @@ def test_construction_with_mixed(self, float_string_frame, using_infer_string): ) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_construction_with_conversions(self): # convert from a numpy array of non-ns timedelta64; as of 2.0 this does # *not* convert arr = np.array([1, 2, 3], dtype="timedelta64[s]") - df = DataFrame(index=range(3)) - df["A"] = arr + df = DataFrame({"A": arr}) expected = DataFrame( {"A": pd.timedelta_range("00:00:01", periods=3, freq="s")}, index=range(3) ) @@ -220,11 +202,11 @@ def test_construction_with_conversions(self): assert expected.dtypes["dt1"] == "M8[s]" assert expected.dtypes["dt2"] == "M8[s]" - df = DataFrame(index=range(3)) - df["dt1"] = np.datetime64("2013-01-01") - df["dt2"] = np.array( + dt1 = np.datetime64("2013-01-01") + dt2 = np.array( ["2013-01-01", "2013-01-02", "2013-01-03"], dtype="datetime64[D]" ) + df = DataFrame({"dt1": dt1, "dt2": dt2}) # df['dt3'] = np.array(['2013-01-01 00:00:01','2013-01-01 # 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]') @@ -401,14 +383,17 @@ def test_update_inplace_sets_valid_block_values(): assert isinstance(df._mgr.blocks[0].values, Categorical) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_nonconsolidated_item_cache_take(): # https://github.com/pandas-dev/pandas/issues/35521 # create non-consolidated dataframe with object dtype columns - df = DataFrame() - df["col1"] = Series(["a"], dtype=object) + df = DataFrame( + { + "col1": Series(["a"], dtype=object), + } + ) df["col2"] = Series([0], dtype=object) + assert not df._mgr.is_consolidated() # access column (item cache) df["col1"] == "A" From 354b61f88bc0523d4bb9f3cfe1d6c12f9a3d6567 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Fri, 24 Jan 2025 15:39:30 -0500 Subject: [PATCH 39/67] TST(string dtype): Resolve xfail in groupby.test_size (#60711) --- pandas/tests/groupby/methods/test_size.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/pandas/tests/groupby/methods/test_size.py b/pandas/tests/groupby/methods/test_size.py index 2dc89bc75746f..6664563bd2272 100644 --- a/pandas/tests/groupby/methods/test_size.py +++ b/pandas/tests/groupby/methods/test_size.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, Index, @@ -76,18 +74,16 @@ def test_size_series_masked_type_returns_Int64(dtype): tm.assert_series_equal(result, expected) -# TODO(infer_string) in case the column is object dtype, it should preserve that dtype -# for the result's index -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) -def test_size_strings(any_string_dtype): +def test_size_strings(any_string_dtype, using_infer_string): # GH#55627 dtype = any_string_dtype df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype) result = df.groupby("a")["b"].size() exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64" + exp_index_dtype = "str" if using_infer_string and dtype == "object" else dtype expected = Series( [2, 1], - index=Index(["a", "b"], name="a", dtype=dtype), + index=Index(["a", "b"], name="a", dtype=exp_index_dtype), name="b", dtype=exp_dtype, ) From e3b2de852a87dc7b530302e0039730e7745b2fcf Mon Sep 17 00:00:00 2001 From: William Ayd Date: Fri, 24 Jan 2025 18:16:18 -0500 Subject: [PATCH 40/67] TST(string_dtype): Fix minor issue with CSV parser and column dtype (#60784) --- pandas/io/parsers/arrow_parser_wrapper.py | 3 ++- pandas/tests/io/parser/common/test_index.py | 3 --- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 672672490996d..8cadde1ad6537 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -165,7 +165,8 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: # The only way self.names is not the same length as number of cols is # if we have int index_col. We should just pad the names(they will get # removed anyways) to expected length then. - self.names = list(range(num_cols - len(self.names))) + self.names + columns_prefix = [str(x) for x in range(num_cols - len(self.names))] + self.names = columns_prefix + self.names multi_index_named = False frame.columns = self.names diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py index 8352cc80f5e62..cfa8785b24bde 100644 --- a/pandas/tests/io/parser/common/test_index.py +++ b/pandas/tests/io/parser/common/test_index.py @@ -90,9 +90,6 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected): def test_multi_index_no_level_names( request, all_parsers, index_col, using_infer_string ): - if using_infer_string and all_parsers.engine == "pyarrow": - # result should have string columns instead of object dtype - request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) data = """index1,index2,A,B,C,D foo,one,2,3,4,5 foo,two,7,8,9,10 From 8fe27200e2d4ba1f9781f704becf889d7aa43c28 Mon Sep 17 00:00:00 2001 From: "Christine P. Chai" Date: Sat, 25 Jan 2025 09:00:26 -0800 Subject: [PATCH 41/67] DOC: Update a link in tutorials.rst (#60787) --- doc/source/getting_started/tutorials.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/getting_started/tutorials.rst b/doc/source/getting_started/tutorials.rst index 4393c3716bdad..eae7771418485 100644 --- a/doc/source/getting_started/tutorials.rst +++ b/doc/source/getting_started/tutorials.rst @@ -112,7 +112,7 @@ Various tutorials * `Wes McKinney's (pandas BDFL) blog `_ * `Statistical analysis made easy in Python with SciPy and pandas DataFrames, by Randal Olson `_ -* `Statistical Data Analysis in Python, tutorial videos, by Christopher Fonnesbeck from SciPy 2013 `_ +* `Statistical Data Analysis in Python, tutorial by Christopher Fonnesbeck from SciPy 2013 `_ * `Financial analysis in Python, by Thomas Wiecki `_ * `Intro to pandas data structures, by Greg Reda `_ * `Pandas DataFrames Tutorial, by Karlijn Willems `_ From f3045db91dbb89306c15b1673987cc70912a76b5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Sun, 26 Jan 2025 00:44:11 -0800 Subject: [PATCH 42/67] CI: Remove CircleCI in favor of GHA ARM builds (#60761) --- .circleci/config.yml | 155 ---------------------------------- .gitattributes | 1 - ci/deps/circle-311-arm64.yaml | 61 ------------- pandas/tests/io/conftest.py | 7 +- 4 files changed, 3 insertions(+), 221 deletions(-) delete mode 100644 .circleci/config.yml delete mode 100644 ci/deps/circle-311-arm64.yaml diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index 139ea9d220453..0000000000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,155 +0,0 @@ -version: 2.1 - -jobs: - test-linux-arm: - machine: - image: default - resource_class: arm.large - environment: - ENV_FILE: ci/deps/circle-311-arm64.yaml - PYTEST_WORKERS: auto - PATTERN: "not single_cpu and not slow and not network and not clipboard and not arm_slow and not db" - PYTEST_TARGET: "pandas" - PANDAS_CI: "1" - steps: - - checkout - - run: - name: Install Environment and Run Tests - shell: /bin/bash -exo pipefail - # https://pytest-qt.readthedocs.io/en/latest/troubleshooting.html#github-actions-azure-pipelines-travis-ci-and-gitlab-ci-cd - command: | - MINI_URL="https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-aarch64.sh" - wget -q $MINI_URL -O Miniforge3.sh - chmod +x Miniforge3.sh - MINI_DIR="$HOME/miniconda3" - rm -rf $MINI_DIR - ./Miniforge3.sh -b -p $MINI_DIR - export PATH=$MINI_DIR/bin:$PATH - conda info -a - conda env create -q -n pandas-dev -f $ENV_FILE - conda list -n pandas-dev - source activate pandas-dev - if pip show pandas 1>/dev/null; then - pip uninstall -y pandas - fi - python -m pip install --no-build-isolation -ve . -Csetup-args="--werror" - PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH - ci/run_tests.sh - test-linux-musl: - docker: - - image: quay.io/pypa/musllinux_1_1_aarch64 - resource_class: arm.large - steps: - # Install pkgs first to have git in the image - # (needed for checkout) - - run: - name: Install System Packages - command: | - apk update - apk add git - apk add musl-locales - - checkout - - run: - name: Install Environment and Run Tests - command: | - /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev - . ~/virtualenvs/pandas-dev/bin/activate - python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 - python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 - python -m pip install --no-cache-dir --no-build-isolation -e . -Csetup-args="--werror" - python -m pip list --no-cache-dir - export PANDAS_CI=1 - python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml - build-aarch64: - parameters: - cibw-build: - type: string - machine: - image: default - resource_class: arm.large - environment: - TRIGGER_SOURCE: << pipeline.trigger_source >> - steps: - - checkout - - run: - name: Check if build is necessary - command: | - # Check if tag is defined or TRIGGER_SOURCE is scheduled - if [[ -n "$CIRCLE_TAG" ]]; then - echo 'export IS_PUSH="true"' >> "$BASH_ENV" - elif [[ $TRIGGER_SOURCE == "scheduled_pipeline" ]]; then - echo 'export IS_SCHEDULE_DISPATCH="true"' >> "$BASH_ENV" - # Look for the build label/[wheel build] in commit - # grep takes a regex, so need to escape brackets - elif (git log --format=oneline -n 1 $CIRCLE_SHA1) | grep -q '\[wheel build\]'; then - : # Do nothing - elif ! (curl https://api.github.com/repos/pandas-dev/pandas/issues/$CIRCLE_PR_NUMBER | jq '.labels' | grep -q 'Build'); then - circleci-agent step halt - fi - - run: - name: Build aarch64 wheels - no_output_timeout: 30m # Sometimes the tests won't generate any output, make sure the job doesn't get killed by that - command: | - pip3 install cibuildwheel==2.20.0 - if [[ $CIBW_BUILD == cp313t* ]]; then - # TODO: temporarily run 3.13 free threaded builds without build isolation - # since we need pre-release cython - CIBW_BUILD_FRONTEND="pip; args: --no-build-isolation" cibuildwheel --output-dir wheelhouse - else - cibuildwheel --output-dir wheelhouse - fi - - environment: - CIBW_BUILD: << parameters.cibw-build >> - - - run: - name: Install Anaconda Client & Upload Wheels - shell: /bin/bash -exo pipefail - command: | - MINI_URL="https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-aarch64.sh" - wget -q $MINI_URL -O Miniforge3.sh - chmod +x Miniforge3.sh - MINI_DIR="$HOME/miniconda3" - rm -rf $MINI_DIR - ./Miniforge3.sh -b -p $MINI_DIR - export PATH=$MINI_DIR/bin:$PATH - conda install -y -c conda-forge anaconda-client - source ci/upload_wheels.sh - set_upload_vars - upload_wheels - - store_artifacts: - path: wheelhouse/ - -workflows: - test: - # Don't run trigger this one when scheduled pipeline runs - when: - not: - equal: [ scheduled_pipeline, << pipeline.trigger_source >> ] - jobs: - - test-linux-arm - test-musl: - # Don't run trigger this one when scheduled pipeline runs - when: - not: - equal: [ scheduled_pipeline, << pipeline.trigger_source >> ] - jobs: - - test-linux-musl - build-wheels: - jobs: - - build-aarch64: - filters: - tags: - only: /^v.*/ - matrix: - parameters: - cibw-build: ["cp310-manylinux_aarch64", - "cp311-manylinux_aarch64", - "cp312-manylinux_aarch64", - "cp313-manylinux_aarch64", - "cp313t-manylinux_aarch64", - "cp310-musllinux_aarch64", - "cp311-musllinux_aarch64", - "cp312-musllinux_aarch64", - "cp313-musllinux_aarch64", - "cp313t-musllinux_aarch64"] diff --git a/.gitattributes b/.gitattributes index f77da2339b20f..d94c19e7edb1f 100644 --- a/.gitattributes +++ b/.gitattributes @@ -61,7 +61,6 @@ pandas/_version.py export-subst *.pxi export-ignore # Ignoring stuff from the top level -.circleci export-ignore .github export-ignore asv_bench export-ignore ci export-ignore diff --git a/ci/deps/circle-311-arm64.yaml b/ci/deps/circle-311-arm64.yaml deleted file mode 100644 index 3f09e27d0fe4b..0000000000000 --- a/ci/deps/circle-311-arm64.yaml +++ /dev/null @@ -1,61 +0,0 @@ -name: pandas-dev -channels: - - conda-forge -dependencies: - - python=3.11 - - # build dependencies - - versioneer - - cython>=0.29.33 - - meson=1.2.1 - - meson-python=0.13.1 - - # test dependencies - - pytest>=7.3.2 - - pytest-cov - - pytest-xdist>=3.4.0 - - pytest-localserver>=0.8.1 - - pytest-qt>=4.4.0 - - boto3 - - # required dependencies - - python-dateutil - - numpy - - # optional dependencies - - beautifulsoup4>=4.11.2 - - blosc>=1.21.3 - - bottleneck>=1.3.6 - - fastparquet>=2023.10.0 - - fsspec>=2022.11.0 - - html5lib>=1.1 - - hypothesis>=6.84.0 - - gcsfs>=2022.11.0 - - jinja2>=3.1.2 - - lxml>=4.9.2 - - matplotlib>=3.6.3 - - numba>=0.56.4 - - numexpr>=2.8.4 - - odfpy>=1.4.1 - - qtpy>=2.3.0 - - openpyxl>=3.1.0 - - psycopg2>=2.9.6 - - pyarrow>=10.0.1 - - pymysql>=1.0.2 - - pyqt>=5.15.9 - - pyreadstat>=1.2.0 - - pytables>=3.8.0 - - python-calamine>=0.1.7 - - pytz>=2023.4 - - pyxlsb>=1.0.10 - - s3fs>=2022.11.0 - - scipy>=1.10.0 - - sqlalchemy>=2.0.0 - - tabulate>=0.9.0 - - xarray>=2022.12.0, <2024.10.0 - - xlrd>=2.0.1 - - xlsxwriter>=3.0.5 - - zstandard>=0.19.0 - - pip: - - adbc-driver-postgresql>=0.8.0 - - adbc-driver-sqlite>=0.8.0 diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index bdefadf3dbec0..a5ddda9d66e7a 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -67,14 +67,13 @@ def s3_base(worker_id, monkeypatch): monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret") if is_ci_environment(): if is_platform_arm() or is_platform_mac() or is_platform_windows(): - # NOT RUN on Windows/macOS/ARM, only Ubuntu + # NOT RUN on Windows/macOS, only Ubuntu # - subprocess in CI can cause timeouts # - GitHub Actions do not support # container services for the above OSs - # - CircleCI will probably hit the Docker rate pull limit pytest.skip( - "S3 tests do not have a corresponding service in " - "Windows, macOS or ARM platforms" + "S3 tests do not have a corresponding service on " + "Windows or macOS platforms" ) else: # set in .github/workflows/unit-tests.yml From 84bf1ef82912ebf497a304b0ffd90914bfc41ea9 Mon Sep 17 00:00:00 2001 From: tasfia8 <117693390+tasfia8@users.noreply.github.com> Date: Sun, 26 Jan 2025 06:29:25 -0500 Subject: [PATCH 43/67] BUG: fix construction of Series / Index from dict keys when "str" dtype is specified explicitly (#60436) Co-authored-by: Joris Van den Bossche --- pandas/core/construction.py | 2 ++ pandas/tests/base/test_constructors.py | 11 +++++++++++ pandas/tests/io/test_fsspec.py | 1 - 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 8df4f7e3e08f9..50088804e0245 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -596,6 +596,8 @@ def sanitize_array( # create an extension array from its dtype _sanitize_non_ordered(data) cls = dtype.construct_array_type() + if not hasattr(data, "__array__"): + data = list(data) subarr = cls._from_sequence(data, dtype=dtype, copy=copy) # GH#846 diff --git a/pandas/tests/base/test_constructors.py b/pandas/tests/base/test_constructors.py index c4b02423f8cf0..dffd2009ef373 100644 --- a/pandas/tests/base/test_constructors.py +++ b/pandas/tests/base/test_constructors.py @@ -179,3 +179,14 @@ def test_constructor_datetime_nonns(self, constructor): arr.flags.writeable = False result = constructor(arr) tm.assert_equal(result, expected) + + def test_constructor_from_dict_keys(self, constructor, using_infer_string): + # https://github.com/pandas-dev/pandas/issues/60343 + d = {"a": 1, "b": 2} + result = constructor(d.keys(), dtype="str") + if using_infer_string: + assert result.dtype == "str" + else: + assert result.dtype == "object" + expected = constructor(list(d.keys()), dtype="str") + tm.assert_equal(result, expected) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 5340560884afe..2e3e74a9d31ff 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -209,7 +209,6 @@ def test_arrowparquet_options(fsspectest): assert fsspectest.test[0] == "parquet_read" -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet") def test_fastparquet_options(fsspectest): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") From e36b00035665d416fe10a3950880a6532eaf6131 Mon Sep 17 00:00:00 2001 From: Xiao Yuan Date: Mon, 27 Jan 2025 22:54:25 +0200 Subject: [PATCH 44/67] BUG: fix combine_first reorders columns (#60791) * Add test * Fix combine_first reorders columns * Add whatsnew * Fix corner case when self is empty and future.infer_string is True * Update --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/frame.py | 5 +++-- pandas/tests/frame/methods/test_combine_first.py | 12 +++++++++++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 1d8d0f6a74cb1..a7f63d75a047e 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -758,6 +758,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ - Bug in :func:`qcut` where values at the quantile boundaries could be incorrectly assigned (:issue:`59355`) +- Bug in :meth:`DataFrame.combine_first` not preserving the column order (:issue:`60427`) - Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`) - Bug in :meth:`DataFrame.join` when a :class:`DataFrame` with a :class:`MultiIndex` would raise an ``AssertionError`` when :attr:`MultiIndex.names` contained ``None``. (:issue:`58721`) - Bug in :meth:`DataFrame.merge` where merging on a column containing only ``NaN`` values resulted in an out-of-bounds array access (:issue:`59421`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index af66bb54610f1..3669d8249dd27 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8671,6 +8671,7 @@ def combine( 2 NaN 3.0 1.0 """ other_idxlen = len(other.index) # save for compare + other_columns = other.columns this, other = self.align(other) new_index = this.index @@ -8681,8 +8682,8 @@ def combine( if self.empty and len(other) == other_idxlen: return other.copy() - # sorts if possible; otherwise align above ensures that these are set-equal - new_columns = this.columns.union(other.columns) + # preserve column order + new_columns = self.columns.union(other_columns, sort=False) do_fill = fill_value is not None result = {} for col in new_columns: diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index a70876b5a96ca..1e594043510ea 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -380,7 +380,7 @@ def test_combine_first_with_asymmetric_other(self, val): df2 = DataFrame({"isBool": [True]}) res = df1.combine_first(df2) - exp = DataFrame({"isBool": [True], "isNum": [val]}) + exp = DataFrame({"isNum": [val], "isBool": [True]}) tm.assert_frame_equal(res, exp) @@ -555,3 +555,13 @@ def test_combine_first_empty_columns(): result = left.combine_first(right) expected = DataFrame(columns=["a", "b", "c"]) tm.assert_frame_equal(result, expected) + + +def test_combine_first_preserve_column_order(): + # GH#60427 + df1 = DataFrame({"B": [1, 2, 3], "A": [4, None, 6]}) + df2 = DataFrame({"A": [5]}, index=[1]) + + result = df1.combine_first(df2) + expected = DataFrame({"B": [1, 2, 3], "A": [4.0, 5.0, 6.0]}) + tm.assert_frame_equal(result, expected) From c0c778bdb75a54cf03cdfe76f5b3dadae6a67054 Mon Sep 17 00:00:00 2001 From: Matteo Paltenghi Date: Tue, 28 Jan 2025 00:29:37 +0100 Subject: [PATCH 45/67] TST: Add test for exceptional behavior when calling `view()` on `BaseStringArray` (#60799) * add test for when str array raises type error * fix formatting: ruff-format * moved test to tests/arrays/string_/test_string.py file --- pandas/tests/arrays/string_/test_string.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index f875873863b4d..336a0fef69170 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -758,3 +758,9 @@ def test_tolist(dtype): result = arr.tolist() expected = vals tm.assert_equal(result, expected) + + +def test_string_array_view_type_error(): + arr = pd.array(["a", "b", "c"], dtype="string") + with pytest.raises(TypeError, match="Cannot change data-type for string array."): + arr.view("i8") From 8973c551895c2cd3619cadf554362e802b27e02a Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Tue, 28 Jan 2025 01:59:07 -0500 Subject: [PATCH 46/67] BUG: is_*_array returns true on empty object dtype (#60796) --- pandas/_libs/lib.pyx | 36 +++++++++++++-------------- pandas/tests/dtypes/test_inference.py | 25 +++++++++++++++++++ pandas/tests/io/test_feather.py | 4 +-- 3 files changed, 45 insertions(+), 20 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index de603beff7836..5239aa2c61dc5 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1882,7 +1882,7 @@ cdef class BoolValidator(Validator): cpdef bint is_bool_array(ndarray values, bint skipna=False): cdef: - BoolValidator validator = BoolValidator(len(values), + BoolValidator validator = BoolValidator(values.size, values.dtype, skipna=skipna) return validator.validate(values) @@ -1900,7 +1900,7 @@ cdef class IntegerValidator(Validator): # Note: only python-exposed for tests cpdef bint is_integer_array(ndarray values, bint skipna=True): cdef: - IntegerValidator validator = IntegerValidator(len(values), + IntegerValidator validator = IntegerValidator(values.size, values.dtype, skipna=skipna) return validator.validate(values) @@ -1915,7 +1915,7 @@ cdef class IntegerNaValidator(Validator): cdef bint is_integer_na_array(ndarray values, bint skipna=True): cdef: - IntegerNaValidator validator = IntegerNaValidator(len(values), + IntegerNaValidator validator = IntegerNaValidator(values.size, values.dtype, skipna=skipna) return validator.validate(values) @@ -1931,7 +1931,7 @@ cdef class IntegerFloatValidator(Validator): cdef bint is_integer_float_array(ndarray values, bint skipna=True): cdef: - IntegerFloatValidator validator = IntegerFloatValidator(len(values), + IntegerFloatValidator validator = IntegerFloatValidator(values.size, values.dtype, skipna=skipna) return validator.validate(values) @@ -1949,7 +1949,7 @@ cdef class FloatValidator(Validator): # Note: only python-exposed for tests cpdef bint is_float_array(ndarray values): cdef: - FloatValidator validator = FloatValidator(len(values), values.dtype) + FloatValidator validator = FloatValidator(values.size, values.dtype) return validator.validate(values) @@ -1967,7 +1967,7 @@ cdef class ComplexValidator(Validator): cdef bint is_complex_array(ndarray values): cdef: - ComplexValidator validator = ComplexValidator(len(values), values.dtype) + ComplexValidator validator = ComplexValidator(values.size, values.dtype) return validator.validate(values) @@ -1980,7 +1980,7 @@ cdef class DecimalValidator(Validator): cdef bint is_decimal_array(ndarray values, bint skipna=False): cdef: DecimalValidator validator = DecimalValidator( - len(values), values.dtype, skipna=skipna + values.size, values.dtype, skipna=skipna ) return validator.validate(values) @@ -1996,7 +1996,7 @@ cdef class StringValidator(Validator): cpdef bint is_string_array(ndarray values, bint skipna=False): cdef: - StringValidator validator = StringValidator(len(values), + StringValidator validator = StringValidator(values.size, values.dtype, skipna=skipna) return validator.validate(values) @@ -2013,7 +2013,7 @@ cdef class BytesValidator(Validator): cdef bint is_bytes_array(ndarray values, bint skipna=False): cdef: - BytesValidator validator = BytesValidator(len(values), values.dtype, + BytesValidator validator = BytesValidator(values.size, values.dtype, skipna=skipna) return validator.validate(values) @@ -2064,7 +2064,7 @@ cdef class DatetimeValidator(TemporalValidator): cpdef bint is_datetime_array(ndarray values, bint skipna=True): cdef: - DatetimeValidator validator = DatetimeValidator(len(values), + DatetimeValidator validator = DatetimeValidator(values.size, skipna=skipna) return validator.validate(values) @@ -2078,7 +2078,7 @@ cdef class Datetime64Validator(DatetimeValidator): # Note: only python-exposed for tests cpdef bint is_datetime64_array(ndarray values, bint skipna=True): cdef: - Datetime64Validator validator = Datetime64Validator(len(values), + Datetime64Validator validator = Datetime64Validator(values.size, skipna=skipna) return validator.validate(values) @@ -2093,7 +2093,7 @@ cdef class AnyDatetimeValidator(DatetimeValidator): cdef bint is_datetime_or_datetime64_array(ndarray values, bint skipna=True): cdef: - AnyDatetimeValidator validator = AnyDatetimeValidator(len(values), + AnyDatetimeValidator validator = AnyDatetimeValidator(values.size, skipna=skipna) return validator.validate(values) @@ -2105,7 +2105,7 @@ def is_datetime_with_singletz_array(values: ndarray) -> bool: Doesn't check values are datetime-like types. """ cdef: - Py_ssize_t i = 0, j, n = len(values) + Py_ssize_t i = 0, j, n = values.size object base_val, base_tz, val, tz if n == 0: @@ -2153,7 +2153,7 @@ cpdef bint is_timedelta_or_timedelta64_array(ndarray values, bint skipna=True): Infer with timedeltas and/or nat/none. """ cdef: - AnyTimedeltaValidator validator = AnyTimedeltaValidator(len(values), + AnyTimedeltaValidator validator = AnyTimedeltaValidator(values.size, skipna=skipna) return validator.validate(values) @@ -2167,7 +2167,7 @@ cdef class DateValidator(Validator): # Note: only python-exposed for tests cpdef bint is_date_array(ndarray values, bint skipna=False): cdef: - DateValidator validator = DateValidator(len(values), skipna=skipna) + DateValidator validator = DateValidator(values.size, skipna=skipna) return validator.validate(values) @@ -2180,7 +2180,7 @@ cdef class TimeValidator(Validator): # Note: only python-exposed for tests cpdef bint is_time_array(ndarray values, bint skipna=False): cdef: - TimeValidator validator = TimeValidator(len(values), skipna=skipna) + TimeValidator validator = TimeValidator(values.size, skipna=skipna) return validator.validate(values) @@ -2231,14 +2231,14 @@ cpdef bint is_interval_array(ndarray values): Is this an ndarray of Interval (or np.nan) with a single dtype? """ cdef: - Py_ssize_t i, n = len(values) + Py_ssize_t i, n = values.size str closed = None bint numeric = False bint dt64 = False bint td64 = False object val - if len(values) == 0: + if n == 0: return False for i in range(n): diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index da444b55490f0..db98751324ebc 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1582,6 +1582,31 @@ def test_is_string_array(self): ) assert not lib.is_string_array(np.array([1, 2])) + @pytest.mark.parametrize( + "func", + [ + "is_bool_array", + "is_date_array", + "is_datetime_array", + "is_datetime64_array", + "is_float_array", + "is_integer_array", + "is_interval_array", + "is_string_array", + "is_time_array", + "is_timedelta_or_timedelta64_array", + ], + ) + def test_is_dtype_array_empty_obj(self, func): + # https://github.com/pandas-dev/pandas/pull/60796 + func = getattr(lib, func) + + arr = np.empty((2, 0), dtype=object) + assert not func(arr) + + arr = np.empty((0, 2), dtype=object) + assert not func(arr) + def test_to_object_array_tuples(self): r = (5, 6) values = [r] diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 24af0a014dd50..e778193c147c1 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -143,8 +143,8 @@ def test_rw_use_threads(self): def test_path_pathlib(self): df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ).reset_index() result = tm.round_trip_pathlib(df.to_feather, read_feather) tm.assert_frame_equal(df, result) From dec6eb29b35c884e78c82525e1bb30280208714c Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Tue, 28 Jan 2025 23:41:54 +0530 Subject: [PATCH 47/67] DOC: fix PR01,RT03,SA01 for pandas.core.resample.Resampler.transform (#60805) * DOC: fix PR01,RT03,SA01 for pandas.core.resample.Resampler.transform * DOC: fix PR01,RT03,SA01 for pandas.core.resample.Resampler.transform --- ci/code_checks.sh | 1 - pandas/core/resample.py | 10 ++++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 2d0fcce47d2a5..ee5b7eb4f09fb 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -82,7 +82,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \ -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \ -i "pandas.core.resample.Resampler.quantile PR01,PR07" \ - -i "pandas.core.resample.Resampler.transform PR01,RT03,SA01" \ -i "pandas.tseries.offsets.BDay PR02,SA01" \ -i "pandas.tseries.offsets.BQuarterBegin.is_on_offset GL08" \ -i "pandas.tseries.offsets.BQuarterBegin.n GL08" \ diff --git a/pandas/core/resample.py b/pandas/core/resample.py index b1b8aef31d3c4..4b3b7a72b5a5c 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -378,10 +378,20 @@ def transform(self, arg, *args, **kwargs): ---------- arg : function To apply to each group. Should return a Series with the same index. + *args, **kwargs + Additional arguments and keywords. Returns ------- Series + A Series with the transformed values, maintaining the same index as + the original object. + + See Also + -------- + core.resample.Resampler.apply : Apply a function along each group. + core.resample.Resampler.aggregate : Aggregate using one or more operations + over the specified axis. Examples -------- From c430c613e6c712a39d07146b8adb083d55943840 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Tue, 28 Jan 2025 19:17:22 -0500 Subject: [PATCH 48/67] TST(string_dtype): Refine scope of string xfail in test_http_headers (#60811) --- pandas/tests/io/test_http_headers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_http_headers.py b/pandas/tests/io/test_http_headers.py index b11fe931f46e5..3b9c8769ad9dc 100644 --- a/pandas/tests/io/test_http_headers.py +++ b/pandas/tests/io/test_http_headers.py @@ -86,7 +86,6 @@ def stata_responder(df): return bio.getvalue() -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "responder, read_method", [ @@ -107,6 +106,7 @@ def stata_responder(df): marks=[ td.skip_if_no("fastparquet"), td.skip_if_no("fsspec"), + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string"), ], ), (pickle_respnder, pd.read_pickle), From c36da3f6ded4141add4b3b16c252cedf4641e5ea Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Wed, 29 Jan 2025 17:07:04 -0500 Subject: [PATCH 49/67] ENH(string dtype): Make str.decode return str dtype (#60709) * TST(string dtype): Make str.decode return str dtype * Test fixups * pytables fixup * Simplify * whatsnew * fix implementation --- doc/source/whatsnew/v2.3.0.rst | 1 + pandas/core/strings/accessor.py | 10 +++++++--- pandas/io/pytables.py | 4 +++- pandas/io/sas/sas7bdat.py | 6 ++++++ pandas/tests/io/sas/test_sas7bdat.py | 16 ++++++---------- pandas/tests/strings/test_strings.py | 9 +++++---- 6 files changed, 28 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 108ee62d88409..8bdddb5b7f85d 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -35,6 +35,7 @@ Other enhancements - The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been updated to work correctly with NumPy >= 2 (:issue:`57739`) +- :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`) - :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype`` (:issue:`60663`) - The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns when backed by PyArrow (:issue:`60633`) - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 5b35b5e393012..b854338c2d1d7 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -12,6 +12,8 @@ import numpy as np +from pandas._config import get_option + from pandas._libs import lib from pandas._typing import ( AlignJoin, @@ -400,7 +402,9 @@ def cons_row(x): # This is a mess. _dtype: DtypeObj | str | None = dtype vdtype = getattr(result, "dtype", None) - if self._is_string: + if _dtype is not None: + pass + elif self._is_string: if is_bool_dtype(vdtype): _dtype = result.dtype elif returns_string: @@ -2141,9 +2145,9 @@ def decode(self, encoding, errors: str = "strict"): decoder = codecs.getdecoder(encoding) f = lambda x: decoder(x, errors)[0] arr = self._data.array - # assert isinstance(arr, (StringArray,)) result = arr._str_map(f) - return self._wrap_result(result) + dtype = "str" if get_option("future.infer_string") else None + return self._wrap_result(result, dtype=dtype) @forbid_nonstring_types(["bytes"]) def encode(self, encoding, errors: str = "strict"): diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 2f8096746318b..e18db2e53113f 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -5233,7 +5233,9 @@ def _unconvert_string_array( dtype = f"U{itemsize}" if isinstance(data[0], bytes): - data = Series(data, copy=False).str.decode(encoding, errors=errors)._values + ser = Series(data, copy=False).str.decode(encoding, errors=errors) + data = ser.to_numpy() + data.flags.writeable = True else: data = data.astype(dtype, copy=False).astype(object, copy=False) diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index c5aab4d967cd4..792af5ff713a3 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -22,6 +22,8 @@ import numpy as np +from pandas._config import get_option + from pandas._libs.byteswap import ( read_double_with_byteswap, read_float_with_byteswap, @@ -699,6 +701,7 @@ def _chunk_to_dataframe(self) -> DataFrame: rslt = {} js, jb = 0, 0 + infer_string = get_option("future.infer_string") for j in range(self.column_count): name = self.column_names[j] @@ -715,6 +718,9 @@ def _chunk_to_dataframe(self) -> DataFrame: rslt[name] = pd.Series(self._string_chunk[js, :], index=ix, copy=False) if self.convert_text and (self.encoding is not None): rslt[name] = self._decode_string(rslt[name].str) + if infer_string: + rslt[name] = rslt[name].astype("str") + js += 1 else: self.close() diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 3f5b73f4aa8a4..a17cd27f8284e 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat._constants import ( IS64, WASM, @@ -20,10 +18,6 @@ from pandas.io.sas.sas7bdat import SAS7BDATReader -pytestmark = pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string)", strict=False -) - @pytest.fixture def dirpath(datapath): @@ -246,11 +240,13 @@ def test_zero_variables(datapath): pd.read_sas(fname) -def test_zero_rows(datapath): +@pytest.mark.parametrize("encoding", [None, "utf8"]) +def test_zero_rows(datapath, encoding): # GH 18198 fname = datapath("io", "sas", "data", "zero_rows.sas7bdat") - result = pd.read_sas(fname) - expected = pd.DataFrame([{"char_field": "a", "num_field": 1.0}]).iloc[:0] + result = pd.read_sas(fname, encoding=encoding) + str_value = b"a" if encoding is None else "a" + expected = pd.DataFrame([{"char_field": str_value, "num_field": 1.0}]).iloc[:0] tm.assert_frame_equal(result, expected) @@ -409,7 +405,7 @@ def test_0x40_control_byte(datapath): fname = datapath("io", "sas", "data", "0x40controlbyte.sas7bdat") df = pd.read_sas(fname, encoding="ascii") fname = datapath("io", "sas", "data", "0x40controlbyte.csv") - df0 = pd.read_csv(fname, dtype="object") + df0 = pd.read_csv(fname, dtype="str") tm.assert_frame_equal(df, df0) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 0598e5f80e6d6..ee531b32aa82d 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -95,6 +95,7 @@ def test_repeat_with_null(any_string_dtype, arg, repeat): def test_empty_str_methods(any_string_dtype): empty_str = empty = Series(dtype=any_string_dtype) + empty_inferred_str = Series(dtype="str") if is_object_or_nan_string_dtype(any_string_dtype): empty_int = Series(dtype="int64") empty_bool = Series(dtype=bool) @@ -154,7 +155,7 @@ def test_empty_str_methods(any_string_dtype): tm.assert_series_equal(empty_str, empty.str.rstrip()) tm.assert_series_equal(empty_str, empty.str.wrap(42)) tm.assert_series_equal(empty_str, empty.str.get(0)) - tm.assert_series_equal(empty_object, empty_bytes.str.decode("ascii")) + tm.assert_series_equal(empty_inferred_str, empty_bytes.str.decode("ascii")) tm.assert_series_equal(empty_bytes, empty.str.encode("ascii")) # ismethods should always return boolean (GH 29624) tm.assert_series_equal(empty_bool, empty.str.isalnum()) @@ -566,7 +567,7 @@ def test_string_slice_out_of_bounds(any_string_dtype): def test_encode_decode(any_string_dtype): ser = Series(["a", "b", "a\xe4"], dtype=any_string_dtype).str.encode("utf-8") result = ser.str.decode("utf-8") - expected = ser.map(lambda x: x.decode("utf-8")).astype(object) + expected = Series(["a", "b", "a\xe4"], dtype="str") tm.assert_series_equal(result, expected) @@ -596,7 +597,7 @@ def test_decode_errors_kwarg(): ser.str.decode("cp1252") result = ser.str.decode("cp1252", "ignore") - expected = ser.map(lambda x: x.decode("cp1252", "ignore")).astype(object) + expected = ser.map(lambda x: x.decode("cp1252", "ignore")).astype("str") tm.assert_series_equal(result, expected) @@ -751,5 +752,5 @@ def test_get_with_dict_label(): def test_series_str_decode(): # GH 22613 result = Series([b"x", b"y"]).str.decode(encoding="UTF-8", errors="strict") - expected = Series(["x", "y"], dtype="object") + expected = Series(["x", "y"], dtype="str") tm.assert_series_equal(result, expected) From ea7ff0ea4606f47a672f75793f4ea2b3eb0b87f5 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 30 Jan 2025 09:31:45 -0800 Subject: [PATCH 50/67] BUG(string): from_dummies, dropna (#60818) --- pandas/tests/frame/methods/test_dropna.py | 8 ++++---- pandas/tests/frame/test_arithmetic.py | 13 ++++++++++--- pandas/tests/reshape/test_from_dummies.py | 7 +++---- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/pandas/tests/frame/methods/test_dropna.py b/pandas/tests/frame/methods/test_dropna.py index 4a60dc09cfe07..d4f5629e6ba4b 100644 --- a/pandas/tests/frame/methods/test_dropna.py +++ b/pandas/tests/frame/methods/test_dropna.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -184,10 +182,12 @@ def test_dropna_multiple_axes(self): with pytest.raises(TypeError, match="supplying multiple axes"): inp.dropna(how="all", axis=(0, 1), inplace=True) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_dropna_tz_aware_datetime(self): + def test_dropna_tz_aware_datetime(self, using_infer_string): # GH13407 + df = DataFrame() + if using_infer_string: + df.columns = df.columns.astype("str") dt1 = datetime.datetime(2015, 1, 1, tzinfo=dateutil.tz.tzutc()) dt2 = datetime.datetime(2015, 2, 2, tzinfo=dateutil.tz.tzutc()) df["Time"] = [dt1] diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 7ada1884feb90..aa2d5e9d23815 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -11,7 +11,7 @@ import numpy as np import pytest -from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW import pandas as pd from pandas import ( @@ -2126,12 +2126,19 @@ def test_enum_column_equality(): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_mixed_col_index_dtype(): +def test_mixed_col_index_dtype(using_infer_string): # GH 47382 df1 = DataFrame(columns=list("abc"), data=1.0, index=[0]) df2 = DataFrame(columns=list("abc"), data=0.0, index=[0]) df1.columns = df2.columns.astype("string") result = df1 + df2 expected = DataFrame(columns=list("abc"), data=1.0, index=[0]) + if using_infer_string: + # df2.columns.dtype will be "str" instead of object, + # so the aligned result will be "string", not object + if HAS_PYARROW: + dtype = "string[pyarrow]" + else: + dtype = "string" + expected.columns = expected.columns.astype(dtype) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index da1930323f464..c7b7992a78232 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, Series, @@ -364,7 +362,6 @@ def test_with_prefix_contains_get_dummies_NaN_column(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "default_category, expected", [ @@ -401,12 +398,14 @@ def test_with_prefix_contains_get_dummies_NaN_column(): ], ) def test_with_prefix_default_category( - dummies_with_unassigned, default_category, expected + dummies_with_unassigned, default_category, expected, using_infer_string ): result = from_dummies( dummies_with_unassigned, sep="_", default_category=default_category ) expected = DataFrame(expected) + if using_infer_string: + expected = expected.astype("str") tm.assert_frame_equal(result, expected) From 9b03dd4d22550403b75d74f8b54b422bd31c55f2 Mon Sep 17 00:00:00 2001 From: 3w36zj6 <52315048+3w36zj6@users.noreply.github.com> Date: Sun, 2 Feb 2025 04:16:46 +0900 Subject: [PATCH 51/67] ENH: Add `Styler.to_typst()` (#60733) * ENH: Add `to_typst` method to `Styler` * TST: Add `Styler.to_typst()` test cases * STY: Apply Ruff suggestions * DOC: Update What's new * DOC: Update reference * CI: Add `Styler.template_typst` to validation ignore list * DOC: Update docstring format for `Styler.to_typst()` example * DOC: Update versionadded for `Styler.to_typst()` to 3.0.0 in documentation --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/reference/style.rst | 2 + doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/formats/style.py | 105 ++++++++++++++++++ pandas/io/formats/style_render.py | 16 +++ pandas/io/formats/templates/typst.tpl | 12 ++ .../tests/io/formats/style/test_to_typst.py | 96 ++++++++++++++++ scripts/validate_docstrings.py | 1 + 7 files changed, 233 insertions(+) create mode 100644 pandas/io/formats/templates/typst.tpl create mode 100644 pandas/tests/io/formats/style/test_to_typst.py diff --git a/doc/source/reference/style.rst b/doc/source/reference/style.rst index 0e1d93841d52f..742263c788c2f 100644 --- a/doc/source/reference/style.rst +++ b/doc/source/reference/style.rst @@ -27,6 +27,7 @@ Styler properties Styler.template_html_style Styler.template_html_table Styler.template_latex + Styler.template_typst Styler.template_string Styler.loader @@ -77,6 +78,7 @@ Style export and import Styler.to_html Styler.to_latex + Styler.to_typst Styler.to_excel Styler.to_string Styler.export diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index a7f63d75a047e..64f4a66a109f5 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -31,6 +31,7 @@ Other enhancements - :class:`pandas.api.typing.FrozenList` is available for typing the outputs of :attr:`MultiIndex.names`, :attr:`MultiIndex.codes` and :attr:`MultiIndex.levels` (:issue:`58237`) - :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`) - :meth:`pandas.api.interchange.from_dataframe` now uses the `PyCapsule Interface `_ if available, only falling back to the Dataframe Interchange Protocol if that fails (:issue:`60739`) +- Added :meth:`.Styler.to_typst` to write Styler objects to file, buffer or string in Typst format (:issue:`57617`) - :class:`pandas.api.typing.NoDefault` is available for typing ``no_default`` - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`) - :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`) diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 6f164c4b97514..3f37556867954 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -1228,6 +1228,111 @@ def to_latex( ) return save_to_buffer(latex, buf=buf, encoding=encoding) + @overload + def to_typst( + self, + buf: FilePath | WriteBuffer[str], + *, + encoding: str | None = ..., + sparse_index: bool | None = ..., + sparse_columns: bool | None = ..., + max_rows: int | None = ..., + max_columns: int | None = ..., + ) -> None: ... + + @overload + def to_typst( + self, + buf: None = ..., + *, + encoding: str | None = ..., + sparse_index: bool | None = ..., + sparse_columns: bool | None = ..., + max_rows: int | None = ..., + max_columns: int | None = ..., + ) -> str: ... + + @Substitution(buf=buffering_args, encoding=encoding_args) + def to_typst( + self, + buf: FilePath | WriteBuffer[str] | None = None, + *, + encoding: str | None = None, + sparse_index: bool | None = None, + sparse_columns: bool | None = None, + max_rows: int | None = None, + max_columns: int | None = None, + ) -> str | None: + """ + Write Styler to a file, buffer or string in Typst format. + + .. versionadded:: 3.0.0 + + Parameters + ---------- + %(buf)s + %(encoding)s + sparse_index : bool, optional + Whether to sparsify the display of a hierarchical index. Setting to False + will display each explicit level element in a hierarchical key for each row. + Defaults to ``pandas.options.styler.sparse.index`` value. + sparse_columns : bool, optional + Whether to sparsify the display of a hierarchical index. Setting to False + will display each explicit level element in a hierarchical key for each + column. Defaults to ``pandas.options.styler.sparse.columns`` value. + max_rows : int, optional + The maximum number of rows that will be rendered. Defaults to + ``pandas.options.styler.render.max_rows``, which is None. + max_columns : int, optional + The maximum number of columns that will be rendered. Defaults to + ``pandas.options.styler.render.max_columns``, which is None. + + Rows and columns may be reduced if the number of total elements is + large. This value is set to ``pandas.options.styler.render.max_elements``, + which is 262144 (18 bit browser rendering). + + Returns + ------- + str or None + If `buf` is None, returns the result as a string. Otherwise returns `None`. + + See Also + -------- + DataFrame.to_typst : Write a DataFrame to a file, + buffer or string in Typst format. + + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) + >>> df.style.to_typst() # doctest: +SKIP + + .. code-block:: typst + + #table( + columns: 3, + [], [A], [B], + + [0], [1], [3], + [1], [2], [4], + ) + """ + obj = self._copy(deepcopy=True) + + if sparse_index is None: + sparse_index = get_option("styler.sparse.index") + if sparse_columns is None: + sparse_columns = get_option("styler.sparse.columns") + + text = obj._render_typst( + sparse_columns=sparse_columns, + sparse_index=sparse_index, + max_rows=max_rows, + max_cols=max_columns, + ) + return save_to_buffer( + text, buf=buf, encoding=(encoding if buf is not None else None) + ) + @overload def to_html( self, diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py index c0f0608f1ab32..2d1218b007d19 100644 --- a/pandas/io/formats/style_render.py +++ b/pandas/io/formats/style_render.py @@ -77,6 +77,7 @@ class StylerRenderer: template_html_table = env.get_template("html_table.tpl") template_html_style = env.get_template("html_style.tpl") template_latex = env.get_template("latex.tpl") + template_typst = env.get_template("typst.tpl") template_string = env.get_template("string.tpl") def __init__( @@ -232,6 +233,21 @@ def _render_latex( d.update(kwargs) return self.template_latex.render(**d) + def _render_typst( + self, + sparse_index: bool, + sparse_columns: bool, + max_rows: int | None = None, + max_cols: int | None = None, + **kwargs, + ) -> str: + """ + Render a Styler in typst format + """ + d = self._render(sparse_index, sparse_columns, max_rows, max_cols) + d.update(kwargs) + return self.template_typst.render(**d) + def _render_string( self, sparse_index: bool, diff --git a/pandas/io/formats/templates/typst.tpl b/pandas/io/formats/templates/typst.tpl new file mode 100644 index 0000000000000..66de8f31b405e --- /dev/null +++ b/pandas/io/formats/templates/typst.tpl @@ -0,0 +1,12 @@ +#table( + columns: {{ head[0] | length }}, +{% for r in head %} + {% for c in r %}[{% if c["is_visible"] %}{{ c["display_value"] }}{% endif %}],{% if not loop.last %} {% endif%}{% endfor %} + +{% endfor %} + +{% for r in body %} + {% for c in r %}[{% if c["is_visible"] %}{{ c["display_value"] }}{% endif %}],{% if not loop.last %} {% endif%}{% endfor %} + +{% endfor %} +) diff --git a/pandas/tests/io/formats/style/test_to_typst.py b/pandas/tests/io/formats/style/test_to_typst.py new file mode 100644 index 0000000000000..2365119c9c4dc --- /dev/null +++ b/pandas/tests/io/formats/style/test_to_typst.py @@ -0,0 +1,96 @@ +from textwrap import dedent + +import pytest + +from pandas import ( + DataFrame, + Series, +) + +pytest.importorskip("jinja2") +from pandas.io.formats.style import Styler + + +@pytest.fixture +def df(): + return DataFrame( + {"A": [0, 1], "B": [-0.61, -1.22], "C": Series(["ab", "cd"], dtype=object)} + ) + + +@pytest.fixture +def styler(df): + return Styler(df, uuid_len=0, precision=2) + + +def test_basic_table(styler): + result = styler.to_typst() + expected = dedent( + """\ + #table( + columns: 4, + [], [A], [B], [C], + + [0], [0], [-0.61], [ab], + [1], [1], [-1.22], [cd], + )""" + ) + assert result == expected + + +def test_concat(styler): + result = styler.concat(styler.data.agg(["sum"]).style).to_typst() + expected = dedent( + """\ + #table( + columns: 4, + [], [A], [B], [C], + + [0], [0], [-0.61], [ab], + [1], [1], [-1.22], [cd], + [sum], [1], [-1.830000], [abcd], + )""" + ) + assert result == expected + + +def test_concat_recursion(styler): + df = styler.data + styler1 = styler + styler2 = Styler(df.agg(["sum"]), uuid_len=0, precision=3) + styler3 = Styler(df.agg(["sum"]), uuid_len=0, precision=4) + result = styler1.concat(styler2.concat(styler3)).to_typst() + expected = dedent( + """\ + #table( + columns: 4, + [], [A], [B], [C], + + [0], [0], [-0.61], [ab], + [1], [1], [-1.22], [cd], + [sum], [1], [-1.830], [abcd], + [sum], [1], [-1.8300], [abcd], + )""" + ) + assert result == expected + + +def test_concat_chain(styler): + df = styler.data + styler1 = styler + styler2 = Styler(df.agg(["sum"]), uuid_len=0, precision=3) + styler3 = Styler(df.agg(["sum"]), uuid_len=0, precision=4) + result = styler1.concat(styler2).concat(styler3).to_typst() + expected = dedent( + """\ + #table( + columns: 4, + [], [A], [B], [C], + + [0], [0], [-0.61], [ab], + [1], [1], [-1.22], [cd], + [sum], [1], [-1.830], [abcd], + [sum], [1], [-1.8300], [abcd], + )""" + ) + assert result == expected diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 55acfaac4d843..944575dcc8659 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -45,6 +45,7 @@ "Styler.template_html_style", "Styler.template_html_table", "Styler.template_latex", + "Styler.template_typst", "Styler.template_string", "Styler.loader", "errors.InvalidComparison", From d72f165eb327898b1597efe75ff8b54032c3ae7b Mon Sep 17 00:00:00 2001 From: "Christine P. Chai" Date: Sat, 1 Feb 2025 11:18:25 -0800 Subject: [PATCH 52/67] DOC: Move NumPy Byte Order page in gotchas.rst (#60822) --- doc/source/user_guide/gotchas.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/gotchas.rst b/doc/source/user_guide/gotchas.rst index 842f30f06676e..e85eead4e0f09 100644 --- a/doc/source/user_guide/gotchas.rst +++ b/doc/source/user_guide/gotchas.rst @@ -372,5 +372,5 @@ constructors using something similar to the following: s = pd.Series(newx) See `the NumPy documentation on byte order -`__ for more +`__ for more details. From f1441b218271178ebe18acecc3657f6549fb6c54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quang=20Nguy=E1=BB=85n?= <30631476+quangngd@users.noreply.github.com> Date: Mon, 3 Feb 2025 03:12:30 +0700 Subject: [PATCH 53/67] CHORE: Enable mistakenly ignored tests (#60827) Enable ignored tests --- pandas/tests/io/formats/test_to_string.py | 25 ++++++++++------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/pandas/tests/io/formats/test_to_string.py b/pandas/tests/io/formats/test_to_string.py index af3cdf2d44af3..1e8598c918efe 100644 --- a/pandas/tests/io/formats/test_to_string.py +++ b/pandas/tests/io/formats/test_to_string.py @@ -132,20 +132,17 @@ def test_to_string_with_formatters_unicode(self): ) assert result == expected - def test_to_string_index_formatter(self): - df = DataFrame([range(5), range(5, 10), range(10, 15)]) - - rs = df.to_string(formatters={"__index__": lambda x: "abc"[x]}) - - xp = dedent( - """\ - 0 1 2 3 4 - a 0 1 2 3 4 - b 5 6 7 8 9 - c 10 11 12 13 14\ - """ - ) - assert rs == xp + def test_to_string_index_formatter(self): + df = DataFrame([range(5), range(5, 10), range(10, 15)]) + rs = df.to_string(formatters={"__index__": lambda x: "abc"[x]}) + xp = dedent( + """\ + 0 1 2 3 4 + a 0 1 2 3 4 + b 5 6 7 8 9 + c 10 11 12 13 14""" + ) + assert rs == xp def test_no_extra_space(self): # GH#52690: Check that no extra space is given From a68048ea026f09fc56e1a9963c489ff0beaae651 Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Mon, 3 Feb 2025 06:23:23 -0800 Subject: [PATCH 54/67] ENH: Support skipna parameter in GroupBy min, max, prod, median, var, std and sem methods (#60752) --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/_libs/groupby.pyi | 5 + pandas/_libs/groupby.pyx | 99 ++++++++++--- pandas/core/_numba/kernels/min_max_.py | 8 +- pandas/core/_numba/kernels/var_.py | 7 +- pandas/core/groupby/groupby.py | 76 ++++++++-- pandas/core/resample.py | 98 ++++++++++++- pandas/tests/groupby/aggregate/test_numba.py | 2 +- pandas/tests/groupby/test_api.py | 18 +-- pandas/tests/groupby/test_reductions.py | 141 +++++++++++++++++++ 10 files changed, 405 insertions(+), 51 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 64f4a66a109f5..9089b9cdd2185 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -59,9 +59,9 @@ Other enhancements - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`) +- :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``median``, ``prod``, ``min``, ``max``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`) - :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`) - :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`) -- :meth:`.DataFrameGroupBy.mean`, :meth:`.DataFrameGroupBy.sum`, :meth:`.SeriesGroupBy.mean` and :meth:`.SeriesGroupBy.sum` now accept ``skipna`` parameter (:issue:`15675`) - :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`) - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`) - :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index e3909203d1f5a..163fc23535022 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -13,6 +13,7 @@ def group_median_float64( mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., is_datetimelike: bool = ..., # bint + skipna: bool = ..., ) -> None: ... def group_cumprod( out: np.ndarray, # float64_t[:, ::1] @@ -76,6 +77,7 @@ def group_prod( mask: np.ndarray | None, result_mask: np.ndarray | None = ..., min_count: int = ..., + skipna: bool = ..., ) -> None: ... def group_var( out: np.ndarray, # floating[:, ::1] @@ -88,6 +90,7 @@ def group_var( result_mask: np.ndarray | None = ..., is_datetimelike: bool = ..., name: str = ..., + skipna: bool = ..., ) -> None: ... def group_skew( out: np.ndarray, # float64_t[:, ::1] @@ -183,6 +186,7 @@ def group_max( is_datetimelike: bool = ..., mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., + skipna: bool = ..., ) -> None: ... def group_min( out: np.ndarray, # groupby_t[:, ::1] @@ -193,6 +197,7 @@ def group_min( is_datetimelike: bool = ..., mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., + skipna: bool = ..., ) -> None: ... def group_idxmin_idxmax( out: npt.NDArray[np.intp], diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 70af22f514ce0..16a104a46ed3d 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -62,7 +62,12 @@ cdef enum InterpolationEnumType: INTERPOLATION_MIDPOINT -cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept nogil: +cdef float64_t median_linear_mask( + float64_t* a, + int n, + uint8_t* mask, + bint skipna=True +) noexcept nogil: cdef: int i, j, na_count = 0 float64_t* tmp @@ -77,7 +82,7 @@ cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept n na_count += 1 if na_count: - if na_count == n: + if na_count == n or not skipna: return NaN tmp = malloc((n - na_count) * sizeof(float64_t)) @@ -104,7 +109,8 @@ cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept n cdef float64_t median_linear( float64_t* a, int n, - bint is_datetimelike=False + bint is_datetimelike=False, + bint skipna=True, ) noexcept nogil: cdef: int i, j, na_count = 0 @@ -125,7 +131,7 @@ cdef float64_t median_linear( na_count += 1 if na_count: - if na_count == n: + if na_count == n or not skipna: return NaN tmp = malloc((n - na_count) * sizeof(float64_t)) @@ -186,6 +192,7 @@ def group_median_float64( const uint8_t[:, :] mask=None, uint8_t[:, ::1] result_mask=None, bint is_datetimelike=False, + bint skipna=True, ) -> None: """ Only aggregates on axis=0 @@ -229,7 +236,7 @@ def group_median_float64( for j in range(ngroups): size = _counts[j + 1] - result = median_linear_mask(ptr, size, ptr_mask) + result = median_linear_mask(ptr, size, ptr_mask, skipna) out[j, i] = result if result != result: @@ -244,7 +251,7 @@ def group_median_float64( ptr += _counts[0] for j in range(ngroups): size = _counts[j + 1] - out[j, i] = median_linear(ptr, size, is_datetimelike) + out[j, i] = median_linear(ptr, size, is_datetimelike, skipna) ptr += size @@ -804,17 +811,18 @@ def group_prod( const uint8_t[:, ::1] mask, uint8_t[:, ::1] result_mask=None, Py_ssize_t min_count=0, + bint skipna=True, ) -> None: """ Only aggregates on axis=0 """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - int64float_t val + int64float_t val, nan_val int64float_t[:, ::1] prodx int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) - bint isna_entry, uses_mask = mask is not None + bint isna_entry, isna_result, uses_mask = mask is not None if len_values != len_labels: raise ValueError("len(index) != len(labels)") @@ -823,6 +831,7 @@ def group_prod( prodx = np.ones((out).shape, dtype=(out).base.dtype) N, K = (values).shape + nan_val = _get_na_val(0, False) with nogil: for i in range(N): @@ -836,12 +845,23 @@ def group_prod( if uses_mask: isna_entry = mask[i, j] + isna_result = result_mask[lab, j] else: isna_entry = _treat_as_na(val, False) + isna_result = _treat_as_na(prodx[lab, j], False) + + if not skipna and isna_result: + # If prod is already NA, no need to update it + continue if not isna_entry: nobs[lab, j] += 1 prodx[lab, j] *= val + elif not skipna: + if uses_mask: + result_mask[lab, j] = True + else: + prodx[lab, j] = nan_val _check_below_mincount( out, uses_mask, result_mask, ncounts, K, nobs, min_count, prodx @@ -862,6 +882,7 @@ def group_var( uint8_t[:, ::1] result_mask=None, bint is_datetimelike=False, str name="var", + bint skipna=True, ) -> None: cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) @@ -869,7 +890,7 @@ def group_var( floating[:, ::1] mean int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) - bint isna_entry, uses_mask = mask is not None + bint isna_entry, isna_result, uses_mask = mask is not None bint is_std = name == "std" bint is_sem = name == "sem" @@ -898,19 +919,34 @@ def group_var( if uses_mask: isna_entry = mask[i, j] + isna_result = result_mask[lab, j] elif is_datetimelike: # With group_var, we cannot just use _treat_as_na bc # datetimelike dtypes get cast to float64 instead of # to int64. isna_entry = val == NPY_NAT + isna_result = out[lab, j] == NPY_NAT else: isna_entry = _treat_as_na(val, is_datetimelike) + isna_result = _treat_as_na(out[lab, j], is_datetimelike) + + if not skipna and isna_result: + # If aggregate is already NA, don't add to it. This is important for + # datetimelike because adding a value to NPY_NAT may not result + # in a NPY_NAT + continue if not isna_entry: nobs[lab, j] += 1 oldmean = mean[lab, j] mean[lab, j] += (val - oldmean) / nobs[lab, j] out[lab, j] += (val - mean[lab, j]) * (val - oldmean) + elif not skipna: + nobs[lab, j] = 0 + if uses_mask: + result_mask[lab, j] = True + else: + out[lab, j] = NAN for i in range(ncounts): for j in range(K): @@ -1164,7 +1200,7 @@ def group_mean( mean_t[:, ::1] sumx, compensation int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) - bint isna_entry, uses_mask = mask is not None + bint isna_entry, isna_result, uses_mask = mask is not None assert min_count == -1, "'min_count' only used in sum and prod" @@ -1194,25 +1230,24 @@ def group_mean( for j in range(K): val = values[i, j] - if not skipna and ( - (uses_mask and result_mask[lab, j]) or - (is_datetimelike and sumx[lab, j] == NPY_NAT) or - _treat_as_na(sumx[lab, j], False) - ): - # If sum is already NA, don't add to it. This is important for - # datetimelike because adding a value to NPY_NAT may not result - # in NPY_NAT - continue - if uses_mask: isna_entry = mask[i, j] + isna_result = result_mask[lab, j] elif is_datetimelike: # With group_mean, we cannot just use _treat_as_na bc # datetimelike dtypes get cast to float64 instead of # to int64. isna_entry = val == NPY_NAT + isna_result = sumx[lab, j] == NPY_NAT else: isna_entry = _treat_as_na(val, is_datetimelike) + isna_result = _treat_as_na(sumx[lab, j], is_datetimelike) + + if not skipna and isna_result: + # If sum is already NA, don't add to it. This is important for + # datetimelike because adding a value to NPY_NAT may not result + # in NPY_NAT + continue if not isna_entry: nobs[lab, j] += 1 @@ -1806,6 +1841,7 @@ cdef group_min_max( bint compute_max=True, const uint8_t[:, ::1] mask=None, uint8_t[:, ::1] result_mask=None, + bint skipna=True, ): """ Compute minimum/maximum of columns of `values`, in row groups `labels`. @@ -1833,6 +1869,8 @@ cdef group_min_max( result_mask : ndarray[bool, ndim=2], optional If not None, these specify locations in the output that are NA. Modified in-place. + skipna : bool, default True + If True, ignore nans in `values`. Notes ----- @@ -1841,17 +1879,18 @@ cdef group_min_max( """ cdef: Py_ssize_t i, j, N, K, lab, ngroups = len(counts) - numeric_t val + numeric_t val, nan_val numeric_t[:, ::1] group_min_or_max int64_t[:, ::1] nobs bint uses_mask = mask is not None - bint isna_entry + bint isna_entry, isna_result if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") min_count = max(min_count, 1) nobs = np.zeros((out).shape, dtype=np.int64) + nan_val = _get_na_val(0, is_datetimelike) group_min_or_max = np.empty_like(out) group_min_or_max[:] = _get_min_or_max(0, compute_max, is_datetimelike) @@ -1870,8 +1909,15 @@ cdef group_min_max( if uses_mask: isna_entry = mask[i, j] + isna_result = result_mask[lab, j] else: isna_entry = _treat_as_na(val, is_datetimelike) + isna_result = _treat_as_na(group_min_or_max[lab, j], + is_datetimelike) + + if not skipna and isna_result: + # If current min/max is already NA, it will always be NA + continue if not isna_entry: nobs[lab, j] += 1 @@ -1881,6 +1927,11 @@ cdef group_min_max( else: if val < group_min_or_max[lab, j]: group_min_or_max[lab, j] = val + elif not skipna: + if uses_mask: + result_mask[lab, j] = True + else: + group_min_or_max[lab, j] = nan_val _check_below_mincount( out, uses_mask, result_mask, ngroups, K, nobs, min_count, group_min_or_max @@ -2012,6 +2063,7 @@ def group_max( bint is_datetimelike=False, const uint8_t[:, ::1] mask=None, uint8_t[:, ::1] result_mask=None, + bint skipna=True, ) -> None: """See group_min_max.__doc__""" group_min_max( @@ -2024,6 +2076,7 @@ def group_max( compute_max=True, mask=mask, result_mask=result_mask, + skipna=skipna, ) @@ -2038,6 +2091,7 @@ def group_min( bint is_datetimelike=False, const uint8_t[:, ::1] mask=None, uint8_t[:, ::1] result_mask=None, + bint skipna=True, ) -> None: """See group_min_max.__doc__""" group_min_max( @@ -2050,6 +2104,7 @@ def group_min( compute_max=False, mask=mask, result_mask=result_mask, + skipna=skipna, ) diff --git a/pandas/core/_numba/kernels/min_max_.py b/pandas/core/_numba/kernels/min_max_.py index 59d36732ebae6..d56453e4e5abf 100644 --- a/pandas/core/_numba/kernels/min_max_.py +++ b/pandas/core/_numba/kernels/min_max_.py @@ -88,6 +88,7 @@ def grouped_min_max( ngroups: int, min_periods: int, is_max: bool, + skipna: bool = True, ) -> tuple[np.ndarray, list[int]]: N = len(labels) nobs = np.zeros(ngroups, dtype=np.int64) @@ -97,13 +98,16 @@ def grouped_min_max( for i in range(N): lab = labels[i] val = values[i] - if lab < 0: + if lab < 0 or (nobs[lab] >= 1 and np.isnan(output[lab])): continue if values.dtype.kind == "i" or not np.isnan(val): nobs[lab] += 1 else: - # NaN value cannot be a min/max value + if not skipna: + # If skipna is False and we encounter a NaN, + # both min and max of the group will be NaN + output[lab] = np.nan continue if nobs[lab] == 1: diff --git a/pandas/core/_numba/kernels/var_.py b/pandas/core/_numba/kernels/var_.py index 69aec4d6522c4..5d720c877815d 100644 --- a/pandas/core/_numba/kernels/var_.py +++ b/pandas/core/_numba/kernels/var_.py @@ -176,6 +176,7 @@ def grouped_var( ngroups: int, min_periods: int, ddof: int = 1, + skipna: bool = True, ) -> tuple[np.ndarray, list[int]]: N = len(labels) @@ -190,7 +191,11 @@ def grouped_var( lab = labels[i] val = values[i] - if lab < 0: + if lab < 0 or np.isnan(output[lab]): + continue + + if not skipna and np.isnan(val): + output[lab] = np.nan continue mean_x = means[lab] diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f9059e6e8896f..7c3088bea4b76 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2248,7 +2248,7 @@ def mean( return result.__finalize__(self.obj, method="groupby") @final - def median(self, numeric_only: bool = False) -> NDFrameT: + def median(self, numeric_only: bool = False, skipna: bool = True) -> NDFrameT: """ Compute median of groups, excluding missing values. @@ -2263,6 +2263,12 @@ def median(self, numeric_only: bool = False) -> NDFrameT: numeric_only no longer accepts ``None`` and defaults to False. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + + .. versionadded:: 3.0.0 + Returns ------- Series or DataFrame @@ -2335,8 +2341,11 @@ def median(self, numeric_only: bool = False) -> NDFrameT: """ result = self._cython_agg_general( "median", - alt=lambda x: Series(x, copy=False).median(numeric_only=numeric_only), + alt=lambda x: Series(x, copy=False).median( + numeric_only=numeric_only, skipna=skipna + ), numeric_only=numeric_only, + skipna=skipna, ) return result.__finalize__(self.obj, method="groupby") @@ -2349,6 +2358,7 @@ def std( engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, numeric_only: bool = False, + skipna: bool = True, ): """ Compute standard deviation of groups, excluding missing values. @@ -2387,6 +2397,12 @@ def std( numeric_only now defaults to ``False``. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + + .. versionadded:: 3.0.0 + Returns ------- Series or DataFrame @@ -2441,14 +2457,16 @@ def std( engine_kwargs, min_periods=0, ddof=ddof, + skipna=skipna, ) ) else: return self._cython_agg_general( "std", - alt=lambda x: Series(x, copy=False).std(ddof=ddof), + alt=lambda x: Series(x, copy=False).std(ddof=ddof, skipna=skipna), numeric_only=numeric_only, ddof=ddof, + skipna=skipna, ) @final @@ -2460,6 +2478,7 @@ def var( engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, numeric_only: bool = False, + skipna: bool = True, ): """ Compute variance of groups, excluding missing values. @@ -2497,6 +2516,12 @@ def var( numeric_only now defaults to ``False``. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + + .. versionadded:: 3.0.0 + Returns ------- Series or DataFrame @@ -2550,13 +2575,15 @@ def var( engine_kwargs, min_periods=0, ddof=ddof, + skipna=skipna, ) else: return self._cython_agg_general( "var", - alt=lambda x: Series(x, copy=False).var(ddof=ddof), + alt=lambda x: Series(x, copy=False).var(ddof=ddof, skipna=skipna), numeric_only=numeric_only, ddof=ddof, + skipna=skipna, ) @final @@ -2686,7 +2713,9 @@ def _value_counts( return result.__finalize__(self.obj, method="value_counts") @final - def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT: + def sem( + self, ddof: int = 1, numeric_only: bool = False, skipna: bool = True + ) -> NDFrameT: """ Compute standard error of the mean of groups, excluding missing values. @@ -2706,6 +2735,12 @@ def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT: numeric_only now defaults to ``False``. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + + .. versionadded:: 3.0.0 + Returns ------- Series or DataFrame @@ -2780,9 +2815,10 @@ def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT: ) return self._cython_agg_general( "sem", - alt=lambda x: Series(x, copy=False).sem(ddof=ddof), + alt=lambda x: Series(x, copy=False).sem(ddof=ddof, skipna=skipna), numeric_only=numeric_only, ddof=ddof, + skipna=skipna, ) @final @@ -2959,7 +2995,9 @@ def sum( return result @final - def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT: + def prod( + self, numeric_only: bool = False, min_count: int = 0, skipna: bool = True + ) -> NDFrameT: """ Compute prod of group values. @@ -2976,6 +3014,12 @@ def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT: The required number of valid values to perform the operation. If fewer than ``min_count`` non-NA values are present the result will be NA. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + + .. versionadded:: 3.0.0 + Returns ------- Series or DataFrame @@ -3024,17 +3068,22 @@ def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT: 2 30 72 """ return self._agg_general( - numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod + numeric_only=numeric_only, + min_count=min_count, + skipna=skipna, + alias="prod", + npfunc=np.prod, ) @final @doc( - _groupby_agg_method_engine_template, + _groupby_agg_method_skipna_engine_template, fname="min", no=False, mc=-1, e=None, ek=None, + s=True, example=dedent( """\ For SeriesGroupBy: @@ -3074,6 +3123,7 @@ def min( self, numeric_only: bool = False, min_count: int = -1, + skipna: bool = True, engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, ): @@ -3086,23 +3136,26 @@ def min( engine_kwargs, min_periods=min_count, is_max=False, + skipna=skipna, ) else: return self._agg_general( numeric_only=numeric_only, min_count=min_count, + skipna=skipna, alias="min", npfunc=np.min, ) @final @doc( - _groupby_agg_method_engine_template, + _groupby_agg_method_skipna_engine_template, fname="max", no=False, mc=-1, e=None, ek=None, + s=True, example=dedent( """\ For SeriesGroupBy: @@ -3142,6 +3195,7 @@ def max( self, numeric_only: bool = False, min_count: int = -1, + skipna: bool = True, engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, ): @@ -3154,11 +3208,13 @@ def max( engine_kwargs, min_periods=min_count, is_max=True, + skipna=skipna, ) else: return self._agg_general( numeric_only=numeric_only, min_count=min_count, + skipna=skipna, alias="max", npfunc=np.max, ) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 4b3b7a72b5a5c..1cfc75ea11725 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1269,8 +1269,53 @@ def last( ) @final - @doc(GroupBy.median) def median(self, numeric_only: bool = False): + """ + Compute median of groups, excluding missing values. + + For multiple groupings, the result index will be a MultiIndex + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionchanged:: 2.0.0 + + numeric_only no longer accepts ``None`` and defaults to False. + + Returns + ------- + Series or DataFrame + Median of values within each group. + + See Also + -------- + Series.groupby : Apply a function groupby to a Series. + DataFrame.groupby : Apply a function groupby to each row or column of a + DataFrame. + + Examples + -------- + + >>> ser = pd.Series( + ... [1, 2, 3, 3, 4, 5], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-10", + ... "2023-01-15", + ... "2023-02-01", + ... "2023-02-10", + ... "2023-02-15", + ... ] + ... ), + ... ) + >>> ser.resample("MS").median() + 2023-01-01 2.0 + 2023-02-01 4.0 + Freq: MS, dtype: float64 + """ return self._downsample("median", numeric_only=numeric_only) @final @@ -1450,12 +1495,61 @@ def var( return self._downsample("var", ddof=ddof, numeric_only=numeric_only) @final - @doc(GroupBy.sem) def sem( self, ddof: int = 1, numeric_only: bool = False, ): + """ + Compute standard error of the mean of groups, excluding missing values. + + For multiple groupings, the result index will be a MultiIndex. + + Parameters + ---------- + ddof : int, default 1 + Degrees of freedom. + + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + .. versionchanged:: 2.0.0 + + numeric_only now defaults to ``False``. + + Returns + ------- + Series or DataFrame + Standard error of the mean of values within each group. + + See Also + -------- + DataFrame.sem : Return unbiased standard error of the mean over requested axis. + Series.sem : Return unbiased standard error of the mean over requested axis. + + Examples + -------- + + >>> ser = pd.Series( + ... [1, 3, 2, 4, 3, 8], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-10", + ... "2023-01-15", + ... "2023-02-01", + ... "2023-02-10", + ... "2023-02-15", + ... ] + ... ), + ... ) + >>> ser.resample("MS").sem() + 2023-01-01 0.577350 + 2023-02-01 1.527525 + Freq: MS, dtype: float64 + """ return self._downsample("sem", ddof=ddof, numeric_only=numeric_only) @final diff --git a/pandas/tests/groupby/aggregate/test_numba.py b/pandas/tests/groupby/aggregate/test_numba.py index ca265a1d1108b..0cd8a14d97eb0 100644 --- a/pandas/tests/groupby/aggregate/test_numba.py +++ b/pandas/tests/groupby/aggregate/test_numba.py @@ -186,7 +186,7 @@ def test_multifunc_numba_vs_cython_frame(agg_kwargs): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("func", ["sum", "mean"]) +@pytest.mark.parametrize("func", ["sum", "mean", "var", "std", "min", "max"]) def test_multifunc_numba_vs_cython_frame_noskipna(func): pytest.importorskip("numba") data = DataFrame( diff --git a/pandas/tests/groupby/test_api.py b/pandas/tests/groupby/test_api.py index cc69de2581a79..215e627abb018 100644 --- a/pandas/tests/groupby/test_api.py +++ b/pandas/tests/groupby/test_api.py @@ -174,16 +174,13 @@ def test_frame_consistency(groupby_func): elif groupby_func in ("nunique",): exclude_expected = {"axis"} elif groupby_func in ("max", "min"): - exclude_expected = {"axis", "kwargs", "skipna"} + exclude_expected = {"axis", "kwargs"} exclude_result = {"min_count", "engine", "engine_kwargs"} - elif groupby_func in ("sum", "mean"): + elif groupby_func in ("sum", "mean", "std", "var"): exclude_expected = {"axis", "kwargs"} exclude_result = {"engine", "engine_kwargs"} - elif groupby_func in ("std", "var"): - exclude_expected = {"axis", "kwargs", "skipna"} - exclude_result = {"engine", "engine_kwargs"} elif groupby_func in ("median", "prod", "sem"): - exclude_expected = {"axis", "kwargs", "skipna"} + exclude_expected = {"axis", "kwargs"} elif groupby_func in ("bfill", "ffill"): exclude_expected = {"inplace", "axis", "limit_area"} elif groupby_func in ("cummax", "cummin"): @@ -235,16 +232,13 @@ def test_series_consistency(request, groupby_func): if groupby_func in ("any", "all"): exclude_expected = {"kwargs", "bool_only", "axis"} elif groupby_func in ("max", "min"): - exclude_expected = {"axis", "kwargs", "skipna"} + exclude_expected = {"axis", "kwargs"} exclude_result = {"min_count", "engine", "engine_kwargs"} - elif groupby_func in ("sum", "mean"): + elif groupby_func in ("sum", "mean", "std", "var"): exclude_expected = {"axis", "kwargs"} exclude_result = {"engine", "engine_kwargs"} - elif groupby_func in ("std", "var"): - exclude_expected = {"axis", "kwargs", "skipna"} - exclude_result = {"engine", "engine_kwargs"} elif groupby_func in ("median", "prod", "sem"): - exclude_expected = {"axis", "kwargs", "skipna"} + exclude_expected = {"axis", "kwargs"} elif groupby_func in ("bfill", "ffill"): exclude_expected = {"inplace", "axis", "limit_area"} elif groupby_func in ("cummax", "cummin"): diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 1db12f05e821f..ea876cfdf4933 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -514,6 +514,147 @@ def test_sum_skipna_object(skipna): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize( + "func, values, dtype, result_dtype", + [ + ("prod", [0, 1, 3, np.nan, 4, 5, 6, 7, -8, 9], "float64", "float64"), + ("prod", [0, -1, 3, 4, 5, np.nan, 6, 7, 8, 9], "Float64", "Float64"), + ("prod", [0, 1, 3, -4, 5, 6, 7, -8, np.nan, 9], "Int64", "Int64"), + ("prod", [np.nan] * 10, "float64", "float64"), + ("prod", [np.nan] * 10, "Float64", "Float64"), + ("prod", [np.nan] * 10, "Int64", "Int64"), + ("var", [0, -1, 3, 4, np.nan, 5, 6, 7, 8, 9], "float64", "float64"), + ("var", [0, 1, 3, -4, 5, 6, 7, -8, 9, np.nan], "Float64", "Float64"), + ("var", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "Int64", "Float64"), + ("var", [np.nan] * 10, "float64", "float64"), + ("var", [np.nan] * 10, "Float64", "Float64"), + ("var", [np.nan] * 10, "Int64", "Float64"), + ("std", [0, 1, 3, -4, 5, 6, 7, -8, np.nan, 9], "float64", "float64"), + ("std", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "Float64", "Float64"), + ("std", [0, 1, 3, -4, 5, 6, 7, -8, 9, np.nan], "Int64", "Float64"), + ("std", [np.nan] * 10, "float64", "float64"), + ("std", [np.nan] * 10, "Float64", "Float64"), + ("std", [np.nan] * 10, "Int64", "Float64"), + ("sem", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "float64", "float64"), + ("sem", [0, 1, 3, -4, 5, 6, 7, -8, np.nan, 9], "Float64", "Float64"), + ("sem", [0, -1, 3, 4, 5, -6, 7, 8, 9, np.nan], "Int64", "Float64"), + ("sem", [np.nan] * 10, "float64", "float64"), + ("sem", [np.nan] * 10, "Float64", "Float64"), + ("sem", [np.nan] * 10, "Int64", "Float64"), + ("min", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "float64", "float64"), + ("min", [0, 1, 3, -4, 5, 6, 7, -8, np.nan, 9], "Float64", "Float64"), + ("min", [0, -1, 3, 4, 5, -6, 7, 8, 9, np.nan], "Int64", "Int64"), + ( + "min", + [0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], + "timedelta64[ns]", + "timedelta64[ns]", + ), + ( + "min", + pd.to_datetime( + [ + "2019-05-09", + pd.NaT, + "2019-05-11", + "2019-05-12", + "2019-05-13", + "2019-05-14", + "2019-05-15", + "2019-05-16", + "2019-05-17", + "2019-05-18", + ] + ), + "datetime64[ns]", + "datetime64[ns]", + ), + ("min", [np.nan] * 10, "float64", "float64"), + ("min", [np.nan] * 10, "Float64", "Float64"), + ("min", [np.nan] * 10, "Int64", "Int64"), + ("max", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "float64", "float64"), + ("max", [0, 1, 3, -4, 5, 6, 7, -8, np.nan, 9], "Float64", "Float64"), + ("max", [0, -1, 3, 4, 5, -6, 7, 8, 9, np.nan], "Int64", "Int64"), + ( + "max", + [0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], + "timedelta64[ns]", + "timedelta64[ns]", + ), + ( + "max", + pd.to_datetime( + [ + "2019-05-09", + pd.NaT, + "2019-05-11", + "2019-05-12", + "2019-05-13", + "2019-05-14", + "2019-05-15", + "2019-05-16", + "2019-05-17", + "2019-05-18", + ] + ), + "datetime64[ns]", + "datetime64[ns]", + ), + ("max", [np.nan] * 10, "float64", "float64"), + ("max", [np.nan] * 10, "Float64", "Float64"), + ("max", [np.nan] * 10, "Int64", "Int64"), + ("median", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "float64", "float64"), + ("median", [0, 1, 3, -4, 5, 6, 7, -8, np.nan, 9], "Float64", "Float64"), + ("median", [0, -1, 3, 4, 5, -6, 7, 8, 9, np.nan], "Int64", "Float64"), + ( + "median", + [0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], + "timedelta64[ns]", + "timedelta64[ns]", + ), + ( + "median", + pd.to_datetime( + [ + "2019-05-09", + pd.NaT, + "2019-05-11", + "2019-05-12", + "2019-05-13", + "2019-05-14", + "2019-05-15", + "2019-05-16", + "2019-05-17", + "2019-05-18", + ] + ), + "datetime64[ns]", + "datetime64[ns]", + ), + ("median", [np.nan] * 10, "float64", "float64"), + ("median", [np.nan] * 10, "Float64", "Float64"), + ("median", [np.nan] * 10, "Int64", "Float64"), + ], +) +def test_multifunc_skipna(func, values, dtype, result_dtype, skipna): + # GH#15675 + df = DataFrame( + { + "val": values, + "cat": ["A", "B"] * 5, + } + ).astype({"val": dtype}) + # We need to recast the expected values to the result_dtype as some operations + # change the dtype + expected = ( + df.groupby("cat")["val"] + .apply(lambda x: getattr(x, func)(skipna=skipna)) + .astype(result_dtype) + ) + result = getattr(df.groupby("cat")["val"], func)(skipna=skipna) + tm.assert_series_equal(result, expected) + + def test_cython_median(): arr = np.random.default_rng(2).standard_normal(1000) arr[::2] = np.nan From e4f6270a7b9338c439a6352fca8029be26d8e211 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 3 Feb 2025 23:15:34 +0530 Subject: [PATCH 55/67] DOC: fix ES01 for pandas.reset_option (#60834) --- pandas/_config/config.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 35139979f92fe..0d06e6fa8e96c 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -321,6 +321,11 @@ def reset_option(pat: str) -> None: """ Reset one or more options to their default value. + This method resets the specified pandas option(s) back to their default + values. It allows partial string matching for convenience, but users should + exercise caution to avoid unintended resets due to changes in option names + in future versions. + Parameters ---------- pat : str/regex From 2a49a4f218c3819e128cd1c8ea7fc9c1f2bdf92b Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 3 Feb 2025 23:16:08 +0530 Subject: [PATCH 56/67] DOC: fix ES01 for pandas.core.resample.Resampler.indices (#60835) --- pandas/core/groupby/groupby.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 7c3088bea4b76..549e76ebc15eb 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -570,6 +570,13 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: """ Dict {group name -> group indices}. + The dictionary keys represent the group labels (e.g., timestamps for a + time-based resampling operation), and the values are arrays of integer + positions indicating where the elements of each group are located in the + original data. This property is particularly useful when working with + resampled data, as it provides insight into how the original time-series data + has been grouped. + See Also -------- core.groupby.DataFrameGroupBy.indices : Provides a mapping of group rows to From 569f94da9ecf0cd7c5eb565f5041b883726f6d3a Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 3 Feb 2025 23:16:36 +0530 Subject: [PATCH 57/67] DOC: fix ES01 for pandas.DataFrame.columns (#60836) --- pandas/core/frame.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3669d8249dd27..d9f7623064e05 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -13673,6 +13673,10 @@ def isin_(x): doc=""" The column labels of the DataFrame. + This property holds the column names as a pandas ``Index`` object. + It provides an immutable sequence of column labels that can be + used for data selection, renaming, and alignment in DataFrame operations. + Returns ------- pandas.Index From 4f664f156badac017c3775242559953a4da50b40 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 3 Feb 2025 23:17:10 +0530 Subject: [PATCH 58/67] DOC: fix ES01 for pandas.Series.array (#60837) --- pandas/core/base.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/core/base.py b/pandas/core/base.py index 61a7c079d87f8..a64cd8633c1db 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -506,6 +506,11 @@ def array(self) -> ExtensionArray: """ The ExtensionArray of the data backing this Series or Index. + This property provides direct access to the underlying array data of a + Series or Index without requiring conversion to a NumPy array. It + returns an ExtensionArray, which is the native storage format for + pandas extension dtypes. + Returns ------- ExtensionArray From 3bd27ffa296398c974c19571ccacd1eea76ca034 Mon Sep 17 00:00:00 2001 From: Florian Bourgey Date: Mon, 3 Feb 2025 12:51:31 -0500 Subject: [PATCH 59/67] DOC: Update parameter descriptions in `cut` function for clarity (#60839) --- pandas/core/reshape/tile.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index b3f946f289891..034b861a83f43 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -73,7 +73,7 @@ def cut( Parameters ---------- - x : array-like + x : 1d ndarray or Series The input array to be binned. Must be 1-dimensional. bins : int, sequence of scalars, or IntervalIndex The criteria to bin by. @@ -126,7 +126,7 @@ def cut( Categorical for all other inputs. The values stored within are whatever the type in the sequence is. - * False : returns an ndarray of integers. + * False : returns a 1d ndarray or Series of integers. bins : numpy.ndarray or IntervalIndex. The computed or specified bins. Only returned when `retbins=True`. From c6fc6d0d7978f3958264fd372f56edf686614dac Mon Sep 17 00:00:00 2001 From: SebastianOuslis Date: Mon, 3 Feb 2025 12:53:01 -0500 Subject: [PATCH 60/67] DOC: Closed parameter not intuitively documented in DataFrame.rolling (#60832) * change docs * format * format --- pandas/core/groupby/groupby.py | 17 ++++++++++++----- pandas/core/window/rolling.py | 17 ++++++++++++----- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 549e76ebc15eb..9c27df4ed8c1b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3717,14 +3717,21 @@ def rolling( an integer index is not used to calculate the rolling window. closed : str, default None - If ``'right'``, the first point in the window is excluded from calculations. + Determines the inclusivity of points in the window + If ``'right'``, (First, Last] the last point in the window + is included in the calculations. - If ``'left'``, the last point in the window is excluded from calculations. + If ``'left'``, [First, Last) the first point in the window + is included in the calculations. - If ``'both'``, no points in the window are excluded from calculations. + If ``'both'``, [First, Last] all points in the window + are included in the calculations. - If ``'neither'``, the first and last points in the window are excluded - from calculations. + If ``'neither'``, (First, Last) the first and last points + in the window are excludedfrom calculations. + + () and [] are referencing open and closed set + notation respetively. Default ``None`` (``'right'``). diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 631ab15464942..b954ce2584c13 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -929,14 +929,21 @@ class Window(BaseWindow): an integer index is not used to calculate the rolling window. closed : str, default None - If ``'right'``, the first point in the window is excluded from calculations. + Determines the inclusivity of points in the window + If ``'right'``, (First, Last] the last point in the window + is included in the calculations. - If ``'left'``, the last point in the window is excluded from calculations. + If ``'left'``, [First, Last) the first point in the window + is included in the calculations. - If ``'both'``, no point in the window is excluded from calculations. + If ``'both'``, [First, Last] all points in the window + are included in the calculations. - If ``'neither'``, the first and last points in the window are excluded - from calculations. + If ``'neither'``, (First, Last) the first and last points + in the window are excludedfrom calculations. + + () and [] are referencing open and closed set + notation respetively. Default ``None`` (``'right'``). From e58bf26fa4d806f40624fb80d8321f2cc43d62a1 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 3 Feb 2025 10:08:43 -0800 Subject: [PATCH 61/67] CI: Update some CI configurations (#60762) * CI: Update some CI configurations * Freeze Python dev * Add actions-313.yaml * Add 3.13 yaml * Move to pyside6 instead of pyqt * Revert "Move to pyside6 instead of pyqt" This reverts commit c04039fff983db3a94f42e7e16c79cd824672757. * Revert "Add 3.13 yaml" This reverts commit 0f888e1476da8f46cacaf6e63b4a5cfc2a1a8365. * Revert "Add actions-313.yaml" This reverts commit 91e27037785cce2eb47e05b3ef726dd16e14f2bf. * Revert "Freeze Python dev" This reverts commit c685af4d5871c2ce455d81f8bf212dc0e2e31aa9. * Move back to python 3.13 dev --- .github/workflows/unit-tests.yml | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 842629ba331d6..08c41a1eeb21f 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -107,7 +107,7 @@ jobs: services: mysql: - image: mysql:8 + image: mysql:9 env: MYSQL_ALLOW_EMPTY_PASSWORD: yes MYSQL_DATABASE: pandas @@ -120,7 +120,7 @@ jobs: - 3306:3306 postgres: - image: postgres:16 + image: postgres:17 env: PGUSER: postgres POSTGRES_USER: postgres @@ -135,7 +135,7 @@ jobs: - 5432:5432 moto: - image: motoserver/moto:5.0.0 + image: motoserver/moto:5.0.27 env: AWS_ACCESS_KEY_ID: foobar_key AWS_SECRET_ACCESS_KEY: foobar_secret @@ -242,15 +242,14 @@ jobs: - name: Build environment and Run Tests # https://github.com/numpy/numpy/issues/24703#issuecomment-1722379388 run: | - /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev + /opt/python/cp313-cp313/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install numpy -Csetup-args="-Dallow-noblas=true" python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 python -m pip install --no-cache-dir --no-build-isolation -e . -Csetup-args="--werror" python -m pip list --no-cache-dir - export PANDAS_CI=1 - python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml + PANDAS_CI=1 python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-32bit @@ -259,7 +258,7 @@ jobs: Linux-Musl: runs-on: ubuntu-22.04 container: - image: quay.io/pypa/musllinux_1_1_x86_64 + image: quay.io/pypa/musllinux_1_2_x86_64 steps: - name: Checkout pandas Repo # actions/checkout does not work since it requires node @@ -281,7 +280,7 @@ jobs: apk add musl-locales - name: Build environment run: | - /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev + /opt/python/cp313-cp313/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 @@ -291,8 +290,7 @@ jobs: - name: Run Tests run: | . ~/virtualenvs/pandas-dev/bin/activate - export PANDAS_CI=1 - python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml + PANDAS_CI=1 python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-musl @@ -357,8 +355,7 @@ jobs: python --version python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy - python -m pip install versioneer[toml] - python -m pip install python-dateutil tzdata cython hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov + python -m pip install versioneer[toml] python-dateutil tzdata cython hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov python -m pip install -ve . --no-build-isolation --no-index --no-deps -Csetup-args="--werror" python -m pip list @@ -375,7 +372,7 @@ jobs: concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.os }}-python-freethreading-dev + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-python-freethreading-dev cancel-in-progress: true env: @@ -396,14 +393,11 @@ jobs: nogil: true - name: Build Environment - # TODO: Once numpy 2.2.1 is out, don't install nightly version - # Tests segfault with numpy 2.2.0: https://github.com/numpy/numpy/pull/27955 run: | python --version - python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 - python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple cython numpy - python -m pip install versioneer[toml] - python -m pip install python-dateutil pytz tzdata hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov + python -m pip install --upgrade pip setuptools wheel numpy meson[ninja]==1.2.1 meson-python==0.13.1 + python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple cython + python -m pip install versioneer[toml] python-dateutil pytz tzdata hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov python -m pip install -ve . --no-build-isolation --no-index --no-deps -Csetup-args="--werror" python -m pip list From e84a7f7b521d52812b227d9dab038f138373866f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 3 Feb 2025 11:20:07 -0800 Subject: [PATCH 62/67] [pre-commit.ci] pre-commit autoupdate (#60840) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [pre-commit.ci] pre-commit autoupdate updates: - [github.com/astral-sh/ruff-pre-commit: v0.8.6 → v0.9.4](https://github.com/astral-sh/ruff-pre-commit/compare/v0.8.6...v0.9.4) - [github.com/codespell-project/codespell: v2.3.0 → v2.4.1](https://github.com/codespell-project/codespell/compare/v2.3.0...v2.4.1) - [github.com/PyCQA/isort: 5.13.2 → 6.0.0](https://github.com/PyCQA/isort/compare/5.13.2...6.0.0) - [github.com/pre-commit/mirrors-clang-format: v19.1.6 → v19.1.7](https://github.com/pre-commit/mirrors-clang-format/compare/v19.1.6...v19.1.7) - [github.com/trim21/pre-commit-mirror-meson: v1.6.1 → v1.7.0](https://github.com/trim21/pre-commit-mirror-meson/compare/v1.6.1...v1.7.0) * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Address ruff/codespell failures * Run ruff again --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- .pre-commit-config.yaml | 10 ++++---- asv_bench/benchmarks/io/style.py | 4 ++-- doc/make.py | 6 ++--- doc/source/user_guide/style.ipynb | 2 +- pandas/core/apply.py | 3 +-- pandas/core/arrays/base.py | 6 +++-- pandas/core/arrays/datetimes.py | 3 +-- pandas/core/computation/eval.py | 2 +- pandas/core/computation/expr.py | 2 +- pandas/core/computation/ops.py | 3 +-- pandas/core/dtypes/cast.py | 2 +- pandas/core/dtypes/dtypes.py | 3 +-- pandas/core/generic.py | 3 +-- pandas/core/groupby/groupby.py | 3 +-- pandas/core/groupby/grouper.py | 3 +-- pandas/core/indexers/objects.py | 6 ++--- pandas/core/indexing.py | 12 ++++------ pandas/core/interchange/buffer.py | 3 +-- pandas/core/internals/blocks.py | 3 +-- pandas/core/internals/construction.py | 3 +-- pandas/core/ops/array_ops.py | 2 +- pandas/core/reshape/encoding.py | 3 +-- pandas/core/reshape/merge.py | 6 ++--- pandas/core/tools/datetimes.py | 6 ++--- pandas/io/excel/_odswriter.py | 2 +- pandas/io/formats/printing.py | 4 ++-- pandas/io/formats/style.py | 12 ++++++---- pandas/io/formats/style_render.py | 11 ++++----- pandas/io/formats/xml.py | 6 ++--- pandas/io/json/_json.py | 2 +- pandas/io/parsers/base_parser.py | 3 +-- pandas/io/parsers/python_parser.py | 6 ++--- pandas/io/parsers/readers.py | 6 ++--- pandas/io/sas/sas_xport.py | 9 +++----- pandas/plotting/_core.py | 23 ++++++++++++------- pandas/plotting/_matplotlib/boxplot.py | 3 +-- pandas/tests/arrays/interval/test_formats.py | 4 +--- pandas/tests/dtypes/cast/test_downcast.py | 4 ++-- pandas/tests/dtypes/test_dtypes.py | 3 +-- pandas/tests/dtypes/test_missing.py | 2 +- pandas/tests/extension/base/getitem.py | 4 ++-- pandas/tests/extension/json/array.py | 3 +-- pandas/tests/extension/list/array.py | 3 +-- pandas/tests/extension/test_arrow.py | 3 +-- pandas/tests/frame/methods/test_info.py | 2 +- pandas/tests/frame/methods/test_sample.py | 3 +-- pandas/tests/frame/methods/test_set_axis.py | 2 +- pandas/tests/groupby/test_categorical.py | 2 +- pandas/tests/groupby/test_raises.py | 5 +--- .../indexes/categorical/test_indexing.py | 6 ++--- .../indexes/datetimes/methods/test_round.py | 6 ++--- .../tests/indexes/datetimes/test_formats.py | 13 ++--------- .../tests/indexes/datetimes/test_indexing.py | 6 ++--- .../indexes/interval/test_constructors.py | 9 +++----- pandas/tests/indexes/interval/test_formats.py | 7 +----- pandas/tests/indexes/multi/test_indexing.py | 3 +-- pandas/tests/indexes/numeric/test_indexing.py | 3 +-- pandas/tests/indexes/period/test_formats.py | 3 +-- pandas/tests/indexes/period/test_indexing.py | 3 +-- pandas/tests/indexes/test_base.py | 3 +-- pandas/tests/indexes/test_index_new.py | 3 +-- .../tests/indexes/timedeltas/test_indexing.py | 3 +-- pandas/tests/indexing/test_iloc.py | 3 +-- pandas/tests/io/excel/test_readers.py | 3 +-- pandas/tests/io/excel/test_style.py | 6 ++--- pandas/tests/io/formats/style/test_style.py | 2 +- pandas/tests/io/formats/test_css.py | 3 +-- pandas/tests/io/formats/test_to_csv.py | 5 +--- pandas/tests/io/formats/test_to_html.py | 3 +-- pandas/tests/io/formats/test_to_markdown.py | 6 ++--- pandas/tests/io/formats/test_to_string.py | 18 +++------------ pandas/tests/io/json/test_pandas.py | 12 +++------- pandas/tests/io/json/test_readlines.py | 9 ++++---- pandas/tests/io/json/test_ujson.py | 4 ++-- .../io/parser/common/test_read_errors.py | 3 +-- pandas/tests/io/parser/test_mangle_dupes.py | 2 +- pandas/tests/io/parser/test_parse_dates.py | 4 ++-- pandas/tests/io/xml/test_xml.py | 3 +-- pandas/tests/plotting/test_series.py | 2 +- pandas/tests/resample/test_time_grouper.py | 2 +- .../tests/reshape/merge/test_merge_cross.py | 6 ++--- .../scalar/timedelta/test_constructors.py | 3 +-- pandas/tests/series/methods/test_between.py | 3 +-- pandas/tests/tools/test_to_datetime.py | 4 ++-- pandas/tests/tools/test_to_numeric.py | 6 ++--- pandas/tests/tseries/offsets/test_offsets.py | 6 ++--- pandas/tests/tseries/offsets/test_ticks.py | 3 +-- pandas/tests/tslibs/test_parsing.py | 5 +--- pandas/tseries/frequencies.py | 3 +-- pyproject.toml | 2 +- 90 files changed, 165 insertions(+), 260 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1dd8dfc54111e..77bcadf57dd2d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ ci: skip: [pyright, mypy] repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.8.6 + rev: v0.9.4 hooks: - id: ruff args: [--exit-non-zero-on-fix] @@ -41,7 +41,7 @@ repos: pass_filenames: true require_serial: false - repo: https://github.com/codespell-project/codespell - rev: v2.3.0 + rev: v2.4.1 hooks: - id: codespell types_or: [python, rst, markdown, cython, c] @@ -70,7 +70,7 @@ repos: - id: trailing-whitespace args: [--markdown-linebreak-ext=md] - repo: https://github.com/PyCQA/isort - rev: 5.13.2 + rev: 6.0.0 hooks: - id: isort - repo: https://github.com/asottile/pyupgrade @@ -95,14 +95,14 @@ repos: - id: sphinx-lint args: ["--enable", "all", "--disable", "line-too-long"] - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v19.1.6 + rev: v19.1.7 hooks: - id: clang-format files: ^pandas/_libs/src|^pandas/_libs/include args: [-i] types_or: [c, c++] - repo: https://github.com/trim21/pre-commit-mirror-meson - rev: v1.6.1 + rev: v1.7.0 hooks: - id: meson-fmt args: ['--inplace'] diff --git a/asv_bench/benchmarks/io/style.py b/asv_bench/benchmarks/io/style.py index 24fd8a0d20aba..0486cabb29845 100644 --- a/asv_bench/benchmarks/io/style.py +++ b/asv_bench/benchmarks/io/style.py @@ -13,8 +13,8 @@ class Render: def setup(self, cols, rows): self.df = DataFrame( np.random.randn(rows, cols), - columns=[f"float_{i+1}" for i in range(cols)], - index=[f"row_{i+1}" for i in range(rows)], + columns=[f"float_{i + 1}" for i in range(cols)], + index=[f"row_{i + 1}" for i in range(rows)], ) def time_apply_render(self, cols, rows): diff --git a/doc/make.py b/doc/make.py index 02deb5002fea1..9542563dc037b 100755 --- a/doc/make.py +++ b/doc/make.py @@ -260,8 +260,7 @@ def latex(self, force=False): for i in range(3): self._run_os("pdflatex", "-interaction=nonstopmode", "pandas.tex") raise SystemExit( - "You should check the file " - '"build/latex/pandas.pdf" for problems.' + 'You should check the file "build/latex/pandas.pdf" for problems.' ) self._run_os("make") return ret_code @@ -343,8 +342,7 @@ def main(): dest="verbosity", default=0, help=( - "increase verbosity (can be repeated), " - "passed to the sphinx build command" + "increase verbosity (can be repeated), passed to the sphinx build command" ), ) argparser.add_argument( diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index abb7181fc8d72..9cda1486eb48b 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -1288,7 +1288,7 @@ "outputs": [], "source": [ "df2.loc[:4].style.highlight_max(\n", - " axis=1, props=(\"color:white; \" \"font-weight:bold; \" \"background-color:darkblue;\")\n", + " axis=1, props=(\"color:white; font-weight:bold; background-color:darkblue;\")\n", ")" ] }, diff --git a/pandas/core/apply.py b/pandas/core/apply.py index af513d49bcfe0..f36fc82fb1a11 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1645,8 +1645,7 @@ def reconstruct_func( # GH 28426 will raise error if duplicated function names are used and # there is no reassigned name raise SpecificationError( - "Function names must be unique if there is no new column names " - "assigned" + "Function names must be unique if there is no new column names assigned" ) if func is None: # nicer error message diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index e831883998098..33745438e2aea 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1791,9 +1791,11 @@ def take(self, indices, allow_fill=False, fill_value=None): # type for the array, to the physical storage type for # the data, before passing to take. - result = take(data, indices, fill_value=fill_value, allow_fill=allow_fill) + result = take( + data, indices, fill_value=fill_value, allow_fill=allow_fill + ) return self._from_sequence(result, dtype=self.dtype) - """ # noqa: E501 + """ # Implementer note: The `fill_value` parameter should be a user-facing # value, an instance of self.dtype.type. When passed `fill_value=None`, # the default of `self.dtype.na_value` should be used. diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 43cc492f82885..df40c9c11b117 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -2707,8 +2707,7 @@ def _maybe_infer_tz(tz: tzinfo | None, inferred_tz: tzinfo | None) -> tzinfo | N pass elif not timezones.tz_compare(tz, inferred_tz): raise TypeError( - f"data is already tz-aware {inferred_tz}, unable to " - f"set specified tz: {tz}" + f"data is already tz-aware {inferred_tz}, unable to set specified tz: {tz}" ) return tz diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index 9d844e590582a..f8e3200ef2ba0 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -204,7 +204,7 @@ def eval( By default, with the numexpr engine, the following operations are supported: - - Arthimetic operations: ``+``, ``-``, ``*``, ``/``, ``**``, ``%`` + - Arithmetic operations: ``+``, ``-``, ``*``, ``/``, ``**``, ``%`` - Boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not) - Comparison operators: ``<``, ``<=``, ``==``, ``!=``, ``>=``, ``>`` diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 010fad1bbf0b6..14a393b02409c 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -698,7 +698,7 @@ def visit_Call(self, node, side=None, **kwargs): if not isinstance(key, ast.keyword): # error: "expr" has no attribute "id" raise ValueError( - "keyword error in function call " f"'{node.func.id}'" # type: ignore[attr-defined] + f"keyword error in function call '{node.func.id}'" # type: ignore[attr-defined] ) if key.arg: diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index 9b26de42e119b..f06ded6d9f98e 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -512,8 +512,7 @@ def __init__(self, op: Literal["+", "-", "~", "not"], operand) -> None: self.func = _unary_ops_dict[op] except KeyError as err: raise ValueError( - f"Invalid unary operator {op!r}, " - f"valid operators are {UNARY_OPS_SYMS}" + f"Invalid unary operator {op!r}, valid operators are {UNARY_OPS_SYMS}" ) from err def __call__(self, env) -> MathCall: diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 02b9291da9b31..94531c2ac87e8 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1651,7 +1651,7 @@ def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.n # (test_constructor_coercion_signed_to_unsigned) so safe to ignore. warnings.filterwarnings( "ignore", - "NumPy will stop allowing conversion of " "out-of-bound Python int", + "NumPy will stop allowing conversion of out-of-bound Python int", DeprecationWarning, ) casted = np.asarray(arr, dtype=dtype) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 1eb1a630056a2..d8dd6441913b5 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -605,8 +605,7 @@ def update_dtype(self, dtype: str_type | CategoricalDtype) -> CategoricalDtype: return self elif not self.is_dtype(dtype): raise ValueError( - f"a CategoricalDtype must be passed to perform an update, " - f"got {dtype!r}" + f"a CategoricalDtype must be passed to perform an update, got {dtype!r}" ) else: # from here on, dtype is a CategoricalDtype diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e0a4f9d9c546a..f376518d4d3b8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5537,8 +5537,7 @@ def filter( nkw = common.count_not_none(items, like, regex) if nkw > 1: raise TypeError( - "Keyword arguments `items`, `like`, or `regex` " - "are mutually exclusive" + "Keyword arguments `items`, `like`, or `regex` are mutually exclusive" ) if axis is None: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 9c27df4ed8c1b..fdf2aab434695 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2632,8 +2632,7 @@ def _value_counts( doesnt_exist = subsetted - unique_cols if doesnt_exist: raise ValueError( - f"Keys {doesnt_exist} in subset do not " - f"exist in the DataFrame." + f"Keys {doesnt_exist} in subset do not exist in the DataFrame." ) else: subsetted = unique_cols diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 5f9ebdcea4a2d..c9d874fc08dbe 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -516,8 +516,7 @@ def __init__( ): grper = pprint_thing(grouping_vector) errmsg = ( - "Grouper result violates len(labels) == " - f"len(data)\nresult: {grper}" + f"Grouper result violates len(labels) == len(data)\nresult: {grper}" ) raise AssertionError(errmsg) diff --git a/pandas/core/indexers/objects.py b/pandas/core/indexers/objects.py index 0064aa91056e8..88379164534f2 100644 --- a/pandas/core/indexers/objects.py +++ b/pandas/core/indexers/objects.py @@ -478,9 +478,9 @@ def get_window_bounds( ) start = start.astype(np.int64) end = end.astype(np.int64) - assert len(start) == len( - end - ), "these should be equal in length from get_window_bounds" + assert len(start) == len(end), ( + "these should be equal in length from get_window_bounds" + ) # Cannot use groupby_indices as they might not be monotonic with the object # we're rolling over window_indices = np.arange( diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 656ee54cbc5d4..8a493fef54d3b 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -975,8 +975,7 @@ def _validate_tuple_indexer(self, key: tuple) -> tuple: self._validate_key(k, i) except ValueError as err: raise ValueError( - "Location based indexing can only have " - f"[{self._valid_types}] types" + f"Location based indexing can only have [{self._valid_types}] types" ) from err return key @@ -1589,8 +1588,7 @@ def _validate_key(self, key, axis: AxisInt) -> None: "is not available" ) raise ValueError( - "iLocation based boolean indexing cannot use " - "an indexable as a mask" + "iLocation based boolean indexing cannot use an indexable as a mask" ) return @@ -1994,8 +1992,7 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): return self._setitem_with_indexer((pi, info_axis[0]), value[0]) raise ValueError( - "Must have equal len keys and value " - "when setting with an iterable" + "Must have equal len keys and value when setting with an iterable" ) elif lplane_indexer == 0 and len(value) == len(self.obj.index): @@ -2023,8 +2020,7 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): else: raise ValueError( - "Must have equal len keys and value " - "when setting with an iterable" + "Must have equal len keys and value when setting with an iterable" ) else: diff --git a/pandas/core/interchange/buffer.py b/pandas/core/interchange/buffer.py index 62bf396256f2a..8953360a91c8e 100644 --- a/pandas/core/interchange/buffer.py +++ b/pandas/core/interchange/buffer.py @@ -31,8 +31,7 @@ def __init__(self, x: np.ndarray, allow_copy: bool = True) -> None: x = x.copy() else: raise RuntimeError( - "Exports cannot be zero-copy in the case " - "of a non-contiguous buffer" + "Exports cannot be zero-copy in the case of a non-contiguous buffer" ) # Store the numpy array in which the data resides as a private diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f44ad926dda5c..d1a9081b234de 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2264,8 +2264,7 @@ def check_ndim(values, placement: BlockPlacement, ndim: int) -> None: if values.ndim > ndim: # Check for both np.ndarray and ExtensionArray raise ValueError( - "Wrong number of dimensions. " - f"values.ndim > ndim [{values.ndim} > {ndim}]" + f"Wrong number of dimensions. values.ndim > ndim [{values.ndim} > {ndim}]" ) if not is_1d_only_ea_dtype(values.dtype): diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index dfff34656f82b..69da2be0306f6 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -907,8 +907,7 @@ def _validate_or_indexify_columns( if not is_mi_list and len(columns) != len(content): # pragma: no cover # caller's responsibility to check for this... raise AssertionError( - f"{len(columns)} columns passed, passed data had " - f"{len(content)} columns" + f"{len(columns)} columns passed, passed data had {len(content)} columns" ) if is_mi_list: # check if nested list column, length of each sub-list should be equal diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 983a3df57e369..3a466b6fc7fc8 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -164,7 +164,7 @@ def _masked_arith_op(x: np.ndarray, y, op) -> np.ndarray: else: if not is_scalar(y): raise TypeError( - f"Cannot broadcast np.ndarray with operand of type { type(y) }" + f"Cannot broadcast np.ndarray with operand of type {type(y)}" ) # mask is only meaningful for x diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 33ff182f5baee..6a590ee5b227e 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -495,8 +495,7 @@ def from_dummies( if col_isna_mask.any(): raise ValueError( - "Dummy DataFrame contains NA value in column: " - f"'{col_isna_mask.idxmax()}'" + f"Dummy DataFrame contains NA value in column: '{col_isna_mask.idxmax()}'" ) # index data with a list of all columns that are dummies diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 5fddd9f9aca5b..ab056c8cc7e37 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1929,9 +1929,9 @@ def get_join_indexers( np.ndarray[np.intp] or None Indexer into the right_keys. """ - assert len(left_keys) == len( - right_keys - ), "left_keys and right_keys must be the same length" + assert len(left_keys) == len(right_keys), ( + "left_keys and right_keys must be the same length" + ) # fast-path for empty left/right left_n = len(left_keys[0]) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 30487de7bafd5..0a10001a3113f 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -192,9 +192,9 @@ def should_cache( else: check_count = 500 else: - assert ( - 0 <= check_count <= len(arg) - ), "check_count must be in next bounds: [0; len(arg)]" + assert 0 <= check_count <= len(arg), ( + "check_count must be in next bounds: [0; len(arg)]" + ) if check_count == 0: return False diff --git a/pandas/io/excel/_odswriter.py b/pandas/io/excel/_odswriter.py index 10a06aec72a57..ba4919c9298ed 100644 --- a/pandas/io/excel/_odswriter.py +++ b/pandas/io/excel/_odswriter.py @@ -270,7 +270,7 @@ def _process_style(self, style: dict[str, Any] | None) -> str | None: style_key = json.dumps(style) if style_key in self._style_dict: return self._style_dict[style_key] - name = f"pd{len(self._style_dict)+1}" + name = f"pd{len(self._style_dict) + 1}" self._style_dict[style_key] = name odf_style = Style(name=name, family="table-cell") if "font" in style: diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index a9936ba8c8f2c..b466e986450b1 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -336,8 +336,8 @@ def format_object_summary( if indent_for_name: name_len = len(name) - space1 = f'\n{(" " * (name_len + 1))}' - space2 = f'\n{(" " * (name_len + 2))}' + space1 = f"\n{(' ' * (name_len + 1))}" + space2 = f"\n{(' ' * (name_len + 2))}" else: space1 = "\n" space2 = "\n " # space for the opening '[' diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 3f37556867954..b4c55da3eddd6 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -2588,7 +2588,7 @@ def set_sticky( for i, level in enumerate(levels_): styles.append( { - "selector": f"thead tr:nth-child({level+1}) th", + "selector": f"thead tr:nth-child({level + 1}) th", "props": props + ( f"top:{i * pixel_size}px; height:{pixel_size}px; " @@ -2599,7 +2599,7 @@ def set_sticky( if not all(name is None for name in self.index.names): styles.append( { - "selector": f"thead tr:nth-child({obj.nlevels+1}) th", + "selector": f"thead tr:nth-child({obj.nlevels + 1}) th", "props": props + ( f"top:{(len(levels_)) * pixel_size}px; " @@ -2619,7 +2619,7 @@ def set_sticky( styles.extend( [ { - "selector": f"thead tr th:nth-child({level+1})", + "selector": f"thead tr th:nth-child({level + 1})", "props": props_ + "z-index:3 !important;", }, { @@ -4214,8 +4214,10 @@ def css_bar(start: float, end: float, color: str) -> str: if end > start: cell_css += "background: linear-gradient(90deg," if start > 0: - cell_css += f" transparent {start*100:.1f}%, {color} {start*100:.1f}%," - cell_css += f" {color} {end*100:.1f}%, transparent {end*100:.1f}%)" + cell_css += ( + f" transparent {start * 100:.1f}%, {color} {start * 100:.1f}%," + ) + cell_css += f" {color} {end * 100:.1f}%, transparent {end * 100:.1f}%)" return cell_css def css_calc(x, left: float, right: float, align: str, color: str | list | tuple): diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py index 2d1218b007d19..482ed316c7ce4 100644 --- a/pandas/io/formats/style_render.py +++ b/pandas/io/formats/style_render.py @@ -850,10 +850,7 @@ def _generate_body_row( data_element = _element( "td", - ( - f"{self.css['data']} {self.css['row']}{r} " - f"{self.css['col']}{c}{cls}" - ), + (f"{self.css['data']} {self.css['row']}{r} {self.css['col']}{c}{cls}"), value, data_element_visible, attributes="", @@ -973,7 +970,7 @@ def concatenated_visible_rows(obj): idx_len = d["index_lengths"].get((lvl, r), None) if idx_len is not None: # i.e. not a sparsified entry d["clines"][rn + idx_len].append( - f"\\cline{{{lvln+1}-{len(visible_index_levels)+data_len}}}" + f"\\cline{{{lvln + 1}-{len(visible_index_levels) + data_len}}}" # noqa: E501 ) def format( @@ -1557,7 +1554,7 @@ def relabel_index( >>> df = pd.DataFrame({"samples": np.random.rand(10)}) >>> styler = df.loc[np.random.randint(0, 10, 3)].style - >>> styler.relabel_index([f"sample{i+1} ({{}})" for i in range(3)]) + >>> styler.relabel_index([f"sample{i + 1} ({{}})" for i in range(3)]) ... # doctest: +SKIP samples sample1 (5) 0.315811 @@ -2520,7 +2517,7 @@ def color(value, user_arg, command, comm_arg): if value[0] == "#" and len(value) == 7: # color is hex code return command, f"[HTML]{{{value[1:].upper()}}}{arg}" if value[0] == "#" and len(value) == 4: # color is short hex code - val = f"{value[1].upper()*2}{value[2].upper()*2}{value[3].upper()*2}" + val = f"{value[1].upper() * 2}{value[2].upper() * 2}{value[3].upper() * 2}" return command, f"[HTML]{{{val}}}{arg}" elif value[:3] == "rgb": # color is rgb or rgba r = re.findall("(?<=\\()[0-9\\s%]+(?=,)", value)[0].strip() diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py index 47f162e93216d..febf43b9a1018 100644 --- a/pandas/io/formats/xml.py +++ b/pandas/io/formats/xml.py @@ -260,7 +260,7 @@ def _other_namespaces(self) -> dict: nmsp_dict: dict[str, str] = {} if self.namespaces: nmsp_dict = { - f"xmlns{p if p=='' else f':{p}'}": n + f"xmlns{p if p == '' else f':{p}'}": n for p, n in self.namespaces.items() if n != self.prefix_uri[1:-1] } @@ -404,7 +404,7 @@ def _get_prefix_uri(self) -> str: f"{self.prefix} is not included in namespaces" ) from err elif "" in self.namespaces: - uri = f'{{{self.namespaces[""]}}}' + uri = f"{{{self.namespaces['']}}}" else: uri = "" @@ -502,7 +502,7 @@ def _get_prefix_uri(self) -> str: f"{self.prefix} is not included in namespaces" ) from err elif "" in self.namespaces: - uri = f'{{{self.namespaces[""]}}}' + uri = f"{{{self.namespaces['']}}}" else: uri = "" diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 237518b3c8d92..703a2b3656c9c 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -917,7 +917,7 @@ def _combine_lines(self, lines) -> str: Combines a list of JSON objects into one JSON object. """ return ( - f'[{",".join([line for line in (line.strip() for line in lines) if line])}]' + f"[{','.join([line for line in (line.strip() for line in lines) if line])}]" ) @overload diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index e263c69376d05..c283f600eb971 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -112,8 +112,7 @@ def __init__(self, kwds) -> None: parse_dates = bool(parse_dates) elif not isinstance(parse_dates, list): raise TypeError( - "Only booleans and lists are accepted " - "for the 'parse_dates' parameter" + "Only booleans and lists are accepted for the 'parse_dates' parameter" ) self.parse_dates: bool | list = parse_dates self.date_parser = kwds.pop("date_parser", lib.no_default) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index db9547a18b600..e7b5c7f06a79a 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -595,8 +595,7 @@ def _infer_columns( joi = list(map(str, header[:-1] if have_mi_columns else header)) msg = f"[{','.join(joi)}], len of {len(joi)}, " raise ValueError( - f"Passed header={msg}" - f"but only {self.line_pos} lines in file" + f"Passed header={msg}but only {self.line_pos} lines in file" ) from err # We have an empty file, so check @@ -1219,8 +1218,7 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]: for row_num, actual_len in bad_lines: msg = ( - f"Expected {col_len} fields in line {row_num + 1}, saw " - f"{actual_len}" + f"Expected {col_len} fields in line {row_num + 1}, saw {actual_len}" ) if ( self.delimiter diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 54877017f76fc..67193f930b4dc 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1219,8 +1219,7 @@ def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]: and value != getattr(value, "value", default) ): raise ValueError( - f"The {argname!r} option is not supported with the " - f"'pyarrow' engine" + f"The {argname!r} option is not supported with the 'pyarrow' engine" ) options[argname] = value @@ -1396,8 +1395,7 @@ def _clean_options( if not is_integer(skiprows) and skiprows is not None: # pyarrow expects skiprows to be passed as an integer raise ValueError( - "skiprows argument must be an integer when using " - "engine='pyarrow'" + "skiprows argument must be an integer when using engine='pyarrow'" ) else: if is_integer(skiprows): diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index 89dbdab64c23c..a9c45e720fd56 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -33,19 +33,16 @@ ReadBuffer, ) _correct_line1 = ( - "HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!" - "000000000000000000000000000000 " + "HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!000000000000000000000000000000 " ) _correct_header1 = ( "HEADER RECORD*******MEMBER HEADER RECORD!!!!!!!000000000000000001600000000" ) _correct_header2 = ( - "HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!" - "000000000000000000000000000000 " + "HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!000000000000000000000000000000 " ) _correct_obs_header = ( - "HEADER RECORD*******OBS HEADER RECORD!!!!!!!" - "000000000000000000000000000000 " + "HEADER RECORD*******OBS HEADER RECORD!!!!!!!000000000000000000000000000000 " ) _fieldkeys = [ "ntype", diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index aee872f9ae50a..9670b5439c87e 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -247,11 +247,14 @@ def hist_frame( .. plot:: :context: close-figs - >>> data = {"length": [1.5, 0.5, 1.2, 0.9, 3], "width": [0.7, 0.2, 0.15, 0.2, 1.1]} + >>> data = { + ... "length": [1.5, 0.5, 1.2, 0.9, 3], + ... "width": [0.7, 0.2, 0.15, 0.2, 1.1], + ... } >>> index = ["pig", "rabbit", "duck", "chicken", "horse"] >>> df = pd.DataFrame(data, index=index) >>> hist = df.hist(bins=3) - """ # noqa: E501 + """ plot_backend = _get_plot_backend(backend) return plot_backend.hist_frame( data, @@ -845,7 +848,10 @@ class PlotAccessor(PandasObject): :context: close-figs >>> df = pd.DataFrame( - ... {"length": [1.5, 0.5, 1.2, 0.9, 3], "width": [0.7, 0.2, 0.15, 0.2, 1.1]}, + ... { + ... "length": [1.5, 0.5, 1.2, 0.9, 3], + ... "width": [0.7, 0.2, 0.15, 0.2, 1.1], + ... }, ... index=["pig", "rabbit", "duck", "chicken", "horse"], ... ) >>> plot = df.plot(title="DataFrame Plot") @@ -866,7 +872,7 @@ class PlotAccessor(PandasObject): >>> df = pd.DataFrame({"col1": [1, 2, 3, 4], "col2": ["A", "B", "A", "B"]}) >>> plot = df.groupby("col2").plot(kind="bar", title="DataFrameGroupBy Plot") - """ # noqa: E501 + """ _common_kinds = ("line", "bar", "barh", "kde", "density", "area", "hist", "box") _series_kinds = ("pie",) @@ -993,8 +999,7 @@ def __call__(self, *args, **kwargs): if kind not in self._all_kinds: raise ValueError( - f"{kind} is not a valid plot kind " - f"Valid plot kinds: {self._all_kinds}" + f"{kind} is not a valid plot kind Valid plot kinds: {self._all_kinds}" ) data = self._parent @@ -1630,7 +1635,9 @@ def area( ... "signups": [5, 5, 6, 12, 14, 13], ... "visits": [20, 42, 28, 62, 81, 50], ... }, - ... index=pd.date_range(start="2018/01/01", end="2018/07/01", freq="ME"), + ... index=pd.date_range( + ... start="2018/01/01", end="2018/07/01", freq="ME" + ... ), ... ) >>> ax = df.plot.area() @@ -1662,7 +1669,7 @@ def area( ... } ... ) >>> ax = df.plot.area(x="day") - """ # noqa: E501 + """ return self(kind="area", x=x, y=y, stacked=stacked, **kwargs) def pie(self, y: IndexLabel | None = None, **kwargs) -> PlotAccessor: diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index 5ad30a68ae3c9..af77972da8634 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -123,8 +123,7 @@ def _validate_color_args(self, color, colormap): if colormap is not None: warnings.warn( - "'color' and 'colormap' cannot be used " - "simultaneously. Using 'color'", + "'color' and 'colormap' cannot be used simultaneously. Using 'color'", stacklevel=find_stack_level(), ) diff --git a/pandas/tests/arrays/interval/test_formats.py b/pandas/tests/arrays/interval/test_formats.py index 535efee519374..88c9bf81d718c 100644 --- a/pandas/tests/arrays/interval/test_formats.py +++ b/pandas/tests/arrays/interval/test_formats.py @@ -6,8 +6,6 @@ def test_repr(): arr = IntervalArray.from_tuples([(0, 1), (1, 2)]) result = repr(arr) expected = ( - "\n" - "[(0, 1], (1, 2]]\n" - "Length: 2, dtype: interval[int64, right]" + "\n[(0, 1], (1, 2]]\nLength: 2, dtype: interval[int64, right]" ) assert result == expected diff --git a/pandas/tests/dtypes/cast/test_downcast.py b/pandas/tests/dtypes/cast/test_downcast.py index 9430ba2c478ae..69200b2e5fc96 100644 --- a/pandas/tests/dtypes/cast/test_downcast.py +++ b/pandas/tests/dtypes/cast/test_downcast.py @@ -33,9 +33,9 @@ ( # This is a judgement call, but we do _not_ downcast Decimal # objects - np.array([decimal.Decimal(0.0)]), + np.array([decimal.Decimal("0.0")]), "int64", - np.array([decimal.Decimal(0.0)]), + np.array([decimal.Decimal("0.0")]), ), ( # GH#45837 diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index b7e37ff270e60..621217a8c9317 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -660,8 +660,7 @@ def test_construction_generic(self, subtype): def test_construction_not_supported(self, subtype): # GH 19016 msg = ( - "category, object, and string subtypes are not supported " - "for IntervalDtype" + "category, object, and string subtypes are not supported for IntervalDtype" ) with pytest.raises(TypeError, match=msg): IntervalDtype(subtype) diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index 73c462d492d2d..c61cda83cf6e0 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -321,7 +321,7 @@ def test_period(self): def test_decimal(self): # scalars GH#23530 - a = Decimal(1.0) + a = Decimal("1.0") assert isna(a) is False assert notna(a) is True diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 27fa1206f6f7f..1f3680bf67e90 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -139,8 +139,8 @@ def test_getitem_invalid(self, data): "index out of bounds", # pyarrow "Out of bounds access", # Sparse f"loc must be an integer between -{ub} and {ub}", # Sparse - f"index {ub+1} is out of bounds for axis 0 with size {ub}", - f"index -{ub+1} is out of bounds for axis 0 with size {ub}", + f"index {ub + 1} is out of bounds for axis 0 with size {ub}", + f"index -{ub + 1} is out of bounds for axis 0 with size {ub}", ] ) with pytest.raises(IndexError, match=msg): diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index a68c8a06e1d18..b110911bda400 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -176,8 +176,7 @@ def take(self, indexer, allow_fill=False, fill_value=None): # an ndarary. indexer = np.asarray(indexer) msg = ( - "Index is out of bounds or cannot do a " - "non-empty take from an empty array." + "Index is out of bounds or cannot do a non-empty take from an empty array." ) if allow_fill: diff --git a/pandas/tests/extension/list/array.py b/pandas/tests/extension/list/array.py index da53bdcb4e37e..8b4728c7d6292 100644 --- a/pandas/tests/extension/list/array.py +++ b/pandas/tests/extension/list/array.py @@ -81,8 +81,7 @@ def take(self, indexer, allow_fill=False, fill_value=None): # an ndarary. indexer = np.asarray(indexer) msg = ( - "Index is out of bounds or cannot do a " - "non-empty take from an empty array." + "Index is out of bounds or cannot do a non-empty take from an empty array." ) if allow_fill: diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 4fccf02e08bd6..d6f428f4938a6 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -964,8 +964,7 @@ def _get_arith_xfail_marker(self, opname, pa_dtype): mark = pytest.mark.xfail( raises=TypeError, reason=( - f"{opname} not supported between" - f"pd.NA and {pa_dtype} Python scalar" + f"{opname} not supported betweenpd.NA and {pa_dtype} Python scalar" ), ) elif opname == "__rfloordiv__" and ( diff --git a/pandas/tests/frame/methods/test_info.py b/pandas/tests/frame/methods/test_info.py index 74e4383950174..462d86cadde88 100644 --- a/pandas/tests/frame/methods/test_info.py +++ b/pandas/tests/frame/methods/test_info.py @@ -522,7 +522,7 @@ def test_info_int_columns(using_infer_string): 0 1 2 non-null int64 1 2 2 non-null int64 dtypes: int64(2) - memory usage: {'50.0' if using_infer_string and HAS_PYARROW else '48.0+'} bytes + memory usage: {"50.0" if using_infer_string and HAS_PYARROW else "48.0+"} bytes """ ) assert result == expected diff --git a/pandas/tests/frame/methods/test_sample.py b/pandas/tests/frame/methods/test_sample.py index 91d735a8b2fa7..a9d56cbfd2b46 100644 --- a/pandas/tests/frame/methods/test_sample.py +++ b/pandas/tests/frame/methods/test_sample.py @@ -198,8 +198,7 @@ def test_sample_upsampling_without_replacement(self, frame_or_series): obj = tm.get_obj(obj, frame_or_series) msg = ( - "Replace has to be set to `True` when " - "upsampling the population `frac` > 1." + "Replace has to be set to `True` when upsampling the population `frac` > 1." ) with pytest.raises(ValueError, match=msg): obj.sample(frac=2, replace=False) diff --git a/pandas/tests/frame/methods/test_set_axis.py b/pandas/tests/frame/methods/test_set_axis.py index 1967941bca9f0..7b75bcf4f348d 100644 --- a/pandas/tests/frame/methods/test_set_axis.py +++ b/pandas/tests/frame/methods/test_set_axis.py @@ -93,7 +93,7 @@ def test_set_axis_setattr_index_wrong_length(self, obj): # wrong length msg = ( f"Length mismatch: Expected axis has {len(obj)} elements, " - f"new values have {len(obj)-1} elements" + f"new values have {len(obj) - 1} elements" ) with pytest.raises(ValueError, match=msg): obj.index = np.arange(len(obj) - 1) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 20309e852a556..e49be8c00b426 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -990,7 +990,7 @@ def test_sort(): # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar') df = DataFrame({"value": np.random.default_rng(2).integers(0, 10000, 10)}) - labels = [f"{i} - {i+499}" for i in range(0, 10000, 500)] + labels = [f"{i} - {i + 499}" for i in range(0, 10000, 500)] cat_labels = Categorical(labels, labels) df = df.sort_values(by=["value"], ascending=True) diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index ba13d3bd7278f..864b9e5d55991 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -263,10 +263,7 @@ def test_groupby_raises_string_np( if using_infer_string: if groupby_func_np is np.mean: klass = TypeError - msg = ( - f"Cannot perform reduction '{groupby_func_np.__name__}' " - "with string dtype" - ) + msg = f"Cannot perform reduction '{groupby_func_np.__name__}' with string dtype" _call_and_check(klass, msg, how, gb, groupby_func_np, ()) diff --git a/pandas/tests/indexes/categorical/test_indexing.py b/pandas/tests/indexes/categorical/test_indexing.py index 49eb79da616e7..25232075a07d9 100644 --- a/pandas/tests/indexes/categorical/test_indexing.py +++ b/pandas/tests/indexes/categorical/test_indexing.py @@ -64,8 +64,7 @@ def test_take_fill_value(self): tm.assert_categorical_equal(result.values, expected.values) msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" + "When allow_fill=True and fill_value is not None, all indices must be >= -1" ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) @@ -103,8 +102,7 @@ def test_take_fill_value_datetime(self): tm.assert_index_equal(result, expected) msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" + "When allow_fill=True and fill_value is not None, all indices must be >= -1" ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) diff --git a/pandas/tests/indexes/datetimes/methods/test_round.py b/pandas/tests/indexes/datetimes/methods/test_round.py index cde4a3a65804d..b023542ba0a4c 100644 --- a/pandas/tests/indexes/datetimes/methods/test_round.py +++ b/pandas/tests/indexes/datetimes/methods/test_round.py @@ -216,6 +216,6 @@ def test_round_int64(self, start, index_freq, periods, round_freq): assert (mod == 0).all(), f"round not a {round_freq} multiple" assert (diff <= unit // 2).all(), "round error" if unit % 2 == 0: - assert ( - result.asi8[diff == unit // 2] % 2 == 0 - ).all(), "round half to even error" + assert (result.asi8[diff == unit // 2] % 2 == 0).all(), ( + "round half to even error" + ) diff --git a/pandas/tests/indexes/datetimes/test_formats.py b/pandas/tests/indexes/datetimes/test_formats.py index 4551fdf073193..f4e0a63043335 100644 --- a/pandas/tests/indexes/datetimes/test_formats.py +++ b/pandas/tests/indexes/datetimes/test_formats.py @@ -205,12 +205,7 @@ def test_dti_representation_to_series(self, unit): exp3 = "0 2011-01-01\n1 2011-01-02\ndtype: datetime64[ns]" - exp4 = ( - "0 2011-01-01\n" - "1 2011-01-02\n" - "2 2011-01-03\n" - "dtype: datetime64[ns]" - ) + exp4 = "0 2011-01-01\n1 2011-01-02\n2 2011-01-03\ndtype: datetime64[ns]" exp5 = ( "0 2011-01-01 09:00:00+09:00\n" @@ -226,11 +221,7 @@ def test_dti_representation_to_series(self, unit): "dtype: datetime64[ns, US/Eastern]" ) - exp7 = ( - "0 2011-01-01 09:00:00\n" - "1 2011-01-02 10:15:00\n" - "dtype: datetime64[ns]" - ) + exp7 = "0 2011-01-01 09:00:00\n1 2011-01-02 10:15:00\ndtype: datetime64[ns]" with pd.option_context("display.width", 300): for idx, expected in zip( diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index bfbcdcff51ee6..c44345273466c 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -338,8 +338,7 @@ def test_take_fill_value(self): tm.assert_index_equal(result, expected) msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" + "When allow_fill=True and fill_value is not None, all indices must be >= -1" ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) @@ -375,8 +374,7 @@ def test_take_fill_value_with_timezone(self): tm.assert_index_equal(result, expected) msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" + "When allow_fill=True and fill_value is not None, all indices must be >= -1" ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) diff --git a/pandas/tests/indexes/interval/test_constructors.py b/pandas/tests/indexes/interval/test_constructors.py index 8db483751438c..90423149658ab 100644 --- a/pandas/tests/indexes/interval/test_constructors.py +++ b/pandas/tests/indexes/interval/test_constructors.py @@ -154,8 +154,7 @@ def test_constructor_empty(self, constructor, breaks, closed): def test_constructor_string(self, constructor, breaks): # GH 19016 msg = ( - "category, object, and string subtypes are not supported " - "for IntervalIndex" + "category, object, and string subtypes are not supported for IntervalIndex" ) with pytest.raises(TypeError, match=msg): constructor(**self.get_kwargs_from_breaks(breaks)) @@ -224,8 +223,7 @@ def test_constructor_errors(self): # GH 19016: categorical data data = Categorical(list("01234abcde"), ordered=True) msg = ( - "category, object, and string subtypes are not supported " - "for IntervalIndex" + "category, object, and string subtypes are not supported for IntervalIndex" ) with pytest.raises(TypeError, match=msg): IntervalIndex.from_arrays(data[:-1], data[1:]) @@ -297,8 +295,7 @@ def test_constructor_errors(self): # GH 19016: categorical data data = Categorical(list("01234abcde"), ordered=True) msg = ( - "category, object, and string subtypes are not supported " - "for IntervalIndex" + "category, object, and string subtypes are not supported for IntervalIndex" ) with pytest.raises(TypeError, match=msg): IntervalIndex.from_breaks(data) diff --git a/pandas/tests/indexes/interval/test_formats.py b/pandas/tests/indexes/interval/test_formats.py index 73bbfc91028b3..d45d894c485c9 100644 --- a/pandas/tests/indexes/interval/test_formats.py +++ b/pandas/tests/indexes/interval/test_formats.py @@ -21,12 +21,7 @@ class TestIntervalIndexRendering: [ ( Series, - ( - "(0.0, 1.0] a\n" - "NaN b\n" - "(2.0, 3.0] c\n" - "dtype: object" - ), + ("(0.0, 1.0] a\nNaN b\n(2.0, 3.0] c\ndtype: object"), ), (DataFrame, (" 0\n(0.0, 1.0] a\nNaN b\n(2.0, 3.0] c")), ], diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index d82203a53a60f..f098690be2afa 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -259,8 +259,7 @@ def test_get_indexer(self): def test_get_indexer_nearest(self): midx = MultiIndex.from_tuples([("a", 1), ("b", 2)]) msg = ( - "method='nearest' not implemented yet for MultiIndex; " - "see GitHub issue 9365" + "method='nearest' not implemented yet for MultiIndex; see GitHub issue 9365" ) with pytest.raises(NotImplementedError, match=msg): midx.get_indexer(["a"], method="nearest") diff --git a/pandas/tests/indexes/numeric/test_indexing.py b/pandas/tests/indexes/numeric/test_indexing.py index 43adc09774914..3c1b98d57b2a0 100644 --- a/pandas/tests/indexes/numeric/test_indexing.py +++ b/pandas/tests/indexes/numeric/test_indexing.py @@ -479,8 +479,7 @@ def test_take_fill_value_float64(self): tm.assert_index_equal(result, expected) msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" + "When allow_fill=True and fill_value is not None, all indices must be >= -1" ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) diff --git a/pandas/tests/indexes/period/test_formats.py b/pandas/tests/indexes/period/test_formats.py index 9f36eb1e7a1d1..dc95e19523842 100644 --- a/pandas/tests/indexes/period/test_formats.py +++ b/pandas/tests/indexes/period/test_formats.py @@ -63,8 +63,7 @@ def test_representation(self, method): exp3 = "PeriodIndex(['2011-01-01', '2011-01-02'], dtype='period[D]')" exp4 = ( - "PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], " - "dtype='period[D]')" + "PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], dtype='period[D]')" ) exp5 = "PeriodIndex(['2011', '2012', '2013'], dtype='period[Y-DEC]')" diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index 2683e25eda618..00e8262ddfa4c 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -700,8 +700,7 @@ def test_take_fill_value(self): tm.assert_index_equal(result, expected) msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" + "When allow_fill=True and fill_value is not None, all indices must be >= -1" ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 608158d40cf23..5b75bd9afd6df 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1112,8 +1112,7 @@ def test_take_fill_value(self): def test_take_fill_value_none_raises(self): index = Index(list("ABC"), name="xxx") msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" + "When allow_fill=True and fill_value is not None, all indices must be >= -1" ) with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/indexes/test_index_new.py b/pandas/tests/indexes/test_index_new.py index 4a31ae88a757a..dd228e6b713b5 100644 --- a/pandas/tests/indexes/test_index_new.py +++ b/pandas/tests/indexes/test_index_new.py @@ -419,8 +419,7 @@ class TestIndexConstructionErrors: def test_constructor_overflow_int64(self): # see GH#15832 msg = ( - "The elements provided in the data cannot " - "all be casted to the dtype int64" + "The elements provided in the data cannot all be casted to the dtype int64" ) with pytest.raises(OverflowError, match=msg): Index([np.iinfo(np.uint64).max - 1], dtype="int64") diff --git a/pandas/tests/indexes/timedeltas/test_indexing.py b/pandas/tests/indexes/timedeltas/test_indexing.py index e411555c65bea..426083cb6b67c 100644 --- a/pandas/tests/indexes/timedeltas/test_indexing.py +++ b/pandas/tests/indexes/timedeltas/test_indexing.py @@ -262,8 +262,7 @@ def test_take_fill_value(self): tm.assert_index_equal(result, expected) msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" + "When allow_fill=True and fill_value is not None, all indices must be >= -1" ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index dc95e1bb1b8a0..2f6998a85c80b 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -763,8 +763,7 @@ def test_iloc_mask(self): "(index of the boolean Series and of the " "indexed object do not match).", ("locs", ".iloc"): ( - "iLocation based boolean indexing on an " - "integer type is not available" + "iLocation based boolean indexing on an integer type is not available" ), } diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 34824f0a67985..140cf39b26556 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -910,8 +910,7 @@ def test_corrupt_bytes_raises(self, engine): error = XLRDError msg = ( - "Unsupported format, or corrupt file: Expected BOF " - "record; found b'foo'" + "Unsupported format, or corrupt file: Expected BOF record; found b'foo'" ) elif engine == "calamine": from python_calamine import CalamineError diff --git a/pandas/tests/io/excel/test_style.py b/pandas/tests/io/excel/test_style.py index 71ef1201e523f..0e13b2f94ed58 100644 --- a/pandas/tests/io/excel/test_style.py +++ b/pandas/tests/io/excel/test_style.py @@ -356,6 +356,6 @@ def test_format_hierarchical_rows_periodindex(merge_cells): for cell in formatted_cells: if cell.row != 0 and cell.col == 0: - assert isinstance( - cell.val, Timestamp - ), "Period should be converted to Timestamp" + assert isinstance(cell.val, Timestamp), ( + "Period should be converted to Timestamp" + ) diff --git a/pandas/tests/io/formats/style/test_style.py b/pandas/tests/io/formats/style/test_style.py index ff8a1b9f570ab..b7dcfde327b83 100644 --- a/pandas/tests/io/formats/style/test_style.py +++ b/pandas/tests/io/formats/style/test_style.py @@ -933,7 +933,7 @@ def test_trim(self, df): def test_export(self, df, styler): f = lambda x: "color: red" if x > 0 else "color: blue" - g = lambda x, z: f"color: {z}" if x > 0 else f"color: {z}" + g = lambda x, z: f"color: {z}" style1 = styler style1.map(f).map(g, z="b").highlight_max()._compute() # = render result = style1.export() diff --git a/pandas/tests/io/formats/test_css.py b/pandas/tests/io/formats/test_css.py index c4ecb48006cb1..642a562704344 100644 --- a/pandas/tests/io/formats/test_css.py +++ b/pandas/tests/io/formats/test_css.py @@ -193,8 +193,7 @@ def test_css_border_shorthands(prop, expected): ( "margin: 1px; margin-top: 2px", "", - "margin-left: 1px; margin-right: 1px; " - "margin-bottom: 1px; margin-top: 2px", + "margin-left: 1px; margin-right: 1px; margin-bottom: 1px; margin-top: 2px", ), ("margin-top: 2px", "margin: 1px", "margin: 1px; margin-top: 2px"), ("margin: 1px", "margin-top: 2px", "margin: 1px"), diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 7bf041a50b745..6d762fdeb8d79 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -482,10 +482,7 @@ def test_to_csv_string_with_crlf(self): # case 3: CRLF as line terminator # 'lineterminator' should not change inner element expected_crlf = ( - b"int,str_crlf\r\n" - b"1,abc\r\n" - b'2,"d\r\nef"\r\n' - b'3,"g\r\nh\r\n\r\ni"\r\n' + b'int,str_crlf\r\n1,abc\r\n2,"d\r\nef"\r\n3,"g\r\nh\r\n\r\ni"\r\n' ) df.to_csv(path, lineterminator="\r\n", index=False) with open(path, "rb") as f: diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index b1a437bfdbd8a..9c75314b66fa2 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -94,8 +94,7 @@ def test_to_html_with_column_specific_col_space_raises(): ) msg = ( - "Col_space length\\(\\d+\\) should match " - "DataFrame number of columns\\(\\d+\\)" + "Col_space length\\(\\d+\\) should match DataFrame number of columns\\(\\d+\\)" ) with pytest.raises(ValueError, match=msg): df.to_html(col_space=[30, 40]) diff --git a/pandas/tests/io/formats/test_to_markdown.py b/pandas/tests/io/formats/test_to_markdown.py index 7aa7cebb5120f..f3d9b88cc91e2 100644 --- a/pandas/tests/io/formats/test_to_markdown.py +++ b/pandas/tests/io/formats/test_to_markdown.py @@ -35,8 +35,7 @@ def test_empty_frame(): df.to_markdown(buf=buf) result = buf.getvalue() assert result == ( - "| id | first_name | last_name |\n" - "|------|--------------|-------------|" + "| id | first_name | last_name |\n|------|--------------|-------------|" ) @@ -65,8 +64,7 @@ def test_series(): s.to_markdown(buf=buf) result = buf.getvalue() assert result == ( - "| | foo |\n|---:|------:|\n| 0 | 1 " - "|\n| 1 | 2 |\n| 2 | 3 |" + "| | foo |\n|---:|------:|\n| 0 | 1 |\n| 1 | 2 |\n| 2 | 3 |" ) diff --git a/pandas/tests/io/formats/test_to_string.py b/pandas/tests/io/formats/test_to_string.py index 1e8598c918efe..63c975fd831e7 100644 --- a/pandas/tests/io/formats/test_to_string.py +++ b/pandas/tests/io/formats/test_to_string.py @@ -377,17 +377,11 @@ def test_to_string_small_float_values(self): # sadness per above if _three_digit_exp(): expected = ( - " a\n" - "0 1.500000e+000\n" - "1 1.000000e-017\n" - "2 -5.500000e-007" + " a\n0 1.500000e+000\n1 1.000000e-017\n2 -5.500000e-007" ) else: expected = ( - " a\n" - "0 1.500000e+00\n" - "1 1.000000e-17\n" - "2 -5.500000e-07" + " a\n0 1.500000e+00\n1 1.000000e-17\n2 -5.500000e-07" ) assert result == expected @@ -1210,13 +1204,7 @@ def test_to_string_float_na_spacing(self): ser[::2] = np.nan result = ser.to_string() - expected = ( - "0 NaN\n" - "1 1.5678\n" - "2 NaN\n" - "3 -3.0000\n" - "4 NaN" - ) + expected = "0 NaN\n1 1.5678\n2 NaN\n3 -3.0000\n4 NaN" assert result == expected def test_to_string_with_datetimeindex(self): diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 5dc1272880c9b..144b36166261b 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1267,9 +1267,7 @@ def test_default_handler_numpy_unsupported_dtype(self): columns=["a", "b"], ) expected = ( - '[["(1+0j)","(nan+0j)"],' - '["(2.3+0j)","(nan+0j)"],' - '["(4-5j)","(1.2+0j)"]]' + '[["(1+0j)","(nan+0j)"],["(2.3+0j)","(nan+0j)"],["(4-5j)","(1.2+0j)"]]' ) assert df.to_json(default_handler=str, orient="values") == expected @@ -1372,11 +1370,7 @@ def test_tz_is_naive(self): ) def test_tz_range_is_utc(self, tz_range): exp = '["2013-01-01T05:00:00.000Z","2013-01-02T05:00:00.000Z"]' - dfexp = ( - '{"DT":{' - '"0":"2013-01-01T05:00:00.000Z",' - '"1":"2013-01-02T05:00:00.000Z"}}' - ) + dfexp = '{"DT":{"0":"2013-01-01T05:00:00.000Z","1":"2013-01-02T05:00:00.000Z"}}' assert ujson_dumps(tz_range, iso_dates=True) == exp dti = DatetimeIndex(tz_range) @@ -1775,7 +1769,7 @@ def test_read_json_with_url_value(self, url): ) def test_read_json_with_very_long_file_path(self, compression): # GH 46718 - long_json_path = f'{"a" * 1000}.json{compression}' + long_json_path = f"{'a' * 1000}.json{compression}" with pytest.raises( FileNotFoundError, match=f"File {long_json_path} does not exist" ): diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index 3c843479b446a..d482eb5fa1a06 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -236,9 +236,9 @@ def test_readjson_chunks_closes(chunksize): ) with reader: reader.read() - assert ( - reader.handles.handle.closed - ), f"didn't close stream with chunksize = {chunksize}" + assert reader.handles.handle.closed, ( + f"didn't close stream with chunksize = {chunksize}" + ) @pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"]) @@ -435,8 +435,7 @@ def test_to_json_append_mode(mode_): # Test ValueError when mode is not supported option df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]}) msg = ( - f"mode={mode_} is not a valid option." - "Only 'w' and 'a' are currently supported." + f"mode={mode_} is not a valid option.Only 'w' and 'a' are currently supported." ) with pytest.raises(ValueError, match=msg): df.to_json(mode=mode_, lines=False, orient="records") diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index c5ccc3b3f7184..8f49afdb1f289 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -991,7 +991,7 @@ def test_decode_array(self, arr): def test_decode_extreme_numbers(self, extreme_num): assert extreme_num == ujson.ujson_loads(str(extreme_num)) - @pytest.mark.parametrize("too_extreme_num", [f"{2**64}", f"{-2**63-1}"]) + @pytest.mark.parametrize("too_extreme_num", [f"{2**64}", f"{-(2**63) - 1}"]) def test_decode_too_extreme_numbers(self, too_extreme_num): with pytest.raises( ValueError, @@ -1006,7 +1006,7 @@ def test_decode_with_trailing_non_whitespaces(self): with pytest.raises(ValueError, match="Trailing data"): ujson.ujson_loads("{}\n\t a") - @pytest.mark.parametrize("value", [f"{2**64}", f"{-2**63-1}"]) + @pytest.mark.parametrize("value", [f"{2**64}", f"{-(2**63) - 1}"]) def test_decode_array_with_big_int(self, value): with pytest.raises( ValueError, diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index ed2e729430b01..a73327beea8bb 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -131,8 +131,7 @@ def test_catch_too_many_names(all_parsers): msg = ( "Too many columns specified: expected 4 and found 3" if parser.engine == "c" - else "Number of passed names did not match " - "number of header fields in the file" + else "Number of passed names did not match number of header fields in the file" ) with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py index d3789cd387c05..55c8bbc4bb9e1 100644 --- a/pandas/tests/io/parser/test_mangle_dupes.py +++ b/pandas/tests/io/parser/test_mangle_dupes.py @@ -136,7 +136,7 @@ def test_mangled_unnamed_placeholders(all_parsers): expected = DataFrame(columns=Index([], dtype="str")) for j in range(i + 1): - col_name = "Unnamed: 0" + f".{1*j}" * min(j, 1) + col_name = "Unnamed: 0" + f".{1 * j}" * min(j, 1) expected.insert(loc=0, column=col_name, value=[0, 1, 2]) expected[orig_key] = orig_value diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 1411ed5019766..9a15d9bc84a2e 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -228,7 +228,7 @@ def test_parse_tz_aware(all_parsers): def test_read_with_parse_dates_scalar_non_bool(all_parsers, kwargs): # see gh-5636 parser = all_parsers - msg = "Only booleans and lists " "are accepted for the 'parse_dates' parameter" + msg = "Only booleans and lists are accepted for the 'parse_dates' parameter" data = """A,B,C 1,2,2003-11-1""" @@ -239,7 +239,7 @@ def test_read_with_parse_dates_scalar_non_bool(all_parsers, kwargs): @pytest.mark.parametrize("parse_dates", [(1,), np.array([4, 5]), {1, 3}]) def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates): parser = all_parsers - msg = "Only booleans and lists " "are accepted for the 'parse_dates' parameter" + msg = "Only booleans and lists are accepted for the 'parse_dates' parameter" data = """A,B,C 1,2,2003-11-1""" diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 5c07a56c9fb3f..d897d251909fe 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -1503,8 +1503,7 @@ def test_bad_xml(parser): with pytest.raises( SyntaxError, match=( - "Extra content at the end of the document|" - "junk after document element" + "Extra content at the end of the document|junk after document element" ), ): read_xml( diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 9675b936c171e..c3b0219971446 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -427,7 +427,7 @@ def test_pie_series_autopct_and_fontsize(self): ax = _check_plot_works( series.plot.pie, colors=color_args, autopct="%.2f", fontsize=7 ) - pcts = [f"{s*100:.2f}" for s in series.values / series.sum()] + pcts = [f"{s * 100:.2f}" for s in series.values / series.sum()] expected_texts = list(chain.from_iterable(zip(series.index, pcts))) _check_text_labels(ax.texts, expected_texts) for t in ax.texts: diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 30e2c9dfe3d30..3cc95922e7f2f 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -353,7 +353,7 @@ def test_groupby_resample_interpolate_raises(groupy_test_df): for df in dfs: with pytest.raises( NotImplementedError, - match="Direct interpolation of MultiIndex data frames is " "not supported", + match="Direct interpolation of MultiIndex data frames is not supported", ): df.groupby("volume").resample("1D").interpolate(method="linear") diff --git a/pandas/tests/reshape/merge/test_merge_cross.py b/pandas/tests/reshape/merge/test_merge_cross.py index 14f9036e43fce..6ab80cf0e0823 100644 --- a/pandas/tests/reshape/merge/test_merge_cross.py +++ b/pandas/tests/reshape/merge/test_merge_cross.py @@ -42,8 +42,7 @@ def test_merge_cross_error_reporting(kwargs): left = DataFrame({"a": [1, 3]}) right = DataFrame({"b": [3, 4]}) msg = ( - "Can not pass on, right_on, left_on or set right_index=True or " - "left_index=True" + "Can not pass on, right_on, left_on or set right_index=True or left_index=True" ) with pytest.raises(MergeError, match=msg): merge(left, right, how="cross", **kwargs) @@ -94,8 +93,7 @@ def test_join_cross_error_reporting(): left = DataFrame({"a": [1, 3]}) right = DataFrame({"a": [3, 4]}) msg = ( - "Can not pass on, right_on, left_on or set right_index=True or " - "left_index=True" + "Can not pass on, right_on, left_on or set right_index=True or left_index=True" ) with pytest.raises(MergeError, match=msg): left.join(right, how="cross", on="a") diff --git a/pandas/tests/scalar/timedelta/test_constructors.py b/pandas/tests/scalar/timedelta/test_constructors.py index e029dfc3b2703..45caeb1733590 100644 --- a/pandas/tests/scalar/timedelta/test_constructors.py +++ b/pandas/tests/scalar/timedelta/test_constructors.py @@ -353,8 +353,7 @@ def test_construction(): Timedelta("foo") msg = ( - "cannot construct a Timedelta from " - "the passed arguments, allowed keywords are " + "cannot construct a Timedelta from the passed arguments, allowed keywords are " ) with pytest.raises(ValueError, match=msg): Timedelta(day=10) diff --git a/pandas/tests/series/methods/test_between.py b/pandas/tests/series/methods/test_between.py index e67eafbd118ce..f035767e2ce0e 100644 --- a/pandas/tests/series/methods/test_between.py +++ b/pandas/tests/series/methods/test_between.py @@ -66,8 +66,7 @@ def test_between_error_args(self, inclusive): left, right = series[[2, 7]] value_error_msg = ( - "Inclusive has to be either string of 'both'," - "'left', 'right', or 'neither'." + "Inclusive has to be either string of 'both','left', 'right', or 'neither'." ) series = Series(date_range("1/1/2000", periods=10)) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 74b051aec71a4..566fd8d901569 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1935,7 +1935,7 @@ def test_to_datetime_unit_na_values(self): @pytest.mark.parametrize("bad_val", ["foo", 111111111]) def test_to_datetime_unit_invalid(self, bad_val): if bad_val == "foo": - msg = "Unknown datetime string format, unable to parse: " f"{bad_val}" + msg = f"Unknown datetime string format, unable to parse: {bad_val}" else: msg = "cannot convert input 111111111 with the unit 'D'" with pytest.raises(ValueError, match=msg): @@ -2258,7 +2258,7 @@ def test_to_datetime_iso8601_exact_fails(self, input, format): [ '^unconverted data remains when parsing with format ".*": ".*". ' f"{PARSING_ERR_MSG}$", - f'^time data ".*" doesn\'t match format ".*". ' f"{PARSING_ERR_MSG}$", + f'^time data ".*" doesn\'t match format ".*". {PARSING_ERR_MSG}$', ] ) with pytest.raises( diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index f3645bf0649bd..893f526fb3eb0 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -192,7 +192,7 @@ def test_numeric_df_columns(columns): # see gh-14827 df = DataFrame( { - "a": [1.2, decimal.Decimal(3.14), decimal.Decimal("infinity"), "0.1"], + "a": [1.2, decimal.Decimal("3.14"), decimal.Decimal("infinity"), "0.1"], "b": [1.0, 2.0, 3.0, 4.0], } ) @@ -207,10 +207,10 @@ def test_numeric_df_columns(columns): "data,exp_data", [ ( - [[decimal.Decimal(3.14), 1.0], decimal.Decimal(1.6), 0.1], + [[decimal.Decimal("3.14"), 1.0], decimal.Decimal("1.6"), 0.1], [[3.14, 1.0], 1.6, 0.1], ), - ([np.array([decimal.Decimal(3.14), 1.0]), 0.1], [[3.14, 1.0], 0.1]), + ([np.array([decimal.Decimal("3.14"), 1.0]), 0.1], [[3.14, 1.0], 0.1]), ], ) def test_numeric_embedded_arr_likes(data, exp_data): diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index d0192c12f9518..7480b99595066 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -798,9 +798,9 @@ def test_get_offset(): for name, expected in pairs: offset = _get_offset(name) - assert ( - offset == expected - ), f"Expected {name!r} to yield {expected!r} (actual: {offset!r})" + assert offset == expected, ( + f"Expected {name!r} to yield {expected!r} (actual: {offset!r})" + ) def test_get_offset_legacy(): diff --git a/pandas/tests/tseries/offsets/test_ticks.py b/pandas/tests/tseries/offsets/test_ticks.py index f91230e1460c4..46b6846ad1ec2 100644 --- a/pandas/tests/tseries/offsets/test_ticks.py +++ b/pandas/tests/tseries/offsets/test_ticks.py @@ -289,8 +289,7 @@ def test_tick_rdiv(cls): td64 = delta.to_timedelta64() instance__type = ".".join([cls.__module__, cls.__name__]) msg = ( - "unsupported operand type\\(s\\) for \\/: 'int'|'float' and " - f"'{instance__type}'" + f"unsupported operand type\\(s\\) for \\/: 'int'|'float' and '{instance__type}'" ) with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 07425af8ed37a..bc5cd5fcccbf8 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -134,10 +134,7 @@ def test_does_not_convert_mixed_integer(date_string, expected): ( "2013Q1", {"freq": "INVLD-L-DEC-SAT"}, - ( - "Unable to retrieve month information " - "from given freq: INVLD-L-DEC-SAT" - ), + ("Unable to retrieve month information from given freq: INVLD-L-DEC-SAT"), ), ], ) diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 9a01568971af8..88ea1bfa3c6ed 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -145,8 +145,7 @@ def infer_freq( pass elif isinstance(index.dtype, PeriodDtype): raise TypeError( - "PeriodIndex given. Check the `freq` attribute " - "instead of using infer_freq." + "PeriodIndex given. Check the `freq` attribute instead of using infer_freq." ) elif lib.is_np_dtype(index.dtype, "m"): # Allow TimedeltaIndex and TimedeltaArray diff --git a/pyproject.toml b/pyproject.toml index 7ab9cd2c17669..c6af69438f849 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -746,5 +746,5 @@ exclude_lines = [ directory = "coverage_html_report" [tool.codespell] -ignore-words-list = "blocs, coo, hist, nd, sav, ser, recuse, nin, timere, expec, expecs, indext, SME, NotIn, tructures, tru" +ignore-words-list = "blocs, coo, hist, nd, sav, ser, recuse, nin, timere, expec, expecs, indext, SME, NotIn, tructures, tru, indx, abd, ABD" ignore-regex = 'https://([\w/\.])+' From 3866b98121e84b6fd01ed08de008372aa50e0841 Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Mon, 3 Feb 2025 14:07:49 -0800 Subject: [PATCH 63/67] DOC: Fix description of skipna parameter in groupby reductions (#60842) --- pandas/core/groupby/groupby.py | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index fdf2aab434695..27865a60f6ea3 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2170,8 +2170,7 @@ def mean( numeric_only no longer accepts ``None`` and defaults to ``False``. skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. + Exclude NA/null values. If an entire group is NA, the result will be NA. .. versionadded:: 3.0.0 @@ -2271,8 +2270,7 @@ def median(self, numeric_only: bool = False, skipna: bool = True) -> NDFrameT: numeric_only no longer accepts ``None`` and defaults to False. skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. + Exclude NA/null values. If an entire group is NA, the result will be NA. .. versionadded:: 3.0.0 @@ -2405,8 +2403,7 @@ def std( numeric_only now defaults to ``False``. skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. + Exclude NA/null values. If an entire group is NA, the result will be NA. .. versionadded:: 3.0.0 @@ -2524,8 +2521,7 @@ def var( numeric_only now defaults to ``False``. skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. + Exclude NA/null values. If an entire group is NA, the result will be NA. .. versionadded:: 3.0.0 @@ -2742,8 +2738,7 @@ def sem( numeric_only now defaults to ``False``. skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. + Exclude NA/null values. If an entire group is NA, the result will be NA. .. versionadded:: 3.0.0 @@ -3021,8 +3016,7 @@ def prod( than ``min_count`` non-NA values are present the result will be NA. skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. + Exclude NA/null values. If an entire group is NA, the result will be NA. .. versionadded:: 3.0.0 @@ -3242,8 +3236,7 @@ def first( The required number of valid values to perform the operation. If fewer than ``min_count`` valid values are present the result will be NA. skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. + Exclude NA/null values. If an entire group is NA, the result will be NA. .. versionadded:: 2.2.1 @@ -3329,8 +3322,7 @@ def last( The required number of valid values to perform the operation. If fewer than ``min_count`` valid values are present the result will be NA. skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. + Exclude NA/null values. If an entire group is NA, the result will be NA. .. versionadded:: 2.2.1 @@ -5530,8 +5522,7 @@ def _idxmax_idxmin( numeric_only : bool, default False Include only float, int, boolean columns. skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. + Exclude NA/null values. If an entire group is NA, the result will be NA. ignore_unobserved : bool, default False When True and an unobserved group is encountered, do not raise. This used for transform where unobserved groups do not play an impact on the result. From fc6da9c7f590ffd2eaec801060ee4b239fbf3d92 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Mon, 3 Feb 2025 14:22:19 -0800 Subject: [PATCH 64/67] TST: parametrize Decimal ujson test (#60843) --- pandas/tests/io/json/test_ujson.py | 70 ++++++++---------------------- 1 file changed, 17 insertions(+), 53 deletions(-) diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 8f49afdb1f289..d2bf9bdb139bd 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -53,60 +53,24 @@ def orient(request): class TestUltraJSONTests: @pytest.mark.skipif(not IS64, reason="not compliant on 32-bit, xref #15865") - def test_encode_decimal(self): - sut = decimal.Decimal("1337.1337") - encoded = ujson.ujson_dumps(sut, double_precision=15) - decoded = ujson.ujson_loads(encoded) - assert decoded == "1337.1337" - - sut = decimal.Decimal("0.95") - encoded = ujson.ujson_dumps(sut, double_precision=1) - assert encoded == '"0.95"' - - decoded = ujson.ujson_loads(encoded) - assert decoded == "0.95" - - sut = decimal.Decimal("0.94") - encoded = ujson.ujson_dumps(sut, double_precision=1) - assert encoded == '"0.94"' - - decoded = ujson.ujson_loads(encoded) - assert decoded == "0.94" - - sut = decimal.Decimal("1.95") - encoded = ujson.ujson_dumps(sut, double_precision=1) - assert encoded == '"1.95"' - - decoded = ujson.ujson_loads(encoded) - assert decoded == "1.95" - - sut = decimal.Decimal("-1.95") - encoded = ujson.ujson_dumps(sut, double_precision=1) - assert encoded == '"-1.95"' - - decoded = ujson.ujson_loads(encoded) - assert decoded == "-1.95" - - sut = decimal.Decimal("0.995") - encoded = ujson.ujson_dumps(sut, double_precision=2) - assert encoded == '"0.995"' - - decoded = ujson.ujson_loads(encoded) - assert decoded == "0.995" - - sut = decimal.Decimal("0.9995") - encoded = ujson.ujson_dumps(sut, double_precision=3) - assert encoded == '"0.9995"' - - decoded = ujson.ujson_loads(encoded) - assert decoded == "0.9995" - - sut = decimal.Decimal("0.99999999999999944") - encoded = ujson.ujson_dumps(sut, double_precision=15) - assert encoded == '"0.99999999999999944"' - + @pytest.mark.parametrize( + "value, double_precision", + [ + ("1337.1337", 15), + ("0.95", 1), + ("0.94", 1), + ("1.95", 1), + ("-1.95", 1), + ("0.995", 2), + ("0.9995", 3), + ("0.99999999999999944", 15), + ], + ) + def test_encode_decimal(self, value, double_precision): + sut = decimal.Decimal(value) + encoded = ujson.ujson_dumps(sut, double_precision=double_precision) decoded = ujson.ujson_loads(encoded) - assert decoded == "0.99999999999999944" + assert decoded == value @pytest.mark.parametrize("ensure_ascii", [True, False]) def test_encode_string_conversion(self, ensure_ascii): From e8306037a3a5782b18d3f8db81ae1dbde8ec21bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quang=20Nguy=E1=BB=85n?= <30631476+quangngd@users.noreply.github.com> Date: Tue, 4 Feb 2025 10:27:32 +0700 Subject: [PATCH 65/67] BUG: stack with empty level list (#60826) * return early if set_levels is empty * add test * add whatsnew * check empty before make set --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/reshape/reshape.py | 2 ++ pandas/tests/frame/test_stack_unstack.py | 19 +++++++++++++++++++ 3 files changed, 22 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 9089b9cdd2185..95b5f7eea5eeb 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -766,6 +766,7 @@ Reshaping - Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`) - Bug in :meth:`DataFrame.merge` when merging two :class:`DataFrame` on ``intc`` or ``uintc`` types on Windows (:issue:`60091`, :issue:`58713`) - Bug in :meth:`DataFrame.pivot_table` incorrectly subaggregating results when called without an ``index`` argument (:issue:`58722`) +- Bug in :meth:`DataFrame.stack` with the new implementation where ``ValueError`` is raised when ``level=[]`` (:issue:`60740`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`) Sparse diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 9b7b768fe7adb..c60fe71a7ff28 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -929,6 +929,8 @@ def _reorder_for_extension_array_stack( def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: if frame.columns.nunique() != len(frame.columns): raise ValueError("Columns with duplicate values are not supported in stack") + if not len(level): + return frame set_levels = set(level) stack_cols = frame.columns._drop_level_numbers( [k for k in range(frame.columns.nlevels - 1, -1, -1) if k not in set_levels] diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index abc14d10514fa..22fdfd3a01408 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1452,6 +1452,25 @@ def test_stack_empty_frame(dropna, future_stack): tm.assert_series_equal(result, expected) +@pytest.mark.filterwarnings("ignore:The previous implementation of stack is deprecated") +@pytest.mark.parametrize("dropna", [True, False, lib.no_default]) +def test_stack_empty_level(dropna, future_stack, int_frame): + # GH 60740 + if future_stack and dropna is not lib.no_default: + with pytest.raises(ValueError, match="dropna must be unspecified"): + DataFrame(dtype=np.int64).stack(dropna=dropna, future_stack=future_stack) + else: + expected = int_frame + result = int_frame.copy().stack( + level=[], dropna=dropna, future_stack=future_stack + ) + tm.assert_frame_equal(result, expected) + + expected = DataFrame() + result = DataFrame().stack(level=[], dropna=dropna, future_stack=future_stack) + tm.assert_frame_equal(result, expected) + + @pytest.mark.filterwarnings("ignore:The previous implementation of stack is deprecated") @pytest.mark.parametrize("dropna", [True, False, lib.no_default]) @pytest.mark.parametrize("fill_value", [None, 0]) From b2a7a262977391a09a49295dec4bebe0a120e316 Mon Sep 17 00:00:00 2001 From: Shashwat Agrawal <72117025+ShashwatAgrawal20@users.noreply.github.com> Date: Tue, 4 Feb 2025 23:20:48 +0530 Subject: [PATCH 66/67] DOC: `pandas.DataFrame.to_html` additional description for the border parameter (#60830) * should work * fix: proper backticks --- pandas/core/frame.py | 10 +++++++--- pandas/io/formats/format.py | 10 +++++++--- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d9f7623064e05..b715e526e0f33 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3205,9 +3205,13 @@ def to_html( Convert the characters <, >, and & to HTML-safe sequences. notebook : {True, False}, default False Whether the generated HTML is for IPython Notebook. - border : int - A ``border=border`` attribute is included in the opening - `` tag. Default ``pd.options.display.html.border``. + border : int or bool + When an integer value is provided, it sets the border attribute in + the opening tag, specifying the thickness of the border. + If ``False`` or ``0`` is passed, the border attribute will not + be present in the ``
`` tag. + The default value for this parameter is governed by + ``pd.options.display.html.border``. table_id : str, optional A css id is included in the opening `
` tag if specified. render_links : bool, default False diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 46ecb2b9a8f12..b7fbc4e5e22b7 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -897,9 +897,13 @@ def to_html( ``
`` tag, in addition to the default "dataframe". notebook : {True, False}, optional, default False Whether the generated HTML is for IPython Notebook. - border : int - A ``border=border`` attribute is included in the opening - ``
`` tag. Default ``pd.options.display.html.border``. + border : int or bool + When an integer value is provided, it sets the border attribute in + the opening tag, specifying the thickness of the border. + If ``False`` or ``0`` is passed, the border attribute will not + be present in the ``
`` tag. + The default value for this parameter is governed by + ``pd.options.display.html.border``. table_id : str, optional A css id is included in the opening `
` tag if specified. render_links : bool, default False From 68569a683a9e1068a82397a113b6dd2d8fa9cdd1 Mon Sep 17 00:00:00 2001 From: Ehsan Totoni Date: Tue, 4 Feb 2025 12:55:17 -0500 Subject: [PATCH 67/67] DOC: Update Bodo project description in ecosystem page (#60846) --- web/pandas/community/ecosystem.md | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index dc7b9bc947214..29297488da64f 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -496,17 +496,29 @@ You can find more information about the Hugging Face Dataset Hub in the [documen ## Out-of-core -### [Bodo](https://bodo.ai/) +### [Bodo](https://github.com/bodo-ai/Bodo) -Bodo is a high-performance Python computing engine that automatically parallelizes and -optimizes your code through compilation using HPC (high-performance computing) techniques. -Designed to operate with native pandas dataframes, Bodo compiles your pandas code to execute -across multiple cores on a single machine or distributed clusters of multiple compute nodes efficiently. -Bodo also makes distributed pandas dataframes queryable with SQL. -The community edition of Bodo is free to use on up to 8 cores. Beyond that, Bodo offers a paid -enterprise edition. Free licenses of Bodo (for more than 8 cores) are available -[upon request](https://www.bodo.ai/contact) for academic and non-profit use. +Bodo is a high-performance compute engine for Python data processing. +Using an auto-parallelizing just-in-time (JIT) compiler, Bodo simplifies scaling Pandas +workloads from laptops to clusters without major code changes. +Under the hood, Bodo relies on MPI-based high-performance computing (HPC) technology—making it +both easier to use and often much faster than alternatives. +Bodo also provides a SQL engine that can query distributed pandas dataframes efficiently. + +```python +import pandas as pd +import bodo + +@bodo.jit +def process_data(): + df = pd.read_parquet("my_data.pq") + df2 = pd.DataFrame({"A": df.apply(lambda r: 0 if r.A == 0 else (r.B // r.A), axis=1)}) + df2.to_parquet("out.pq") + +process_data() +``` + ### [Cylon](https://cylondata.org/)