Skip to content

Commit a81d52f

Browse files
authored
ENH: Support kurtosis (kurt) in DataFrameGroupBy and SeriesGroupBy (#60433)
* ENH: Support kurtosis (kurt) in DataFrameGroupBy and SeriesGroupBy * ENH: Address review comments * ENH: Fix comments in new test cases * ENH: Skip pyarrow test case if no pyarrow available * ENH: Update to intp instead of np.intp * ENH: Change intp to int64 * Address review comments
1 parent 11cc7e0 commit a81d52f

18 files changed

+436
-24
lines changed

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ Other enhancements
5555
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
5656
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
5757
- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
58+
- :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`)
5859
- :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
5960
- :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`)
6061
- :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`)

pandas/_libs/groupby.pyi

+9
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,15 @@ def group_skew(
9797
result_mask: np.ndarray | None = ...,
9898
skipna: bool = ...,
9999
) -> None: ...
100+
def group_kurt(
101+
out: np.ndarray, # float64_t[:, ::1]
102+
counts: np.ndarray, # int64_t[::1]
103+
values: np.ndarray, # ndarray[float64_T, ndim=2]
104+
labels: np.ndarray, # const intp_t[::1]
105+
mask: np.ndarray | None = ...,
106+
result_mask: np.ndarray | None = ...,
107+
skipna: bool = ...,
108+
) -> None: ...
100109
def group_mean(
101110
out: np.ndarray, # floating[:, ::1]
102111
counts: np.ndarray, # int64_t[::1]

pandas/_libs/groupby.pyx

+96-2
Original file line numberDiff line numberDiff line change
@@ -910,7 +910,7 @@ def group_var(
910910
@cython.wraparound(False)
911911
@cython.boundscheck(False)
912912
@cython.cdivision(True)
913-
@cython.cpow
913+
@cython.cpow(True)
914914
def group_skew(
915915
float64_t[:, ::1] out,
916916
int64_t[::1] counts,
@@ -961,7 +961,7 @@ def group_skew(
961961
isna_entry = _treat_as_na(val, False)
962962

963963
if not isna_entry:
964-
# Based on RunningStats::Push from
964+
# Running stats update based on RunningStats::Push from
965965
# https://www.johndcook.com/blog/skewness_kurtosis/
966966
n1 = nobs[lab, j]
967967
n = n1 + 1
@@ -995,6 +995,100 @@ def group_skew(
995995
)
996996

997997

998+
@cython.wraparound(False)
999+
@cython.boundscheck(False)
1000+
@cython.cdivision(True)
1001+
@cython.cpow(True)
1002+
def group_kurt(
1003+
float64_t[:, ::1] out,
1004+
int64_t[::1] counts,
1005+
ndarray[float64_t, ndim=2] values,
1006+
const intp_t[::1] labels,
1007+
const uint8_t[:, ::1] mask=None,
1008+
uint8_t[:, ::1] result_mask=None,
1009+
bint skipna=True,
1010+
) -> None:
1011+
cdef:
1012+
Py_ssize_t i, j, N, K, lab, ngroups = len(counts)
1013+
int64_t[:, ::1] nobs
1014+
Py_ssize_t len_values = len(values), len_labels = len(labels)
1015+
bint isna_entry, uses_mask = mask is not None
1016+
float64_t[:, ::1] M1, M2, M3, M4
1017+
float64_t delta, delta_n, delta_n2, term1, val
1018+
int64_t n1, n
1019+
float64_t ct, num, den, adj
1020+
1021+
if len_values != len_labels:
1022+
raise ValueError("len(index) != len(labels)")
1023+
1024+
nobs = np.zeros((<object>out).shape, dtype=np.int64)
1025+
1026+
# M1, M2, M3 and M4 correspond to 1st, 2nd, 3rd and 4th Moments
1027+
M1 = np.zeros((<object>out).shape, dtype=np.float64)
1028+
M2 = np.zeros((<object>out).shape, dtype=np.float64)
1029+
M3 = np.zeros((<object>out).shape, dtype=np.float64)
1030+
M4 = np.zeros((<object>out).shape, dtype=np.float64)
1031+
1032+
N, K = (<object>values).shape
1033+
1034+
out[:, :] = 0.0
1035+
1036+
with nogil:
1037+
for i in range(N):
1038+
lab = labels[i]
1039+
if lab < 0:
1040+
continue
1041+
1042+
counts[lab] += 1
1043+
1044+
for j in range(K):
1045+
val = values[i, j]
1046+
1047+
if uses_mask:
1048+
isna_entry = mask[i, j]
1049+
else:
1050+
isna_entry = _treat_as_na(val, False)
1051+
1052+
if not isna_entry:
1053+
# Running stats update based on RunningStats::Push from
1054+
# https://www.johndcook.com/blog/skewness_kurtosis/
1055+
n1 = nobs[lab, j]
1056+
n = n1 + 1
1057+
1058+
nobs[lab, j] = n
1059+
delta = val - M1[lab, j]
1060+
delta_n = delta / n
1061+
delta_n2 = delta_n * delta_n
1062+
term1 = delta * delta_n * n1
1063+
1064+
M1[lab, j] += delta_n
1065+
M4[lab, j] += (term1 * delta_n2 * (n*n - 3*n + 3)
1066+
+ 6 * delta_n2 * M2[lab, j]
1067+
- 4 * delta_n * M3[lab, j])
1068+
M3[lab, j] += term1 * delta_n * (n - 2) - 3 * delta_n * M2[lab, j]
1069+
M2[lab, j] += term1
1070+
elif not skipna:
1071+
M1[lab, j] = NaN
1072+
M2[lab, j] = NaN
1073+
M3[lab, j] = NaN
1074+
M4[lab, j] = NaN
1075+
1076+
for i in range(ngroups):
1077+
for j in range(K):
1078+
ct = <float64_t>nobs[i, j]
1079+
if ct < 4:
1080+
if result_mask is not None:
1081+
result_mask[i, j] = 1
1082+
out[i, j] = NaN
1083+
elif M2[i, j] == 0:
1084+
out[i, j] = 0
1085+
else:
1086+
num = ct * (ct + 1) * (ct - 1) * M4[i, j]
1087+
den = (ct - 2) * (ct - 3) * M2[i, j] ** 2
1088+
adj = 3.0 * (ct - 1) ** 2 / ((ct - 2) * (ct - 3))
1089+
out[i, j] = num / den - adj
1090+
1091+
9981092
@cython.wraparound(False)
9991093
@cython.boundscheck(False)
10001094
def group_mean(

pandas/core/arrays/base.py

+1
Original file line numberDiff line numberDiff line change
@@ -2618,6 +2618,7 @@ def _groupby_op(
26182618
"sem",
26192619
"var",
26202620
"skew",
2621+
"kurt",
26212622
]:
26222623
raise TypeError(
26232624
f"dtype '{self.dtype}' does not support operation '{how}'"

pandas/core/arrays/categorical.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2736,7 +2736,7 @@ def _groupby_op(
27362736
op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na)
27372737

27382738
dtype = self.dtype
2739-
if how in ["sum", "prod", "cumsum", "cumprod", "skew"]:
2739+
if how in ["sum", "prod", "cumsum", "cumprod", "skew", "kurt"]:
27402740
raise TypeError(f"{dtype} type does not support {how} operations")
27412741
if how in ["min", "max", "rank", "idxmin", "idxmax"] and not dtype.ordered:
27422742
# raise TypeError instead of NotImplementedError to ensure we

pandas/core/arrays/datetimelike.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -1656,7 +1656,7 @@ def _groupby_op(
16561656
dtype = self.dtype
16571657
if dtype.kind == "M":
16581658
# Adding/multiplying datetimes is not valid
1659-
if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]:
1659+
if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew", "kurt"]:
16601660
raise TypeError(f"datetime64 type does not support operation '{how}'")
16611661
if how in ["any", "all"]:
16621662
# GH#34479
@@ -1667,7 +1667,7 @@ def _groupby_op(
16671667

16681668
elif isinstance(dtype, PeriodDtype):
16691669
# Adding/multiplying Periods is not valid
1670-
if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]:
1670+
if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew", "kurt"]:
16711671
raise TypeError(f"Period type does not support {how} operations")
16721672
if how in ["any", "all"]:
16731673
# GH#34479
@@ -1677,7 +1677,7 @@ def _groupby_op(
16771677
)
16781678
else:
16791679
# timedeltas we can add but not multiply
1680-
if how in ["prod", "cumprod", "skew", "var"]:
1680+
if how in ["prod", "cumprod", "skew", "kurt", "var"]:
16811681
raise TypeError(f"timedelta64 type does not support {how} operations")
16821682

16831683
# All of the functions implemented here are ordinal, so we can

pandas/core/groupby/base.py

+1
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ class OutputKey:
5050
"sem",
5151
"size",
5252
"skew",
53+
"kurt",
5354
"std",
5455
"sum",
5556
"var",

pandas/core/groupby/generic.py

+180-2
Original file line numberDiff line numberDiff line change
@@ -1272,13 +1272,86 @@ def skew(
12721272
Name: Max Speed, dtype: float64
12731273
"""
12741274

1275+
return self._cython_agg_general(
1276+
"skew", alt=None, skipna=skipna, numeric_only=numeric_only, **kwargs
1277+
)
1278+
1279+
def kurt(
1280+
self,
1281+
skipna: bool = True,
1282+
numeric_only: bool = False,
1283+
**kwargs,
1284+
) -> Series:
1285+
"""
1286+
Return unbiased kurtosis within groups.
1287+
1288+
Parameters
1289+
----------
1290+
skipna : bool, default True
1291+
Exclude NA/null values when computing the result.
1292+
1293+
numeric_only : bool, default False
1294+
Include only float, int, boolean columns. Not implemented for Series.
1295+
1296+
**kwargs
1297+
Additional keyword arguments to be passed to the function.
1298+
1299+
Returns
1300+
-------
1301+
Series
1302+
Unbiased kurtosis within groups.
1303+
1304+
See Also
1305+
--------
1306+
Series.kurt : Return unbiased kurtosis over requested axis.
1307+
1308+
Examples
1309+
--------
1310+
>>> ser = pd.Series(
1311+
... [390.0, 350.0, 357.0, 333.0, np.nan, 22.0, 20.0, 30.0, 40.0, 41.0],
1312+
... index=[
1313+
... "Falcon",
1314+
... "Falcon",
1315+
... "Falcon",
1316+
... "Falcon",
1317+
... "Falcon",
1318+
... "Parrot",
1319+
... "Parrot",
1320+
... "Parrot",
1321+
... "Parrot",
1322+
... "Parrot",
1323+
... ],
1324+
... name="Max Speed",
1325+
... )
1326+
>>> ser
1327+
Falcon 390.0
1328+
Falcon 350.0
1329+
Falcon 357.0
1330+
Falcon 333.0
1331+
Falcon NaN
1332+
Parrot 22.0
1333+
Parrot 20.0
1334+
Parrot 30.0
1335+
Parrot 40.0
1336+
Parrot 41.0
1337+
Name: Max Speed, dtype: float64
1338+
>>> ser.groupby(level=0).kurt()
1339+
Falcon 1.622109
1340+
Parrot -2.878714
1341+
Name: Max Speed, dtype: float64
1342+
>>> ser.groupby(level=0).kurt(skipna=False)
1343+
Falcon NaN
1344+
Parrot -2.878714
1345+
Name: Max Speed, dtype: float64
1346+
"""
1347+
12751348
def alt(obj):
12761349
# This should not be reached since the cython path should raise
12771350
# TypeError and not NotImplementedError.
1278-
raise TypeError(f"'skew' is not supported for dtype={obj.dtype}")
1351+
raise TypeError(f"'kurt' is not supported for dtype={obj.dtype}")
12791352

12801353
return self._cython_agg_general(
1281-
"skew", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs
1354+
"kurt", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs
12821355
)
12831356

12841357
@property
@@ -2921,6 +2994,111 @@ def alt(obj):
29212994
"skew", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs
29222995
)
29232996

2997+
def kurt(
2998+
self,
2999+
skipna: bool = True,
3000+
numeric_only: bool = False,
3001+
**kwargs,
3002+
) -> DataFrame:
3003+
"""
3004+
Return unbiased kurtosis within groups.
3005+
3006+
Parameters
3007+
----------
3008+
skipna : bool, default True
3009+
Exclude NA/null values when computing the result.
3010+
3011+
numeric_only : bool, default False
3012+
Include only float, int, boolean columns.
3013+
3014+
**kwargs
3015+
Additional keyword arguments to be passed to the function.
3016+
3017+
Returns
3018+
-------
3019+
DataFrame
3020+
Unbiased kurtosis within groups.
3021+
3022+
See Also
3023+
--------
3024+
DataFrame.kurt : Return unbiased kurtosis over requested axis.
3025+
3026+
Examples
3027+
--------
3028+
>>> arrays = [
3029+
... [
3030+
... "falcon",
3031+
... "parrot",
3032+
... "cockatoo",
3033+
... "kiwi",
3034+
... "eagle",
3035+
... "lion",
3036+
... "monkey",
3037+
... "rabbit",
3038+
... "dog",
3039+
... "wolf",
3040+
... ],
3041+
... [
3042+
... "bird",
3043+
... "bird",
3044+
... "bird",
3045+
... "bird",
3046+
... "bird",
3047+
... "mammal",
3048+
... "mammal",
3049+
... "mammal",
3050+
... "mammal",
3051+
... "mammal",
3052+
... ],
3053+
... ]
3054+
>>> index = pd.MultiIndex.from_arrays(arrays, names=("name", "class"))
3055+
>>> df = pd.DataFrame(
3056+
... {
3057+
... "max_speed": [
3058+
... 389.0,
3059+
... 24.0,
3060+
... 70.0,
3061+
... np.nan,
3062+
... 350.0,
3063+
... 80.5,
3064+
... 21.5,
3065+
... 15.0,
3066+
... 40.0,
3067+
... 50.0,
3068+
... ]
3069+
... },
3070+
... index=index,
3071+
... )
3072+
>>> df
3073+
max_speed
3074+
name class
3075+
falcon bird 389.0
3076+
parrot bird 24.0
3077+
cockatoo bird 70.0
3078+
kiwi bird NaN
3079+
eagle bird 350.0
3080+
lion mammal 80.5
3081+
monkey mammal 21.5
3082+
rabbit mammal 15.0
3083+
dog mammal 40.0
3084+
wolf mammal 50.0
3085+
>>> gb = df.groupby(["class"])
3086+
>>> gb.kurt()
3087+
max_speed
3088+
class
3089+
bird -5.493277
3090+
mammal 0.204125
3091+
>>> gb.kurt(skipna=False)
3092+
max_speed
3093+
class
3094+
bird NaN
3095+
mammal 0.204125
3096+
"""
3097+
3098+
return self._cython_agg_general(
3099+
"kurt", alt=None, skipna=skipna, numeric_only=numeric_only, **kwargs
3100+
)
3101+
29243102
@property
29253103
@doc(DataFrame.plot.__doc__)
29263104
def plot(self) -> GroupByPlot:

0 commit comments

Comments
 (0)