Skip to content

Commit e876285

Browse files
samukwekusamuel.oranyeli
and
samuel.oranyeli
authored
[ENH] summarize function (#1450)
* add summarise function * add examples to docs * changelog * use concat where necessary * add info in docs * add info in docs * clean up docs * clean up docs * clean up docs * add test for grouped object * handle coverage * update docs --------- Co-authored-by: samuel.oranyeli <[email protected]>
1 parent e1b64c1 commit e876285

File tree

5 files changed

+435
-0
lines changed

5 files changed

+435
-0
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
- [ENH] Added support for pd.Series.select - Issue #1394 @samukweku
66
- [ENH] Added suport for janitor.mutate - Issue #1226 @samukweku
7+
- [ENH] Added support for janitor.summarise - Issue #1225 @samukweku
78

89
## [v0.30.0] - 2024-12-04
910

janitor/functions/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@
8282
from .shuffle import shuffle
8383
from .sort_column_value_order import sort_column_value_order
8484
from .sort_naturally import sort_naturally
85+
from .summarise import summarise
8586
from .take_first import take_first
8687
from .then import then
8788
from .to_datetime import to_datetime
@@ -160,6 +161,7 @@
160161
"shuffle",
161162
"sort_column_value_order",
162163
"sort_naturally",
164+
"summarise",
163165
"take_first",
164166
"then",
165167
"to_datetime",

janitor/functions/summarise.py

+265
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,265 @@
1+
"""Implementation of summarise."""
2+
3+
from __future__ import annotations
4+
5+
from functools import singledispatch
6+
from typing import Any
7+
8+
import pandas as pd
9+
import pandas_flavor as pf
10+
from pandas.api.types import is_scalar
11+
from pandas.core.groupby.generic import DataFrameGroupBy
12+
13+
from janitor.functions.select import get_index_labels
14+
15+
16+
@pf.register_dataframe_method
17+
def summarise(
18+
df: pd.DataFrame,
19+
*args: tuple[dict | tuple],
20+
by: Any = None,
21+
) -> pd.DataFrame | pd.Series:
22+
"""
23+
24+
!!! info "New in version 0.31.0"
25+
26+
!!!note
27+
28+
Before reaching for `summarise`, try `pd.DataFrame.agg`.
29+
30+
summarise creates a new dataframe;
31+
it returns one row for each combination of grouping columns.
32+
If there are no grouping variables,
33+
the output will have a single row
34+
summarising all observations in the input.
35+
36+
The argument provided to *args* should be either a dictionary or a tuple.
37+
38+
- **dictionary argument**:
39+
If the argument is a dictionary,
40+
the value in the `{key:value}` pairing
41+
should be either a string, a callable or a tuple.
42+
43+
- If the value in the dictionary
44+
is a string or a callable,
45+
the key of the dictionary
46+
should be an existing column name.
47+
48+
!!!note
49+
50+
- If the value is a string,
51+
the string should be a pandas string function,
52+
e.g "sum", "mean", etc.
53+
54+
- If the value of the dictionary is a tuple,
55+
it should be of length 2, and of the form
56+
`(column_name, mutation_func)`,
57+
where `column_name` should exist in the DataFrame,
58+
and `mutation_func` should be either a string or a callable.
59+
60+
!!!note
61+
62+
- If `mutation_func` is a string,
63+
the string should be a pandas string function,
64+
e.g "sum", "mean", etc.
65+
66+
The key in the dictionary can be a new column name.
67+
68+
- **tuple argument**:
69+
If the argument is a tuple, it should be of length 2,
70+
and of the form
71+
`(column_name, mutation_func)`,
72+
where column_name should exist in the DataFrame,
73+
and `mutation_func` should be either a string or a callable.
74+
75+
!!!note
76+
77+
- if `mutation_func` is a string,
78+
the string should be a pandas string function,
79+
e.g "sum", "mean", etc.
80+
81+
!!!note
82+
83+
- `column_name` can be anything supported by the
84+
[`select`][janitor.functions.select.select] syntax;
85+
as such multiple columns can be processed here -
86+
they will be processed individually.
87+
88+
- **callable argument**:
89+
If the argument is a callable, the callable is applied
90+
on the DataFrame or GroupBy object.
91+
The result from the callable should be a pandas Series
92+
or DataFrame.
93+
94+
95+
Aggregated columns cannot be reused in `summarise`.
96+
97+
98+
`by` can be a `DataFrameGroupBy` object; it is assumed that
99+
`by` was created from `df` - the onus is on the user to
100+
ensure that, or the aggregations may yield incorrect results.
101+
102+
`by` accepts anything supported by `pd.DataFrame.groupby`.
103+
104+
Arguments supported in `pd.DataFrame.groupby`
105+
can also be passed to `by` via a dictionary.
106+
107+
Examples:
108+
>>> import pandas as pd
109+
>>> import janitor
110+
>>> data = {'avg_jump': [3, 4, 1, 2, 3, 4],
111+
... 'avg_run': [3, 4, 1, 3, 2, 4],
112+
... 'combine_id': [100200, 100200,
113+
... 101200, 101200,
114+
... 102201, 103202]}
115+
>>> df = pd.DataFrame(data)
116+
>>> df
117+
avg_jump avg_run combine_id
118+
0 3 3 100200
119+
1 4 4 100200
120+
2 1 1 101200
121+
3 2 3 101200
122+
4 3 2 102201
123+
5 4 4 103202
124+
125+
Aggregation via a callable:
126+
>>> df.summarise(lambda df: df.sum(),by='combine_id')
127+
avg_jump avg_run
128+
combine_id
129+
100200 7 7
130+
101200 3 4
131+
102201 3 2
132+
103202 4 4
133+
134+
Aggregation via a tuple:
135+
>>> df.summarise(("avg_run","mean"), by='combine_id')
136+
avg_run
137+
combine_id
138+
100200 3.5
139+
101200 2.0
140+
102201 2.0
141+
103202 4.0
142+
143+
Aggregation via a dictionary:
144+
>>> df.summarise({"avg_run":"mean"}, by='combine_id')
145+
avg_run
146+
combine_id
147+
100200 3.5
148+
101200 2.0
149+
102201 2.0
150+
103202 4.0
151+
>>> df.summarise({"avg_run_2":("avg_run","mean")}, by='combine_id')
152+
avg_run_2
153+
combine_id
154+
100200 3.5
155+
101200 2.0
156+
102201 2.0
157+
103202 4.0
158+
159+
Args:
160+
df: A pandas DataFrame.
161+
args: Either a dictionary or a tuple.
162+
by: Column(s) to group by.
163+
164+
Raises:
165+
ValueError: If a tuple is passed and the length is not 2.
166+
167+
Returns:
168+
A pandas DataFrame or Series with aggregated columns.
169+
170+
""" # noqa: E501
171+
172+
if by is not None:
173+
# it is assumed that by is created from df
174+
# onus is on user to ensure that
175+
if isinstance(by, DataFrameGroupBy):
176+
pass
177+
elif isinstance(by, dict):
178+
by = df.groupby(**by)
179+
else:
180+
if is_scalar(by):
181+
by = [by]
182+
by = df.groupby(by, sort=False, observed=True)
183+
dictionary = {}
184+
for arg in args:
185+
aggregate = _mutator(arg, df=df, by=by)
186+
dictionary.update(aggregate)
187+
values = map(is_scalar, dictionary.values())
188+
if all(values):
189+
return pd.Series(dictionary)
190+
return pd.concat(dictionary, axis=1, sort=False, copy=False)
191+
192+
193+
@singledispatch
194+
def _mutator(arg, df, by):
195+
if not callable(arg):
196+
raise NotImplementedError(
197+
f"janitor.summarise is not supported for {type(arg)}"
198+
)
199+
if by is None:
200+
val = df
201+
else:
202+
val = by
203+
outcome = _process_maybe_callable(func=arg, obj=val)
204+
if isinstance(outcome, pd.Series):
205+
if not outcome.name:
206+
raise ValueError("Ensure the pandas Series object has a name")
207+
return {outcome.name: outcome}
208+
# assumption: should return a DataFrame
209+
outcome = {key: outcome[key] for key in outcome}
210+
return outcome
211+
212+
213+
@_mutator.register(dict)
214+
def _(arg, df, by):
215+
"""Dispatch function for dictionary"""
216+
if by is None:
217+
val = df
218+
else:
219+
val = by
220+
221+
dictionary = {}
222+
for column_name, mutator in arg.items():
223+
if isinstance(mutator, tuple):
224+
column, func = mutator
225+
column = _process_within_dict(mutator=func, obj=val[column])
226+
else:
227+
column = _process_within_dict(
228+
mutator=mutator, obj=val[column_name]
229+
)
230+
dictionary[column_name] = column
231+
return dictionary
232+
233+
234+
@_mutator.register(tuple)
235+
def _(arg, df, by):
236+
"""Dispatch function for tuple"""
237+
if len(arg) != 2:
238+
raise ValueError("the tuple has to be a length of 2")
239+
column_names, mutator = arg
240+
column_names = get_index_labels(arg=[column_names], df=df, axis="columns")
241+
mapping = {column_name: mutator for column_name in column_names}
242+
return _mutator(mapping, df=df, by=by)
243+
244+
245+
def _process_maybe_callable(func: callable, obj):
246+
"""Function to handle callables"""
247+
try:
248+
column = obj.agg(func)
249+
except: # noqa: E722
250+
column = func(obj)
251+
return column
252+
253+
254+
def _process_maybe_string(func: str, obj):
255+
"""Function to handle pandas string functions"""
256+
# treat as a pandas approved string function
257+
# https://pandas.pydata.org/docs/user_guide/groupby.html#built-in-aggregation-methods
258+
return obj.agg(func)
259+
260+
261+
def _process_within_dict(mutator, obj):
262+
"""Handle str/callables within a dictionary"""
263+
if isinstance(mutator, str):
264+
return _process_maybe_string(func=mutator, obj=obj)
265+
return _process_maybe_callable(func=mutator, obj=obj)

mkdocs/api/functions.md

+1
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
- shuffle
5757
- sort_column_value_order
5858
- sort_naturally
59+
- summarise
5960
- take_first
6061
- then
6162
- to_datetime

0 commit comments

Comments
 (0)