pyjanitor-devs · samukweku · Mar 2, 2025 · Mar 1, 2025 · Mar 1, 2025 · Mar 1, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,7 @@
 
 -  [ENH] Added support for pd.Series.select - Issue #1394 @samukweku
 -  [ENH] Added suport for janitor.mutate - Issue #1226 @samukweku
+-  [ENH] Added support for janitor.summarise - Issue #1225 @samukweku
 
 ## [v0.30.0] - 2024-12-04
 

diff --git a/janitor/functions/__init__.py b/janitor/functions/__init__.py
@@ -82,6 +82,7 @@
 from .shuffle import shuffle
 from .sort_column_value_order import sort_column_value_order
 from .sort_naturally import sort_naturally
+from .summarise import summarise
 from .take_first import take_first
 from .then import then
 from .to_datetime import to_datetime
@@ -160,6 +161,7 @@
     "shuffle",
     "sort_column_value_order",
     "sort_naturally",
+    "summarise",
     "take_first",
     "then",
     "to_datetime",

diff --git a/janitor/functions/summarise.py b/janitor/functions/summarise.py
@@ -0,0 +1,265 @@
+"""Implementation of summarise."""
+
+from __future__ import annotations
+
+from functools import singledispatch
+from typing import Any
+
+import pandas as pd
+import pandas_flavor as pf
+from pandas.api.types import is_scalar
+from pandas.core.groupby.generic import DataFrameGroupBy
+
+from janitor.functions.select import get_index_labels
+
+
+@pf.register_dataframe_method
+def summarise(
+    df: pd.DataFrame,
+    *args: tuple[dict | tuple],
+    by: Any = None,
+) -> pd.DataFrame | pd.Series:
+    """
+
+    !!! info "New in version 0.31.0"
+
+    !!!note
+
+        Before reaching for `summarise`, try `pd.DataFrame.agg`.
+
+    summarise creates a new dataframe;
+    it returns one row for each combination of grouping columns.
+    If there are no grouping variables,
+    the output will have a single row
+    summarising all observations in the input.
+
+    The argument provided to *args* should be either a dictionary or a tuple.
+
+    - **dictionary argument**:
+    If the argument is a dictionary,
+    the value in the `{key:value}` pairing
+    should be either a string, a callable or a tuple.
+
+        - If the value in the dictionary
+        is a string or a callable,
+        the key of the dictionary
+        should be an existing column name.
+
+        !!!note
+
+            - If the value is a string,
+            the string should be a pandas string function,
+            e.g "sum", "mean", etc.
+
+        - If the value of the dictionary is a tuple,
+        it should be of length 2, and of the form
+        `(column_name, mutation_func)`,
+        where `column_name` should exist in the DataFrame,
+        and `mutation_func` should be either a string or a callable.
+
+        !!!note
+
+            - If `mutation_func` is a string,
+            the string should be a pandas string function,
+            e.g "sum", "mean", etc.
+
+        The key in the dictionary can be a new column name.
+
+    - **tuple argument**:
+    If the argument is a tuple, it should be of length 2,
+    and of the form
+    `(column_name, mutation_func)`,
+    where column_name should exist in the DataFrame,
+    and `mutation_func` should be either a string or a callable.
+
+        !!!note
+
+            - if `mutation_func` is a string,
+            the string should be a pandas string function,
+            e.g "sum", "mean", etc.
+
+        !!!note
+
+            - `column_name` can be anything supported by the
+            [`select`][janitor.functions.select.select] syntax;
+            as such multiple columns can be processed here -
+            they will be processed individually.
+
+    - **callable argument**:
+    If the argument is a callable, the callable is applied
+    on the DataFrame or GroupBy object.
+    The result from the callable should be a pandas Series
+    or DataFrame.
+
+
+    Aggregated columns cannot be reused in `summarise`.
+
+
+    `by` can be a `DataFrameGroupBy` object; it is assumed that
+    `by` was created from `df` - the onus is on the user to
+    ensure that, or the aggregations may yield incorrect results.
+
+    `by` accepts anything supported by `pd.DataFrame.groupby`.
+
+    Arguments supported in `pd.DataFrame.groupby`
+    can also be passed to `by` via a dictionary.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import janitor
+        >>> data = {'avg_jump': [3, 4, 1, 2, 3, 4],
+        ...         'avg_run': [3, 4, 1, 3, 2, 4],
+        ...         'combine_id': [100200, 100200,
+        ...                        101200, 101200,
+        ...                        102201, 103202]}
+        >>> df = pd.DataFrame(data)
+        >>> df
+           avg_jump  avg_run  combine_id
+        0         3        3      100200
+        1         4        4      100200
+        2         1        1      101200
+        3         2        3      101200
+        4         3        2      102201
+        5         4        4      103202
+
+        Aggregation via a callable:
+        >>> df.summarise(lambda df: df.sum(),by='combine_id')
+                    avg_jump  avg_run
+        combine_id
+        100200             7        7
+        101200             3        4
+        102201             3        2
+        103202             4        4
+
+        Aggregation via a tuple:
+        >>> df.summarise(("avg_run","mean"), by='combine_id')
+                    avg_run
+        combine_id
+        100200          3.5
+        101200          2.0
+        102201          2.0
+        103202          4.0
+
+        Aggregation via a dictionary:
+        >>> df.summarise({"avg_run":"mean"}, by='combine_id')
+                    avg_run
+        combine_id
+        100200          3.5
+        101200          2.0
+        102201          2.0
+        103202          4.0
+        >>> df.summarise({"avg_run_2":("avg_run","mean")}, by='combine_id')
+                    avg_run_2
+        combine_id
+        100200            3.5
+        101200            2.0
+        102201            2.0
+        103202            4.0
+
+    Args:
+        df: A pandas DataFrame.
+        args: Either a dictionary or a tuple.
+        by: Column(s) to group by.
+
+    Raises:
+        ValueError: If a tuple is passed and the length is not 2.
+
+    Returns:
+        A pandas DataFrame or Series with aggregated columns.
+
+    """  # noqa: E501
+
+    if by is not None:
+        # it is assumed that by is created from df
+        # onus is on user to ensure that
+        if isinstance(by, DataFrameGroupBy):
+            pass
+        elif isinstance(by, dict):
+            by = df.groupby(**by)
+        else:
+            if is_scalar(by):
+                by = [by]
+            by = df.groupby(by, sort=False, observed=True)
+    dictionary = {}
+    for arg in args:
+        aggregate = _mutator(arg, df=df, by=by)
+        dictionary.update(aggregate)
+    values = map(is_scalar, dictionary.values())
+    if all(values):
+        return pd.Series(dictionary)
+    return pd.concat(dictionary, axis=1, sort=False, copy=False)
+
+
+@singledispatch
+def _mutator(arg, df, by):
+    if not callable(arg):
+        raise NotImplementedError(
+            f"janitor.summarise is not supported for {type(arg)}"
+        )
+    if by is None:
+        val = df
+    else:
+        val = by
+    outcome = _process_maybe_callable(func=arg, obj=val)
+    if isinstance(outcome, pd.Series):
+        if not outcome.name:
+            raise ValueError("Ensure the pandas Series object has a name")
+        return {outcome.name: outcome}
+    # assumption: should return a DataFrame
+    outcome = {key: outcome[key] for key in outcome}
+    return outcome
+
+
+@_mutator.register(dict)
+def _(arg, df, by):
+    """Dispatch function for dictionary"""
+    if by is None:
+        val = df
+    else:
+        val = by
+
+    dictionary = {}
+    for column_name, mutator in arg.items():
+        if isinstance(mutator, tuple):
+            column, func = mutator
+            column = _process_within_dict(mutator=func, obj=val[column])
+        else:
+            column = _process_within_dict(
+                mutator=mutator, obj=val[column_name]
+            )
+        dictionary[column_name] = column
+    return dictionary
+
+
+@_mutator.register(tuple)
+def _(arg, df, by):
+    """Dispatch function for tuple"""
+    if len(arg) != 2:
+        raise ValueError("the tuple has to be a length of 2")
+    column_names, mutator = arg
+    column_names = get_index_labels(arg=[column_names], df=df, axis="columns")
+    mapping = {column_name: mutator for column_name in column_names}
+    return _mutator(mapping, df=df, by=by)
+
+
+def _process_maybe_callable(func: callable, obj):
+    """Function to handle callables"""
+    try:
+        column = obj.agg(func)
+    except:  # noqa: E722
+        column = func(obj)
+    return column
+
+
+def _process_maybe_string(func: str, obj):
+    """Function to handle pandas string functions"""
+    # treat as a pandas approved string function
+    # https://pandas.pydata.org/docs/user_guide/groupby.html#built-in-aggregation-methods
+    return obj.agg(func)
+
+
+def _process_within_dict(mutator, obj):
+    """Handle str/callables within a dictionary"""
+    if isinstance(mutator, str):
+        return _process_maybe_string(func=mutator, obj=obj)
+    return _process_maybe_callable(func=mutator, obj=obj)
diff --git a/mkdocs/api/functions.md b/mkdocs/api/functions.md
@@ -56,6 +56,7 @@
         - shuffle
         - sort_column_value_order
         - sort_naturally
+        - summarise
         - take_first
         - then
         - to_datetime