pyjanitor-devs · Sarra99 · Jan 24, 2025 · Jan 24, 2025 · Jan 24, 2025 · Jan 24, 2025
diff --git a/AUTHORS.md b/AUTHORS.md
@@ -114,3 +114,4 @@ Contributors
 - [@joranbeasley](https://github.com/joranbeasley) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/issues?q=is%3Aclosed+mentions%joranbeasley)
 -[@kianmeng](https://github.com/kianmeng) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/pull/1290#issue-1906020324)
 - [@lbeltrame](https://github.com/lbeltrame) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/pull/1401)
+- [@Sarra99](https://github.com/Sarra99) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/issues?q=is%3Aclosed+mentions%3ASarra99)
diff --git a/janitor/functions/adorn.py b/janitor/functions/adorn.py
@@ -0,0 +1,251 @@
+from typing import Optional
+
+import pandas as pd
+
+
+def tabyl(
+    df: pd.DataFrame,
+    col1: str,
+    col2: Optional[str] = None,
+    col3: Optional[str] = None,
+    show_counts: bool = True,
+    show_percentages: bool = False,
+    percentage_axis: Optional[str] = None,  # 'row', 'col', or 'all'
+) -> pd.DataFrame:
+    """
+    Create a summary table similar to R's `tabyl`.
+
+    Args:
+        df: Input DataFrame.
+        col1: Name of the first column for grouping (required).
+        col2: Name of the second column for grouping (optional).
+        col3: Name of the third column for grouping (optional).
+        show_counts: Whether to show raw counts in the table.
+        show_percentages: Whether to show percentages in the table.
+        percentage_axis: Axis for percentages ('row', 'col', or 'all').
+        Only applies if `show_percentages` is True.
+
+    Returns:
+        A DataFrame representing the summary table.
+
+    Example :
+        >>> data = {
+        ...     "Category": ["A", "A", "B", "B", "C", "C", "A", "B", "C", "A"],
+        ...     "Subcategory": ["X", "Y", "X", "Y", "X", "Y", "X", "Y", "X", "X"],
+        ...     "Region": ["North", "South", "East", "West", "North",
+        ...      "South", "East", "West", "North", "East"],
+        ...     "Value": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+        ... }
+        >>> df = pd.DataFrame(data)
+
+        >>> result = tabyl(df, "Category", "Subcategory", show_percentages=True,
+        ...      percentage_axis="row")
+        >>> print(result)
+        Subcategory  Category X             Y
+        0                  A  3.0 (75.00%)  1.0 (25.00%)
+        1                  B  1.0 (33.33%)  2.0 (66.67%)
+        2                  C  2.0 (66.67%)  1.0 (33.33%)
+
+        >>> result = tabyl(df, "Category", "Subcategory",
+        ...       show_percentages=True, percentage_axis="col")
+        >>> print(result)
+        Subcategory Category  X             Y
+        0                  A  3.0 (50.00%)  1.0 (25.00%)
+        1                  B  1.0 (16.67%)  2.0 (50.00%)
+        2                  C  2.0 (33.33%)  1.0 (25.00%)
+
+    """
+
+    if col1 not in df.columns:
+        raise ValueError(f"Column '{col1}' is not in the DataFrame.")
+    if col2 and col2 not in df.columns:
+        raise ValueError(f"Column '{col2}' is not in the DataFrame.")
+    if col3 and col3 not in df.columns:
+        raise ValueError(f"Column '{col3}' is not in the DataFrame.")
+
+    # Step 1: Group and count
+    group_cols = [col1]
+    if col2:
+        group_cols.append(col2)
+    if col3:
+        group_cols.append(col3)
+
+    grouped = df.groupby(group_cols).size().reset_index(name="count")
+
+    # Step 2: Pivot for 3D (col1, col2, col3)
+    if col2 and col3:
+        pivot = grouped.pivot_table(
+            index=col1,
+            columns=[col2, col3],  # Creating 2-level columns for col2 and col3
+            values="count",
+            aggfunc="sum",
+            fill_value=0,
+        )
+    elif col2:
+        pivot = grouped.pivot_table(
+            index=col1,
+            columns=col2,
+            values="count",
+            aggfunc="sum",
+            fill_value=0,
+        )
+    else:
+        pivot = grouped.set_index(col1)["count"].to_frame()
+
+    if show_percentages:
+        pivot = pivot.astype(
+            float
+        )  # Convert to float before calculating percentages
+
+        if percentage_axis == "row":
+            percentages = pivot.div(pivot.sum(axis=1), axis=0)
+        elif percentage_axis == "col":
+            percentages = pivot.div(pivot.sum(axis=0), axis=1)
+        elif percentage_axis == "all":
+            total = pivot.values.sum()
+            percentages = pivot / total
+        else:
+            raise ValueError(
+                "`percentage_axis` must be one of 'row', 'col', or 'all'."
+            )
+
+        percentages = percentages.applymap(lambda x: f"{x:.2%}")
+
+        if show_counts:
+            pivot = pivot.astype(str) + " (" + percentages + ")"
+        else:
+            pivot = percentages
+
+    return pivot.reset_index()
+
+
+def adorn_totals(df, col1, col2, axis=0):
+    """
+    Adds a 'Total' row or column to a crosstab generated by tabyl.
+
+    :param df: DataFrame used to generate the crosstab
+    :param col1: First column to create the crosstab
+    :param col2: Second column to create the crosstab
+    :param axis: 0 to add a 'Total' row, 1 to add a 'Total' column
+    :return: DataFrame with a 'Total' row/column added
+
+    Example:
+    >>> data = {
+    ...     "Category": ["A", "B", "A", "B", "A", "B", "A", "B"],
+    ...     "Subcategory": ["X", "X", "Y", "Y", "X", "X", "Y", "Y"],
+    ...     "Value": [1, 2, 3, 4, 5, 6, 7, 8],
+    ... }
+    >>> df = pd.DataFrame(data)
+
+    >>> result = adorn_totals(df, "Category", "Subcategory", axis=0)
+    >>> print(result)
+    Subcategory  Category  X  Y
+    0            A         2  2
+    1            B         2  2
+    Total        NaN       4  4
+
+    >>> result = adorn_totals(df, "Category", "Subcategory", axis=1)
+    >>> print(result)
+    Subcategory  Category  X  Y  Total
+    0            A         2  2     4
+    1            B         2  2     4
+
+    """
+
+    # Generate the crosstab using tabyl with the two specified columns
+    pivot = tabyl(df, col1, col2)
+
+    if pivot.empty:  # If the crosstab is empty, return it as-is
+        return pivot
+
+    if axis == 0:  # Add a 'Total' row
+        # Select only numeric columns and compute their sum across rows
+        total_row = pivot.select_dtypes(include="number").sum(axis=0)
+        total_row.name = "Total"  # Set the name of the total row
+        # Concatenate the total row to the crosstab
+        pivot = pd.concat([pivot, total_row.to_frame().T])
+    elif axis == 1:  # Add a 'Total' column
+        # Select only numeric columns and compute their sum across columns
+        total_col = pivot.select_dtypes(include="number").sum(axis=1)
+        pivot["Total"] = total_col  # Add the total column to the crosstab
+    else:
+        raise ValueError(
+            "The 'axis' argument must be 0 (to add a row) or 1 (to add a column)"
+        )
+
+    return pivot
+
+
+def adorn_percentages(
+    df, col1, col2, axis="row", fmt=True, include_ns=False, decimal_places=1
+):
+    """
+    Adds percentages to a crosstab generated by tabyl, with options to format
+    and include raw counts, and also control the behavior
+        of adorn_pct_formatting and adorn_ns.
+
+    :param df: DataFrame used to generate the crosstab
+    :param col1: First column to create the crosstab
+    :param col2: Second column to create the crosstab
+    :param axis: 'row' to add percentages by row, 'col' for column percentages,
+         'all' for global percentages
+    :param fmt: If True, formats percentages as strings
+        (e.g., "12.5%"), else returns numeric values.
+    :param include_ns: If True, includes raw counts alongside percentages.
+    :param decimal_places: Number of decimal places for the percentages
+    :param thousand_separator: Whether to add a thousand separator to the counts
+    :param percent_format: Whether to format as percentages
+    :return: DataFrame with percentages and optional formatting and raw counts
+
+    """
+    # Generate the crosstab using tabyl with the two specified columns
+    pivot = pd.pivot_table(
+        df,
+        values="Value",
+        index=col1,
+        columns=col2,
+        aggfunc="sum",
+        fill_value=0,
+    )
+
+    if pivot.empty:  # If the crosstab is empty, return it as-is
+        return pivot
+
+    # Separate numeric columns from the rest of the data
+    numeric_cols = pivot.select_dtypes(include="number")
+
+    # Calculate the percentages based on the axis
+    if axis == "row":
+        percentages = numeric_cols.div(numeric_cols.sum(axis=1), axis=0)
+    elif axis == "col":
+        percentages = numeric_cols.div(numeric_cols.sum(axis=0), axis=1)
+    elif axis == "all":
+        total_sum = numeric_cols.sum().sum()
+        percentages = numeric_cols / total_sum
+    else:
+        raise ValueError("The 'axis' argument must be 'row', 'col', or 'all'.")
+
+    # Format the percentages if requested
+    if fmt:
+        percentages = percentages.applymap(
+            lambda x: f"{x * 100:.{decimal_places}f}%" if pd.notnull(x) else x
+        )
+    else:
+        percentages = percentages.applymap(
+            lambda x: f"{x:.{decimal_places}f}" if pd.notnull(x) else x
+        )
+
+    # Combine percentages with raw counts if requested (adorn_ns functionality)
+    if include_ns:
+        raw_counts = numeric_cols
+        percentages_with_ns = (
+            percentages.astype(str) + " (" + raw_counts.astype(str) + ")"
+            if fmt
+            else percentages.astype(str) + " (" + raw_counts.astype(str) + ")"
+        )
+        percentages = percentages_with_ns
+
+    # Reattach the categories and the percentages to form the final DataFrame
+    result = pd.concat([pivot.iloc[:, :1], percentages], axis=1)
+
+    return result