[ENH] pivot_wider_spec pandas (#1427)

samukweku · samuel.oranyeli · web-flow · commit d7fa35b2bd3a · 2024-12-17T09:41:32.000+11:00
* add pivot_wider_spec for pandas

* add examples

* cleanup

* fix example

* fix example

* fix failing test

* fix example

---------

Co-authored-by: samuel.oranyeli &lt;samuel.oranyeli@grow.inc&gt;
diff --git a/janitor/functions/__init__.py b/janitor/functions/__init__.py
@@ -57,7 +57,12 @@
 from .limit_column_characters import limit_column_characters
 from .min_max_scale import min_max_scale
 from .move import move
-from .pivot import pivot_longer, pivot_longer_spec, pivot_wider
+from .pivot import (
+    pivot_longer,
+    pivot_longer_spec,
+    pivot_wider,
+    pivot_wider_spec,
+)
 from .process_text import process_text
 from .remove_columns import remove_columns
 from .remove_empty import remove_empty
@@ -138,6 +143,7 @@
     "pivot_longer",
     "pivot_longer_spec",
     "pivot_wider",
+    "pivot_wider_spec",
     "process_text",
     "remove_columns",
     "remove_empty",
diff --git a/janitor/functions/pivot.py b/janitor/functions/pivot.py
@@ -327,10 +327,14 @@ def pivot_longer(
             Should be either a single column name, or a list/tuple of
             column names.
             `index` should be a list of tuples if the columns are a MultiIndex.
+            Column selection is possible using the
+            [`select`][janitor.functions.select.select] syntax.
         column_names: Name(s) of columns to unpivot. Should be either
             a single column name or a list/tuple of column names.
             `column_names` should be a list of tuples
             if the columns are a MultiIndex.
+            Column selection is possible using the
+            [`select`][janitor.functions.select.select] syntax.
         names_to: Name of new column as a string that will contain
             what were previously the column names in `column_names`.
             The default is `variable` if no value is provided. It can
@@ -420,10 +424,13 @@ def pivot_longer_spec(
 ) -> pd.DataFrame:
     """A declarative interface to pivot a DataFrame from wide to long form,
     where you describe how the data will be unpivoted,
-    using a DataFrame. This gives you, the user,
+    using a DataFrame.
+
+    This gives you, the user,
     more control over unpivoting, where you create a “spec”
     data frame that describes exactly how data stored
     in the column names becomes variables.
+
     It can come in handy for situations where
     [`pivot_longer`][janitor.functions.pivot.pivot_longer]
     seems inadequate for the transformation.
@@ -2380,3 +2387,151 @@ def _check_tuples_multiindex(indexer, args, param):
         )
 
     return args
+
+
+def pivot_wider_spec(
+    df: pd.DataFrame,
+    spec: pd.DataFrame,
+    index: list | tuple | str | Pattern = None,
+    reset_index: bool = True,
+) -> pd.DataFrame:
+    """A declarative interface to pivot a DataFrame from long to wide form,
+    where you describe how the data will be pivoted,
+    using a DataFrame.
+
+    This gives you, the user,
+    more control over pivoting, where you create a “spec”
+    data frame that describes exactly how data stored
+    in the column names becomes variables.
+
+    It can come in handy for situations where
+    `pd.DataFrame.pivot`
+    seems inadequate for the transformation.
+
+    !!! info "New in version 0.31.0"
+
+    Examples:
+        >>> import pandas as pd
+        >>> from janitor import pivot_wider_spec
+        >>> df = pd.DataFrame(
+        ... [
+        ...    {"famid": 1, "birth": 1, "age": 1, "ht": 2.8},
+        ...    {"famid": 1, "birth": 1, "age": 2, "ht": 3.4},
+        ...    {"famid": 1, "birth": 2, "age": 1, "ht": 2.9},
+        ...    {"famid": 1, "birth": 2, "age": 2, "ht": 3.8},
+        ...    {"famid": 1, "birth": 3, "age": 1, "ht": 2.2},
+        ...    {"famid": 1, "birth": 3, "age": 2, "ht": 2.9},
+        ...    {"famid": 2, "birth": 1, "age": 1, "ht": 2.0},
+        ...    {"famid": 2, "birth": 1, "age": 2, "ht": 3.2},
+        ...    {"famid": 2, "birth": 2, "age": 1, "ht": 1.8},
+        ...    {"famid": 2, "birth": 2, "age": 2, "ht": 2.8},
+        ...    {"famid": 2, "birth": 3, "age": 1, "ht": 1.9},
+        ...    {"famid": 2, "birth": 3, "age": 2, "ht": 2.4},
+        ...    {"famid": 3, "birth": 1, "age": 1, "ht": 2.2},
+        ...    {"famid": 3, "birth": 1, "age": 2, "ht": 3.3},
+        ...    {"famid": 3, "birth": 2, "age": 1, "ht": 2.3},
+        ...    {"famid": 3, "birth": 2, "age": 2, "ht": 3.4},
+        ...    {"famid": 3, "birth": 3, "age": 1, "ht": 2.1},
+        ...    {"famid": 3, "birth": 3, "age": 2, "ht": 2.9},
+        ... ]
+        ... )
+        >>> df
+            famid  birth  age   ht
+        0       1      1    1  2.8
+        1       1      1    2  3.4
+        2       1      2    1  2.9
+        3       1      2    2  3.8
+        4       1      3    1  2.2
+        5       1      3    2  2.9
+        6       2      1    1  2.0
+        7       2      1    2  3.2
+        8       2      2    1  1.8
+        9       2      2    2  2.8
+        10      2      3    1  1.9
+        11      2      3    2  2.4
+        12      3      1    1  2.2
+        13      3      1    2  3.3
+        14      3      2    1  2.3
+        15      3      2    2  3.4
+        16      3      3    1  2.1
+        17      3      3    2  2.9
+        >>> spec = {".name": ["ht1", "ht2"],
+        ...         ".value": ["ht", "ht"],
+        ...         "age": [1, 2]}
+        >>> spec = pd.DataFrame(spec)
+        >>> spec
+          .name .value  age
+        0   ht1     ht    1
+        1   ht2     ht    2
+        >>> pivot_wider_spec(df=df,spec=spec, index=['famid','birth'])
+           famid  birth  ht1  ht2
+        0      1      1  2.8  3.4
+        1      1      2  2.9  3.8
+        2      1      3  2.2  2.9
+        3      2      1  2.0  3.2
+        4      2      2  1.8  2.8
+        5      2      3  1.9  2.4
+        6      3      1  2.2  3.3
+        7      3      2  2.3  3.4
+        8      3      3  2.1  2.9
+
+    Args:
+        df: A pandas DataFrame.
+        spec: A specification DataFrame.
+            At a minimum, the spec DataFrame
+            must have a '.name' and a '.value' columns.
+            The '.name' column  should contain the
+            the names of the columns in the output DataFrame.
+            The '.value' column should contain the name of the column(s)
+            in the source DataFrame that will be serve as the values.
+            Additional columns in spec will serves as the columns
+            to be flipped to wide form.
+            Note that these additional columns should already exist
+            in the source DataFrame.
+        index: Name(s) of columns to use as identifier variables.
+            It should be either a single column name, or a list of column names.
+            If `index` is not provided, the DataFrame's index is used.
+            Column selection is possible using the
+            [`select`][janitor.functions.select.select] syntax.
+        reset_index: Determines whether to reset the `index`.
+            Applicable only if `index` is provided.
+
+    Returns:
+        A pandas DataFrame that has been unpivoted from long to wide form.
+    """  # noqa: E501
+    check("spec", spec, [pd.DataFrame])
+    check("reset_index", reset_index, [bool])
+    if not spec.columns.is_unique:
+        raise ValueError("Kindly ensure the spec's columns is unique.")
+    if ".name" not in spec.columns:
+        raise KeyError(
+            "Kindly ensure the spec DataFrame has a `.name` column."
+        )
+    if ".value" not in spec.columns:
+        raise KeyError(
+            "Kindly ensure the spec DataFrame has a `.value` column."
+        )
+    if spec.columns.tolist()[:2] != [".name", ".value"]:
+        raise ValueError(
+            "The first two columns of the spec DataFrame "
+            "should be '.name' and '.value', "
+            "with '.name' coming before '.value'."
+        )
+    if spec.columns.size == 2:
+        raise ValueError(
+            "Kindly provide the column(s) "
+            "to use to make new frame’s columns"
+        )
+    columns = spec.columns[2:]
+    values = spec[".value"].unique()
+    if index is not None:
+        index = _select_index([index], df, axis="columns")
+        index = df.columns[index].tolist()
+    df = df.pivot(index=index, columns=columns, values=values)
+    _index = spec.columns[1:].tolist()
+    spec = spec.set_index(_index).squeeze()
+    df = df.reindex(columns=spec.index)
+    df.columns = df.columns.map(spec)
+    if reset_index and index:
+        return df.reset_index()
+    return df
diff --git a/tests/functions/test_pivot_wider_spec.py b/tests/functions/test_pivot_wider_spec.py
@@ -0,0 +1,136 @@
+import re
+
+import pandas as pd
+import pytest
+from pandas.testing import assert_frame_equal
+
+from janitor import pivot_wider_spec
+
+
+@pytest.fixture
+def df_checks():
+    """pytest fixture"""
+    return pd.DataFrame(
+        [
+            {"famid": 1, "birth": 1, "age": 1, "ht": 2.8},
+            {"famid": 1, "birth": 1, "age": 2, "ht": 3.4},
+            {"famid": 1, "birth": 2, "age": 1, "ht": 2.9},
+            {"famid": 1, "birth": 2, "age": 2, "ht": 3.8},
+            {"famid": 1, "birth": 3, "age": 1, "ht": 2.2},
+            {"famid": 1, "birth": 3, "age": 2, "ht": 2.9},
+            {"famid": 2, "birth": 1, "age": 1, "ht": 2.0},
+            {"famid": 2, "birth": 1, "age": 2, "ht": 3.2},
+            {"famid": 2, "birth": 2, "age": 1, "ht": 1.8},
+            {"famid": 2, "birth": 2, "age": 2, "ht": 2.8},
+            {"famid": 2, "birth": 3, "age": 1, "ht": 1.9},
+            {"famid": 2, "birth": 3, "age": 2, "ht": 2.4},
+            {"famid": 3, "birth": 1, "age": 1, "ht": 2.2},
+            {"famid": 3, "birth": 1, "age": 2, "ht": 3.3},
+            {"famid": 3, "birth": 2, "age": 1, "ht": 2.3},
+            {"famid": 3, "birth": 2, "age": 2, "ht": 3.4},
+            {"famid": 3, "birth": 3, "age": 1, "ht": 2.1},
+            {"famid": 3, "birth": 3, "age": 2, "ht": 2.9},
+        ]
+    )
+
+
+spec = {".name": ["ht1", "ht2"], ".value": ["ht", "ht"], "age": [1, 2]}
+spec = pd.DataFrame(spec)
+
+
+def test_spec_is_a_dataframe(df_checks):
+    """Raise Error if spec is not a DataFrame."""
+    with pytest.raises(
+        TypeError,
+        match="spec should be one of.+",
+    ):
+        df_checks.pipe(pivot_wider_spec, spec={".name": "name"})
+
+
+def test_spec_columns_has_dot_name(df_checks):
+    """Raise KeyError if '.name' not in spec's columns."""
+    with pytest.raises(
+        KeyError,
+        match="Kindly ensure the spec DataFrame has a `.name` column.",
+    ):
+        df_checks.pipe(
+            pivot_wider_spec,
+            spec=spec.set_axis(labels=[".value", ".blabla", "age"], axis=1),
+        )
+
+
+def test_spec_columns_has_dot_value(df_checks):
+    """Raise KeyError if '.value' not in spec's columns."""
+    with pytest.raises(
+        KeyError,
+        match="Kindly ensure the spec DataFrame has a `.value` column.",
+    ):
+        df_checks.pipe(
+            pivot_wider_spec,
+            spec=spec.set_axis(labels=[".name", ".blabla", "age"], axis=1),
+        )
+
+
+def test_spec_columns_name_value_order(df_checks):
+    """
+    Raise ValueError if '.name' and '.value'
+    are not the first two labels
+    in spec's columns.
+    """
+    msg = "The first two columns of the spec DataFrame "
+    msg += "should be '.name' and '.value',.+"
+    with pytest.raises(
+        ValueError,
+        match=msg,
+    ):
+        df_checks.pipe(
+            pivot_wider_spec,
+            spec=spec.loc[:, [".value", ".name", "age"]],
+        )
+
+
+def test_spec_columns_len_2(df_checks):
+    """
+    Raise ValueError if '.name' and '.value'
+    are the only columns in spec.
+    """
+    msg = "Kindly provide the column(s) "
+    msg += "to use to make new frame’s columns"
+    with pytest.raises(
+        ValueError,
+        match=re.escape(msg),
+    ):
+        df_checks.pipe(
+            pivot_wider_spec,
+            spec=spec.loc[:, [".name", ".value"]],
+        )
+
+
+def test_spec_columns_not_unique(df_checks):
+    """Raise ValueError if the spec's columns is not unique."""
+    with pytest.raises(
+        ValueError, match="Kindly ensure the spec's columns is unique."
+    ):
+        df_checks.pipe(
+            pivot_wider_spec,
+            spec=spec.set_axis(labels=[".name", ".name", "age"], axis=1),
+        )
+
+
+def test_pivot_wider_spec(df_checks):
+    """
+    Test output
+    """
+    expected = (
+        df_checks.pivot(index=["famid", "birth"], columns="age", values="ht")
+        .add_prefix("ht")
+        .rename_axis(columns=None)
+        .reset_index()
+    )
+    actual = df_checks.pipe(
+        pivot_wider_spec, spec=spec, index=["famid", "birth"]
+    )
+    assert_frame_equal(
+        actual.sort_values(expected.columns.tolist(), ignore_index=True),
+        expected.sort_values(expected.columns.tolist(), ignore_index=True),
+    )