Skip to content

Commit d7fa35b

Browse files
samukwekusamuel.oranyeli
and
samuel.oranyeli
authored
[ENH] pivot_wider_spec pandas (#1427)
* add pivot_wider_spec for pandas * add examples * cleanup * fix example * fix example * fix failing test * fix example --------- Co-authored-by: samuel.oranyeli <[email protected]>
1 parent 3b8f104 commit d7fa35b

File tree

3 files changed

+299
-2
lines changed

3 files changed

+299
-2
lines changed

janitor/functions/__init__.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,12 @@
5757
from .limit_column_characters import limit_column_characters
5858
from .min_max_scale import min_max_scale
5959
from .move import move
60-
from .pivot import pivot_longer, pivot_longer_spec, pivot_wider
60+
from .pivot import (
61+
pivot_longer,
62+
pivot_longer_spec,
63+
pivot_wider,
64+
pivot_wider_spec,
65+
)
6166
from .process_text import process_text
6267
from .remove_columns import remove_columns
6368
from .remove_empty import remove_empty
@@ -138,6 +143,7 @@
138143
"pivot_longer",
139144
"pivot_longer_spec",
140145
"pivot_wider",
146+
"pivot_wider_spec",
141147
"process_text",
142148
"remove_columns",
143149
"remove_empty",

janitor/functions/pivot.py

+156-1
Original file line numberDiff line numberDiff line change
@@ -327,10 +327,14 @@ def pivot_longer(
327327
Should be either a single column name, or a list/tuple of
328328
column names.
329329
`index` should be a list of tuples if the columns are a MultiIndex.
330+
Column selection is possible using the
331+
[`select`][janitor.functions.select.select] syntax.
330332
column_names: Name(s) of columns to unpivot. Should be either
331333
a single column name or a list/tuple of column names.
332334
`column_names` should be a list of tuples
333335
if the columns are a MultiIndex.
336+
Column selection is possible using the
337+
[`select`][janitor.functions.select.select] syntax.
334338
names_to: Name of new column as a string that will contain
335339
what were previously the column names in `column_names`.
336340
The default is `variable` if no value is provided. It can
@@ -420,10 +424,13 @@ def pivot_longer_spec(
420424
) -> pd.DataFrame:
421425
"""A declarative interface to pivot a DataFrame from wide to long form,
422426
where you describe how the data will be unpivoted,
423-
using a DataFrame. This gives you, the user,
427+
using a DataFrame.
428+
429+
This gives you, the user,
424430
more control over unpivoting, where you create a “spec”
425431
data frame that describes exactly how data stored
426432
in the column names becomes variables.
433+
427434
It can come in handy for situations where
428435
[`pivot_longer`][janitor.functions.pivot.pivot_longer]
429436
seems inadequate for the transformation.
@@ -2380,3 +2387,151 @@ def _check_tuples_multiindex(indexer, args, param):
23802387
)
23812388

23822389
return args
2390+
2391+
2392+
def pivot_wider_spec(
2393+
df: pd.DataFrame,
2394+
spec: pd.DataFrame,
2395+
index: list | tuple | str | Pattern = None,
2396+
reset_index: bool = True,
2397+
) -> pd.DataFrame:
2398+
"""A declarative interface to pivot a DataFrame from long to wide form,
2399+
where you describe how the data will be pivoted,
2400+
using a DataFrame.
2401+
2402+
This gives you, the user,
2403+
more control over pivoting, where you create a “spec”
2404+
data frame that describes exactly how data stored
2405+
in the column names becomes variables.
2406+
2407+
It can come in handy for situations where
2408+
`pd.DataFrame.pivot`
2409+
seems inadequate for the transformation.
2410+
2411+
!!! info "New in version 0.31.0"
2412+
2413+
Examples:
2414+
>>> import pandas as pd
2415+
>>> from janitor import pivot_wider_spec
2416+
>>> df = pd.DataFrame(
2417+
... [
2418+
... {"famid": 1, "birth": 1, "age": 1, "ht": 2.8},
2419+
... {"famid": 1, "birth": 1, "age": 2, "ht": 3.4},
2420+
... {"famid": 1, "birth": 2, "age": 1, "ht": 2.9},
2421+
... {"famid": 1, "birth": 2, "age": 2, "ht": 3.8},
2422+
... {"famid": 1, "birth": 3, "age": 1, "ht": 2.2},
2423+
... {"famid": 1, "birth": 3, "age": 2, "ht": 2.9},
2424+
... {"famid": 2, "birth": 1, "age": 1, "ht": 2.0},
2425+
... {"famid": 2, "birth": 1, "age": 2, "ht": 3.2},
2426+
... {"famid": 2, "birth": 2, "age": 1, "ht": 1.8},
2427+
... {"famid": 2, "birth": 2, "age": 2, "ht": 2.8},
2428+
... {"famid": 2, "birth": 3, "age": 1, "ht": 1.9},
2429+
... {"famid": 2, "birth": 3, "age": 2, "ht": 2.4},
2430+
... {"famid": 3, "birth": 1, "age": 1, "ht": 2.2},
2431+
... {"famid": 3, "birth": 1, "age": 2, "ht": 3.3},
2432+
... {"famid": 3, "birth": 2, "age": 1, "ht": 2.3},
2433+
... {"famid": 3, "birth": 2, "age": 2, "ht": 3.4},
2434+
... {"famid": 3, "birth": 3, "age": 1, "ht": 2.1},
2435+
... {"famid": 3, "birth": 3, "age": 2, "ht": 2.9},
2436+
... ]
2437+
... )
2438+
>>> df
2439+
famid birth age ht
2440+
0 1 1 1 2.8
2441+
1 1 1 2 3.4
2442+
2 1 2 1 2.9
2443+
3 1 2 2 3.8
2444+
4 1 3 1 2.2
2445+
5 1 3 2 2.9
2446+
6 2 1 1 2.0
2447+
7 2 1 2 3.2
2448+
8 2 2 1 1.8
2449+
9 2 2 2 2.8
2450+
10 2 3 1 1.9
2451+
11 2 3 2 2.4
2452+
12 3 1 1 2.2
2453+
13 3 1 2 3.3
2454+
14 3 2 1 2.3
2455+
15 3 2 2 3.4
2456+
16 3 3 1 2.1
2457+
17 3 3 2 2.9
2458+
>>> spec = {".name": ["ht1", "ht2"],
2459+
... ".value": ["ht", "ht"],
2460+
... "age": [1, 2]}
2461+
>>> spec = pd.DataFrame(spec)
2462+
>>> spec
2463+
.name .value age
2464+
0 ht1 ht 1
2465+
1 ht2 ht 2
2466+
>>> pivot_wider_spec(df=df,spec=spec, index=['famid','birth'])
2467+
famid birth ht1 ht2
2468+
0 1 1 2.8 3.4
2469+
1 1 2 2.9 3.8
2470+
2 1 3 2.2 2.9
2471+
3 2 1 2.0 3.2
2472+
4 2 2 1.8 2.8
2473+
5 2 3 1.9 2.4
2474+
6 3 1 2.2 3.3
2475+
7 3 2 2.3 3.4
2476+
8 3 3 2.1 2.9
2477+
2478+
Args:
2479+
df: A pandas DataFrame.
2480+
spec: A specification DataFrame.
2481+
At a minimum, the spec DataFrame
2482+
must have a '.name' and a '.value' columns.
2483+
The '.name' column should contain the
2484+
the names of the columns in the output DataFrame.
2485+
The '.value' column should contain the name of the column(s)
2486+
in the source DataFrame that will be serve as the values.
2487+
Additional columns in spec will serves as the columns
2488+
to be flipped to wide form.
2489+
Note that these additional columns should already exist
2490+
in the source DataFrame.
2491+
index: Name(s) of columns to use as identifier variables.
2492+
It should be either a single column name, or a list of column names.
2493+
If `index` is not provided, the DataFrame's index is used.
2494+
Column selection is possible using the
2495+
[`select`][janitor.functions.select.select] syntax.
2496+
reset_index: Determines whether to reset the `index`.
2497+
Applicable only if `index` is provided.
2498+
2499+
Returns:
2500+
A pandas DataFrame that has been unpivoted from long to wide form.
2501+
""" # noqa: E501
2502+
check("spec", spec, [pd.DataFrame])
2503+
check("reset_index", reset_index, [bool])
2504+
if not spec.columns.is_unique:
2505+
raise ValueError("Kindly ensure the spec's columns is unique.")
2506+
if ".name" not in spec.columns:
2507+
raise KeyError(
2508+
"Kindly ensure the spec DataFrame has a `.name` column."
2509+
)
2510+
if ".value" not in spec.columns:
2511+
raise KeyError(
2512+
"Kindly ensure the spec DataFrame has a `.value` column."
2513+
)
2514+
if spec.columns.tolist()[:2] != [".name", ".value"]:
2515+
raise ValueError(
2516+
"The first two columns of the spec DataFrame "
2517+
"should be '.name' and '.value', "
2518+
"with '.name' coming before '.value'."
2519+
)
2520+
if spec.columns.size == 2:
2521+
raise ValueError(
2522+
"Kindly provide the column(s) "
2523+
"to use to make new frame’s columns"
2524+
)
2525+
columns = spec.columns[2:]
2526+
values = spec[".value"].unique()
2527+
if index is not None:
2528+
index = _select_index([index], df, axis="columns")
2529+
index = df.columns[index].tolist()
2530+
df = df.pivot(index=index, columns=columns, values=values)
2531+
_index = spec.columns[1:].tolist()
2532+
spec = spec.set_index(_index).squeeze()
2533+
df = df.reindex(columns=spec.index)
2534+
df.columns = df.columns.map(spec)
2535+
if reset_index and index:
2536+
return df.reset_index()
2537+
return df
+136
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
import re
2+
3+
import pandas as pd
4+
import pytest
5+
from pandas.testing import assert_frame_equal
6+
7+
from janitor import pivot_wider_spec
8+
9+
10+
@pytest.fixture
11+
def df_checks():
12+
"""pytest fixture"""
13+
return pd.DataFrame(
14+
[
15+
{"famid": 1, "birth": 1, "age": 1, "ht": 2.8},
16+
{"famid": 1, "birth": 1, "age": 2, "ht": 3.4},
17+
{"famid": 1, "birth": 2, "age": 1, "ht": 2.9},
18+
{"famid": 1, "birth": 2, "age": 2, "ht": 3.8},
19+
{"famid": 1, "birth": 3, "age": 1, "ht": 2.2},
20+
{"famid": 1, "birth": 3, "age": 2, "ht": 2.9},
21+
{"famid": 2, "birth": 1, "age": 1, "ht": 2.0},
22+
{"famid": 2, "birth": 1, "age": 2, "ht": 3.2},
23+
{"famid": 2, "birth": 2, "age": 1, "ht": 1.8},
24+
{"famid": 2, "birth": 2, "age": 2, "ht": 2.8},
25+
{"famid": 2, "birth": 3, "age": 1, "ht": 1.9},
26+
{"famid": 2, "birth": 3, "age": 2, "ht": 2.4},
27+
{"famid": 3, "birth": 1, "age": 1, "ht": 2.2},
28+
{"famid": 3, "birth": 1, "age": 2, "ht": 3.3},
29+
{"famid": 3, "birth": 2, "age": 1, "ht": 2.3},
30+
{"famid": 3, "birth": 2, "age": 2, "ht": 3.4},
31+
{"famid": 3, "birth": 3, "age": 1, "ht": 2.1},
32+
{"famid": 3, "birth": 3, "age": 2, "ht": 2.9},
33+
]
34+
)
35+
36+
37+
spec = {".name": ["ht1", "ht2"], ".value": ["ht", "ht"], "age": [1, 2]}
38+
spec = pd.DataFrame(spec)
39+
40+
41+
def test_spec_is_a_dataframe(df_checks):
42+
"""Raise Error if spec is not a DataFrame."""
43+
with pytest.raises(
44+
TypeError,
45+
match="spec should be one of.+",
46+
):
47+
df_checks.pipe(pivot_wider_spec, spec={".name": "name"})
48+
49+
50+
def test_spec_columns_has_dot_name(df_checks):
51+
"""Raise KeyError if '.name' not in spec's columns."""
52+
with pytest.raises(
53+
KeyError,
54+
match="Kindly ensure the spec DataFrame has a `.name` column.",
55+
):
56+
df_checks.pipe(
57+
pivot_wider_spec,
58+
spec=spec.set_axis(labels=[".value", ".blabla", "age"], axis=1),
59+
)
60+
61+
62+
def test_spec_columns_has_dot_value(df_checks):
63+
"""Raise KeyError if '.value' not in spec's columns."""
64+
with pytest.raises(
65+
KeyError,
66+
match="Kindly ensure the spec DataFrame has a `.value` column.",
67+
):
68+
df_checks.pipe(
69+
pivot_wider_spec,
70+
spec=spec.set_axis(labels=[".name", ".blabla", "age"], axis=1),
71+
)
72+
73+
74+
def test_spec_columns_name_value_order(df_checks):
75+
"""
76+
Raise ValueError if '.name' and '.value'
77+
are not the first two labels
78+
in spec's columns.
79+
"""
80+
msg = "The first two columns of the spec DataFrame "
81+
msg += "should be '.name' and '.value',.+"
82+
with pytest.raises(
83+
ValueError,
84+
match=msg,
85+
):
86+
df_checks.pipe(
87+
pivot_wider_spec,
88+
spec=spec.loc[:, [".value", ".name", "age"]],
89+
)
90+
91+
92+
def test_spec_columns_len_2(df_checks):
93+
"""
94+
Raise ValueError if '.name' and '.value'
95+
are the only columns in spec.
96+
"""
97+
msg = "Kindly provide the column(s) "
98+
msg += "to use to make new frame’s columns"
99+
with pytest.raises(
100+
ValueError,
101+
match=re.escape(msg),
102+
):
103+
df_checks.pipe(
104+
pivot_wider_spec,
105+
spec=spec.loc[:, [".name", ".value"]],
106+
)
107+
108+
109+
def test_spec_columns_not_unique(df_checks):
110+
"""Raise ValueError if the spec's columns is not unique."""
111+
with pytest.raises(
112+
ValueError, match="Kindly ensure the spec's columns is unique."
113+
):
114+
df_checks.pipe(
115+
pivot_wider_spec,
116+
spec=spec.set_axis(labels=[".name", ".name", "age"], axis=1),
117+
)
118+
119+
120+
def test_pivot_wider_spec(df_checks):
121+
"""
122+
Test output
123+
"""
124+
expected = (
125+
df_checks.pivot(index=["famid", "birth"], columns="age", values="ht")
126+
.add_prefix("ht")
127+
.rename_axis(columns=None)
128+
.reset_index()
129+
)
130+
actual = df_checks.pipe(
131+
pivot_wider_spec, spec=spec, index=["famid", "birth"]
132+
)
133+
assert_frame_equal(
134+
actual.sort_values(expected.columns.tolist(), ignore_index=True),
135+
expected.sort_values(expected.columns.tolist(), ignore_index=True),
136+
)

0 commit comments

Comments
 (0)