Skip to content

Commit a14061c

Browse files
authored
row_to_names improvement (#1379)
This function improves `row_to_names` for polars dataframes, primarily with speed enhancements.
1 parent bbb5891 commit a14061c

File tree

6 files changed

+284
-171
lines changed

6 files changed

+284
-171
lines changed

janitor/functions/row_to_names.py

+141-46
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
from __future__ import annotations
44

5-
import warnings
5+
from functools import singledispatch
66

77
import numpy as np
88
import pandas as pd
@@ -15,7 +15,7 @@
1515
@deprecated_alias(row_number="row_numbers", remove_row="remove_rows")
1616
def row_to_names(
1717
df: pd.DataFrame,
18-
row_numbers: int | list = 0,
18+
row_numbers: int | list | slice = 0,
1919
remove_rows: bool = False,
2020
remove_rows_above: bool = False,
2121
reset_index: bool = False,
@@ -47,7 +47,7 @@ def row_to_names(
4747
1 9 y
4848
>>> df.row_to_names([0,1], remove_rows=True, reset_index=True)
4949
nums chars
50-
6 x
50+
6 x
5151
0 9 y
5252
5353
Remove rows above the elevated row and the elevated row itself.
@@ -72,8 +72,7 @@ def row_to_names(
7272
Args:
7373
df: A pandas DataFrame.
7474
row_numbers: Position of the row(s) containing the variable names.
75-
Note that indexing starts from 0. It can also be a list,
76-
in which case, a MultiIndex column is created.
75+
It can be an integer, a list or a slice.
7776
Defaults to 0 (first row).
7877
remove_rows: Whether the row(s) should be removed from the DataFrame.
7978
remove_rows_above: Whether the row(s) above the selected row should
@@ -83,53 +82,149 @@ def row_to_names(
8382
Returns:
8483
A pandas DataFrame with set column names.
8584
""" # noqa: E501
86-
if not pd.options.mode.copy_on_write:
87-
df = df.copy()
88-
89-
check("row_numbers", row_numbers, [int, list])
90-
if isinstance(row_numbers, list):
91-
for entry in row_numbers:
92-
check("entry in the row_numbers argument", entry, [int])
93-
94-
warnings.warn(
95-
"The function row_to_names will, in the official 1.0 release, "
96-
"change its behaviour to reset the dataframe's index by default. "
97-
"You can prepare for this change right now by explicitly setting "
98-
"`reset_index=True` when calling on `row_to_names`."
85+
86+
return _row_to_names(
87+
row_numbers,
88+
df=df,
89+
remove_rows=remove_rows,
90+
remove_rows_above=remove_rows_above,
91+
reset_index=reset_index,
92+
)
93+
94+
95+
@singledispatch
96+
def _row_to_names(
97+
row_numbers, df, remove_rows, remove_rows_above, reset_index
98+
) -> pd.DataFrame:
99+
"""
100+
Base function for row_to_names.
101+
"""
102+
raise TypeError(
103+
"row_numbers should be either an integer, "
104+
"a slice or a list; "
105+
f"instead got type {type(row_numbers).__name__}"
99106
)
100-
# should raise if positional indexers are missing
101-
# IndexError: positional indexers are out-of-bounds
102-
headers = df.iloc[row_numbers]
107+
108+
109+
@_row_to_names.register(int) # noqa: F811
110+
def _row_to_names_dispatch( # noqa: F811
111+
row_numbers, df, remove_rows, remove_rows_above, reset_index
112+
):
113+
df_ = df[:]
114+
headers = df_.iloc[row_numbers]
115+
df_.columns = headers
116+
df_.columns.name = None
117+
if not remove_rows and not remove_rows_above and not reset_index:
118+
return df_
119+
if not remove_rows and not remove_rows_above and reset_index:
120+
return df_.reset_index(drop=True)
121+
122+
len_df = len(df_)
123+
arrays = [arr._values for _, arr in df_.items()]
124+
if remove_rows_above and remove_rows:
125+
indexer = np.arange(row_numbers + 1, len_df)
126+
elif remove_rows_above:
127+
indexer = np.arange(row_numbers, len_df)
128+
elif remove_rows:
129+
indexer = np.arange(len_df)
130+
mask = np.ones(len_df, dtype=np.bool_)
131+
mask[row_numbers] = False
132+
indexer = indexer[mask]
133+
arrays = {num: arr[indexer] for num, arr in enumerate(arrays)}
134+
if reset_index:
135+
df_index = pd.RangeIndex(start=0, stop=indexer.size)
136+
else:
137+
df_index = df_.index[indexer]
138+
_df = pd.DataFrame(data=arrays, index=df_index, copy=False)
139+
_df.columns = df_.columns
140+
return _df
141+
142+
143+
@_row_to_names.register(slice) # noqa: F811
144+
def _row_to_names_dispatch( # noqa: F811
145+
row_numbers, df, remove_rows, remove_rows_above, reset_index
146+
):
147+
if row_numbers.step is not None:
148+
raise ValueError(
149+
"The step argument for slice is not supported in row_to_names."
150+
)
151+
df_ = df[:]
152+
headers = df_.iloc[row_numbers]
103153
if isinstance(headers, pd.DataFrame) and (len(headers) == 1):
104154
headers = headers.squeeze()
105-
if isinstance(headers, pd.Series):
106-
headers = pd.Index(headers)
155+
df_.columns = headers
156+
df_.columns.name = None
107157
else:
108-
headers = [entry.array for _, entry in headers.items()]
158+
headers = [array._values for _, array in headers.items()]
109159
headers = pd.MultiIndex.from_tuples(headers)
160+
df_.columns = headers
161+
if not remove_rows and not remove_rows_above and not reset_index:
162+
return df_
163+
if not remove_rows and not remove_rows_above and reset_index:
164+
return df_.reset_index(drop=True)
165+
len_df = len(df_)
166+
arrays = [arr._values for _, arr in df_.items()]
167+
if remove_rows_above and remove_rows:
168+
indexer = np.arange(row_numbers.stop, len_df)
169+
elif remove_rows_above:
170+
indexer = np.arange(row_numbers.start, len_df)
171+
elif remove_rows:
172+
indexer = np.arange(len_df)
173+
mask = np.ones(len_df, dtype=np.bool_)
174+
mask[row_numbers] = False
175+
indexer = indexer[mask]
176+
arrays = {num: arr[indexer] for num, arr in enumerate(arrays)}
177+
if reset_index:
178+
df_index = pd.RangeIndex(start=0, stop=indexer.size)
179+
else:
180+
df_index = df_.index[indexer]
181+
_df = pd.DataFrame(data=arrays, index=df_index, copy=False)
182+
_df.columns = df_.columns
183+
return _df
110184

111-
df.columns = headers
112-
df.columns.name = None
113185

114-
df_index = df.index
186+
@_row_to_names.register(list) # noqa: F811
187+
def _row_to_names_dispatch( # noqa: F811
188+
row_numbers, df, remove_rows, remove_rows_above, reset_index
189+
):
115190
if remove_rows_above:
116-
if isinstance(row_numbers, list):
117-
if not (np.diff(row_numbers) == 1).all():
118-
raise ValueError(
119-
"The remove_rows_above argument is applicable "
120-
"only if the row_numbers argument is an integer, "
121-
"or the integers in a list are consecutive increasing, "
122-
"with a difference of 1."
123-
)
124-
tail = row_numbers[0]
125-
else:
126-
tail = row_numbers
127-
df = df.iloc[tail:]
128-
if remove_rows:
129-
if isinstance(row_numbers, int):
130-
row_numbers = [row_numbers]
131-
df_index = df.index.symmetric_difference(df_index[row_numbers])
132-
df = df.loc[df_index]
191+
raise ValueError(
192+
"The remove_rows_above argument is applicable "
193+
"only if the row_numbers argument is an integer "
194+
"or a slice."
195+
)
196+
197+
for entry in row_numbers:
198+
check("entry in the row_numbers argument", entry, [int])
199+
200+
df_ = df[:]
201+
headers = df_.iloc[row_numbers]
202+
if isinstance(headers, pd.DataFrame) and (len(headers) == 1):
203+
headers = headers.squeeze()
204+
df_.columns = headers
205+
df_.columns.name = None
206+
else:
207+
headers = [array._values for _, array in headers.items()]
208+
headers = pd.MultiIndex.from_tuples(headers)
209+
df_.columns = headers
210+
211+
if not remove_rows and reset_index:
212+
return df_.reset_index(drop=True)
213+
if not remove_rows and not reset_index:
214+
return df_
215+
216+
len_df = len(df_)
217+
arrays = [arr._values for _, arr in df_.items()]
218+
indexer = np.arange(len_df)
219+
mask = np.ones(len_df, dtype=np.bool_)
220+
mask[row_numbers] = False
221+
indexer = indexer[mask]
222+
223+
arrays = {num: arr[indexer] for num, arr in enumerate(arrays)}
133224
if reset_index:
134-
df.index = range(len(df))
135-
return df
225+
df_index = pd.RangeIndex(start=0, stop=indexer.size)
226+
else:
227+
df_index = df_.index[indexer]
228+
_df = pd.DataFrame(data=arrays, index=df_index, copy=False)
229+
_df.columns = df_.columns
230+
return _df

janitor/polars/complete.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
try:
1212
import polars as pl
1313
import polars.selectors as cs
14-
from polars.type_aliases import ColumnNameOrSelector
14+
from polars._typing import ColumnNameOrSelector
1515
except ImportError:
1616
import_message(
1717
submodule="polars",

janitor/polars/pivot_longer.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
try:
1010
import polars as pl
11-
from polars.type_aliases import ColumnNameOrSelector
11+
from polars._typing import ColumnNameOrSelector
1212
except ImportError:
1313
import_message(
1414
submodule="polars",

0 commit comments

Comments
 (0)