Skip to content

Commit fa4ad4a

Browse files
authored
[ENH] generic select function (#1187)
* fix blank note output * return whatever user passes, even if they r duplicates * add DropLabel for dropping columns * generic select with tests * update test_select.py * update changelog * update docs in select * add version notifications * update docs for DropLabel class * update admonition * ensure booleans are converted into arrays
1 parent fe9fa5a commit fa4ad4a

9 files changed

+276
-22
lines changed

CHANGELOG.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
- [ENH] Fix error when `sort_by_appearance=True` is combined with `dropna=True`. Issue #1168 @samukweku
2929
- [ENH] Add explicit default parameter to `case_when` function. Issue #1159 @samukweku
3030
- [BUG] pandas 1.5.x `_MergeOperation` doesn't have `copy` keyword anymore. Issue #1174 @Zeroto521
31-
- [ENH] `select_rows` function added for flexible row selection. Add support for MultiIndex selection via dictionary. Issue #1124 @samukweku
31+
- [ENH] `select_rows` function added for flexible row selection. Generic `select` function added as well. Add support for MultiIndex selection via dictionary. Issue #1124 @samukweku
3232
- [TST] Compat with macos and window, to fix `FailedHealthCheck` Issue #1181 @Zeroto521
3333
- [INF] Merge two docs CIs (`docs-preview.yml` and `docs.yml`) to one. And add `documentation` pytest mark. PR #1183 @Zeroto521
3434
- [INF] Merge `codecov.yml` (only works for the dev branch pushing event) into `tests.yml` (only works for PR event). PR #1185 @Zeroto521

janitor/functions/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -75,4 +75,4 @@
7575
from .transform_columns import transform_column, transform_columns
7676
from .truncate_datetime import truncate_datetime_dataframe
7777
from .update_where import update_where
78-
from .utils import patterns, unionize_dataframe_categories
78+
from .utils import patterns, unionize_dataframe_categories, DropLabel

janitor/functions/case_when.py

+5
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,11 @@ def case_when(
9090
else:
9191
default
9292
```
93+
!!! abstract "Version Changed"
94+
95+
- 0.24.0
96+
- Added `default` parameter.
97+
9398
9499
:param df: A pandas DataFrame.
95100
:param args: Variable argument of conditions and expected values.

janitor/functions/conditional_join.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,12 @@ def conditional_join(
115115
3 4 3 5
116116
4 4 3 6
117117
118+
!!! abstract "Version Changed"
119+
120+
- 0.24.0
121+
- Added `df_columns`, `right_columns`, `keep` and `use_numba` parameters.
122+
123+
118124
119125
:param df: A pandas DataFrame.
120126
:param right: Named Series or DataFrame to join to.
@@ -145,7 +151,7 @@ def conditional_join(
145151
:param use_numba: Use numba, if installed, to accelerate the computation.
146152
Applicable only to strictly non-equi joins. Default is `False`.
147153
:returns: A pandas DataFrame of the two merged Pandas objects.
148-
"""
154+
""" # noqa: E501
149155

150156
return _conditional_join_compute(
151157
df,

janitor/functions/pivot.py

+15-1
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,13 @@ def pivot_longer(
220220
7 Austin Texas Watermelon 99 None NaN
221221
8 Hoover Alabama Watermelon 43 None NaN
222222
223+
224+
!!! abstract "Version Changed"
225+
226+
- 0.24.0
227+
- Added `dropna` parameter.
228+
229+
223230
:param df: A pandas DataFrame.
224231
:param index: Name(s) of columns to use as identifier variables.
225232
Should be either a single column name, or a list/tuple of
@@ -1259,6 +1266,13 @@ def pivot_wider(
12591266
0 5.5 20 25 30 37
12601267
1 6.1 22 18 19 29
12611268
1269+
1270+
!!! abstract "Version Changed"
1271+
1272+
- 0.24.0
1273+
- Added `reset_index`, `names_expand` and `index_expand` parameters.
1274+
1275+
12621276
:param df: A pandas DataFrame.
12631277
:param index: Name(s) of columns to use as identifier variables.
12641278
It should be either a single column name, or a list of column names.
@@ -1293,7 +1307,7 @@ def pivot_wider(
12931307
Applies only if `index` is a categorical column. Default is `False`.
12941308
:returns: A pandas DataFrame that has been unpivoted from long to wide
12951309
form.
1296-
"""
1310+
""" # noqa: E501
12971311

12981312
df = df.copy()
12991313

janitor/functions/select.py

+81-5
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import pandas_flavor as pf
22
import pandas as pd
33
from janitor.utils import deprecated_alias
4-
from janitor.functions.utils import _select
4+
from janitor.functions.utils import _select, DropLabel # noqa: F401
55

66

77
@pf.register_dataframe_method
@@ -24,7 +24,8 @@ def select_columns(
2424
2525
Optional ability to invert selection of columns available as well.
2626
27-
!!! Note
27+
!!!note
28+
2829
The preferred option when selecting columns or rows in a Pandas DataFrame
2930
is with `.loc` or `.iloc` methods, as they are generally performant.
3031
`select_columns` is primarily for convenience.
@@ -57,7 +58,7 @@ def select_columns(
5758
:returns: A pandas DataFrame with the specified columns selected.
5859
""" # noqa: E501
5960

60-
return _select(df, args, invert, axis="columns")
61+
return _select(df, args=args, invert=invert, axis="columns")
6162

6263

6364
@pf.register_dataframe_method
@@ -79,11 +80,17 @@ def select_rows(
7980
8081
Optional ability to invert selection of rows available as well.
8182
82-
!!! Note
83+
84+
!!! info "New in version 0.24.0"
85+
86+
87+
!!!note
88+
8389
The preferred option when selecting columns or rows in a Pandas DataFrame
8490
is with `.loc` or `.iloc` methods, as they are generally performant.
8591
`select_rows` is primarily for convenience.
8692
93+
8794
Example:
8895
8996
>>> import pandas as pd
@@ -113,5 +120,74 @@ def select_rows(
113120
provided.
114121
:returns: A pandas DataFrame with the specified rows selected.
115122
""" # noqa: E501
123+
return _select(df, args=args, invert=invert, axis="index")
124+
125+
126+
@pf.register_dataframe_method
127+
def select(df: pd.DataFrame, *, rows=None, columns=None) -> pd.DataFrame:
128+
"""
129+
Method-chainable selection of rows and columns.
130+
131+
It accepts a string, shell-like glob strings `(*string*)`,
132+
regex, slice, array-like object, or a list of the previous options.
133+
134+
Selection on a MultiIndex on a level, or multiple levels,
135+
is possible with a dictionary.
136+
137+
This method does not mutate the original DataFrame.
138+
139+
Selection can be inverted with the `DropLabel` class.
140+
141+
142+
!!! info "New in version 0.24.0"
143+
144+
145+
!!!note
146+
147+
The preferred option when selecting columns or rows in a Pandas DataFrame
148+
is with `.loc` or `.iloc` methods, as they are generally performant.
149+
`select` is primarily for convenience.
150+
151+
152+
Example:
153+
154+
>>> import pandas as pd
155+
>>> import janitor
156+
>>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]],
157+
... index=['cobra', 'viper', 'sidewinder'],
158+
... columns=['max_speed', 'shield'])
159+
>>> df
160+
max_speed shield
161+
cobra 1 2
162+
viper 4 5
163+
sidewinder 7 8
164+
>>> df.select(rows='cobra', columns='shield')
165+
shield
166+
cobra 2
167+
168+
Labels can be dropped with the `DropLabel` class:
169+
170+
>>> df.select(rows=DropLabel('cobra'))
171+
max_speed shield
172+
viper 4 5
173+
sidewinder 7 8
174+
175+
:param df: A pandas DataFrame.
176+
:param rows: Valid inputs include: an exact label to look for,
177+
a shell-style glob string (e.g. `*_thing_*`),
178+
a regular expression,
179+
a callable,
180+
or variable arguments of all the aforementioned.
181+
A sequence of booleans is also acceptable.
182+
A dictionary can be used for selection on a MultiIndex on different levels.
183+
:param columns: Valid inputs include: an exact label to look for,
184+
a shell-style glob string (e.g. `*_thing_*`),
185+
a regular expression,
186+
a callable,
187+
or variable arguments of all the aforementioned.
188+
A sequence of booleans is also acceptable.
189+
A dictionary can be used for selection on a MultiIndex on different levels.
190+
:returns: A pandas DataFrame with the specified rows and/or columns selected.
191+
""" # noqa: E501
116192

117-
return _select(df, args, invert, axis="index")
193+
return _select(df, args=None, rows=rows, columns=columns, axis="both")

janitor/functions/utils.py

+77-10
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,11 @@
1111
Pattern,
1212
Union,
1313
Callable,
14+
Any,
1415
)
1516
from pandas.core.dtypes.generic import ABCPandasArray, ABCExtensionArray
1617
from pandas.core.common import is_bool_indexer
17-
18+
from dataclasses import dataclass
1819

1920
import pandas as pd
2021
from janitor.utils import check, _expand_grid
@@ -269,6 +270,23 @@ def _select_callable(arg, func: Callable, axis=None):
269270
return bools
270271

271272

273+
@dataclass
274+
class DropLabel:
275+
"""
276+
Helper class for removing labels within the `select` syntax.
277+
`label` can be any of the types supported in the `select`,
278+
`select_rows` and `select_columns` functions.
279+
An array of integers not matching the labels is returned.
280+
281+
!!! info "New in version 0.24.0"
282+
283+
:param label: Label(s) to be dropped from the index.
284+
:returns: A dataclass.
285+
"""
286+
287+
label: Any
288+
289+
272290
@singledispatch
273291
def _select_index(arg, df, axis):
274292
"""
@@ -284,6 +302,27 @@ def _select_index(arg, df, axis):
284302
raise KeyError(f"No match was returned for {arg}") from exc
285303

286304

305+
@_select_index.register(DropLabel) # noqa: F811
306+
def _column_sel_dispatch(cols, df, axis): # noqa: F811
307+
"""
308+
Base function for selection on a Pandas Index object.
309+
Returns the inverse of the passed label(s).
310+
311+
Returns an array of integers.
312+
"""
313+
arr = _select_index(cols.label, df, axis)
314+
index = np.arange(getattr(df, axis).size)
315+
if isinstance(arr, int):
316+
arr = [arr]
317+
elif isinstance(arr, slice):
318+
arr = index[arr]
319+
elif is_list_like(arr):
320+
arr = np.asanyarray(arr)
321+
if is_bool_dtype(arr):
322+
return index[~arr]
323+
return np.setdiff1d(index, arr)
324+
325+
287326
@_select_index.register(str) # noqa: F811
288327
def _index_dispatch(arg, df, axis): # noqa: F811
289328
"""
@@ -437,7 +476,7 @@ def _index_dispatch(arg, df, axis): # noqa: F811
437476
f"{arg} is a boolean dtype and has wrong length: "
438477
f"{len(arg)} instead of {len(index)}"
439478
)
440-
return arg
479+
return np.asanyarray(arg)
441480
try:
442481

443482
if isinstance(arg, pd.Series):
@@ -486,17 +525,27 @@ def _index_dispatch(arg, df, axis): # noqa: F811
486525

487526
return arg
488527

528+
# treat multiple DropLabel instances as a single unit
529+
checks = (isinstance(entry, DropLabel) for entry in arg)
530+
if sum(checks) > 1:
531+
drop_labels = (entry for entry in arg if isinstance(entry, DropLabel))
532+
drop_labels = [entry.label for entry in drop_labels]
533+
drop_labels = DropLabel(drop_labels)
534+
arg = [entry for entry in arg if not isinstance(entry, DropLabel)]
535+
arg.append(drop_labels)
536+
489537
indices = [_select_index(entry, df, axis) for entry in arg]
490538

491539
# single entry does not need to be combined
492540
# or materialized if possible;
493541
# this offers more performance
494542
if len(indices) == 1:
495-
if isinstance(indices[0], int):
543+
if is_scalar(indices[0]):
496544
return indices
497-
if is_list_like(indices[0]):
498-
return np.asanyarray(indices[0])
499-
return indices[0]
545+
indices = indices[0]
546+
if is_list_like(indices):
547+
indices = np.asanyarray(indices)
548+
return indices
500549
contents = []
501550
for arr in indices:
502551
if is_list_like(arr):
@@ -508,19 +557,37 @@ def _index_dispatch(arg, df, axis): # noqa: F811
508557
elif isinstance(arr, int):
509558
arr = [arr]
510559
contents.append(arr)
511-
contents = np.concatenate(contents)
512-
# remove possible duplicates
513-
return pd.unique(contents)
560+
return np.concatenate(contents)
514561

515562

516563
def _select(
517-
df: pd.DataFrame, args: tuple, invert: bool, axis: str
564+
df: pd.DataFrame,
565+
args: tuple,
566+
invert: bool = False,
567+
axis: str = "index",
568+
rows=None,
569+
columns=None,
518570
) -> pd.DataFrame:
519571
"""
520572
Index DataFrame on the index or columns.
521573
522574
Returns a DataFrame.
523575
"""
576+
assert axis in {"both", "index", "columns"}
577+
if axis == "both":
578+
if rows is None:
579+
rows = slice(None)
580+
else:
581+
if not is_list_like(rows):
582+
rows = [rows]
583+
rows = _select_index(rows, df, axis="index")
584+
if columns is None:
585+
columns = slice(None)
586+
else:
587+
if not is_list_like(columns):
588+
columns = [columns]
589+
columns = _select_index(columns, df, axis="columns")
590+
return df.iloc[rows, columns]
524591
indices = _select_index(list(args), df, axis)
525592
if invert:
526593
rev = np.ones(getattr(df, axis).size, dtype=np.bool8)

0 commit comments

Comments
 (0)