Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BUGFIX] droplabel when combined with other selectors does not exclude labels #1444

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ jobs:
- name: Build docs
run: mkdocs build

- uses: actions/upload-artifact@v3
- uses: actions/upload-artifact@v4
with:
name: website
path: site/
Expand Down
75 changes: 46 additions & 29 deletions janitor/functions/select.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,13 +152,13 @@ def select_columns(

Exclude columns with the `DropLabel` class:
>>> from janitor import DropLabel
>>> df.select_columns(DropLabel(slice("name", "awake")), "conservation")
brainwt bodywt conservation
0 NaN 50.000 lc
1 0.01550 0.480 NaN
2 NaN 1.350 nt
3 0.00029 0.019 lc
4 0.42300 600.000 domesticated
>>> df.select_columns(DropLabel(slice("name", "awake")))
brainwt bodywt
0 NaN 50.000
1 0.01550 0.480
2 NaN 1.350
3 0.00029 0.019
4 0.42300 600.000

Selection on MultiIndex columns:
>>> d = {'num_legs': [4, 4, 2, 2],
Expand Down Expand Up @@ -673,7 +673,7 @@ def _index_dispatch(arg, df, axis): # noqa: F811

Returns an array of integers.
"""
level_label = {}

index = getattr(df, axis)
if not isinstance(index, pd.MultiIndex):
return _select_index(list(arg), df, axis)
Expand All @@ -687,6 +687,7 @@ def _index_dispatch(arg, df, axis): # noqa: F811
"in the MultiIndex, and should either be all "
"strings or integers."
)
level_label = {}
for key, value in arg.items():
if isinstance(value, dispatch_callable):
indexer = index.get_level_values(key)
Expand Down Expand Up @@ -757,14 +758,14 @@ def _index_dispatch(arg, df, axis): # noqa: F811
def _column_sel_dispatch(cols, df, axis): # noqa: F811
"""
Base function for selection on a Pandas Index object.
Returns the inverse of the passed label(s).
Returns the inverse of the passed label(s),
or the set difference if it is part of a list of labels.

Returns an array of integers.
"""
arr = _select_index(cols.label, df, axis)
index = np.arange(getattr(df, axis).size)
arr = _index_converter(arr, index)
return np.delete(index, arr)
return _index_converter(arr, index)


@_select_index.register(set)
Expand Down Expand Up @@ -797,27 +798,43 @@ def _index_dispatch(arg, df, axis): # noqa: F811
indices = index.get_indexer_for(list(arg))
if (indices != -1).all():
return indices
# treat multiple DropLabel instances as a single unit
checks = (isinstance(entry, DropLabel) for entry in arg)
if sum(checks) > 1:
drop_labels = (entry for entry in arg if isinstance(entry, DropLabel))
drop_labels = [entry.label for entry in drop_labels]
drop_labels = DropLabel(drop_labels)
arg = [entry for entry in arg if not isinstance(entry, DropLabel)]
arg.append(drop_labels)
indices = [_select_index(entry, df, axis) for entry in arg]

include = []
exclude = []
for entry in arg:
if isinstance(entry, DropLabel):
exclude.append(entry)
else:
outcome = _select_index(entry, df, axis)
include.append(outcome)
if exclude:
if len(exclude) > 1:
exclude = [entry.label for entry in exclude]
exclude = DropLabel(exclude)
else:
exclude = exclude[0]
exclude = _select_index(exclude, df, axis)
len_exclude = len(exclude)
if len_exclude and not include:
index_arr = np.arange(getattr(df, axis).size)
return np.delete(index_arr, exclude)
if include and len_exclude:
include = [_index_converter(arr, index) for arr in include]
include = np.concatenate(include)
mask = np.isin(include, exclude)
return include[~mask]
# single entry does not need to be combined
# or materialized if possible;
# this offers more performance
if len(indices) == 1:
if is_scalar(indices[0]):
return indices
indices = indices[0]
if is_list_like(indices):
indices = np.asanyarray(indices)
return indices
indices = [_index_converter(arr, index) for arr in indices]
return np.concatenate(indices)
if len(include) == 1:
if is_scalar(include[0]):
return include
include = include[0]
if is_list_like(include):
include = np.asanyarray(include)
return include
include = [_index_converter(arr, index) for arr in include]
return np.concatenate(include)


def _index_converter(arr, index):
Expand Down
21 changes: 21 additions & 0 deletions tests/functions/test_select_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,27 @@ def test_select_column_names_droplabel(dataframe, invert, expected):
assert_frame_equal(df, dataframe[expected])


def test_select_column_names_droplabel_mix():
"Test DropLabel, mixed with other labels"
data = {
"rowid": [1, 2],
"species": ["Adelie", "Adelie"],
"island": ["Torgersen", "Torgersen"],
"bill_length_mm": [39.1, 39.5],
"bill_depth_mm": [18.7, 17.4],
"flipper_length_mm": [181.0, 186.0],
"body_mass_g": [3750.0, 3800.0],
"sex": ["male", "female"],
"year": [2007, 2007],
}

df = pd.DataFrame(data)
selectors = [pd.api.types.is_numeric_dtype, DropLabel(["year", "rowid"])]
actual = df.select(selectors)
expected = df.select_dtypes("number").drop(columns=["year", "rowid"])
assert_frame_equal(actual, expected)


@pytest.mark.functions
def test_select_column_names_droplabel_multiple(dataframe):
"Base DataFrame"
Expand Down
Loading