Skip to content

Commit 161c290

Browse files
samukwekusamuel.oranyelipre-commit-ci[bot]
authored
[ENH] minor fixes (#1219)
* minor fix for drop_constant_columns * minor fix for get_dupes * minor fix for collapse_levels, primarily for speed * fix test fails * fix test fails * vectorise collapse_levels some more for performance sake * allow for mutation * leave collapse_levels as is * Update test_collapse_levels.py * Update test_collapse_levels.py * Update test_collapse_levels.py * restor collapse_levels to before * shortcut if all entries are strings in a list in a select call * use get_indexer_for for lists that contain only strings in select * make more robust by checking on scalar, instead of just strings * improve comments * rebase * more edits * remove extra check * shortcut for * * exclude api/utils from mkdocs * exclude api/utils from mkdocs * simplify move * avoid mutation in collapse_levels * make move more robust with select syntax * docs * fix docstring * replicate fill_empty in impute - reduce duplication * add tests * fix doctest * fix docstrings * defer copy in pivot_wider to pd.pivot * fix np.bool8 deprecation * simplify dtype column selection * fix warning msg output for change_type * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * rebase * expose _select_index * add parameters * use get_index_labels where possible * add test for multiple columns * make column selection more robust for sequences * add test for set/dict selection * add test for move - both source and target are lists * exclude utils from docs * fix test fails --------- Co-authored-by: samuel.oranyeli <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 9245f76 commit 161c290

19 files changed

+394
-258
lines changed

janitor/functions/__init__.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@
6464
from .reorder_columns import reorder_columns
6565
from .round_to_fraction import round_to_fraction
6666
from .row_to_names import row_to_names
67-
from .select import select_columns, select_rows
67+
from .select import select_columns, select_rows, select
6868
from .shuffle import shuffle
6969
from .sort_column_value_order import sort_column_value_order
7070
from .sort_naturally import sort_naturally
@@ -75,4 +75,9 @@
7575
from .transform_columns import transform_column, transform_columns
7676
from .truncate_datetime import truncate_datetime_dataframe
7777
from .update_where import update_where
78-
from .utils import patterns, unionize_dataframe_categories, DropLabel
78+
from .utils import (
79+
patterns,
80+
unionize_dataframe_categories,
81+
DropLabel,
82+
get_index_labels,
83+
)

janitor/functions/change_type.py

+50-50
Original file line numberDiff line numberDiff line change
@@ -24,65 +24,65 @@ def change_type(
2424
) -> pd.DataFrame:
2525
"""Change the type of a column.
2626
27-
This method does not mutate the original DataFrame.
27+
This method does not mutate the original DataFrame.
2828
29-
Exceptions that are raised can be ignored. For example, if one has a mixed
30-
dtype column that has non-integer strings and integers, and you want to
31-
coerce everything to integers, you can optionally ignore the non-integer
32-
strings and replace them with `NaN` or keep the original value.
29+
Exceptions that are raised can be ignored. For example, if one has a mixed
30+
dtype column that has non-integer strings and integers, and you want to
31+
coerce everything to integers, you can optionally ignore the non-integer
32+
strings and replace them with `NaN` or keep the original value.
3333
34-
Intended to be the method-chaining alternative to:
34+
Intended to be the method-chaining alternative to:
3535
36-
```python
37-
df[col] = df[col].astype(dtype)
36+
```python
37+
df[col] = df[col].astype(dtype)
3838
```
3939
40-
!!!note
41-
42-
This function will be deprecated in a 1.x release.
43-
Please use `pd.DataFrame.astype` instead.
44-
45-
Example: Change the type of a column.
46-
47-
>>> import pandas as pd
48-
>>> import janitor
49-
>>> df = pd.DataFrame({"col1": range(3), "col2": ["m", 5, True]})
50-
>>> df
51-
col1 col2
52-
0 0 m
53-
1 1 5
54-
2 2 True
55-
>>> df.change_type(
56-
... "col1", dtype=str,
57-
... ).change_type(
58-
... "col2", dtype=float, ignore_exception="fillna",
59-
... )
60-
col1 col2
61-
0 0 NaN
62-
1 1 5.0
63-
2 2 1.0
40+
!!!note
6441
65-
Example: Change the type of multiple columns.
42+
This function will be deprecated in a 1.x release.
43+
Please use `pd.DataFrame.astype` instead.
6644
67-
Change the type of all columns, please use `DataFrame.astype` instead.
45+
Example: Change the type of a column.
6846
69-
>>> import pandas as pd
70-
>>> import janitor
71-
>>> df = pd.DataFrame({"col1": range(3), "col2": ["m", 5, True]})
72-
>>> df.change_type(['col1', 'col2'], str)
47+
>>> import pandas as pd
48+
>>> import janitor
49+
>>> df = pd.DataFrame({"col1": range(3), "col2": ["m", 5, True]})
50+
>>> df
7351
col1 col2
74-
0 0 m
75-
1 1 5
76-
2 2 True
77-
78-
:param df: A pandas DataFrame.
79-
:param column_name: The column(s) in the dataframe.
80-
:param dtype: The datatype to convert to. Should be one of the standard
81-
Python types, or a numpy datatype.
82-
:param ignore_exception: one of `{False, "fillna", "keep_values"}`.
83-
:returns: A pandas DataFrame with changed column types.
84-
:raises ValueError: If unknown option provided for
85-
`ignore_exception`.
52+
0 0 m
53+
1 1 5
54+
2 2 True
55+
>>> df.change_type(
56+
... "col1", dtype=str,
57+
... ).change_type(
58+
... "col2", dtype=float, ignore_exception="fillna",
59+
... )
60+
col1 col2
61+
0 0 NaN
62+
1 1 5.0
63+
2 2 1.0
64+
65+
Example: Change the type of multiple columns.
66+
67+
Change the type of all columns, please use `DataFrame.astype` instead.
68+
69+
>>> import pandas as pd
70+
>>> import janitor
71+
>>> df = pd.DataFrame({"col1": range(3), "col2": ["m", 5, True]})
72+
>>> df.change_type(['col1', 'col2'], str)
73+
col1 col2
74+
0 0 m
75+
1 1 5
76+
2 2 True
77+
78+
:param df: A pandas DataFrame.
79+
:param column_name: The column(s) in the dataframe.
80+
:param dtype: The datatype to convert to. Should be one of the standard
81+
Python types, or a numpy datatype.
82+
:param ignore_exception: one of `{False, "fillna", "keep_values"}`.
83+
:returns: A pandas DataFrame with changed column types.
84+
:raises ValueError: If unknown option provided for
85+
`ignore_exception`.
8686
"""
8787

8888
df = df.copy() # avoid mutating the original DataFrame

janitor/functions/coalesce.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import pandas_flavor as pf
55

66
from janitor.utils import check, deprecated_alias
7-
from janitor.functions.utils import _select_index
7+
from janitor.functions.utils import get_index_labels
88

99

1010
@pf.register_dataframe_method
@@ -95,8 +95,7 @@ def coalesce(
9595
"The number of columns to coalesce should be a minimum of 2."
9696
)
9797

98-
indices = _select_index([*column_names], df, axis="columns")
99-
column_names = df.columns[indices]
98+
column_names = get_index_labels([*column_names], df, axis="columns")
10099

101100
if target_column_name:
102101
check("target_column_name", target_column_name, [str])

janitor/functions/collapse_levels.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
def collapse_levels(df: pd.DataFrame, sep: str = "_") -> pd.DataFrame:
1010
"""Flatten multi-level column dataframe to a single level.
1111
12-
This method mutates the original DataFrame.
12+
This method does not mutate the original DataFrame.
1313
1414
Given a DataFrame containing multi-level columns, flatten to single-level
1515
by string-joining the column labels in each level.
@@ -72,6 +72,8 @@ class max_speed type
7272
if not isinstance(df.columns, pd.MultiIndex):
7373
return df
7474

75+
df = df[:]
76+
7577
df.columns = [
7678
sep.join(str(el) for el in tup if str(el) != "")
7779
for tup in df # noqa: PD011

janitor/functions/drop_constant_columns.py

+1-8
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,4 @@ def drop_constant_columns(df: pd.DataFrame) -> pd.DataFrame:
3434
:param df: Input Pandas DataFrame
3535
:returns: The Pandas DataFrame with the constant columns dropped.
3636
"""
37-
# Find the constant columns
38-
constant_columns = []
39-
for col in df.columns:
40-
if len(df[col].unique()) == 1:
41-
constant_columns.append(col)
42-
43-
# Drop constant columns from df and return it
44-
return df.drop(labels=constant_columns, axis=1)
37+
return df.loc[:, df.nunique().ne(1)]

janitor/functions/encode_categorical.py

+3-5
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
import pandas_flavor as pf
88
from pandas.api.types import is_list_like
99

10-
from janitor.utils import check, check_column, deprecated_alias
10+
from janitor.utils import check_column, deprecated_alias
11+
from janitor.functions.utils import get_index_labels
1112

1213

1314
@pf.register_dataframe_method
@@ -112,10 +113,7 @@ def encode_categorical(
112113
# kwargs takes care of scenarios where user wants an ordered category
113114
# or user supplies specific categories to create the categorical
114115
if column_names is not None:
115-
check("column_names", column_names, [list, tuple, Hashable])
116-
if isinstance(column_names, Hashable):
117-
column_names = [column_names]
118-
check_column(df, column_names)
116+
column_names = get_index_labels([column_names], df, axis="columns")
119117
dtypes = {col: "category" for col in column_names}
120118
return df.astype(dtypes)
121119

janitor/functions/get_dupes.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -75,5 +75,4 @@ def get_dupes(
7575
all columns.
7676
:returns: The duplicate rows, as a pandas DataFrame.
7777
"""
78-
dupes = df.duplicated(subset=column_names, keep=False)
79-
return df[dupes == True] # noqa: E712
78+
return df.loc[df.duplicated(subset=column_names, keep=False)]

janitor/functions/impute.py

+47-40
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,29 @@
11
"""Implementation of `impute` function"""
2-
from typing import Any, Hashable, Optional
2+
from typing import Any, Optional
3+
34

4-
import lazy_loader as lazy
5-
import numpy as np
65
import pandas_flavor as pf
76
import pandas as pd
87

98
from janitor.utils import deprecated_alias
10-
11-
ss = lazy.load("scipy.stats")
9+
from janitor.functions.utils import get_index_labels
10+
from itertools import product
1211

1312

1413
@pf.register_dataframe_method
1514
@deprecated_alias(column="column_name")
15+
@deprecated_alias(column_name="column_names")
1616
@deprecated_alias(statistic="statistic_column_name")
1717
def impute(
1818
df: pd.DataFrame,
19-
column_name: Hashable,
19+
column_names: Any,
2020
value: Optional[Any] = None,
2121
statistic_column_name: Optional[str] = None,
2222
) -> pd.DataFrame:
2323
"""
2424
Method-chainable imputation of values in a column.
2525
26-
This method mutates the original DataFrame.
26+
This method does not mutate the original DataFrame.
2727
2828
Underneath the hood, this function calls the `.fillna()` method available
2929
to every `pandas.Series` object.
@@ -34,8 +34,11 @@ def impute(
3434
take on the value provided.
3535
3636
If `statistic_column_name` is provided, then all null values in the
37-
selected column will take on the summary statistic value of other non-null
38-
values.
37+
selected column(s) will take on the summary statistic value
38+
of other non-null values.
39+
40+
Column selection in `column_names` is possible using the
41+
[`select_columns`][janitor.functions.select.select_columns] syntax.
3942
4043
Currently supported statistics include:
4144
@@ -63,7 +66,7 @@ def impute(
6366
6467
Imputing null values with 0 (using the `value` parameter):
6568
66-
>>> df.impute(column_name="sales", value=0.0)
69+
>>> df.impute(column_names="sales", value=0.0)
6770
a sales score
6871
0 1 0.0 NaN
6972
1 2 0.0 3.0
@@ -72,14 +75,14 @@ def impute(
7275
Imputing null values with median (using the `statistic_column_name`
7376
parameter):
7477
75-
>>> df.impute(column_name="score", statistic_column_name="median")
78+
>>> df.impute(column_names="score", statistic_column_name="median")
7679
a sales score
77-
0 1 0.0 2.5
78-
1 2 0.0 3.0
79-
2 3 0.0 2.0
80+
0 1 NaN 2.5
81+
1 2 NaN 3.0
82+
2 3 NaN 2.0
8083
8184
:param df: A pandas DataFrame.
82-
:param column_name: The name of the column on which to impute values.
85+
:param column_names: The name of the column(s) on which to impute values.
8386
:param value: The value used for imputation, passed into `.fillna` method
8487
of the underlying pandas Series.
8588
:param statistic_column_name: The column statistic to impute.
@@ -90,42 +93,46 @@ def impute(
9093
`average`, `median`, `mode`, `minimum`, `min`, `maximum`, or `max`.
9194
"""
9295
# Firstly, we check that only one of `value` or `statistic` are provided.
96+
if (value is None) and (statistic_column_name is None):
97+
raise ValueError("Kindly specify a value or a statistic_column_name")
98+
9399
if value is not None and statistic_column_name is not None:
94100
raise ValueError(
95101
"Only one of `value` or `statistic_column_name` should be "
96102
"provided."
97103
)
98104

99-
# If statistic is provided, then we compute the relevant summary statistic
100-
# from the other data.
101-
funcs = {
102-
"mean": np.mean,
103-
"average": np.mean, # aliased
104-
"median": np.median,
105-
"mode": ss.mode,
106-
"minimum": np.min,
107-
"min": np.min, # aliased
108-
"maximum": np.max,
109-
"max": np.max, # aliased
110-
}
111-
if statistic_column_name is not None:
105+
column_names = get_index_labels([column_names], df, axis="columns")
106+
107+
if value is not None:
108+
value = dict(product(column_names, [value]))
109+
110+
else:
111+
# If statistic is provided, then we compute
112+
# the relevant summary statistic
113+
# from the other data.
114+
funcs = {
115+
"mean": "mean",
116+
"average": "mean", # aliased
117+
"median": "median",
118+
"mode": "mode",
119+
"minimum": "min",
120+
"min": "min", # aliased
121+
"maximum": "max",
122+
"max": "max", # aliased
123+
}
112124
# Check that the statistic keyword argument is one of the approved.
113125
if statistic_column_name not in funcs:
114126
raise KeyError(
115127
f"`statistic_column_name` must be one of {funcs.keys()}."
116128
)
117129

118-
value = funcs[statistic_column_name](
119-
df[column_name].dropna().to_numpy()
120-
)
121-
# special treatment for mode, because scipy stats mode returns a
122-
# moderesult object.
130+
value = dict(product(column_names, [funcs[statistic_column_name]]))
131+
132+
value = df.agg(value)
133+
134+
# special treatment for mode
123135
if statistic_column_name == "mode":
124-
value = value.mode[0]
136+
value = {key: val.at[0] for key, val in value.items()}
125137

126-
# The code is architected this way - if `value` is not provided but
127-
# statistic is, we then overwrite the None value taken on by `value`, and
128-
# use it to set the imputation column.
129-
if value is not None:
130-
df[column_name] = df[column_name].fillna(value)
131-
return df
138+
return df.fillna(value=value)

0 commit comments

Comments
 (0)