Skip to content

Commit 7870879

Browse files
committed
Add MWE for encode_categorical; fixed formatting of error messages
1 parent 6c5c4d6 commit 7870879

File tree

1 file changed

+87
-109
lines changed

1 file changed

+87
-109
lines changed

janitor/functions/encode_categorical.py

+87-109
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
1+
import warnings
2+
from enum import Enum
13
from typing import Hashable, Iterable, Union
4+
25
import pandas_flavor as pf
36
import pandas as pd
47
from pandas.api.types import is_list_like
5-
import warnings
8+
69
from janitor.utils import check, check_column, deprecated_alias
7-
from enum import Enum
810

911

1012
@pf.register_dataframe_method
@@ -24,80 +26,79 @@ def encode_categorical(
2426
2527
Note: In versions < 0.20.11, this method mutates the original DataFrame.
2628
29+
If `column_names` is specified, these column(s) are cast to unordered categorical
30+
dtypes.
31+
2732
If `categories` is `None` in the `kwargs` tuple, then the
2833
values for `categories` are inferred from the column;
2934
if `order` is `None`, then the values for `categories` are applied unordered.
3035
3136
`column_names` and `kwargs` parameters cannot be used at the same time.
3237
33-
Functional usage syntax:
34-
35-
```python
36-
import pandas as pd
37-
import janitor as jn
38-
```
39-
40-
- With `column_names`
41-
42-
```python
43-
categorical_cols = ['col1', 'col2', 'col4']
44-
df = jn.encode_categorical(
45-
df,
46-
columns = categorical_cols) # one way
47-
```
48-
49-
- With `kwargs`
50-
51-
```python
52-
df = jn.encode_categorical(
53-
df,
54-
col1 = (categories, order),
55-
col2 = (categories = [values],
56-
order="sort" # or "appearance" or None
57-
58-
)
59-
```
60-
61-
Method chaining syntax:
62-
63-
- With `column_names`
64-
65-
```python
66-
categorical_cols = ['col1', 'col2', 'col4']
67-
df = (pd.DataFrame(...)
68-
.encode_categorical(columns=categorical_cols)
69-
)
70-
```
71-
72-
- With `kwargs`
73-
74-
```python
75-
df = (
76-
pd.DataFrame(...)
77-
.encode_categorical(
78-
col1 = (categories, order),
79-
col2 = (categories = [values]/None,
80-
order="sort" # or "appearance" or None
81-
)
82-
)
83-
```
38+
Example: Using `column_names`
39+
40+
>>> import pandas as pd
41+
>>> import janitor
42+
>>> df = pd.DataFrame({
43+
... "foo": ["b", "b", "a", "c", "b"],
44+
... "bar": range(4, 9),
45+
... })
46+
>>> df
47+
foo bar
48+
0 b 4
49+
1 b 5
50+
2 a 6
51+
3 c 7
52+
4 b 8
53+
>>> df.dtypes
54+
foo object
55+
bar int64
56+
dtype: object
57+
>>> enc_df = df.encode_categorical(column_names="foo")
58+
>>> enc_df.dtypes
59+
foo category
60+
bar int64
61+
dtype: object
62+
>>> enc_df["foo"].cat.categories
63+
Index(['a', 'b', 'c'], dtype='object')
64+
>>> enc_df["foo"].cat.ordered
65+
False
66+
67+
Example: Using `kwargs` to specify an ordered categorical.
68+
69+
>>> import pandas as pd
70+
>>> import janitor
71+
>>> df = pd.DataFrame({
72+
... "foo": ["b", "b", "a", "c", "b"],
73+
... "bar": range(4, 9),
74+
... })
75+
>>> df.dtypes
76+
foo object
77+
bar int64
78+
dtype: object
79+
>>> enc_df = df.encode_categorical(foo=(None, "appearance"))
80+
>>> enc_df.dtypes
81+
foo category
82+
bar int64
83+
dtype: object
84+
>>> enc_df["foo"].cat.categories
85+
Index(['b', 'a', 'c'], dtype='object')
86+
>>> enc_df["foo"].cat.ordered
87+
True
8488
8589
:param df: The pandas DataFrame object.
8690
:param column_names: A column name or an iterable (list or tuple)
8791
of column names.
88-
:param kwargs: A pairing of column name to a tuple of (`categories`, `order`).
89-
This is useful in creating categorical columns that are ordered, or
90-
if the user needs to explicitly specify the categories.
92+
:param kwargs: A mapping from column name to a tuple of (`categories`, `order`).
93+
`categories` should be list-like (specifying the categories explicitly) or `None`.
94+
`order` is only allowed to be `'sort'`, `'appearance'` or `None`.
9195
:returns: A pandas DataFrame.
92-
:raises ValueError: if both ``column_names`` and ``kwargs`` are provided.
96+
:raises ValueError: If both `column_names` and `kwargs` are provided.
9397
""" # noqa: E501
9498

9599
if all((column_names, kwargs)):
96100
raise ValueError(
97-
"""
98-
Only one of `column_names` or `kwargs`
99-
can be provided.
100-
"""
101+
"Only one of `column_names` or `kwargs` can be provided."
101102
)
102103
# column_names deal with only category dtype (unordered)
103104
# kwargs takes care of scenarios where user wants an ordered category
@@ -125,7 +126,7 @@ def _computations_as_categorical(df: pd.DataFrame, **kwargs) -> pd.DataFrame:
125126
The defaults for the tuple are (None, None)
126127
and will return a categorical dtype
127128
with no order and categories inferred from the column.
128-
A DataFrame, with catetorical columns, is returned.
129+
A DataFrame, with categorical columns, is returned.
129130
"""
130131

131132
categories_dict = _as_categorical_checks(df, **kwargs)
@@ -136,10 +137,10 @@ def _computations_as_categorical(df: pd.DataFrame, **kwargs) -> pd.DataFrame:
136137
cat,
137138
order,
138139
) in categories_dict.items():
139-
error_msg = f"""
140-
Kindly ensure there is at least
141-
one non-null value in {column_name}.
142-
"""
140+
error_msg = (
141+
"Kindly ensure there is at least one non-null value in "
142+
f"{column_name}."
143+
)
143144
if (cat is None) and (order is None):
144145
cat_dtype = pd.CategoricalDtype()
145146

@@ -182,30 +183,24 @@ def _as_categorical_checks(df: pd.DataFrame, **kwargs) -> dict:
182183
:param kwargs: A pairing of column name
183184
to a tuple of (`categories`, `order`).
184185
:returns: A dictionary.
185-
:raises TypeError: if the value in ``kwargs`` is not a tuple.
186-
:raises ValueError: if ``categories`` is not a 1-D array.
187-
:raises ValueError: if ``order`` is not one of
186+
:raises TypeError: If the value in `kwargs` is not a tuple.
187+
:raises ValueError: If `categories` is not a 1-D array.
188+
:raises ValueError: If `order` is not one of
188189
`sort`, `appearance`, or `None`.
189190
"""
190191

191-
# column checks
192192
check_column(df, kwargs)
193193

194194
categories_dict = {}
195195

196196
for column_name, value in kwargs.items():
197-
# type check
198197
check("Pair of `categories` and `order`", value, [tuple])
199198

200199
len_value = len(value)
201-
202200
if len_value != 2:
203201
raise ValueError(
204-
f"""
205-
The tuple of (categories, order) for {column_name}
206-
should be length 2; the tuple provided is
207-
length {len_value}.
208-
"""
202+
f"The tuple of (categories, order) for {column_name} should "
203+
f"be length 2; the tuple provided is length {len_value}."
209204
)
210205

211206
cat, order = value
@@ -221,10 +216,8 @@ def _as_categorical_checks(df: pd.DataFrame, **kwargs) -> dict:
221216
arr_ndim = checker.ndim
222217
if (arr_ndim != 1) or isinstance(checker, pd.MultiIndex):
223218
raise ValueError(
224-
f"""
225-
{cat} is not a 1-D array.
226-
Kindly provide a 1-D array-like object to `categories`.
227-
"""
219+
f"{cat} is not a 1-D array. Kindly provide a 1-D "
220+
"array-like object to `categories`."
228221
)
229222

230223
if not isinstance(checker, (pd.Series, pd.Index)):
@@ -237,51 +230,38 @@ def _as_categorical_checks(df: pd.DataFrame, **kwargs) -> dict:
237230

238231
if not checker.is_unique:
239232
raise ValueError(
240-
"""
241-
Kindly provide unique,
242-
non-null values for `categories`.
243-
"""
233+
"Kindly provide unique, non-null values for `categories`."
244234
)
245235

246236
if checker.empty:
247237
raise ValueError(
248-
"""
249-
Kindly ensure there is at least
250-
one non-null value in `categories`.
251-
"""
238+
"Kindly ensure there is at least one non-null value in "
239+
"`categories`."
252240
)
253241

254242
# uniques, without nulls
255243
uniques = df[column_name].factorize(sort=False)[-1]
256244
if uniques.empty:
257245
raise ValueError(
258-
f"""
259-
Kindly ensure there is at least
260-
one non-null value in {column_name}.
261-
"""
246+
"Kindly ensure there is at least one non-null value "
247+
f"in {column_name}."
262248
)
263249

264250
missing = uniques.difference(checker, sort=False)
265251
if not missing.empty and (uniques.size > missing.size):
266252
warnings.warn(
267-
f"""
268-
Values {tuple(missing)} are missing from
269-
the provided categories {cat}
270-
for {column_name}; this may create nulls
271-
in the new categorical column.
272-
""",
253+
"Values {tuple(missing)} are missing from the provided "
254+
f"categories {cat} for {column_name}; this may create "
255+
"nulls in the new categorical column.",
273256
UserWarning,
274257
stacklevel=2,
275258
)
276259

277260
elif uniques.equals(missing):
278261
warnings.warn(
279-
f"""
280-
None of the values in {column_name} are in
281-
{cat};
282-
this might create nulls for all values
283-
in the new categorical column.
284-
""",
262+
f"None of the values in {column_name} are in {cat}; "
263+
"this might create nulls for all values in the new "
264+
"categorical column.",
285265
UserWarning,
286266
stacklevel=2,
287267
)
@@ -292,10 +272,8 @@ def _as_categorical_checks(df: pd.DataFrame, **kwargs) -> dict:
292272
category_order_types = [ent.value for ent in _CategoryOrder]
293273
if order.lower() not in category_order_types:
294274
raise ValueError(
295-
"""
296-
`order` argument should be one of
297-
"appearance", "sort" or `None`.
298-
"""
275+
"`order` argument should be one of 'appearance', "
276+
"'sort' or `None`."
299277
)
300278

301279
categories_dict[column_name] = value

0 commit comments

Comments
 (0)