Skip to content

Commit 92251b5

Browse files
[DOC] MWE for label_encode, factorize_columns, encode_categorical (#1028)
* Add MWE for label_encode and factorize_columns * Fix some formatting for encode_categorical * Simplify encode_categorical logic slightly * Fix formatting * Make factorize_columns non-mutating
1 parent 270498e commit 92251b5

File tree

3 files changed

+85
-68
lines changed

3 files changed

+85
-68
lines changed

janitor/functions/encode_categorical.py

+14-17
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
1+
import warnings
2+
from enum import Enum
13
from typing import Hashable, Iterable, Union
4+
25
import pandas_flavor as pf
36
import pandas as pd
47
from pandas.api.types import is_list_like
5-
import warnings
8+
69
from janitor.utils import check, check_column, deprecated_alias
7-
from enum import Enum
810

911

1012
@pf.register_dataframe_method
@@ -90,17 +92,15 @@ def encode_categorical(
9092
>>> enc_df["foo"].cat.ordered
9193
True
9294
93-
94-
9595
:param df: A pandas DataFrame object.
9696
:param column_names: A column name or an iterable (list or tuple)
9797
of column names.
98-
:param kwargs: A mapping from column name to either `None`,
99-
`sort` or `appearance`, or a 1-D array. This is useful
98+
:param **kwargs: A mapping from column name to either `None`,
99+
`'sort'` or `'appearance'`, or a 1-D array. This is useful
100100
in creating categorical columns that are ordered, or
101101
if the user needs to explicitly specify the categories.
102102
:returns: A pandas DataFrame.
103-
:raises ValueError: if both `column_names` and `kwargs` are provided.
103+
:raises ValueError: If both `column_names` and `kwargs` are provided.
104104
""" # noqa: E501
105105

106106
if all((column_names, kwargs)):
@@ -112,13 +112,11 @@ def encode_categorical(
112112
# or user supplies specific categories to create the categorical
113113
if column_names is not None:
114114
check("column_names", column_names, [list, tuple, Hashable])
115-
if isinstance(column_names, (list, tuple)):
116-
check_column(df, column_names)
117-
dtypes = {col: "category" for col in column_names}
118-
return df.astype(dtypes)
119115
if isinstance(column_names, Hashable):
120-
check_column(df, [column_names])
121-
return df.astype({column_names: "category"})
116+
column_names = [column_names]
117+
check_column(df, column_names)
118+
dtypes = {col: "category" for col in column_names}
119+
return df.astype(dtypes)
122120

123121
return _computations_as_categorical(df, **kwargs)
124122

@@ -167,21 +165,20 @@ def _as_categorical_checks(df: pd.DataFrame, **kwargs) -> dict:
167165
This function raises errors if columns in `kwargs` are
168166
absent from the dataframe's columns.
169167
It also raises errors if the value in `kwargs`
170-
is not a string (`appearance` or `sort`), or a 1D array.
168+
is not a string (`'appearance'` or `'sort'`), or a 1D array.
171169
172170
This function is executed before proceeding to the computation phase.
173171
174172
If all checks pass, a dictionary of column names and value is returned.
175173
176174
:param df: The pandas DataFrame object.
177-
:param kwargs: A pairing of column name and value.
175+
:param **kwargs: A pairing of column name and value.
178176
:returns: A dictionary.
179177
:raises TypeError: If `value` is not a 1-D array, or a string.
180178
:raises ValueError: If `value` is a 1-D array, and contains nulls,
181179
or is non-unique.
182180
"""
183181

184-
# column checks
185182
check_column(df, kwargs)
186183

187184
categories_dict = {}
@@ -255,7 +252,7 @@ def _as_categorical_checks(df: pd.DataFrame, **kwargs) -> dict:
255252
category_order_types = {ent.value for ent in _CategoryOrder}
256253
if value.lower() not in category_order_types:
257254
raise ValueError(
258-
"argument should be one of `appearance` or `sort`."
255+
"Argument should be one of 'appearance' or 'sort'."
259256
)
260257

261258
categories_dict[column_name] = value
+32-32
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
"""Implementation of the `factorize_columns` function"""
12
from typing import Hashable, Iterable, Union
23
import pandas_flavor as pf
34
import pandas as pd
@@ -13,52 +14,51 @@ def factorize_columns(
1314
**kwargs,
1415
) -> pd.DataFrame:
1516
"""
16-
Converts labels into numerical data
17+
Converts labels into numerical data.
1718
1819
This method will create a new column with the string `_enc` appended
1920
after the original column's name.
2021
This can be overriden with the suffix parameter.
2122
22-
Internally this method uses pandas `factorize` method.
23+
Internally, this method uses pandas `factorize` method.
2324
It takes in an optional suffix and keyword arguments also.
2425
An empty string as suffix will override the existing column.
2526
26-
This method mutates the original DataFrame.
27+
This method does not mutate the original DataFrame.
2728
28-
Functional usage syntax:
29+
Example:
2930
30-
```python
31-
df = factorize_columns(
32-
df,
33-
column_names="my_categorical_column",
34-
suffix="_enc"
35-
) # one way
36-
```
37-
38-
Method chaining syntax:
39-
40-
```python
41-
import pandas as pd
42-
import janitor
43-
categorical_cols = ['col1', 'col2', 'col4']
44-
df = (
45-
pd.DataFrame(...)
46-
.factorize_columns(
47-
column_names=categorical_cols,
48-
suffix="_enc"
49-
)
50-
)
51-
```
31+
>>> import pandas as pd
32+
>>> import janitor
33+
>>> df = pd.DataFrame({
34+
... "foo": ["b", "b", "a", "c", "b"],
35+
... "bar": range(4, 9),
36+
... })
37+
>>> df
38+
foo bar
39+
0 b 4
40+
1 b 5
41+
2 a 6
42+
3 c 7
43+
4 b 8
44+
>>> df.factorize_columns(column_names="foo")
45+
foo bar foo_enc
46+
0 b 4 0
47+
1 b 5 0
48+
2 a 6 1
49+
3 c 7 2
50+
4 b 8 0
5251
5352
:param df: The pandas DataFrame object.
54-
:param column_names: A column name or an iterable (list
55-
or tuple) of column names.
56-
:param suffix: Suffix to be used for the new column. Default value is _enc.
57-
An empty string suffix means, it will override the existing column
53+
:param column_names: A column name or an iterable (list or tuple) of
54+
column names.
55+
:param suffix: Suffix to be used for the new column.
56+
An empty string suffix means, it will override the existing column.
5857
:param **kwargs: Keyword arguments. It takes any of the keyword arguments,
59-
which the pandas factorize method takes like sort,na_sentinel,size_hint
58+
which the pandas factorize method takes like `sort`, `na_sentinel`,
59+
`size_hint`.
6060
6161
:returns: A pandas DataFrame.
6262
"""
63-
df = _factorize(df, column_names, suffix, **kwargs)
63+
df = _factorize(df.copy(), column_names, suffix, **kwargs)
6464
return df

janitor/functions/label_encode.py

+39-19
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
"""Implementation of `label_encode` function"""
12
from typing import Hashable, Iterable, Union
23
import warnings
34
import pandas_flavor as pf
@@ -10,43 +11,62 @@
1011
@pf.register_dataframe_method
1112
@deprecated_alias(columns="column_names")
1213
def label_encode(
13-
df: pd.DataFrame, column_names: Union[str, Iterable[str], Hashable]
14+
df: pd.DataFrame,
15+
column_names: Union[str, Iterable[str], Hashable],
1416
) -> pd.DataFrame:
1517
"""
1618
Convert labels into numerical data.
1719
1820
This method will create a new column with the string `_enc` appended
19-
after the original column's name. Consider this to be syntactic sugar.
21+
after the original column's name.
22+
Consider this to be syntactic sugar.
23+
This function uses the `factorize` pandas function under the hood.
2024
21-
This method behaves differently from `encode_categorical`. This method
22-
creates a new column of numeric data. `encode_categorical` replaces the
23-
dtype of the original column with a *categorical* dtype.
25+
This method behaves differently from
26+
[`encode_categorical`][janitor.functions.encode_categorical.encode_categorical].
27+
This method creates a new column of numeric data.
28+
[`encode_categorical`][janitor.functions.encode_categorical.encode_categorical]
29+
replaces the dtype of the original column with a *categorical* dtype.
2430
2531
This method mutates the original DataFrame.
2632
27-
Functional usage syntax:
33+
Example:
2834
29-
```python
30-
df = label_encode(df, column_names="my_categorical_column") # one way
31-
```
35+
>>> import pandas as pd
36+
>>> import janitor
37+
>>> df = pd.DataFrame({
38+
... "foo": ["b", "b", "a", "c", "b"],
39+
... "bar": range(4, 9),
40+
... })
41+
>>> df
42+
foo bar
43+
0 b 4
44+
1 b 5
45+
2 a 6
46+
3 c 7
47+
4 b 8
48+
>>> df.label_encode(column_names="foo")
49+
foo bar foo_enc
50+
0 b 4 0
51+
1 b 5 0
52+
2 a 6 1
53+
3 c 7 2
54+
4 b 8 0
3255
33-
Method chaining syntax:
56+
!!!note
3457
35-
```python
36-
import pandas as pd
37-
import janitor
38-
categorical_cols = ['col1', 'col2', 'col4']
39-
df = pd.DataFrame(...).label_encode(column_names=categorical_cols)
40-
```
58+
This function will be deprecated in a 1.x release.
59+
Please use [`factorize_columns`][janitor.functions.factorize_columns.factorize_columns]
60+
instead.
4161
4262
:param df: The pandas DataFrame object.
4363
:param column_names: A column name or an iterable (list
4464
or tuple) of column names.
4565
:returns: A pandas DataFrame.
46-
"""
66+
""" # noqa: E501
4767
warnings.warn(
48-
"label_encode will be deprecated in a 1.x release. \
49-
Please use factorize_columns instead"
68+
"`label_encode` will be deprecated in a 1.x release. "
69+
"Please use `factorize_columns` instead."
5070
)
5171
df = _factorize(df, column_names, "_enc")
5272
return df

0 commit comments

Comments
 (0)