1
+ import warnings
2
+ from enum import Enum
1
3
from typing import Hashable , Iterable , Union
4
+
2
5
import pandas_flavor as pf
3
6
import pandas as pd
4
7
from pandas .api .types import is_list_like
5
- import warnings
8
+
6
9
from janitor .utils import check , check_column , deprecated_alias
7
- from enum import Enum
8
10
9
11
10
12
@pf .register_dataframe_method
@@ -24,80 +26,79 @@ def encode_categorical(
24
26
25
27
Note: In versions < 0.20.11, this method mutates the original DataFrame.
26
28
29
+ If `column_names` is specified, these column(s) are cast to unordered categorical
30
+ dtypes.
31
+
27
32
If `categories` is `None` in the `kwargs` tuple, then the
28
33
values for `categories` are inferred from the column;
29
34
if `order` is `None`, then the values for `categories` are applied unordered.
30
35
31
36
`column_names` and `kwargs` parameters cannot be used at the same time.
32
37
33
- Functional usage syntax:
34
-
35
- ```python
36
- import pandas as pd
37
- import janitor as jn
38
- ```
39
-
40
- - With `column_names`
41
-
42
- ```python
43
- categorical_cols = ['col1', 'col2', 'col4']
44
- df = jn.encode_categorical(
45
- df,
46
- columns = categorical_cols) # one way
47
- ```
48
-
49
- - With `kwargs`
50
-
51
- ```python
52
- df = jn.encode_categorical(
53
- df,
54
- col1 = (categories, order),
55
- col2 = (categories = [values],
56
- order="sort" # or "appearance" or None
57
-
58
- )
59
- ```
60
-
61
- Method chaining syntax:
62
-
63
- - With `column_names`
64
-
65
- ```python
66
- categorical_cols = ['col1', 'col2', 'col4']
67
- df = (pd.DataFrame(...)
68
- .encode_categorical(columns=categorical_cols)
69
- )
70
- ```
71
-
72
- - With `kwargs`
73
-
74
- ```python
75
- df = (
76
- pd.DataFrame(...)
77
- .encode_categorical(
78
- col1 = (categories, order),
79
- col2 = (categories = [values]/None,
80
- order="sort" # or "appearance" or None
81
- )
82
- )
83
- ```
38
+ Example: Using `column_names`
39
+
40
+ >>> import pandas as pd
41
+ >>> import janitor
42
+ >>> df = pd.DataFrame({
43
+ ... "foo": ["b", "b", "a", "c", "b"],
44
+ ... "bar": range(4, 9),
45
+ ... })
46
+ >>> df
47
+ foo bar
48
+ 0 b 4
49
+ 1 b 5
50
+ 2 a 6
51
+ 3 c 7
52
+ 4 b 8
53
+ >>> df.dtypes
54
+ foo object
55
+ bar int64
56
+ dtype: object
57
+ >>> enc_df = df.encode_categorical(column_names="foo")
58
+ >>> enc_df.dtypes
59
+ foo category
60
+ bar int64
61
+ dtype: object
62
+ >>> enc_df["foo"].cat.categories
63
+ Index(['a', 'b', 'c'], dtype='object')
64
+ >>> enc_df["foo"].cat.ordered
65
+ False
66
+
67
+ Example: Using `kwargs` to specify an ordered categorical.
68
+
69
+ >>> import pandas as pd
70
+ >>> import janitor
71
+ >>> df = pd.DataFrame({
72
+ ... "foo": ["b", "b", "a", "c", "b"],
73
+ ... "bar": range(4, 9),
74
+ ... })
75
+ >>> df.dtypes
76
+ foo object
77
+ bar int64
78
+ dtype: object
79
+ >>> enc_df = df.encode_categorical(foo=(None, "appearance"))
80
+ >>> enc_df.dtypes
81
+ foo category
82
+ bar int64
83
+ dtype: object
84
+ >>> enc_df["foo"].cat.categories
85
+ Index(['b', 'a', 'c'], dtype='object')
86
+ >>> enc_df["foo"].cat.ordered
87
+ True
84
88
85
89
:param df: The pandas DataFrame object.
86
90
:param column_names: A column name or an iterable (list or tuple)
87
91
of column names.
88
- :param kwargs: A pairing of column name to a tuple of (`categories`, `order`).
89
- This is useful in creating categorical columns that are ordered, or
90
- if the user needs to explicitly specify the categories .
92
+ :param kwargs: A mapping from column name to a tuple of (`categories`, `order`).
93
+ `categories` should be list-like (specifying the categories explicitly) or `None`.
94
+ `order` is only allowed to be `'sort'`, `'appearance'` or `None` .
91
95
:returns: A pandas DataFrame.
92
- :raises ValueError: if both `` column_names`` and `` kwargs` ` are provided.
96
+ :raises ValueError: If both `column_names` and `kwargs` are provided.
93
97
""" # noqa: E501
94
98
95
99
if all ((column_names , kwargs )):
96
100
raise ValueError (
97
- """
98
- Only one of `column_names` or `kwargs`
99
- can be provided.
100
- """
101
+ "Only one of `column_names` or `kwargs` can be provided."
101
102
)
102
103
# column_names deal with only category dtype (unordered)
103
104
# kwargs takes care of scenarios where user wants an ordered category
@@ -125,7 +126,7 @@ def _computations_as_categorical(df: pd.DataFrame, **kwargs) -> pd.DataFrame:
125
126
The defaults for the tuple are (None, None)
126
127
and will return a categorical dtype
127
128
with no order and categories inferred from the column.
128
- A DataFrame, with catetorical columns, is returned.
129
+ A DataFrame, with categorical columns, is returned.
129
130
"""
130
131
131
132
categories_dict = _as_categorical_checks (df , ** kwargs )
@@ -136,10 +137,10 @@ def _computations_as_categorical(df: pd.DataFrame, **kwargs) -> pd.DataFrame:
136
137
cat ,
137
138
order ,
138
139
) in categories_dict .items ():
139
- error_msg = f"""
140
- Kindly ensure there is at least
141
- one non-null value in { column_name } .
142
- """
140
+ error_msg = (
141
+ " Kindly ensure there is at least one non-null value in "
142
+ f" { column_name } ."
143
+ )
143
144
if (cat is None ) and (order is None ):
144
145
cat_dtype = pd .CategoricalDtype ()
145
146
@@ -182,30 +183,24 @@ def _as_categorical_checks(df: pd.DataFrame, **kwargs) -> dict:
182
183
:param kwargs: A pairing of column name
183
184
to a tuple of (`categories`, `order`).
184
185
:returns: A dictionary.
185
- :raises TypeError: if the value in `` kwargs` ` is not a tuple.
186
- :raises ValueError: if `` categories` ` is not a 1-D array.
187
- :raises ValueError: if `` order` ` is not one of
186
+ :raises TypeError: If the value in `kwargs` is not a tuple.
187
+ :raises ValueError: If ` categories` is not a 1-D array.
188
+ :raises ValueError: If ` order` is not one of
188
189
`sort`, `appearance`, or `None`.
189
190
"""
190
191
191
- # column checks
192
192
check_column (df , kwargs )
193
193
194
194
categories_dict = {}
195
195
196
196
for column_name , value in kwargs .items ():
197
- # type check
198
197
check ("Pair of `categories` and `order`" , value , [tuple ])
199
198
200
199
len_value = len (value )
201
-
202
200
if len_value != 2 :
203
201
raise ValueError (
204
- f"""
205
- The tuple of (categories, order) for { column_name }
206
- should be length 2; the tuple provided is
207
- length { len_value } .
208
- """
202
+ f"The tuple of (categories, order) for { column_name } should "
203
+ f"be length 2; the tuple provided is length { len_value } ."
209
204
)
210
205
211
206
cat , order = value
@@ -221,10 +216,8 @@ def _as_categorical_checks(df: pd.DataFrame, **kwargs) -> dict:
221
216
arr_ndim = checker .ndim
222
217
if (arr_ndim != 1 ) or isinstance (checker , pd .MultiIndex ):
223
218
raise ValueError (
224
- f"""
225
- { cat } is not a 1-D array.
226
- Kindly provide a 1-D array-like object to `categories`.
227
- """
219
+ f"{ cat } is not a 1-D array. Kindly provide a 1-D "
220
+ "array-like object to `categories`."
228
221
)
229
222
230
223
if not isinstance (checker , (pd .Series , pd .Index )):
@@ -237,51 +230,38 @@ def _as_categorical_checks(df: pd.DataFrame, **kwargs) -> dict:
237
230
238
231
if not checker .is_unique :
239
232
raise ValueError (
240
- """
241
- Kindly provide unique,
242
- non-null values for `categories`.
243
- """
233
+ "Kindly provide unique, non-null values for `categories`."
244
234
)
245
235
246
236
if checker .empty :
247
237
raise ValueError (
248
- """
249
- Kindly ensure there is at least
250
- one non-null value in `categories`.
251
- """
238
+ "Kindly ensure there is at least one non-null value in "
239
+ "`categories`."
252
240
)
253
241
254
242
# uniques, without nulls
255
243
uniques = df [column_name ].factorize (sort = False )[- 1 ]
256
244
if uniques .empty :
257
245
raise ValueError (
258
- f"""
259
- Kindly ensure there is at least
260
- one non-null value in { column_name } .
261
- """
246
+ "Kindly ensure there is at least one non-null value "
247
+ f"in { column_name } ."
262
248
)
263
249
264
250
missing = uniques .difference (checker , sort = False )
265
251
if not missing .empty and (uniques .size > missing .size ):
266
252
warnings .warn (
267
- f"""
268
- Values { tuple (missing )} are missing from
269
- the provided categories { cat }
270
- for { column_name } ; this may create nulls
271
- in the new categorical column.
272
- """ ,
253
+ "Values {tuple(missing)} are missing from the provided "
254
+ f"categories { cat } for { column_name } ; this may create "
255
+ "nulls in the new categorical column." ,
273
256
UserWarning ,
274
257
stacklevel = 2 ,
275
258
)
276
259
277
260
elif uniques .equals (missing ):
278
261
warnings .warn (
279
- f"""
280
- None of the values in { column_name } are in
281
- { cat } ;
282
- this might create nulls for all values
283
- in the new categorical column.
284
- """ ,
262
+ f"None of the values in { column_name } are in { cat } ; "
263
+ "this might create nulls for all values in the new "
264
+ "categorical column." ,
285
265
UserWarning ,
286
266
stacklevel = 2 ,
287
267
)
@@ -292,10 +272,8 @@ def _as_categorical_checks(df: pd.DataFrame, **kwargs) -> dict:
292
272
category_order_types = [ent .value for ent in _CategoryOrder ]
293
273
if order .lower () not in category_order_types :
294
274
raise ValueError (
295
- """
296
- `order` argument should be one of
297
- "appearance", "sort" or `None`.
298
- """
275
+ "`order` argument should be one of 'appearance', "
276
+ "'sort' or `None`."
299
277
)
300
278
301
279
categories_dict [column_name ] = value
0 commit comments