-
Notifications
You must be signed in to change notification settings - Fork 171
/
Copy pathclean_names.py
273 lines (243 loc) · 9.04 KB
/
clean_names.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
"""Functions for cleaning columns/index names and/or column values."""
from __future__ import annotations
import unicodedata
import pandas as pd
import pandas_flavor as pf
from pandas.api.types import is_scalar
from janitor.errors import JanitorError
from janitor.functions.utils import _is_str_or_cat, get_index_labels
from janitor.utils import deprecated_alias
@pf.register_dataframe_method
@deprecated_alias(preserve_original_columns="preserve_original_labels")
def clean_names(
df: pd.DataFrame,
axis: str = "columns",
column_names: str | list = None,
strip_underscores: str | bool = None,
case_type: str = "lower",
remove_special: bool = False,
strip_accents: bool = True,
preserve_original_labels: bool = True,
enforce_string: bool = True,
truncate_limit: int = None,
) -> pd.DataFrame:
"""Clean column/index names. It can also be applied to column values.
Takes all column names, converts them to lowercase,
then replaces all spaces with underscores.
By default, column names are converted to string types.
This can be switched off by passing in `enforce_string=False`.
This method does not mutate the original DataFrame.
Examples:
>>> import pandas as pd
>>> import janitor
>>> df = pd.DataFrame(
... {
... "Aloha": range(3),
... "Bell Chart": range(3),
... "Animals@#$%^": range(3)
... }
... )
>>> df
Aloha Bell Chart Animals@#$%^
0 0 0 0
1 1 1 1
2 2 2 2
>>> df.clean_names()
aloha bell_chart animals@#$%^
0 0 0 0
1 1 1 1
2 2 2 2
>>> df.clean_names(remove_special=True)
aloha bell_chart animals
0 0 0 0
1 1 1 1
2 2 2 2
!!! summary "Version Changed"
- 0.26.0
- Added `axis` and `column_names` parameters.
Args:
df: The pandas DataFrame object.
axis: Whether to clean the labels on the index or columns.
If `None`, applies to a defined column
or columns in `column_names`.
column_names: Clean the values in a column.
`axis` should be `None`.
Column selection is possible using the
[`select`][janitor.functions.select.select] syntax.
strip_underscores: Removes the outer underscores from all
column names/values. Default None keeps outer underscores.
Values can be either 'left', 'right' or 'both'
or the respective shorthand 'l',
'r' and True.
case_type: Whether to make columns lower or uppercase.
Current case may be preserved with 'preserve',
while snake case conversion (from CamelCase or camelCase only)
can be turned on using "snake".
Default 'lower' makes all characters lowercase.
remove_special: Remove special characters from columns.
Only letters, numbers and underscores are preserved.
strip_accents: Whether or not to remove accents from
columns names/values.
preserve_original_labels: Preserve original names.
This is later retrievable using `df.original_labels`.
Applies if `axis` is not None.
enforce_string: Whether or not to convert all
column names/values to string type.
Defaults to True, but can be turned off.
Columns with >1 levels will not be converted by default.
truncate_limit: Truncates formatted column names/values
to the specified length.
Default None does not truncate.
Raises:
ValueError: If `axis=None` and `column_names=None`.
Returns:
A pandas DataFrame.
"""
if not axis and not column_names:
raise ValueError(
"Kindly provide an argument to `column_names`, if axis is None."
)
if axis is None:
column_names = get_index_labels(
arg=column_names, df=df, axis="columns"
)
if is_scalar(column_names):
column_names = [column_names]
df = df.copy()
for column_name in column_names:
df[column_name] = _clean_names(
obj=df[column_name],
enforce_string=enforce_string,
case_type=case_type,
remove_special=remove_special,
strip_accents=strip_accents,
strip_underscores=strip_underscores,
truncate_limit=truncate_limit,
)
return df
assert axis in {"index", "columns"}
df = df[:]
target_axis = getattr(df, axis)
if isinstance(target_axis, pd.MultiIndex):
target_axis = [
target_axis.get_level_values(number)
for number in range(target_axis.nlevels)
]
target_axis = [
_clean_names(
obj=obj,
enforce_string=enforce_string,
case_type=case_type,
remove_special=remove_special,
strip_accents=strip_accents,
strip_underscores=strip_underscores,
truncate_limit=truncate_limit,
)
for obj in target_axis
]
else:
target_axis = _clean_names(
obj=target_axis,
enforce_string=enforce_string,
case_type=case_type,
remove_special=remove_special,
strip_accents=strip_accents,
strip_underscores=strip_underscores,
truncate_limit=truncate_limit,
)
# Store the original column names, if enabled by user
if preserve_original_labels:
df.__dict__["original_labels"] = getattr(df, axis)
setattr(df, axis, target_axis)
return df
def _clean_names(
obj: pd.Index | pd.Series,
strip_underscores: str | bool,
case_type: str,
remove_special: bool,
strip_accents: bool,
enforce_string: bool,
truncate_limit: int,
) -> pd.Index | pd.Series:
"""
Generic function to clean labels in a pandas object.
"""
if enforce_string and not _is_str_or_cat(obj):
obj = obj.astype(str)
obj = _change_case(obj=obj, case_type=case_type)
obj = _normalize_1(obj=obj)
if remove_special:
obj = obj.str.replace(
pat="[^A-Za-z_\\d]", repl="", regex=True
).str.strip()
if strip_accents:
obj = _strip_accents(obj=obj)
obj = obj.str.replace(pat="_+", repl="_", regex=True)
obj = _strip_underscores_func(
obj,
strip_underscores=strip_underscores,
)
if truncate_limit:
obj = obj.str[:truncate_limit]
return obj
def _change_case(
obj: pd.Index | pd.Series,
case_type: str,
) -> pd.Index | pd.Series:
"""Change case of labels in obj."""
case_types = {"preserve", "upper", "lower", "snake"}
case_type = case_type.lower()
if case_type not in case_types:
raise JanitorError(f"case_type must be one of: {case_types}")
if case_type == "preserve":
return obj
if case_type == "upper":
return obj.str.upper()
if case_type == "lower":
return obj.str.lower()
# Implementation taken from: https://gist.github.com/jaytaylor/3660565
# by @jtaylor
return (
obj.str.replace(pat=r"(.)([A-Z][a-z]+)", repl=r"\1_\2", regex=True)
.str.replace(pat=r"([a-z0-9])([A-Z])", repl=r"\1_\2", regex=True)
.str.lower()
)
def _normalize_1(obj: pd.Index | pd.Series) -> pd.Index | pd.Series:
"""Perform normalization of labels in obj."""
FIXES = [(r"[ /:,?()\.-]", "_"), (r"['’]", ""), (r"[\xa0]", "_")]
for search, replace in FIXES:
obj = obj.str.replace(pat=search, repl=replace, regex=True)
return obj
def _strip_accents(
obj: pd.Index | pd.Series,
) -> pd.Index | pd.Series:
"""Remove accents from a label.
Inspired from [StackOverflow][so].
[so]: https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-strin
""" # noqa: E501
return obj.map(
lambda f: "".join(
[
letter
for letter in unicodedata.normalize("NFD", str(f))
if not unicodedata.combining(letter)
]
)
)
def _strip_underscores_func(
obj: pd.Index | pd.Series,
strip_underscores: str | bool = None,
) -> pd.Index | pd.Series:
"""Strip underscores."""
underscore_options = {None, "left", "right", "both", "l", "r", True}
if strip_underscores not in underscore_options:
raise JanitorError(
f"strip_underscores must be one of: {underscore_options}"
)
if strip_underscores in {"left", "l"}:
return obj.str.lstrip("_")
if strip_underscores in {"right", "r"}:
return obj.str.rstrip("_")
if strip_underscores in {True, "both"}:
return obj.str.strip("_")
return obj