Skip to content

Commit 63c075e

Browse files
[EHN] Add jointly option for min_max_scale (#1112)
* [EHN] Add `entire_data` for `min_max_scale` to transform each column * Update the description of function * highlight the keywords * Update examples * Rename function * Update test suitcases * Ignore darglint error * Update test results * correct variable name * Miss data * Update example result * `entire_data` -> `jointly` * Update description * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add changelog section * Update CHANGELOG.md * lint codes * lint codes Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent a8324dd commit 63c075e

File tree

3 files changed

+144
-40
lines changed

3 files changed

+144
-40
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
- [ENH] Extend select_columns to support non-string columns. Also allow selection on MultiIndex columns via level parameter. #1105 @samukweku
99
- [ENH] Performance improvement for groupby_topk. #1093 @samukweku
1010
- [EHN] `min_max_scale` drop `old_min` and `old_max` to fit sklearn's method API. Issue #1068 @Zeroto521
11+
- [EHN] Add `jointly` option for `min_max_scale` support to transform each column values or entire values. Default transform each column, similar behavior to `sklearn.preprocessing.MinMaxScaler`. Issue #1067 @Zeroto521
1112

1213
## [v0.23.1] - 2022-05-03
1314

janitor/functions/min_max_scale.py

+101-37
Original file line numberDiff line numberDiff line change
@@ -23,23 +23,19 @@ def min_max_scale(
2323
df: pd.DataFrame,
2424
feature_range: tuple[int | float, int | float] = (0, 1),
2525
column_name: str | int | list[str | int] | pd.Index = None,
26+
jointly: bool = False,
2627
) -> pd.DataFrame:
2728
"""
28-
Scales data to between a minimum and maximum value.
29+
Scales DataFrame to between a minimum and maximum value.
2930
30-
This method mutates the original DataFrame.
31+
One can optionally set a new target **minimum** and **maximum** value
32+
using the `feature_range` keyword argument.
3133
32-
If `minimum` and `maximum` are provided, the true min/max of the
33-
`DataFrame` or column is ignored in the scaling process and replaced with
34-
these values, instead.
35-
36-
One can optionally set a new target minimum and maximum value using the
37-
`feature_range[0]` and `feature_range[1]` keyword arguments.
38-
This will result in the transformed data being bounded between
39-
`feature_range[0]` and `feature_range[1]`.
40-
41-
If a particular column name is specified, then only that column of data
42-
are scaled. Otherwise, the entire dataframe is scaled.
34+
If `column_name` is specified, then only that column(s) of data is scaled.
35+
Otherwise, the entire dataframe is scaled.
36+
If `jointly` is `True`, the `column_names` provided entire dataframe will
37+
be regnozied as the one to jointly scale. Otherwise, each column of data
38+
will be scaled separately.
4339
4440
Example: Basic usage.
4541
@@ -48,6 +44,10 @@ def min_max_scale(
4844
>>> df = pd.DataFrame({'a':[1, 2], 'b':[0, 1]})
4945
>>> df.min_max_scale()
5046
a b
47+
0 0.0 0.0
48+
1 1.0 1.0
49+
>>> df.min_max_scale(jointly=True)
50+
a b
5151
0 0.5 0.0
5252
1 1.0 0.5
5353
@@ -57,6 +57,10 @@ def min_max_scale(
5757
>>> import janitor
5858
>>> df = pd.DataFrame({'a':[1, 2], 'b':[0, 1]})
5959
>>> df.min_max_scale(feature_range=(0, 100))
60+
a b
61+
0 0.0 0.0
62+
1 100.0 100.0
63+
>>> df.min_max_scale(feature_range=(0, 100), jointly=True)
6064
a b
6165
0 50.0 0.0
6266
1 100.0 50.0
@@ -65,15 +69,26 @@ def min_max_scale(
6569
6670
>>> import pandas as pd
6771
>>> import janitor
68-
>>> df = pd.DataFrame({'a':[1, 2], 'b':[0, 1]})
69-
>>> df.min_max_scale(feature_range=(0, 100), column_name=['a', 'b'])
70-
a b
71-
0 0.0 0.0
72-
1 100.0 100.0
72+
>>> df = pd.DataFrame({'a':[1, 2], 'b':[0, 1], 'c': [1, 0]})
73+
>>> df.min_max_scale(
74+
... feature_range=(0, 100),
75+
... column_name=["a", "c"],
76+
... )
77+
a b c
78+
0 0.0 0 100.0
79+
1 100.0 1 0.0
80+
>>> df.min_max_scale(
81+
... feature_range=(0, 100),
82+
... column_name=["a", "c"],
83+
... jointly=True,
84+
... )
85+
a b c
86+
0 50.0 0 50.0
87+
1 100.0 1 0.0
7388
>>> df.min_max_scale(feature_range=(0, 100), column_name='a')
74-
a b
75-
0 0.0 0
76-
1 100.0 1
89+
a b c
90+
0 0.0 0 1
91+
1 100.0 1 0
7792
7893
The aforementioned example might be applied to something like scaling the
7994
isoelectric points of amino acids. While technically they range from
@@ -84,11 +99,16 @@ def min_max_scale(
8499
:param df: A pandas DataFrame.
85100
:param feature_range: (optional) Desired range of transformed data.
86101
:param column_name: (optional) The column on which to perform scaling.
102+
:param jointly: (bool) Scale the entire data if Ture.
87103
:returns: A pandas DataFrame with scaled data.
88104
:raises ValueError: if `feature_range` isn't tuple type.
89105
:raises ValueError: if the length of `feature_range` isn't equal to two.
90106
:raises ValueError: if the element of `feature_range` isn't number type.
91107
:raises ValueError: if `feature_range[1]` <= `feature_range[0]`.
108+
109+
Changed in version 0.24.0: Deleted "old_min", "old_max", "new_min", and
110+
"new_max" options.
111+
Changed in version 0.24.0: Added "feature_range", and "jointly" options.
92112
"""
93113

94114
if not (
@@ -102,23 +122,67 @@ def min_max_scale(
102122
"the first element must be greater than the second one"
103123
)
104124

105-
new_min, new_max = feature_range
106-
new_range = new_max - new_min
107-
108125
if column_name is not None:
109-
old_min = df[column_name].min()
110-
old_max = df[column_name].max()
111-
old_range = old_max - old_min
112-
113-
df = df.copy()
114-
df[column_name] = (
115-
df[column_name] - old_min
116-
) * new_range / old_range + new_min
117-
else:
118-
old_min = df.min().min()
119-
old_max = df.max().max()
120-
old_range = old_max - old_min
126+
df = df.copy() # Avoid to change the original DataFrame.
121127

122-
df = (df - old_min) * new_range / old_range + new_min
128+
old_feature_range = df[column_name].pipe(min_max_value, jointly)
129+
df[column_name] = df[column_name].pipe(
130+
apply_min_max,
131+
*old_feature_range,
132+
*feature_range,
133+
)
134+
else:
135+
old_feature_range = df.pipe(min_max_value, jointly)
136+
df = df.pipe(
137+
apply_min_max,
138+
*old_feature_range,
139+
*feature_range,
140+
)
123141

124142
return df
143+
144+
145+
def min_max_value(df: pd.DataFrame, jointly: bool) -> tuple:
146+
"""
147+
Return the minimum and maximum of DataFrame.
148+
149+
Use the `jointly` flag to control returning entire data or each column.
150+
151+
.. # noqa: DAR101
152+
.. # noqa: DAR201
153+
"""
154+
155+
if jointly:
156+
mmin = df.min().min()
157+
mmax = df.max().max()
158+
else:
159+
mmin = df.min()
160+
mmax = df.max()
161+
162+
return mmin, mmax
163+
164+
165+
def apply_min_max(
166+
df: pd.DataFrame,
167+
old_min: int | float | pd.Series,
168+
old_max: int | float | pd.Series,
169+
new_min: int | float | pd.Series,
170+
new_max: int | float | pd.Series,
171+
) -> pd.DataFrame:
172+
"""
173+
Apply minimax scaler to DataFrame.
174+
175+
Notes
176+
-----
177+
- Inputting minimum and maximum type
178+
- int or float : It will apply minimax to the entire DataFrame.
179+
- Series : It will apply minimax to each column.
180+
181+
.. # noqa: DAR101
182+
.. # noqa: DAR201
183+
"""
184+
185+
old_range = old_max - old_min
186+
new_range = new_max - new_min
187+
188+
return (df - old_min) * new_range / old_range + new_min

tests/functions/test_min_max_scale.py

+42-3
Original file line numberDiff line numberDiff line change
@@ -4,42 +4,81 @@
44

55
@pytest.mark.functions
66
@pytest.mark.parametrize(
7-
"df, column_name, excepted",
7+
"df, column_name, jointly, excepted",
88
[
99
# test default parameter
1010
(
1111
pd.DataFrame({"a": [5, 10], "b": [0, 5]}),
1212
None,
13+
True,
1314
pd.DataFrame({"a": [0.5, 1], "b": [0, 0.5]}),
1415
),
16+
# test default parameter
17+
(
18+
pd.DataFrame({"a": [5, 10], "b": [0, 5]}),
19+
None,
20+
False,
21+
pd.DataFrame({"a": [0, 1.0], "b": [0, 1.0]}),
22+
),
23+
# test list condition
24+
(
25+
pd.DataFrame({"a": [5, 10], "b": [0, 5]}),
26+
["a", "b"],
27+
True,
28+
pd.DataFrame({"a": [0.5, 1.0], "b": [0, 0.5]}),
29+
),
1530
# test list condition
1631
(
1732
pd.DataFrame({"a": [5, 10], "b": [0, 5]}),
1833
["a", "b"],
34+
False,
1935
pd.DataFrame({"a": [0, 1.0], "b": [0, 1.0]}),
2036
),
2137
# test Index condition
2238
(
2339
pd.DataFrame({"a": [5, 10], "b": [0, 5]}),
2440
pd.Index(["a", "b"]),
41+
False,
2542
pd.DataFrame({"a": [0, 1.0], "b": [0, 1.0]}),
2643
),
44+
# test Index condition
45+
(
46+
pd.DataFrame({"a": [5, 10], "b": [0, 5]}),
47+
pd.Index(["a", "b"]),
48+
True,
49+
pd.DataFrame({"a": [0.5, 1], "b": [0, 0.5]}),
50+
),
2751
# test str condition
2852
(
2953
pd.DataFrame({"a": [5, 10], "b": [0, 5]}),
3054
"a",
55+
True,
56+
pd.DataFrame({"a": [0, 1.0], "b": [0, 5]}),
57+
),
58+
(
59+
pd.DataFrame({"a": [5, 10], "b": [0, 5]}),
60+
"a",
61+
False,
3162
pd.DataFrame({"a": [0, 1.0], "b": [0, 5]}),
3263
),
3364
# test int condition
3465
(
3566
pd.DataFrame({1: [5, 10], "b": [0, 5]}),
3667
1,
68+
True,
69+
pd.DataFrame({1: [0, 1.0], "b": [0, 5]}),
70+
),
71+
# test int condition
72+
(
73+
pd.DataFrame({1: [5, 10], "b": [0, 5]}),
74+
1,
75+
False,
3776
pd.DataFrame({1: [0, 1.0], "b": [0, 5]}),
3877
),
3978
],
4079
)
41-
def test_min_max_scale_column_name(df, column_name, excepted):
42-
result = df.min_max_scale(column_name=column_name)
80+
def test_min_max_scale_column_name_type(df, column_name, jointly, excepted):
81+
result = df.min_max_scale(column_name=column_name, jointly=jointly)
4382

4483
assert result.equals(excepted)
4584

0 commit comments

Comments
 (0)