[EHN] Add jointly option for min_max_scale (#1112)

Zeroto521 · pre-commit-ci[bot] · web-flow · commit 63c075ec9354 · 2022-06-14T16:39:11.000-04:00
* [EHN] Add `entire_data` for `min_max_scale` to transform each column * Update the description of function * highlight the keywords * Update examples * Rename function * Update test suitcases * Ignore darglint error * Update test results * correct variable name * Miss data * Update example result * `entire_data` -> `jointly` * Update description * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add changelog section * Update CHANGELOG.md * lint codes * lint codes Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,7 @@
 -   [ENH] Extend select_columns to support non-string columns. Also allow selection on MultiIndex columns via level parameter. #1105 @samukweku
 -   [ENH] Performance improvement for groupby_topk. #1093 @samukweku
 -   [EHN] `min_max_scale` drop `old_min` and `old_max` to fit sklearn's method API. Issue #1068 @Zeroto521
+-   [EHN] Add `jointly` option for `min_max_scale` support to transform each column values or entire values. Default transform each column, similar behavior to `sklearn.preprocessing.MinMaxScaler`. Issue #1067 @Zeroto521
 
 ## [v0.23.1] - 2022-05-03
 
diff --git a/janitor/functions/min_max_scale.py b/janitor/functions/min_max_scale.py
@@ -23,23 +23,19 @@ def min_max_scale(
     df: pd.DataFrame,
     feature_range: tuple[int | float, int | float] = (0, 1),
     column_name: str | int | list[str | int] | pd.Index = None,
+    jointly: bool = False,
 ) -> pd.DataFrame:
     """
-    Scales data to between a minimum and maximum value.
+    Scales DataFrame to between a minimum and maximum value.
 
-    This method mutates the original DataFrame.
+    One can optionally set a new target **minimum** and **maximum** value
+    using the `feature_range` keyword argument.
 
-    If `minimum` and `maximum` are provided, the true min/max of the
-    `DataFrame` or column is ignored in the scaling process and replaced with
-    these values, instead.
-
-    One can optionally set a new target minimum and maximum value using the
-    `feature_range[0]` and `feature_range[1]` keyword arguments.
-    This will result in the transformed data being bounded between
-    `feature_range[0]` and `feature_range[1]`.
-
-    If a particular column name is specified, then only that column of data
-    are scaled. Otherwise, the entire dataframe is scaled.
+    If `column_name` is specified, then only that column(s) of data is scaled.
+    Otherwise, the entire dataframe is scaled.
+    If `jointly` is `True`, the `column_names` provided entire dataframe will
+    be regnozied as the one to jointly scale. Otherwise, each column of data
+    will be scaled separately.
 
     Example: Basic usage.
 
@@ -48,6 +44,10 @@ def min_max_scale(
         >>> df = pd.DataFrame({'a':[1, 2], 'b':[0, 1]})
         >>> df.min_max_scale()
              a    b
+        0  0.0  0.0
+        1  1.0  1.0
+        >>> df.min_max_scale(jointly=True)
+             a    b
         0  0.5  0.0
         1  1.0  0.5
 
@@ -57,6 +57,10 @@ def min_max_scale(
         >>> import janitor
         >>> df = pd.DataFrame({'a':[1, 2], 'b':[0, 1]})
         >>> df.min_max_scale(feature_range=(0, 100))
+               a      b
+        0    0.0    0.0
+        1  100.0  100.0
+        >>> df.min_max_scale(feature_range=(0, 100), jointly=True)
                a     b
         0   50.0   0.0
         1  100.0  50.0
@@ -65,15 +69,26 @@ def min_max_scale(
 
         >>> import pandas as pd
         >>> import janitor
-        >>> df = pd.DataFrame({'a':[1, 2], 'b':[0, 1]})
-        >>> df.min_max_scale(feature_range=(0, 100), column_name=['a', 'b'])
-               a      b
-        0    0.0    0.0
-        1  100.0  100.0
+        >>> df = pd.DataFrame({'a':[1, 2], 'b':[0, 1], 'c': [1, 0]})
+        >>> df.min_max_scale(
+        ...     feature_range=(0, 100),
+        ...     column_name=["a", "c"],
+        ... )
+               a  b      c
+        0    0.0  0  100.0
+        1  100.0  1    0.0
+        >>> df.min_max_scale(
+        ...     feature_range=(0, 100),
+        ...     column_name=["a", "c"],
+        ...     jointly=True,
+        ... )
+               a  b     c
+        0   50.0  0  50.0
+        1  100.0  1   0.0
         >>> df.min_max_scale(feature_range=(0, 100), column_name='a')
-               a  b
-        0    0.0  0
-        1  100.0  1
+               a  b  c
+        0    0.0  0  1
+        1  100.0  1  0
 
     The aforementioned example might be applied to something like scaling the
     isoelectric points of amino acids. While technically they range from
@@ -84,11 +99,16 @@ def min_max_scale(
     :param df: A pandas DataFrame.
     :param feature_range: (optional) Desired range of transformed data.
     :param column_name: (optional) The column on which to perform scaling.
+    :param jointly: (bool) Scale the entire data if Ture.
     :returns: A pandas DataFrame with scaled data.
     :raises ValueError: if `feature_range` isn't tuple type.
     :raises ValueError: if the length of `feature_range` isn't equal to two.
     :raises ValueError: if the element of `feature_range` isn't number type.
     :raises ValueError: if `feature_range[1]` <= `feature_range[0]`.
+
+    Changed in version 0.24.0: Deleted "old_min", "old_max", "new_min", and
+    "new_max" options.
+    Changed in version 0.24.0: Added "feature_range", and "jointly" options.
     """
 
     if not (
@@ -102,23 +122,67 @@ def min_max_scale(
             "the first element must be greater than the second one"
         )
 
-    new_min, new_max = feature_range
-    new_range = new_max - new_min
-
     if column_name is not None:
-        old_min = df[column_name].min()
-        old_max = df[column_name].max()
-        old_range = old_max - old_min
-
-        df = df.copy()
-        df[column_name] = (
-            df[column_name] - old_min
-        ) * new_range / old_range + new_min
-    else:
-        old_min = df.min().min()
-        old_max = df.max().max()
-        old_range = old_max - old_min
+        df = df.copy()  # Avoid to change the original DataFrame.
 
-        df = (df - old_min) * new_range / old_range + new_min
+        old_feature_range = df[column_name].pipe(min_max_value, jointly)
+        df[column_name] = df[column_name].pipe(
+            apply_min_max,
+            *old_feature_range,
+            *feature_range,
+        )
+    else:
+        old_feature_range = df.pipe(min_max_value, jointly)
+        df = df.pipe(
+            apply_min_max,
+            *old_feature_range,
+            *feature_range,
+        )
 
     return df
+
+
+def min_max_value(df: pd.DataFrame, jointly: bool) -> tuple:
+    """
+    Return the minimum and maximum of DataFrame.
+
+    Use the `jointly` flag to control returning entire data or each column.
+
+    .. # noqa: DAR101
+    .. # noqa: DAR201
+    """
+
+    if jointly:
+        mmin = df.min().min()
+        mmax = df.max().max()
+    else:
+        mmin = df.min()
+        mmax = df.max()
+
+    return mmin, mmax
+
+
+def apply_min_max(
+    df: pd.DataFrame,
+    old_min: int | float | pd.Series,
+    old_max: int | float | pd.Series,
+    new_min: int | float | pd.Series,
+    new_max: int | float | pd.Series,
+) -> pd.DataFrame:
+    """
+    Apply minimax scaler to DataFrame.
+
+    Notes
+    -----
+    - Inputting minimum and maximum type
+        - int or float : It will apply minimax to the entire DataFrame.
+        - Series : It will apply minimax to each column.
+
+    .. # noqa: DAR101
+    .. # noqa: DAR201
+    """
+
+    old_range = old_max - old_min
+    new_range = new_max - new_min
+
+    return (df - old_min) * new_range / old_range + new_min
diff --git a/tests/functions/test_min_max_scale.py b/tests/functions/test_min_max_scale.py
@@ -4,42 +4,81 @@
 
 @pytest.mark.functions
 @pytest.mark.parametrize(
-    "df, column_name, excepted",
+    "df, column_name, jointly, excepted",
     [
         # test default parameter
         (
             pd.DataFrame({"a": [5, 10], "b": [0, 5]}),
             None,
+            True,
             pd.DataFrame({"a": [0.5, 1], "b": [0, 0.5]}),
         ),
+        # test default parameter
+        (
+            pd.DataFrame({"a": [5, 10], "b": [0, 5]}),
+            None,
+            False,
+            pd.DataFrame({"a": [0, 1.0], "b": [0, 1.0]}),
+        ),
+        # test list condition
+        (
+            pd.DataFrame({"a": [5, 10], "b": [0, 5]}),
+            ["a", "b"],
+            True,
+            pd.DataFrame({"a": [0.5, 1.0], "b": [0, 0.5]}),
+        ),
         # test list condition
         (
             pd.DataFrame({"a": [5, 10], "b": [0, 5]}),
             ["a", "b"],
+            False,
             pd.DataFrame({"a": [0, 1.0], "b": [0, 1.0]}),
         ),
         # test Index condition
         (
             pd.DataFrame({"a": [5, 10], "b": [0, 5]}),
             pd.Index(["a", "b"]),
+            False,
             pd.DataFrame({"a": [0, 1.0], "b": [0, 1.0]}),
         ),
+        # test Index condition
+        (
+            pd.DataFrame({"a": [5, 10], "b": [0, 5]}),
+            pd.Index(["a", "b"]),
+            True,
+            pd.DataFrame({"a": [0.5, 1], "b": [0, 0.5]}),
+        ),
         # test str condition
         (
             pd.DataFrame({"a": [5, 10], "b": [0, 5]}),
             "a",
+            True,
+            pd.DataFrame({"a": [0, 1.0], "b": [0, 5]}),
+        ),
+        (
+            pd.DataFrame({"a": [5, 10], "b": [0, 5]}),
+            "a",
+            False,
             pd.DataFrame({"a": [0, 1.0], "b": [0, 5]}),
         ),
         # test int condition
         (
             pd.DataFrame({1: [5, 10], "b": [0, 5]}),
             1,
+            True,
+            pd.DataFrame({1: [0, 1.0], "b": [0, 5]}),
+        ),
+        # test int condition
+        (
+            pd.DataFrame({1: [5, 10], "b": [0, 5]}),
+            1,
+            False,
             pd.DataFrame({1: [0, 1.0], "b": [0, 5]}),
         ),
     ],
 )
-def test_min_max_scale_column_name(df, column_name, excepted):
-    result = df.min_max_scale(column_name=column_name)
+def test_min_max_scale_column_name_type(df, column_name, jointly, excepted):
+    result = df.min_max_scale(column_name=column_name, jointly=jointly)
 
     assert result.equals(excepted)