[ENH] Add support for extension arrays to expand_grid (pyjanitor-devs#1122)

samukweku · web-flow · commit 383fad1be403 · 2022-06-26T22:29:50.000+08:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,7 @@
 -   [ENH] `min_max_scale` drop `old_min` and `old_max` to fit sklearn's method API. Issue #1068 @Zeroto521
 -   [ENH] Add `jointly` option for `min_max_scale` support to transform each column values or entire values. Default transform each column, similar behavior to `sklearn.preprocessing.MinMaxScaler`. (Issue #1067, PR #1112, PR #1123) @Zeroto521
 -   [INF] Require pyspark minimal version is v3.2.0 to cut duplicates codes. Issue #1110 @Zeroto521
+-   [ENH] Added support for extension arrays in `expand_grid`. Issue #1121 @samukweku
 
 
 ## [v0.23.1] - 2022-05-03
diff --git a/examples/notebooks/expand_grid.ipynb b/examples/notebooks/expand_grid.ipynb
@@ -1108,11 +1108,9 @@
   }
  ],
  "metadata": {
-  "interpreter": {
-   "hash": "98b0a9b7b4eaaa670588a142fd0a9b87eaafe866f1db4228be72b4211d12040f"
-  },
   "kernelspec": {
-   "display_name": "Python 3.8.10 64-bit ('base': conda)",
+   "display_name": "Python 3.9.10 ('base')",
+   "language": "python",
    "name": "python3"
   },
   "language_info": {
@@ -1125,7 +1123,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.9"
+   "version": "3.9.10"
   },
   "orig_nbformat": 2
  },
diff --git a/janitor/functions/utils.py b/janitor/functions/utils.py
@@ -188,7 +188,7 @@ def _computations_expand_grid(others: dict) -> pd.DataFrame:
     grid = ((*left, right) for left, right in grid)
     contents = {}
     for key, value, grid_index in grid:
-        contents = {**contents, **_expand_grid(value, grid_index, key)}
+        contents.update(_expand_grid(value, grid_index, key))
     return pd.DataFrame(contents, copy=False)
 
 
diff --git a/janitor/utils.py b/janitor/utils.py
@@ -9,7 +9,6 @@
 
 import numpy as np
 import pandas as pd
-from pandas.core.construction import extract_array
 
 
 def check(varname: str, value, expected_types: list):
@@ -57,7 +56,6 @@ def _expand_grid(value, grid_index, key):
 def _sub_expand_grid(value, grid_index, key):  # noqa: F811
     """
     Expands the numpy array based on `grid_index`.
-
     Returns a dictionary.
     """
 
@@ -73,43 +71,37 @@ def _sub_expand_grid(value, grid_index, key):  # noqa: F811
     if value.ndim == 1:
         return {(key, 0): value}
 
-    return {(key, num): arr for num, arr in enumerate(value.T)}
+    return {(key, num): value[:, num] for num in range(value.shape[-1])}
 
 
+@_expand_grid.register(pd.api.extensions.ExtensionArray)
 @_expand_grid.register(pd.arrays.PandasArray)
 def _sub_expand_grid(value, grid_index, key):  # noqa: F811
     """
     Expands the pandas array based on `grid_index`.
-
     Returns a dictionary.
     """
 
-    value = value[grid_index]
-
-    return {(key, 0): value}
+    return {(key, 0): value[grid_index]}
 
 
+@_expand_grid.register(pd.Index)
 @_expand_grid.register(pd.Series)
 def _sub_expand_grid(value, grid_index, key):  # noqa: F811
     """
-    Expands the Series based on `grid_index`.
-
+    Expands the pd.Series/pd.Index based on `grid_index`.
     Returns a dictionary.
     """
 
-    name = value.name
-    if not name:
-        name = 0
-    value = extract_array(value, extract_numpy=True)[grid_index]
+    name = value.name or 0
 
-    return {(key, name): value}
+    return {(key, name): value._values[grid_index]}
 
 
 @_expand_grid.register(pd.DataFrame)
 def _sub_expand_grid(value, grid_index, key):  # noqa: F811
     """
     Expands the DataFrame based on `grid_index`.
-
     Returns a dictionary.
     """
 
@@ -120,16 +112,14 @@ def _sub_expand_grid(value, grid_index, key):  # noqa: F811
         value = value.set_axis(columns, axis="columns")
 
     return {
-        (key, name): extract_array(val, extract_numpy=True)[grid_index]
-        for name, val in value.items()
+        (key, name): val._values[grid_index] for name, val in value.items()
     }
 
 
 @_expand_grid.register(pd.MultiIndex)
 def _sub_expand_grid(value, grid_index, key):  # noqa: F811
     """
     Expands the MultiIndex based on `grid_index`.
-
     Returns a dictionary.
     """
 
@@ -138,27 +128,14 @@ def _sub_expand_grid(value, grid_index, key):  # noqa: F811
     for n in range(value.nlevels):
         arr = value.get_level_values(n)
         name = arr.name
-        arr = extract_array(arr, extract_numpy=True)[grid_index]
+        arr = arr._values[grid_index]
         if not name:
             name = num
             num += 1
         contents[(key, name)] = arr
     return contents
 
 
-@_expand_grid.register(pd.Index)
-def _sub_expand_grid(value, grid_index, key):  # noqa: F811
-    """
-    Expands the Index based on `grid_index`.
-
-    Returns a dictionary.
-    """
-    name = value.name
-    if not name:
-        name = 0
-    return {(key, name): extract_array(value, extract_numpy=True)[grid_index]}
-
-
 def import_message(
     submodule: str,
     package: str,
diff --git a/tests/functions/test_expand_grid.py b/tests/functions/test_expand_grid.py
@@ -8,6 +8,7 @@
     categoricaldf_strategy,
 )
 from janitor.functions import expand_grid
+from functools import reduce
 
 
 @given(df=df_strategy())
@@ -324,3 +325,21 @@ def test_series_name(df):
         [["city", "A"], ["cities", 0]]
     )
     assert_frame_equal(result, expected)
+
+
+def test_extension_array():
+    """Test output on an extension array"""
+    others = dict(
+        id=pd.Categorical(
+            values=(2, 1, 1, 2, 1), categories=(1, 2, 3), ordered=True
+        ),
+        year=(2018, 2018, 2019, 2020, 2020),
+        gender=pd.Categorical(("female", "male", "male", "female", "male")),
+    )
+
+    expected = expand_grid(others=others).droplevel(axis=1, level=-1)
+    others = [pd.Series(val).rename(key) for key, val in others.items()]
+
+    func = lambda x, y: pd.merge(x, y, how="cross")  # noqa: E731
+    actual = reduce(func, others)
+    assert_frame_equal(expected, actual)