[ENH] Add fill_value and explicit to the complete function (#1030)

samukweku · samukweku · sammychoco · web-flow · commit 270498ef32f2 · 2022-03-01T19:04:14.000+08:00
* changelog

* add fill_value and explicit parameters

* docs update

* examples for fill and explicit

* examples for fill and explicit

* examples for fill and explicit

* updates

* restore defaults

* docs update

* update to docs

* fix for singledispatch

Co-authored-by: samukweku &lt;samukweku@gmail.com&gt;
Co-authored-by: sammychoco &lt;samuel.oranyeli@slalom.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,7 @@
 -   [BUG] Removed/updated error-inducing default arguments in `row_to_names` (#1004) and `round_to_fraction` (#1005). @thatlittleboy
 -   [ENH] `patterns` deprecated in favour of importing `re.compile`. #1007 @samukweku
 -   [ENH] Changes to kwargs in `encode_categorical`, where the values can either be a string or a 1D array. #1021 @samukweku
+-   [ENH] Add `fill_value` and `explicit` parameters to the `complete` function. #1019 @samukweku
 
 ## [v0.22.0] - 2021-11-21
 
diff --git a/janitor/functions/complete.py b/janitor/functions/complete.py
@@ -1,9 +1,9 @@
-from typing import Optional, Union, List, Tuple, Dict
+from typing import Optional, Union, List, Tuple, Dict, Any
 from pandas.core.common import apply_if_callable
 import pandas_flavor as pf
 import pandas as pd
 import functools
-from pandas.api.types import is_list_like
+from pandas.api.types import is_list_like, is_scalar
 
 from janitor.utils import check, check_column
 
@@ -16,22 +16,25 @@ def complete(
     *columns,
     sort: bool = False,
     by: Optional[Union[list, str]] = None,
+    fill_value: Optional[Union[Dict, Any]] = None,
+    explicit: bool = True,
 ) -> pd.DataFrame:
     """
     It is modeled after tidyr's `complete` function, and is a wrapper around
-    [`expand_grid`][janitor.functions.expand_grid.expand_grid] and `pd.merge`.
+    [`expand_grid`][janitor.functions.expand_grid.expand_grid], `pd.merge`
+    and `pd.fillna`. In a way, it is the inverse of `pd.dropna`, as it exposes
+    implicitly missing rows.
 
     Combinations of column names or a list/tuple of column names, or even a
     dictionary of column names and new values are possible.
 
-    It can also handle duplicated data.
-
     MultiIndex columns are not supported.
 
     Example:
 
         >>> import pandas as pd
         >>> import janitor
+        >>> import numpy as np
         >>> df = pd.DataFrame(
         ...     {
         ...         "Year": [1999, 2000, 2004, 1999, 2004],
@@ -55,7 +58,7 @@ def complete(
 
     Expose missing pairings of `Year` and `Taxon`:
 
-        >>> df.complete("Year", "Taxon", sort = True)
+        >>> df.complete("Year", "Taxon", sort=True)
            Year       Taxon  Abundance
         0  1999      Agarum        1.0
         1  1999  Saccharina        4.0
@@ -69,7 +72,7 @@ def complete(
         >>> df.complete(
         ...     {"Year": range(df.Year.min(), df.Year.max() + 1)},
         ...     "Taxon",
-        ...     sort=True,
+        ...     sort=True
         ... )
             Year       Taxon  Abundance
         0   1999      Agarum        1.0
@@ -85,6 +88,60 @@ def complete(
         10  2004      Agarum        8.0
         11  2004  Saccharina        2.0
 
+    Fill missing values:
+
+        >>> df = pd.DataFrame(
+        ...     dict(
+        ...         group=(1, 2, 1, 2),
+        ...         item_id=(1, 2, 2, 3),
+        ...         item_name=("a", "a", "b", "b"),
+        ...         value1=(1, np.nan, 3, 4),
+        ...         value2=range(4, 8),
+        ...     )
+        ... )
+        >>> df
+           group  item_id item_name  value1  value2
+        0      1        1         a     1.0       4
+        1      2        2         a     NaN       5
+        2      1        2         b     3.0       6
+        3      2        3         b     4.0       7
+        >>> df.complete(
+        ...     "group",
+        ...     ("item_id", "item_name"),
+        ...     fill_value={"value1": 0, "value2": 99},
+        ...     sort=True
+        ... )
+           group  item_id item_name  value1  value2
+        0      1        1         a       1       4
+        1      1        2         a       0      99
+        2      1        2         b       3       6
+        3      1        3         b       0      99
+        4      2        1         a       0      99
+        5      2        2         a       0       5
+        6      2        2         b       0      99
+        7      2        3         b       4       7
+
+    Limit the fill to only implicit missing values
+    by setting explicit to `False`:
+
+        >>> df.complete(
+        ...     "group",
+        ...     ("item_id", "item_name"),
+        ...     fill_value={"value1": 0, "value2": 99},
+        ...     explicit=False,
+        ...     sort=True
+        ... )
+           group  item_id item_name  value1  value2
+        0      1        1         a     1.0       4
+        1      1        2         a     0.0      99
+        2      1        2         b     3.0       6
+        3      1        3         b     0.0      99
+        4      2        1         a     0.0      99
+        5      2        2         a     NaN       5
+        6      2        2         b     0.0      99
+        7      2        3         b     4.0       7
+
+
     :param df: A pandas DataFrame.
     :param *columns: This refers to the columns to be
         completed. It could be column labels (string type),
@@ -93,6 +150,13 @@ def complete(
     :param sort: Sort DataFrame based on *columns. Default is `False`.
     :param by: label or list of labels to group by.
         The explicit missing rows are returned per group.
+    :param fill_value: Scalar value to use instead of NaN
+        for missing combinations. A dictionary, mapping columns names
+        to a scalar value is also accepted.
+    :param explicit: Determines if only implicitly missing values
+        should be filled (`False`), or all nulls existing in the dataframe
+        (`True`). Default is `True`. `explicit` is applicable only
+        if `fill_value` is not `None`.
     :returns: A pandas DataFrame with explicit missing rows, if any.
     """
 
@@ -101,14 +165,16 @@ def complete(
 
     df = df.copy()
 
-    return _computations_complete(df, columns, sort, by)
+    return _computations_complete(df, columns, sort, by, fill_value, explicit)
 
 
 def _computations_complete(
     df: pd.DataFrame,
     columns: List[Union[List, Tuple, Dict, str]],
-    sort: bool = False,
-    by: Optional[Union[list, str]] = None,
+    sort: bool,
+    by: Optional[Union[list, str]],
+    fill_value: Optional[Union[Dict, Any]],
+    explicit: bool,
 ) -> pd.DataFrame:
     """
     This function computes the final output for the `complete` function.
@@ -117,10 +183,14 @@ def _computations_complete(
 
     A DataFrame, with rows of missing values, if any, is returned.
     """
-
-    columns, column_checker, sort, by = _data_checks_complete(
-        df, columns, sort, by
-    )
+    (
+        columns,
+        column_checker,
+        sort,
+        by,
+        fill_value,
+        explicit,
+    ) = _data_checks_complete(df, columns, sort, by, fill_value, explicit)
 
     all_strings = True
     for column in columns:
@@ -143,17 +213,65 @@ def _computations_complete(
     # of course there could be a better way ...
     if by is None:
         uniques = _generic_complete(df, columns, all_strings)
-        return df.merge(uniques, how="outer", on=column_checker, sort=sort)
-
-    uniques = df.groupby(by)
-    uniques = uniques.apply(_generic_complete, columns, all_strings)
-    uniques = uniques.droplevel(-1)
-    return df.merge(uniques, how="outer", on=by + column_checker, sort=sort)
+    else:
+        uniques = df.groupby(by)
+        uniques = uniques.apply(_generic_complete, columns, all_strings)
+        uniques = uniques.droplevel(-1)
+        column_checker = by + column_checker
+    if fill_value is None:
+        return df.merge(
+            uniques, on=column_checker, how="outer", sort=sort, copy=False
+        )
+    # if fill_value is present
+    if is_scalar(fill_value):
+        # faster when fillna operates on a Series basis
+        fill_value = {col: fill_value for col in df}
+
+    if explicit:
+        return df.merge(
+            uniques, on=column_checker, how="outer", sort=sort, copy=False
+        ).fillna(fill_value, downcast="infer")
+    # keep only columns that are not part of column_checker
+    # IOW, we are excluding columns that were not used
+    # to generate the combinations
+    fill_value = {
+        col: value
+        for col, value in fill_value.items()
+        if col not in column_checker
+    }
+    if not fill_value:  # there is nothing to fill
+        return df.merge(
+            uniques, on=column_checker, how="outer", sort=sort, copy=False
+        )
 
+    # when explicit is False
+    # filter out rows from `unique` that already exist in the parent dataframe
+    # fill the null values in the trimmed `unique`
+    # and merge back to the main dataframe
 
-def _generic_complete(
-    df: pd.DataFrame, columns: list, all_strings: bool = True
-):
+    # to get a name that does not exist in the columns
+    indicator = "".join(df.columns)
+    trimmed = df.loc(axis=1)[column_checker]
+    uniques = uniques.merge(
+        trimmed, how="left", sort=False, copy=False, indicator=indicator
+    )
+    trimmed = uniques.iloc(axis=1)[-1] == "left_only"
+    uniques = uniques.loc[trimmed, column_checker]
+    trimmed = None
+    indicator = None
+    # iteration used here, instead of assign (which is also a for loop),
+    # to cater for scenarios where the column_name is not a string
+    # assign only works with keys that are strings
+    for column_name, value in fill_value.items():
+        uniques[column_name] = value
+    df = pd.concat([df, uniques], sort=False, copy=False, axis="index")
+    if sort:
+        df = df.sort_values(column_checker)
+    df.index = range(len(df))
+    return df
+
+
+def _generic_complete(df: pd.DataFrame, columns: list, all_strings: bool):
     """
     Generate cartesian product for `_computations_complete`.
 
@@ -182,7 +300,7 @@ def _generic_complete(
 
 
 @functools.singledispatch
-def _complete_column(column, df):
+def _complete_column(column: str, df):
     """
     Args:
         column : str/list/dict
@@ -191,22 +309,6 @@ def _complete_column(column, df):
     A Pandas Series/DataFrame with no duplicates,
     or a list of unique Pandas Series is returned.
     """
-    raise TypeError(
-        """This type is not supported in the `complete` function."""
-    )
-
-
-@_complete_column.register(str)  # noqa: F811
-def _sub_complete_column(column, df):  # noqa: F811
-    """
-    Args:
-        column : str
-        df: Pandas DataFrame
-
-    Returns:
-        Pandas Series
-    """
-
     column = df[column]
 
     if not column.is_unique:
@@ -248,20 +350,14 @@ def _sub_complete_column(column, df):  # noqa: F811
     for key, value in column.items():
         arr = apply_if_callable(value, df[key])
         if not is_list_like(arr):
-            raise ValueError(
-                f"""
-                value for {key} should be a 1-D array.
-                """
-            )
+            raise ValueError(f"value for {key} should be a 1-D array.")
         if not hasattr(arr, "shape"):
             arr = pd.Series([*arr], name=key)
 
         if not arr.size > 0:
             raise ValueError(
-                f"""
-                Kindly ensure the provided array for {key}
-                has at least one value.
-                """
+                f"Kindly ensure the provided array for {key} "
+                "has at least one value."
             )
 
         if isinstance(arr, pd.Index):
@@ -270,11 +366,7 @@ def _sub_complete_column(column, df):  # noqa: F811
             arr_ndim = arr.ndim
 
         if arr_ndim != 1:
-            raise ValueError(
-                f"""
-                Kindly provide a 1-D array for {key}.
-                """
-            )
+            raise ValueError(f"Kindly provide a 1-D array for {key}.")
 
         if not isinstance(arr, pd.Series):
             arr = pd.Series(arr)
@@ -292,8 +384,10 @@ def _sub_complete_column(column, df):  # noqa: F811
 def _data_checks_complete(
     df: pd.DataFrame,
     columns: List[Union[List, Tuple, Dict, str]],
-    sort: Optional[bool] = False,
-    by: Optional[Union[list, str]] = None,
+    sort: Optional[bool],
+    by: Optional[Union[list, str]],
+    fill_value: Optional[Union[Dict, Any]],
+    explicit: bool,
 ):
     """
     Function to check parameters in the `complete` function.
@@ -303,20 +397,16 @@ def _data_checks_complete(
     Check is conducted to ensure that column names are not repeated.
     Also checks that the names in `columns` actually exist in `df`.
 
-    Returns `df`, `columns`, `column_checker`, and `by` if
-    all checks pass.
+    Returns `df`, `columns`, `column_checker`, `by`, `fill_value`,
+    and `explicit` if all checks pass.
     """
     # TODO: get `complete` to work on MultiIndex columns,
     # if there is sufficient interest with use cases
     if isinstance(df.columns, pd.MultiIndex):
-        raise ValueError(
-            """
-            `complete` does not support MultiIndex columns.
-            """
-        )
+        raise ValueError("`complete` does not support MultiIndex columns.")
 
     columns = [
-        list(grouping) if isinstance(grouping, tuple) else grouping
+        [*grouping] if isinstance(grouping, tuple) else grouping
         for grouping in columns
     ]
     column_checker = []
@@ -333,9 +423,7 @@ def _data_checks_complete(
     column_checker_no_duplicates = set()
     for column in column_checker:
         if column in column_checker_no_duplicates:
-            raise ValueError(
-                f"""{column} column should be in only one group."""
-            )
+            raise ValueError(f"{column} column should be in only one group.")
         column_checker_no_duplicates.add(column)  # noqa: PD005
 
     check_column(df, column_checker)
@@ -349,4 +437,19 @@ def _data_checks_complete(
         check("by", by, [list])
         check_column(df, by)
 
-    return columns, column_checker, sort, by
+    check("explicit", explicit, [bool])
+
+    fill_value_check = is_scalar(fill_value), isinstance(fill_value, dict)
+    if not any(fill_value_check):
+        raise TypeError(
+            "`fill_value` should either be a dictionary or a scalar value."
+        )
+    if fill_value_check[-1]:
+        check_column(df, fill_value)
+        for column_name, value in fill_value.items():
+            if not is_scalar(value):
+                raise ValueError(
+                    f"The value for {column_name} should be a scalar."
+                )
+
+    return columns, column_checker, sort, by, fill_value, explicit
diff --git a/tests/functions/test_complete.py b/tests/functions/test_complete.py