Skip to content

Commit 270498e

Browse files
samukwekusamukwekusammychoco
authored
[ENH] Add fill_value and explicit to the complete function (#1030)
* changelog * add fill_value and explicit parameters * docs update * examples for fill and explicit * examples for fill and explicit * examples for fill and explicit * updates * restore defaults * docs update * update to docs * fix for singledispatch Co-authored-by: samukweku <[email protected]> Co-authored-by: sammychoco <[email protected]>
1 parent c1ae4ab commit 270498e

File tree

3 files changed

+461
-127
lines changed

3 files changed

+461
-127
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
- [BUG] Removed/updated error-inducing default arguments in `row_to_names` (#1004) and `round_to_fraction` (#1005). @thatlittleboy
1515
- [ENH] `patterns` deprecated in favour of importing `re.compile`. #1007 @samukweku
1616
- [ENH] Changes to kwargs in `encode_categorical`, where the values can either be a string or a 1D array. #1021 @samukweku
17+
- [ENH] Add `fill_value` and `explicit` parameters to the `complete` function. #1019 @samukweku
1718

1819
## [v0.22.0] - 2021-11-21
1920

janitor/functions/complete.py

+171-68
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
from typing import Optional, Union, List, Tuple, Dict
1+
from typing import Optional, Union, List, Tuple, Dict, Any
22
from pandas.core.common import apply_if_callable
33
import pandas_flavor as pf
44
import pandas as pd
55
import functools
6-
from pandas.api.types import is_list_like
6+
from pandas.api.types import is_list_like, is_scalar
77

88
from janitor.utils import check, check_column
99

@@ -16,22 +16,25 @@ def complete(
1616
*columns,
1717
sort: bool = False,
1818
by: Optional[Union[list, str]] = None,
19+
fill_value: Optional[Union[Dict, Any]] = None,
20+
explicit: bool = True,
1921
) -> pd.DataFrame:
2022
"""
2123
It is modeled after tidyr's `complete` function, and is a wrapper around
22-
[`expand_grid`][janitor.functions.expand_grid.expand_grid] and `pd.merge`.
24+
[`expand_grid`][janitor.functions.expand_grid.expand_grid], `pd.merge`
25+
and `pd.fillna`. In a way, it is the inverse of `pd.dropna`, as it exposes
26+
implicitly missing rows.
2327
2428
Combinations of column names or a list/tuple of column names, or even a
2529
dictionary of column names and new values are possible.
2630
27-
It can also handle duplicated data.
28-
2931
MultiIndex columns are not supported.
3032
3133
Example:
3234
3335
>>> import pandas as pd
3436
>>> import janitor
37+
>>> import numpy as np
3538
>>> df = pd.DataFrame(
3639
... {
3740
... "Year": [1999, 2000, 2004, 1999, 2004],
@@ -55,7 +58,7 @@ def complete(
5558
5659
Expose missing pairings of `Year` and `Taxon`:
5760
58-
>>> df.complete("Year", "Taxon", sort = True)
61+
>>> df.complete("Year", "Taxon", sort=True)
5962
Year Taxon Abundance
6063
0 1999 Agarum 1.0
6164
1 1999 Saccharina 4.0
@@ -69,7 +72,7 @@ def complete(
6972
>>> df.complete(
7073
... {"Year": range(df.Year.min(), df.Year.max() + 1)},
7174
... "Taxon",
72-
... sort=True,
75+
... sort=True
7376
... )
7477
Year Taxon Abundance
7578
0 1999 Agarum 1.0
@@ -85,6 +88,60 @@ def complete(
8588
10 2004 Agarum 8.0
8689
11 2004 Saccharina 2.0
8790
91+
Fill missing values:
92+
93+
>>> df = pd.DataFrame(
94+
... dict(
95+
... group=(1, 2, 1, 2),
96+
... item_id=(1, 2, 2, 3),
97+
... item_name=("a", "a", "b", "b"),
98+
... value1=(1, np.nan, 3, 4),
99+
... value2=range(4, 8),
100+
... )
101+
... )
102+
>>> df
103+
group item_id item_name value1 value2
104+
0 1 1 a 1.0 4
105+
1 2 2 a NaN 5
106+
2 1 2 b 3.0 6
107+
3 2 3 b 4.0 7
108+
>>> df.complete(
109+
... "group",
110+
... ("item_id", "item_name"),
111+
... fill_value={"value1": 0, "value2": 99},
112+
... sort=True
113+
... )
114+
group item_id item_name value1 value2
115+
0 1 1 a 1 4
116+
1 1 2 a 0 99
117+
2 1 2 b 3 6
118+
3 1 3 b 0 99
119+
4 2 1 a 0 99
120+
5 2 2 a 0 5
121+
6 2 2 b 0 99
122+
7 2 3 b 4 7
123+
124+
Limit the fill to only implicit missing values
125+
by setting explicit to `False`:
126+
127+
>>> df.complete(
128+
... "group",
129+
... ("item_id", "item_name"),
130+
... fill_value={"value1": 0, "value2": 99},
131+
... explicit=False,
132+
... sort=True
133+
... )
134+
group item_id item_name value1 value2
135+
0 1 1 a 1.0 4
136+
1 1 2 a 0.0 99
137+
2 1 2 b 3.0 6
138+
3 1 3 b 0.0 99
139+
4 2 1 a 0.0 99
140+
5 2 2 a NaN 5
141+
6 2 2 b 0.0 99
142+
7 2 3 b 4.0 7
143+
144+
88145
:param df: A pandas DataFrame.
89146
:param *columns: This refers to the columns to be
90147
completed. It could be column labels (string type),
@@ -93,6 +150,13 @@ def complete(
93150
:param sort: Sort DataFrame based on *columns. Default is `False`.
94151
:param by: label or list of labels to group by.
95152
The explicit missing rows are returned per group.
153+
:param fill_value: Scalar value to use instead of NaN
154+
for missing combinations. A dictionary, mapping columns names
155+
to a scalar value is also accepted.
156+
:param explicit: Determines if only implicitly missing values
157+
should be filled (`False`), or all nulls existing in the dataframe
158+
(`True`). Default is `True`. `explicit` is applicable only
159+
if `fill_value` is not `None`.
96160
:returns: A pandas DataFrame with explicit missing rows, if any.
97161
"""
98162

@@ -101,14 +165,16 @@ def complete(
101165

102166
df = df.copy()
103167

104-
return _computations_complete(df, columns, sort, by)
168+
return _computations_complete(df, columns, sort, by, fill_value, explicit)
105169

106170

107171
def _computations_complete(
108172
df: pd.DataFrame,
109173
columns: List[Union[List, Tuple, Dict, str]],
110-
sort: bool = False,
111-
by: Optional[Union[list, str]] = None,
174+
sort: bool,
175+
by: Optional[Union[list, str]],
176+
fill_value: Optional[Union[Dict, Any]],
177+
explicit: bool,
112178
) -> pd.DataFrame:
113179
"""
114180
This function computes the final output for the `complete` function.
@@ -117,10 +183,14 @@ def _computations_complete(
117183
118184
A DataFrame, with rows of missing values, if any, is returned.
119185
"""
120-
121-
columns, column_checker, sort, by = _data_checks_complete(
122-
df, columns, sort, by
123-
)
186+
(
187+
columns,
188+
column_checker,
189+
sort,
190+
by,
191+
fill_value,
192+
explicit,
193+
) = _data_checks_complete(df, columns, sort, by, fill_value, explicit)
124194

125195
all_strings = True
126196
for column in columns:
@@ -143,17 +213,65 @@ def _computations_complete(
143213
# of course there could be a better way ...
144214
if by is None:
145215
uniques = _generic_complete(df, columns, all_strings)
146-
return df.merge(uniques, how="outer", on=column_checker, sort=sort)
147-
148-
uniques = df.groupby(by)
149-
uniques = uniques.apply(_generic_complete, columns, all_strings)
150-
uniques = uniques.droplevel(-1)
151-
return df.merge(uniques, how="outer", on=by + column_checker, sort=sort)
216+
else:
217+
uniques = df.groupby(by)
218+
uniques = uniques.apply(_generic_complete, columns, all_strings)
219+
uniques = uniques.droplevel(-1)
220+
column_checker = by + column_checker
221+
if fill_value is None:
222+
return df.merge(
223+
uniques, on=column_checker, how="outer", sort=sort, copy=False
224+
)
225+
# if fill_value is present
226+
if is_scalar(fill_value):
227+
# faster when fillna operates on a Series basis
228+
fill_value = {col: fill_value for col in df}
229+
230+
if explicit:
231+
return df.merge(
232+
uniques, on=column_checker, how="outer", sort=sort, copy=False
233+
).fillna(fill_value, downcast="infer")
234+
# keep only columns that are not part of column_checker
235+
# IOW, we are excluding columns that were not used
236+
# to generate the combinations
237+
fill_value = {
238+
col: value
239+
for col, value in fill_value.items()
240+
if col not in column_checker
241+
}
242+
if not fill_value: # there is nothing to fill
243+
return df.merge(
244+
uniques, on=column_checker, how="outer", sort=sort, copy=False
245+
)
152246

247+
# when explicit is False
248+
# filter out rows from `unique` that already exist in the parent dataframe
249+
# fill the null values in the trimmed `unique`
250+
# and merge back to the main dataframe
153251

154-
def _generic_complete(
155-
df: pd.DataFrame, columns: list, all_strings: bool = True
156-
):
252+
# to get a name that does not exist in the columns
253+
indicator = "".join(df.columns)
254+
trimmed = df.loc(axis=1)[column_checker]
255+
uniques = uniques.merge(
256+
trimmed, how="left", sort=False, copy=False, indicator=indicator
257+
)
258+
trimmed = uniques.iloc(axis=1)[-1] == "left_only"
259+
uniques = uniques.loc[trimmed, column_checker]
260+
trimmed = None
261+
indicator = None
262+
# iteration used here, instead of assign (which is also a for loop),
263+
# to cater for scenarios where the column_name is not a string
264+
# assign only works with keys that are strings
265+
for column_name, value in fill_value.items():
266+
uniques[column_name] = value
267+
df = pd.concat([df, uniques], sort=False, copy=False, axis="index")
268+
if sort:
269+
df = df.sort_values(column_checker)
270+
df.index = range(len(df))
271+
return df
272+
273+
274+
def _generic_complete(df: pd.DataFrame, columns: list, all_strings: bool):
157275
"""
158276
Generate cartesian product for `_computations_complete`.
159277
@@ -182,7 +300,7 @@ def _generic_complete(
182300

183301

184302
@functools.singledispatch
185-
def _complete_column(column, df):
303+
def _complete_column(column: str, df):
186304
"""
187305
Args:
188306
column : str/list/dict
@@ -191,22 +309,6 @@ def _complete_column(column, df):
191309
A Pandas Series/DataFrame with no duplicates,
192310
or a list of unique Pandas Series is returned.
193311
"""
194-
raise TypeError(
195-
"""This type is not supported in the `complete` function."""
196-
)
197-
198-
199-
@_complete_column.register(str) # noqa: F811
200-
def _sub_complete_column(column, df): # noqa: F811
201-
"""
202-
Args:
203-
column : str
204-
df: Pandas DataFrame
205-
206-
Returns:
207-
Pandas Series
208-
"""
209-
210312
column = df[column]
211313

212314
if not column.is_unique:
@@ -248,20 +350,14 @@ def _sub_complete_column(column, df): # noqa: F811
248350
for key, value in column.items():
249351
arr = apply_if_callable(value, df[key])
250352
if not is_list_like(arr):
251-
raise ValueError(
252-
f"""
253-
value for {key} should be a 1-D array.
254-
"""
255-
)
353+
raise ValueError(f"value for {key} should be a 1-D array.")
256354
if not hasattr(arr, "shape"):
257355
arr = pd.Series([*arr], name=key)
258356

259357
if not arr.size > 0:
260358
raise ValueError(
261-
f"""
262-
Kindly ensure the provided array for {key}
263-
has at least one value.
264-
"""
359+
f"Kindly ensure the provided array for {key} "
360+
"has at least one value."
265361
)
266362

267363
if isinstance(arr, pd.Index):
@@ -270,11 +366,7 @@ def _sub_complete_column(column, df): # noqa: F811
270366
arr_ndim = arr.ndim
271367

272368
if arr_ndim != 1:
273-
raise ValueError(
274-
f"""
275-
Kindly provide a 1-D array for {key}.
276-
"""
277-
)
369+
raise ValueError(f"Kindly provide a 1-D array for {key}.")
278370

279371
if not isinstance(arr, pd.Series):
280372
arr = pd.Series(arr)
@@ -292,8 +384,10 @@ def _sub_complete_column(column, df): # noqa: F811
292384
def _data_checks_complete(
293385
df: pd.DataFrame,
294386
columns: List[Union[List, Tuple, Dict, str]],
295-
sort: Optional[bool] = False,
296-
by: Optional[Union[list, str]] = None,
387+
sort: Optional[bool],
388+
by: Optional[Union[list, str]],
389+
fill_value: Optional[Union[Dict, Any]],
390+
explicit: bool,
297391
):
298392
"""
299393
Function to check parameters in the `complete` function.
@@ -303,20 +397,16 @@ def _data_checks_complete(
303397
Check is conducted to ensure that column names are not repeated.
304398
Also checks that the names in `columns` actually exist in `df`.
305399
306-
Returns `df`, `columns`, `column_checker`, and `by` if
307-
all checks pass.
400+
Returns `df`, `columns`, `column_checker`, `by`, `fill_value`,
401+
and `explicit` if all checks pass.
308402
"""
309403
# TODO: get `complete` to work on MultiIndex columns,
310404
# if there is sufficient interest with use cases
311405
if isinstance(df.columns, pd.MultiIndex):
312-
raise ValueError(
313-
"""
314-
`complete` does not support MultiIndex columns.
315-
"""
316-
)
406+
raise ValueError("`complete` does not support MultiIndex columns.")
317407

318408
columns = [
319-
list(grouping) if isinstance(grouping, tuple) else grouping
409+
[*grouping] if isinstance(grouping, tuple) else grouping
320410
for grouping in columns
321411
]
322412
column_checker = []
@@ -333,9 +423,7 @@ def _data_checks_complete(
333423
column_checker_no_duplicates = set()
334424
for column in column_checker:
335425
if column in column_checker_no_duplicates:
336-
raise ValueError(
337-
f"""{column} column should be in only one group."""
338-
)
426+
raise ValueError(f"{column} column should be in only one group.")
339427
column_checker_no_duplicates.add(column) # noqa: PD005
340428

341429
check_column(df, column_checker)
@@ -349,4 +437,19 @@ def _data_checks_complete(
349437
check("by", by, [list])
350438
check_column(df, by)
351439

352-
return columns, column_checker, sort, by
440+
check("explicit", explicit, [bool])
441+
442+
fill_value_check = is_scalar(fill_value), isinstance(fill_value, dict)
443+
if not any(fill_value_check):
444+
raise TypeError(
445+
"`fill_value` should either be a dictionary or a scalar value."
446+
)
447+
if fill_value_check[-1]:
448+
check_column(df, fill_value)
449+
for column_name, value in fill_value.items():
450+
if not is_scalar(value):
451+
raise ValueError(
452+
f"The value for {column_name} should be a scalar."
453+
)
454+
455+
return columns, column_checker, sort, by, fill_value, explicit

0 commit comments

Comments
 (0)