1
- from typing import Optional , Union , List , Tuple , Dict
1
+ from typing import Optional , Union , List , Tuple , Dict , Any
2
2
from pandas .core .common import apply_if_callable
3
3
import pandas_flavor as pf
4
4
import pandas as pd
5
5
import functools
6
- from pandas .api .types import is_list_like
6
+ from pandas .api .types import is_list_like , is_scalar
7
7
8
8
from janitor .utils import check , check_column
9
9
@@ -16,22 +16,25 @@ def complete(
16
16
* columns ,
17
17
sort : bool = False ,
18
18
by : Optional [Union [list , str ]] = None ,
19
+ fill_value : Optional [Union [Dict , Any ]] = None ,
20
+ explicit : bool = True ,
19
21
) -> pd .DataFrame :
20
22
"""
21
23
It is modeled after tidyr's `complete` function, and is a wrapper around
22
- [`expand_grid`][janitor.functions.expand_grid.expand_grid] and `pd.merge`.
24
+ [`expand_grid`][janitor.functions.expand_grid.expand_grid], `pd.merge`
25
+ and `pd.fillna`. In a way, it is the inverse of `pd.dropna`, as it exposes
26
+ implicitly missing rows.
23
27
24
28
Combinations of column names or a list/tuple of column names, or even a
25
29
dictionary of column names and new values are possible.
26
30
27
- It can also handle duplicated data.
28
-
29
31
MultiIndex columns are not supported.
30
32
31
33
Example:
32
34
33
35
>>> import pandas as pd
34
36
>>> import janitor
37
+ >>> import numpy as np
35
38
>>> df = pd.DataFrame(
36
39
... {
37
40
... "Year": [1999, 2000, 2004, 1999, 2004],
@@ -55,7 +58,7 @@ def complete(
55
58
56
59
Expose missing pairings of `Year` and `Taxon`:
57
60
58
- >>> df.complete("Year", "Taxon", sort = True)
61
+ >>> df.complete("Year", "Taxon", sort= True)
59
62
Year Taxon Abundance
60
63
0 1999 Agarum 1.0
61
64
1 1999 Saccharina 4.0
@@ -69,7 +72,7 @@ def complete(
69
72
>>> df.complete(
70
73
... {"Year": range(df.Year.min(), df.Year.max() + 1)},
71
74
... "Taxon",
72
- ... sort=True,
75
+ ... sort=True
73
76
... )
74
77
Year Taxon Abundance
75
78
0 1999 Agarum 1.0
@@ -85,6 +88,60 @@ def complete(
85
88
10 2004 Agarum 8.0
86
89
11 2004 Saccharina 2.0
87
90
91
+ Fill missing values:
92
+
93
+ >>> df = pd.DataFrame(
94
+ ... dict(
95
+ ... group=(1, 2, 1, 2),
96
+ ... item_id=(1, 2, 2, 3),
97
+ ... item_name=("a", "a", "b", "b"),
98
+ ... value1=(1, np.nan, 3, 4),
99
+ ... value2=range(4, 8),
100
+ ... )
101
+ ... )
102
+ >>> df
103
+ group item_id item_name value1 value2
104
+ 0 1 1 a 1.0 4
105
+ 1 2 2 a NaN 5
106
+ 2 1 2 b 3.0 6
107
+ 3 2 3 b 4.0 7
108
+ >>> df.complete(
109
+ ... "group",
110
+ ... ("item_id", "item_name"),
111
+ ... fill_value={"value1": 0, "value2": 99},
112
+ ... sort=True
113
+ ... )
114
+ group item_id item_name value1 value2
115
+ 0 1 1 a 1 4
116
+ 1 1 2 a 0 99
117
+ 2 1 2 b 3 6
118
+ 3 1 3 b 0 99
119
+ 4 2 1 a 0 99
120
+ 5 2 2 a 0 5
121
+ 6 2 2 b 0 99
122
+ 7 2 3 b 4 7
123
+
124
+ Limit the fill to only implicit missing values
125
+ by setting explicit to `False`:
126
+
127
+ >>> df.complete(
128
+ ... "group",
129
+ ... ("item_id", "item_name"),
130
+ ... fill_value={"value1": 0, "value2": 99},
131
+ ... explicit=False,
132
+ ... sort=True
133
+ ... )
134
+ group item_id item_name value1 value2
135
+ 0 1 1 a 1.0 4
136
+ 1 1 2 a 0.0 99
137
+ 2 1 2 b 3.0 6
138
+ 3 1 3 b 0.0 99
139
+ 4 2 1 a 0.0 99
140
+ 5 2 2 a NaN 5
141
+ 6 2 2 b 0.0 99
142
+ 7 2 3 b 4.0 7
143
+
144
+
88
145
:param df: A pandas DataFrame.
89
146
:param *columns: This refers to the columns to be
90
147
completed. It could be column labels (string type),
@@ -93,6 +150,13 @@ def complete(
93
150
:param sort: Sort DataFrame based on *columns. Default is `False`.
94
151
:param by: label or list of labels to group by.
95
152
The explicit missing rows are returned per group.
153
+ :param fill_value: Scalar value to use instead of NaN
154
+ for missing combinations. A dictionary, mapping columns names
155
+ to a scalar value is also accepted.
156
+ :param explicit: Determines if only implicitly missing values
157
+ should be filled (`False`), or all nulls existing in the dataframe
158
+ (`True`). Default is `True`. `explicit` is applicable only
159
+ if `fill_value` is not `None`.
96
160
:returns: A pandas DataFrame with explicit missing rows, if any.
97
161
"""
98
162
@@ -101,14 +165,16 @@ def complete(
101
165
102
166
df = df .copy ()
103
167
104
- return _computations_complete (df , columns , sort , by )
168
+ return _computations_complete (df , columns , sort , by , fill_value , explicit )
105
169
106
170
107
171
def _computations_complete (
108
172
df : pd .DataFrame ,
109
173
columns : List [Union [List , Tuple , Dict , str ]],
110
- sort : bool = False ,
111
- by : Optional [Union [list , str ]] = None ,
174
+ sort : bool ,
175
+ by : Optional [Union [list , str ]],
176
+ fill_value : Optional [Union [Dict , Any ]],
177
+ explicit : bool ,
112
178
) -> pd .DataFrame :
113
179
"""
114
180
This function computes the final output for the `complete` function.
@@ -117,10 +183,14 @@ def _computations_complete(
117
183
118
184
A DataFrame, with rows of missing values, if any, is returned.
119
185
"""
120
-
121
- columns , column_checker , sort , by = _data_checks_complete (
122
- df , columns , sort , by
123
- )
186
+ (
187
+ columns ,
188
+ column_checker ,
189
+ sort ,
190
+ by ,
191
+ fill_value ,
192
+ explicit ,
193
+ ) = _data_checks_complete (df , columns , sort , by , fill_value , explicit )
124
194
125
195
all_strings = True
126
196
for column in columns :
@@ -143,17 +213,65 @@ def _computations_complete(
143
213
# of course there could be a better way ...
144
214
if by is None :
145
215
uniques = _generic_complete (df , columns , all_strings )
146
- return df .merge (uniques , how = "outer" , on = column_checker , sort = sort )
147
-
148
- uniques = df .groupby (by )
149
- uniques = uniques .apply (_generic_complete , columns , all_strings )
150
- uniques = uniques .droplevel (- 1 )
151
- return df .merge (uniques , how = "outer" , on = by + column_checker , sort = sort )
216
+ else :
217
+ uniques = df .groupby (by )
218
+ uniques = uniques .apply (_generic_complete , columns , all_strings )
219
+ uniques = uniques .droplevel (- 1 )
220
+ column_checker = by + column_checker
221
+ if fill_value is None :
222
+ return df .merge (
223
+ uniques , on = column_checker , how = "outer" , sort = sort , copy = False
224
+ )
225
+ # if fill_value is present
226
+ if is_scalar (fill_value ):
227
+ # faster when fillna operates on a Series basis
228
+ fill_value = {col : fill_value for col in df }
229
+
230
+ if explicit :
231
+ return df .merge (
232
+ uniques , on = column_checker , how = "outer" , sort = sort , copy = False
233
+ ).fillna (fill_value , downcast = "infer" )
234
+ # keep only columns that are not part of column_checker
235
+ # IOW, we are excluding columns that were not used
236
+ # to generate the combinations
237
+ fill_value = {
238
+ col : value
239
+ for col , value in fill_value .items ()
240
+ if col not in column_checker
241
+ }
242
+ if not fill_value : # there is nothing to fill
243
+ return df .merge (
244
+ uniques , on = column_checker , how = "outer" , sort = sort , copy = False
245
+ )
152
246
247
+ # when explicit is False
248
+ # filter out rows from `unique` that already exist in the parent dataframe
249
+ # fill the null values in the trimmed `unique`
250
+ # and merge back to the main dataframe
153
251
154
- def _generic_complete (
155
- df : pd .DataFrame , columns : list , all_strings : bool = True
156
- ):
252
+ # to get a name that does not exist in the columns
253
+ indicator = "" .join (df .columns )
254
+ trimmed = df .loc (axis = 1 )[column_checker ]
255
+ uniques = uniques .merge (
256
+ trimmed , how = "left" , sort = False , copy = False , indicator = indicator
257
+ )
258
+ trimmed = uniques .iloc (axis = 1 )[- 1 ] == "left_only"
259
+ uniques = uniques .loc [trimmed , column_checker ]
260
+ trimmed = None
261
+ indicator = None
262
+ # iteration used here, instead of assign (which is also a for loop),
263
+ # to cater for scenarios where the column_name is not a string
264
+ # assign only works with keys that are strings
265
+ for column_name , value in fill_value .items ():
266
+ uniques [column_name ] = value
267
+ df = pd .concat ([df , uniques ], sort = False , copy = False , axis = "index" )
268
+ if sort :
269
+ df = df .sort_values (column_checker )
270
+ df .index = range (len (df ))
271
+ return df
272
+
273
+
274
+ def _generic_complete (df : pd .DataFrame , columns : list , all_strings : bool ):
157
275
"""
158
276
Generate cartesian product for `_computations_complete`.
159
277
@@ -182,7 +300,7 @@ def _generic_complete(
182
300
183
301
184
302
@functools .singledispatch
185
- def _complete_column (column , df ):
303
+ def _complete_column (column : str , df ):
186
304
"""
187
305
Args:
188
306
column : str/list/dict
@@ -191,22 +309,6 @@ def _complete_column(column, df):
191
309
A Pandas Series/DataFrame with no duplicates,
192
310
or a list of unique Pandas Series is returned.
193
311
"""
194
- raise TypeError (
195
- """This type is not supported in the `complete` function."""
196
- )
197
-
198
-
199
- @_complete_column .register (str ) # noqa: F811
200
- def _sub_complete_column (column , df ): # noqa: F811
201
- """
202
- Args:
203
- column : str
204
- df: Pandas DataFrame
205
-
206
- Returns:
207
- Pandas Series
208
- """
209
-
210
312
column = df [column ]
211
313
212
314
if not column .is_unique :
@@ -248,20 +350,14 @@ def _sub_complete_column(column, df): # noqa: F811
248
350
for key , value in column .items ():
249
351
arr = apply_if_callable (value , df [key ])
250
352
if not is_list_like (arr ):
251
- raise ValueError (
252
- f"""
253
- value for { key } should be a 1-D array.
254
- """
255
- )
353
+ raise ValueError (f"value for { key } should be a 1-D array." )
256
354
if not hasattr (arr , "shape" ):
257
355
arr = pd .Series ([* arr ], name = key )
258
356
259
357
if not arr .size > 0 :
260
358
raise ValueError (
261
- f"""
262
- Kindly ensure the provided array for { key }
263
- has at least one value.
264
- """
359
+ f"Kindly ensure the provided array for { key } "
360
+ "has at least one value."
265
361
)
266
362
267
363
if isinstance (arr , pd .Index ):
@@ -270,11 +366,7 @@ def _sub_complete_column(column, df): # noqa: F811
270
366
arr_ndim = arr .ndim
271
367
272
368
if arr_ndim != 1 :
273
- raise ValueError (
274
- f"""
275
- Kindly provide a 1-D array for { key } .
276
- """
277
- )
369
+ raise ValueError (f"Kindly provide a 1-D array for { key } ." )
278
370
279
371
if not isinstance (arr , pd .Series ):
280
372
arr = pd .Series (arr )
@@ -292,8 +384,10 @@ def _sub_complete_column(column, df): # noqa: F811
292
384
def _data_checks_complete (
293
385
df : pd .DataFrame ,
294
386
columns : List [Union [List , Tuple , Dict , str ]],
295
- sort : Optional [bool ] = False ,
296
- by : Optional [Union [list , str ]] = None ,
387
+ sort : Optional [bool ],
388
+ by : Optional [Union [list , str ]],
389
+ fill_value : Optional [Union [Dict , Any ]],
390
+ explicit : bool ,
297
391
):
298
392
"""
299
393
Function to check parameters in the `complete` function.
@@ -303,20 +397,16 @@ def _data_checks_complete(
303
397
Check is conducted to ensure that column names are not repeated.
304
398
Also checks that the names in `columns` actually exist in `df`.
305
399
306
- Returns `df`, `columns`, `column_checker`, and `by` if
307
- all checks pass.
400
+ Returns `df`, `columns`, `column_checker`, `by`, `fill_value`,
401
+ and `explicit` if all checks pass.
308
402
"""
309
403
# TODO: get `complete` to work on MultiIndex columns,
310
404
# if there is sufficient interest with use cases
311
405
if isinstance (df .columns , pd .MultiIndex ):
312
- raise ValueError (
313
- """
314
- `complete` does not support MultiIndex columns.
315
- """
316
- )
406
+ raise ValueError ("`complete` does not support MultiIndex columns." )
317
407
318
408
columns = [
319
- list ( grouping ) if isinstance (grouping , tuple ) else grouping
409
+ [ * grouping ] if isinstance (grouping , tuple ) else grouping
320
410
for grouping in columns
321
411
]
322
412
column_checker = []
@@ -333,9 +423,7 @@ def _data_checks_complete(
333
423
column_checker_no_duplicates = set ()
334
424
for column in column_checker :
335
425
if column in column_checker_no_duplicates :
336
- raise ValueError (
337
- f"""{ column } column should be in only one group."""
338
- )
426
+ raise ValueError (f"{ column } column should be in only one group." )
339
427
column_checker_no_duplicates .add (column ) # noqa: PD005
340
428
341
429
check_column (df , column_checker )
@@ -349,4 +437,19 @@ def _data_checks_complete(
349
437
check ("by" , by , [list ])
350
438
check_column (df , by )
351
439
352
- return columns , column_checker , sort , by
440
+ check ("explicit" , explicit , [bool ])
441
+
442
+ fill_value_check = is_scalar (fill_value ), isinstance (fill_value , dict )
443
+ if not any (fill_value_check ):
444
+ raise TypeError (
445
+ "`fill_value` should either be a dictionary or a scalar value."
446
+ )
447
+ if fill_value_check [- 1 ]:
448
+ check_column (df , fill_value )
449
+ for column_name , value in fill_value .items ():
450
+ if not is_scalar (value ):
451
+ raise ValueError (
452
+ f"The value for { column_name } should be a scalar."
453
+ )
454
+
455
+ return columns , column_checker , sort , by , fill_value , explicit
0 commit comments