1
1
"""Implementation of `impute` function"""
2
- from typing import Any , Hashable , Optional
2
+ from typing import Any , Optional
3
+
3
4
4
- import lazy_loader as lazy
5
- import numpy as np
6
5
import pandas_flavor as pf
7
6
import pandas as pd
8
7
9
8
from janitor .utils import deprecated_alias
10
-
11
- ss = lazy . load ( "scipy.stats" )
9
+ from janitor . functions . utils import get_index_labels
10
+ from itertools import product
12
11
13
12
14
13
@pf .register_dataframe_method
15
14
@deprecated_alias (column = "column_name" )
15
+ @deprecated_alias (column_name = "column_names" )
16
16
@deprecated_alias (statistic = "statistic_column_name" )
17
17
def impute (
18
18
df : pd .DataFrame ,
19
- column_name : Hashable ,
19
+ column_names : Any ,
20
20
value : Optional [Any ] = None ,
21
21
statistic_column_name : Optional [str ] = None ,
22
22
) -> pd .DataFrame :
23
23
"""
24
24
Method-chainable imputation of values in a column.
25
25
26
- This method mutates the original DataFrame.
26
+ This method does not mutate the original DataFrame.
27
27
28
28
Underneath the hood, this function calls the `.fillna()` method available
29
29
to every `pandas.Series` object.
@@ -34,8 +34,11 @@ def impute(
34
34
take on the value provided.
35
35
36
36
If `statistic_column_name` is provided, then all null values in the
37
- selected column will take on the summary statistic value of other non-null
38
- values.
37
+ selected column(s) will take on the summary statistic value
38
+ of other non-null values.
39
+
40
+ Column selection in `column_names` is possible using the
41
+ [`select_columns`][janitor.functions.select.select_columns] syntax.
39
42
40
43
Currently supported statistics include:
41
44
@@ -63,7 +66,7 @@ def impute(
63
66
64
67
Imputing null values with 0 (using the `value` parameter):
65
68
66
- >>> df.impute(column_name ="sales", value=0.0)
69
+ >>> df.impute(column_names ="sales", value=0.0)
67
70
a sales score
68
71
0 1 0.0 NaN
69
72
1 2 0.0 3.0
@@ -72,14 +75,14 @@ def impute(
72
75
Imputing null values with median (using the `statistic_column_name`
73
76
parameter):
74
77
75
- >>> df.impute(column_name ="score", statistic_column_name="median")
78
+ >>> df.impute(column_names ="score", statistic_column_name="median")
76
79
a sales score
77
- 0 1 0.0 2.5
78
- 1 2 0.0 3.0
79
- 2 3 0.0 2.0
80
+ 0 1 NaN 2.5
81
+ 1 2 NaN 3.0
82
+ 2 3 NaN 2.0
80
83
81
84
:param df: A pandas DataFrame.
82
- :param column_name : The name of the column on which to impute values.
85
+ :param column_names : The name of the column(s) on which to impute values.
83
86
:param value: The value used for imputation, passed into `.fillna` method
84
87
of the underlying pandas Series.
85
88
:param statistic_column_name: The column statistic to impute.
@@ -90,42 +93,46 @@ def impute(
90
93
`average`, `median`, `mode`, `minimum`, `min`, `maximum`, or `max`.
91
94
"""
92
95
# Firstly, we check that only one of `value` or `statistic` are provided.
96
+ if (value is None ) and (statistic_column_name is None ):
97
+ raise ValueError ("Kindly specify a value or a statistic_column_name" )
98
+
93
99
if value is not None and statistic_column_name is not None :
94
100
raise ValueError (
95
101
"Only one of `value` or `statistic_column_name` should be "
96
102
"provided."
97
103
)
98
104
99
- # If statistic is provided, then we compute the relevant summary statistic
100
- # from the other data.
101
- funcs = {
102
- "mean" : np .mean ,
103
- "average" : np .mean , # aliased
104
- "median" : np .median ,
105
- "mode" : ss .mode ,
106
- "minimum" : np .min ,
107
- "min" : np .min , # aliased
108
- "maximum" : np .max ,
109
- "max" : np .max , # aliased
110
- }
111
- if statistic_column_name is not None :
105
+ column_names = get_index_labels ([column_names ], df , axis = "columns" )
106
+
107
+ if value is not None :
108
+ value = dict (product (column_names , [value ]))
109
+
110
+ else :
111
+ # If statistic is provided, then we compute
112
+ # the relevant summary statistic
113
+ # from the other data.
114
+ funcs = {
115
+ "mean" : "mean" ,
116
+ "average" : "mean" , # aliased
117
+ "median" : "median" ,
118
+ "mode" : "mode" ,
119
+ "minimum" : "min" ,
120
+ "min" : "min" , # aliased
121
+ "maximum" : "max" ,
122
+ "max" : "max" , # aliased
123
+ }
112
124
# Check that the statistic keyword argument is one of the approved.
113
125
if statistic_column_name not in funcs :
114
126
raise KeyError (
115
127
f"`statistic_column_name` must be one of { funcs .keys ()} ."
116
128
)
117
129
118
- value = funcs [statistic_column_name ](
119
- df [ column_name ]. dropna (). to_numpy ()
120
- )
121
- # special treatment for mode, because scipy stats mode returns a
122
- # moderesult object.
130
+ value = dict ( product ( column_names , [ funcs [statistic_column_name ]]))
131
+
132
+ value = df . agg ( value )
133
+
134
+ # special treatment for mode
123
135
if statistic_column_name == "mode" :
124
- value = value . mode [0 ]
136
+ value = { key : val . at [0 ] for key , val in value . items ()}
125
137
126
- # The code is architected this way - if `value` is not provided but
127
- # statistic is, we then overwrite the None value taken on by `value`, and
128
- # use it to set the imputation column.
129
- if value is not None :
130
- df [column_name ] = df [column_name ].fillna (value )
131
- return df
138
+ return df .fillna (value = value )
0 commit comments