@@ -58,33 +58,45 @@ def run(self, data):
58
58
clean_df = self ._format_datetime_col (clean_df )
59
59
clean_df = self ._set_multi_index (clean_df )
60
60
61
- if self .name == "historical_data" :
62
- try :
63
- clean_df = self ._missing_value_imputation_hist (clean_df )
64
- except Exception as e :
65
- logger .debug (f"Missing value imputation failed with { e .args } " )
66
- if self .preprocessing :
67
- try :
68
- clean_df = self ._outlier_treatment (clean_df )
69
- except Exception as e :
70
- logger .debug (f"Outlier Treatment failed with { e .args } " )
71
- else :
72
- logger .debug ("Skipping outlier treatment as preprocessing is disabled" )
73
- elif self .name == "additional_data" :
74
- clean_df = self ._missing_value_imputation_add (clean_df )
61
+ if self .preprocessing and self .preprocessing .enabled :
62
+ if self .name == "historical_data" :
63
+ if self .preprocessing .steps .missing_value_imputation :
64
+ try :
65
+ clean_df = self ._missing_value_imputation_hist (clean_df )
66
+ except Exception as e :
67
+ logger .debug (f"Missing value imputation failed with { e .args } " )
68
+ else :
69
+ logger .info ("Skipping missing value imputation because it is disabled" )
70
+ if self .preprocessing .steps .outlier_treatment :
71
+ try :
72
+ clean_df = self ._outlier_treatment (clean_df )
73
+ except Exception as e :
74
+ logger .debug (f"Outlier Treatment failed with { e .args } " )
75
+ else :
76
+ logger .info ("Skipping outlier treatment because it is disabled" )
77
+ elif self .name == "additional_data" :
78
+ clean_df = self ._missing_value_imputation_add (clean_df )
79
+ else :
80
+ logger .info ("Skipping all preprocessing steps because preprocessing is disabled" )
75
81
return clean_df
76
82
77
83
def _remove_trailing_whitespace (self , df ):
78
84
return df .apply (lambda x : x .str .strip () if x .dtype == "object" else x )
79
85
80
86
def _set_series_id_column (self , df ):
87
+ self ._target_category_columns_map = dict ()
81
88
if not self .target_category_columns :
82
89
df [DataColumns .Series ] = "Series 1"
83
90
self .has_artificial_series = True
84
91
else :
85
92
df [DataColumns .Series ] = merge_category_columns (
86
93
df , self .target_category_columns
87
94
)
95
+ merged_values = df [DataColumns .Series ].unique ().tolist ()
96
+ if self .target_category_columns :
97
+ for value in merged_values :
98
+ self ._target_category_columns_map [value ] = df [df [DataColumns .Series ] == value ][self .target_category_columns ].drop_duplicates ().iloc [0 ].to_dict ()
99
+
88
100
df = df .drop (self .target_category_columns , axis = 1 )
89
101
return df
90
102
@@ -189,3 +201,25 @@ def _check_historical_dataset(self, df):
189
201
raise DataMismatchError (
190
202
f"Expected { self .name } to have columns: { expected_names } , but instead found column names: { df .columns } . Is the { self .name } path correct?"
191
203
)
204
+
205
+ """
206
+ Map between merged target category column values and target category column and its value
207
+ If target category columns are PPG_Code, Class, Num
208
+ Merged target category column values are Product Category 1__A__1, Product Category 2__A__2
209
+ Then target_category_columns_map would be
210
+ {
211
+ "Product Category 1__A__1": {
212
+ "PPG_Code": "Product Category 1",
213
+ "Class": "A",
214
+ "Num": 1
215
+ },
216
+ "Product Category 2__A__2": {
217
+ "PPG_Code": "Product Category 2",
218
+ "Class": "A",
219
+ "Num": 2
220
+ },
221
+
222
+ }
223
+ """
224
+ def get_target_category_columns_map (self ):
225
+ return self ._target_category_columns_map
0 commit comments