@@ -15,6 +15,9 @@ def __init__(self, train_df: DataFrame = None, test_df: DataFrame = None):
15
15
self .train_encoded_df = None
16
16
self .test_encoded_df = None
17
17
18
+ self .one_hot_suffix = '_vec'
19
+ self .indexed_suffix = "_cat"
20
+
18
21
19
22
def explore_factors (self ):
20
23
"""Generates a dictionary of one pandas dataframe per column
@@ -88,22 +91,33 @@ def assemble_features(self, *columns, out_name='features'):
88
91
def prepare_to_model (self , target_col : str , to_strip = ' ' ):
89
92
"""Runs all cleaning and encoding steps to generate
90
93
dataframes ready to use in modeling"""
94
+ # if target_col in self.factors:
95
+ # target_col += indexed_suffix
91
96
self .strip_columns (* self .factors , to_strip = to_strip )
92
- self .string_index (* self .factors , suffix = '_cat' )
97
+ self .string_index (* self .factors , suffix = self . indexed_suffix )
93
98
# one-hot encode indexed factors, except target
94
- to_one_hot_encode = [fac + "_cat" for fac in self .factors if fac != target_col ]
95
- self .one_hot_encode (* to_one_hot_encode , suffix = '_vec' )
99
+ self .one_hot_encode (* self ._one_hot_encode_columns (target_col ), suffix = self .one_hot_suffix )
96
100
# assemble all together with numeric columns into features (except target if it's numeric)
97
- to_assemble = [col for col in self .numeric_columns if col != target_col ]
98
- to_assemble += [col for col , data_type in self .train_df .dtypes if "_cat_vec" in col ]
99
- self .assemble_features (* to_assemble )
100
- if target_col in self .factors :
101
- target_col += "_cat"
101
+ self .assemble_features (* self ._columns_to_assemble (target_col ))
102
102
103
+ if target_col in self .factors :
104
+ target_col += self .indexed_suffix
103
105
self .train_encoded_df = self ._select_to_model (self .train_df , target_col )
104
106
self .test_encoded_df = self ._select_to_model (self .test_df , target_col )
105
107
106
108
109
+ def _one_hot_encode_columns (self , target_col ):
110
+ return [fac + self .indexed_suffix for fac in self .factors if fac != target_col ]
111
+
112
+
113
+ def _columns_to_assemble (self , target_col ):
114
+ numeric = [col for col in self .numeric_columns
115
+ if col != target_col and not col .endswith (self .indexed_suffix )]
116
+ one_hot_encoded = [col for col , data_type in self .train_df .dtypes
117
+ if self .indexed_suffix + self .one_hot_suffix in col ]
118
+ return numeric + one_hot_encoded
119
+
120
+
107
121
@property
108
122
def factors (self ) -> List [str ]:
109
123
return self ._get_cols_by_types (types = ['string' ])
0 commit comments