-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathStacking_blending.py
258 lines (229 loc) · 11.8 KB
/
Stacking_blending.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
import numpy as np
import pandas as pd
from sklearn.base import clone
from sklearn.model_selection import KFold, train_test_split
class StackingEnsemble:
"""
A robust ensemble class for building multi-layer stacking and blending models,
particularly designed for regression tasks.
This class supports both stacking (using K-fold out-of-fold predictions) and
blending (using a hold-out validation set). It includes input validation and
error handling to provide informative messages when issues arise.
Parameters:
-----------
layers : list of lists
Each element should be a list of scikit-learn compatible models for that layer.
Each model must implement fit() and predict() methods.
meta_model : estimator
A scikit-learn compatible model used to combine the outputs from the final layer.
n_folds : int, default=5
Number of folds for generating out-of-fold predictions (used in stacking mode).
blending : bool, default=False
If True, use blending (hold-out approach) instead of stacking.
blend_size : float, default=0.2
Proportion of the training data to hold out for blending (only used if blending=True).
random_state : int, default=None
Seed for reproducibility.
"""
def __init__(self, layers, meta_model, n_folds=5, blending=False, blend_size=0.2, random_state=None):
# Validate layers: should be a non-empty list of non-empty lists
if not isinstance(layers, list) or not layers or not all(isinstance(l, list) and l for l in layers):
raise ValueError("`layers` must be a non-empty list of non-empty lists of models.")
# Validate that each model in layers implements fit and predict
for layer in layers:
for model in layer:
if not (hasattr(model, "fit") and hasattr(model, "predict")):
raise ValueError("Each model in layers must implement fit() and predict().")
# Validate meta_model
if not (hasattr(meta_model, "fit") and hasattr(meta_model, "predict")):
raise ValueError("`meta_model` must implement fit() and predict().")
# Validate n_folds and blend_size
if not isinstance(n_folds, int) or n_folds < 2:
raise ValueError("`n_folds` must be an integer greater than or equal to 2.")
if blending:
if not (0.0 < blend_size < 1.0):
raise ValueError("`blend_size` must be a float between 0 and 1.")
self.layers = layers
self.meta_model = meta_model
self.n_folds = n_folds
self.blending = blending
self.blend_size = blend_size
self.random_state = random_state
self.layer_models_ = []
def fit(self, X, y):
"""
Fit the ensemble using the training data X and target y.
Parameters:
-----------
X : pandas.DataFrame or numpy.array
Feature matrix.
y : pandas.Series or numpy.array
Target vector.
Returns:
--------
self : object
Fitted estimator.
"""
# Validate X and y types and dimensions
if not isinstance(X, (pd.DataFrame, np.ndarray)):
raise TypeError("X must be a pandas DataFrame or a numpy array.")
if not isinstance(y, (pd.Series, np.ndarray, list)):
raise TypeError("y must be a pandas Series, numpy array, or list.")
try:
if isinstance(X, np.ndarray):
X = pd.DataFrame(X)
if not hasattr(y, 'iloc'):
y = pd.Series(y)
except Exception as e:
raise ValueError(f"Error converting inputs to pandas objects: {e}")
if X.shape[0] != y.shape[0]:
raise ValueError("The number of samples in X and y must be the same.")
# For blending, split the data into training and hold-out sets.
if self.blending:
try:
X_train, X_hold, y_train, y_hold = train_test_split(
X, y, test_size=self.blend_size, random_state=self.random_state
)
except Exception as e:
raise RuntimeError(f"Error during train/hold split: {e}")
else:
X_current = X.copy()
# Process each layer
for layer_idx, layer in enumerate(self.layers):
n_models = len(layer)
fitted_models = []
if self.blending:
# Prepare arrays to store predictions on training and hold-out sets.
train_preds = np.zeros((X_train.shape[0], n_models))
hold_preds = np.zeros((X_hold.shape[0], n_models))
for model_idx, model in enumerate(layer):
# For hold-out predictions, train on current training set and predict on hold-out.
try:
cloned_model = clone(model)
cloned_model.fit(X_train, y_train)
hold_preds[:, model_idx] = cloned_model.predict(X_hold)
except Exception as e:
raise RuntimeError(f"Error in blending at layer {layer_idx+1}, model {model_idx+1} (hold-out): {e}")
# Train the model on the same training set.
try:
model.fit(X_train, y_train)
except Exception as e:
raise RuntimeError(f"Error training model at layer {layer_idx+1}, model {model_idx+1}: {e}")
fitted_models.append(model)
# Get training set predictions.
try:
train_preds[:, model_idx] = model.predict(X_train)
except Exception as e:
raise RuntimeError(f"Error predicting on training set at layer {layer_idx+1}, model {model_idx+1}: {e}")
# Update X_train and X_hold to be the meta-features for the next layer.
X_train = pd.DataFrame(
train_preds,
columns=[f"Layer{layer_idx+1}_Model{m+1}" for m in range(n_models)]
)
X_hold = pd.DataFrame(
hold_preds,
columns=[f"Layer{layer_idx+1}_Model{m+1}" for m in range(n_models)]
)
else:
# Stacking mode using K-Fold out-of-fold predictions.
oof_preds = np.zeros((X_current.shape[0], n_models))
kf = KFold(n_splits=self.n_folds, shuffle=True, random_state=self.random_state)
for model_idx, model in enumerate(layer):
oof_model_preds = np.zeros(X_current.shape[0])
for fold, (train_idx, valid_idx) in enumerate(kf.split(X_current)):
try:
X_train_fold = X_current.iloc[train_idx]
X_valid_fold = X_current.iloc[valid_idx]
y_train_fold = y.iloc[train_idx]
except Exception as e:
raise RuntimeError(f"Error splitting data in fold {fold+1} of layer {layer_idx+1}: {e}")
try:
cloned_model = clone(model)
cloned_model.fit(X_train_fold, y_train_fold)
oof_model_preds[valid_idx] = cloned_model.predict(X_valid_fold)
except Exception as e:
raise RuntimeError(f"Error in fold {fold+1} at layer {layer_idx+1}, model {model_idx+1}: {e}")
oof_preds[:, model_idx] = oof_model_preds
try:
model.fit(X_current, y)
except Exception as e:
raise RuntimeError(f"Error training full-data model at layer {layer_idx+1}, model {model_idx+1}: {e}")
fitted_models.append(model)
X_current = pd.DataFrame(
oof_preds,
columns=[f"Layer{layer_idx+1}_Model{m+1}" for m in range(n_models)]
)
self.layer_models_.append(fitted_models)
# Train the meta model using the final layer's predictions.
try:
if self.blending:
self.meta_model.fit(X_hold, y_hold)
else:
self.meta_model.fit(X_current, y)
except Exception as e:
raise RuntimeError(f"Error training meta model: {e}")
return self
def predict(self, X):
"""
Make predictions using the fitted ensemble.
Parameters:
-----------
X : pandas.DataFrame or numpy.array
Feature matrix.
Returns:
--------
y_pred : numpy.array
Predicted values.
"""
if not isinstance(X, (pd.DataFrame, np.ndarray)):
raise TypeError("X must be a pandas DataFrame or a numpy array.")
try:
if isinstance(X, np.ndarray):
X = pd.DataFrame(X)
except Exception as e:
raise ValueError(f"Error converting X to DataFrame: {e}")
X_current = X.copy()
for layer_idx, fitted_models in enumerate(self.layer_models_):
try:
layer_preds = np.column_stack([model.predict(X_current) for model in fitted_models])
except Exception as e:
raise RuntimeError(f"Error during prediction at layer {layer_idx+1}: {e}")
X_current = pd.DataFrame(
layer_preds,
columns=[f"Layer{layer_idx+1}_Model{m+1}" for m in range(len(fitted_models))]
)
try:
return self.meta_model.predict(X_current)
except Exception as e:
raise RuntimeError(f"Error during meta model prediction: {e}")
def print_structure(self):
"""
Prints the entire stacking model structure in a detailed tree format,
including model names and only explicitly changed parameters.
"""
print("\nStacking Model Structure:")
print("└── Meta Model: ", self.meta_model.__class__.__name__)
print(" │ Parameters:", self._get_changed_params(self.meta_model))
for layer_idx, layer_models in enumerate(self.layers):
print(f" ├── Layer {layer_idx + 1}:")
for model_idx, model in enumerate(layer_models):
print(f" │ ├── Model {model_idx + 1}: {model.__class__.__name__}")
print(f" │ │ Parameters: {self._get_changed_params(model)}")
print("Blending Enabled: ", self.blending)
def _get_changed_params(self, model):
"""
Returns only the parameters that were explicitly changed by the user.
This function manually compares the default parameters with the user-set parameters.
"""
# Get the default parameters for the model (assuming they are defined in the class)
model_class = model.__class__
default_params = model_class().get_params()
# Get the user-defined parameters
current_params = model.get_params()
# Identify the changed parameters (non-default)
changed_params = {}
for param, value in current_params.items():
if param in default_params and value != default_params[param]:
changed_params[param] = value
# Return the changed parameters, or indicate if none have been changed
return changed_params if changed_params else "No changes (using defaults)"