1
+ from sklearn .model_selection import KFold
2
+ from sklearn .model_selection import train_test_split
3
+ from sklearn .datasets import load_digits
4
+ import numpy as np
5
+ from sklearn .svm import SVC
6
+ from sklearn import metrics
7
+ from sklearn .ensemble import RandomForestClassifier
8
+ from sklearn import preprocessing
9
+ import pandas as pd
10
+ from functools import reduce
11
+ from sklearn .metrics import confusion_matrix , classification_report
12
+ from sklearn .linear_model import LogisticRegression
13
+ from sklearn .base import clone
14
+ import xgboost as xgb
15
+
16
+ class SubClassifier (object ):
17
+ def __init__ (self ):
18
+ # import lightgbm as lgb
19
+ # import xgboost as xgb
20
+ # from sklearn.svm import SVC
21
+ # from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
22
+ # from sklearn.linear_model import LogisticRegression
23
+ # from sklearn.svm import LinearSVC
24
+ # clfs = {
25
+ # 'lr': LogisticRegression(penalty='l1', C=0.1, tol=0.0001),
26
+ # 'svm': LinearSVC(C=0.05, penalty='l2', dual=True),
27
+ # 'svm_linear': SVC(kernel='linear', probability=True),
28
+ # 'svm_ploy': SVC(kernel='poly', probability=True),
29
+ # 'bagging': BaggingClassifier(base_estimator=base_clf, n_estimators=60, max_samples=1.0, max_features=1.0,
30
+ # random_state=1, n_jobs=1, verbose=1),
31
+ # 'rf': RandomForestClassifier(n_estimators=40, criterion='gini', max_depth=9),
32
+ # 'adaboost': AdaBoostClassifier(base_estimator=base_clf, n_estimators=50, algorithm='SAMME'),
33
+ # 'gbdt': GradientBoostingClassifier(),
34
+ # 'xgb': xgb.XGBClassifier(learning_rate=0.1, max_depth=3, n_estimators=50),
35
+ # 'lgb': lgb.LGBMClassifier(boosting_type='gbdt', learning_rate=0.01, max_depth=5, n_estimators=250, num_leaves=90)
36
+ # }
37
+ pass
38
+
39
+ def SelectModel (self , modelname ):
40
+ if modelname == "SVM" :
41
+ from sklearn .svm import SVC
42
+ clf = SVC (kernel = 'rbf' , C = 16 , gamma = 0.125 ,probability = True )
43
+
44
+ elif modelname == "lr" :
45
+ from sklearn .linear_model import LogisticRegression
46
+ clf = LogisticRegression ()
47
+
48
+ elif modelname == "GBDT" :
49
+ from sklearn .ensemble import GradientBoostingClassifier
50
+ clf = GradientBoostingClassifier ()
51
+
52
+ elif modelname == "RF" :
53
+ from sklearn .ensemble import RandomForestClassifier
54
+ clf = RandomForestClassifier (n_estimators = 100 )
55
+
56
+ elif modelname == "xgboost" :
57
+ from xgboost import XGBClassifier
58
+ clf = XGBClassifier (
59
+ learning_rate = 0.01 ,
60
+ n_estimators = 1000 ,
61
+ max_depth = 4 ,
62
+ min_child_weight = 3 ,
63
+ gamma = 0.1 ,
64
+ subsample = 0.8 ,
65
+ colsample_bytree = 0.8 ,
66
+ reg_alpha = 1 ,
67
+ objective = 'binary:logistic' , #multi:softmax
68
+ nthread = 8 ,
69
+ scale_pos_weight = 1 ,
70
+ seed = 27 ,
71
+ random_state = 27
72
+ )
73
+ elif modelname == "KNN" :
74
+ from sklearn .neighbors import KNeighborsClassifier as knn
75
+ clf = knn ()
76
+
77
+ elif modelname == "MNB" :
78
+ from sklearn .naive_bayes import MultinomialNB
79
+ clf = MultinomialNB ()
80
+ else :
81
+ pass
82
+ return clf
83
+
84
+ def performance (self , y_true , y_pred , modelname = "" ):
85
+ accuracy = metrics .accuracy_score (y_true , y_pred )* 100
86
+ confusion = confusion_matrix (y_true , y_pred )
87
+ report = classification_report (y_true , y_pred )
88
+ print ("模型{}预测accuracy:{}" .format (modelname , accuracy ))
89
+ print ("混淆矩阵:\n {}" .format (confusion ))
90
+ print ("预测结果:\n {}" .format (report ))
91
+ return confusion , report
92
+
93
+
94
+ class StackingClassifier (object ):
95
+
96
+ def __init__ (self , classifiers , meta_classifier ,
97
+ use_clones = True , n_folds = 2 ,
98
+ n_classes = 2 , random_state = 100 ,
99
+ sample_weight = None , use_probas = True ):
100
+
101
+ self .classifiers = classifiers
102
+ self .meta_classifier = meta_classifier
103
+ self .use_clones = use_clones
104
+ self .n_folds = n_folds
105
+ self .n_classes = n_classes
106
+ self .random_state = random_state
107
+ self .sample_weight = sample_weight
108
+ self .use_probas = use_probas
109
+
110
+ def cross_valid_oof (self , clf , X , y , n_folds ):
111
+ """返回CV预测结果
112
+ """
113
+ ntrain = X .shape [0 ]
114
+ n_classes = self .n_classes
115
+ random_state = self .random_state
116
+ oof_features = np .zeros ((ntrain , n_classes ))
117
+ oof_pred = np .zeros (ntrain )
118
+ kf = KFold (n_splits = n_folds , random_state = random_state )
119
+ for i ,(train_index , test_index ) in enumerate (kf .split (X )):
120
+ kf_X_train = X [train_index ] # 数据
121
+ kf_y_train = y [train_index ] # 标签
122
+
123
+ kf_X_test = X [test_index ] # k-fold的验证集
124
+
125
+ clf .fit (kf_X_train , kf_y_train )
126
+ if not self .use_probas :
127
+ oof_features [test_index ] = clf .predict (kf_X_test )
128
+ else :
129
+ oof_features [test_index ] = clf .predict_proba (kf_X_test )
130
+ oof_pred [test_index ] = clf .predict (kf_X_test )
131
+ print ("fold-{i}: oof_features: {a}, cv-oof accuracy:{c}" .format (i = i ,
132
+ a = oof_features .shape ,
133
+ c = self .get_accuracy (y [test_index ], oof_pred [test_index ])))
134
+ return oof_features
135
+
136
+ def fit (self , X , y ):
137
+ self .clfs_ = self .classifiers
138
+ self .meta_clf_ = self .meta_classifier
139
+
140
+ n_folds = self .n_folds
141
+ sample_weight = self .sample_weight
142
+ meta_features = None
143
+
144
+ #feature layer
145
+ for name , sub_clf in self .clfs_ .items ():
146
+ print ("feature layer, current model: {}" .format (name ))
147
+ meta_prediction = self .cross_valid_oof (sub_clf , X , y , n_folds )
148
+ if meta_features is None :
149
+ meta_features = meta_prediction
150
+ else :
151
+ meta_features = np .column_stack ((meta_features , meta_prediction ))
152
+
153
+ for name , model in self .clfs_ .items ():
154
+ print ("fit base model using all train set: {}" .format (name ))
155
+ if sample_weight is None :
156
+ model .fit (X , y )
157
+ else :
158
+ model .fit (X , y , sample_weight = sample_weight )
159
+
160
+ #meta layer
161
+ if sample_weight is None :
162
+ self .meta_clf_ .fit (meta_features , y )
163
+ else :
164
+ self .meta_clf_ .fit (meta_features , y , sample_weight = sample_weight )
165
+
166
+ return self
167
+
168
+ def predict_meta_features (self , X ):
169
+ """ Get meta-features of test-data.
170
+ Parameters
171
+ -------
172
+ X : numpy array, shape = [n_samples, n_features]
173
+
174
+ Returns:
175
+ -------
176
+ meta-features : numpy array, shape = [n_samples, n_classifiers]
177
+ """
178
+ per_model_preds = []
179
+
180
+ for name , model in self .clfs_ .items ():
181
+ print ("model {} predict_meta_features" .format (name ))
182
+ if not self .use_probas :
183
+ pred_score = model .predict (X )
184
+ else :
185
+ pred_score = model .predict_proba (X )
186
+
187
+ per_model_preds .append (pred_score )
188
+
189
+ return np .hstack (per_model_preds )
190
+
191
+
192
+ def predict (self , X ):
193
+ """ Predict class label for X."""
194
+ meta_features = self .predict_meta_features (X )
195
+ return self .meta_clf_ .predict (meta_features )
196
+
197
+ def predict_prob (self , X ):
198
+ """ Predict class probabilities for X."""
199
+ meta_features = self .predict_meta_features (X )
200
+ return self .meta_clf_ .predict_proba (meta_features )
201
+
202
+ def get_accuracy (self , y_true , y_pred ):
203
+ accuracy = round (metrics .accuracy_score (y_true , y_pred )* 100 ,3 )
204
+ return accuracy
205
+
206
+ def performance (self , y_true , y_pred ):
207
+ accuracy = self .get_accuracy (y_true , y_pred )
208
+ confusion = confusion_matrix (y_true , y_pred )
209
+ report = classification_report (y_true , y_pred )
210
+ print ("多模型融合预测accuracy:{}" .format (accuracy ))
211
+ print ("混淆矩阵:\n {}" .format (confusion ))
212
+ print ("预测结果:\n {}" .format (report ))
213
+ return confusion , report
214
+
215
+ # 使用stacking方法的时候
216
+ if __name__ == "__main__" :
217
+ # 导入数据集切割训练与测试数据
218
+ data = load_digits ()
219
+ data_D = preprocessing .StandardScaler ().fit_transform (data .data )
220
+ data_L = data .target
221
+ X_train , X_test , y_train , y_test = train_test_split (data_D ,data_L ,random_state = 100 ,test_size = 0.7 )
222
+ print (set (y_train ))
223
+
224
+ #layer 1:多模型融合
225
+ classifiers = {
226
+ 'KNN' : SubClassifier ().SelectModel (modelname = "KNN" ),
227
+ 'rf' : SubClassifier ().SelectModel (modelname = "RF" ),
228
+ 'svm' : SubClassifier ().SelectModel (modelname = "SVM" ),
229
+ 'GBDT' : SubClassifier ().SelectModel (modelname = "GBDT" )
230
+ }
231
+
232
+ meta_classifier = SubClassifier ().SelectModel (modelname = "RF" )
233
+
234
+ stacking_clf = StackingClassifier (classifiers , meta_classifier , n_classes = 10 ,n_folds = 5 )
235
+
236
+ stacking_clf .fit (X_train , y_train )
237
+ pred = stacking_clf .predict (X_test )
238
+
239
+ #模型评估
240
+ stacking_clf .performance (y_test , pred )
241
+ # 96.4228934817
0 commit comments