Skip to content

Commit 5f1569a

Browse files
committed
add stacking
1 parent 9a5daa0 commit 5f1569a

File tree

2 files changed

+241
-0
lines changed

2 files changed

+241
-0
lines changed

.DS_Store

0 Bytes
Binary file not shown.

stacking/stacking.py

+241
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,241 @@
1+
from sklearn.model_selection import KFold
2+
from sklearn.model_selection import train_test_split
3+
from sklearn.datasets import load_digits
4+
import numpy as np
5+
from sklearn.svm import SVC
6+
from sklearn import metrics
7+
from sklearn.ensemble import RandomForestClassifier
8+
from sklearn import preprocessing
9+
import pandas as pd
10+
from functools import reduce
11+
from sklearn.metrics import confusion_matrix, classification_report
12+
from sklearn.linear_model import LogisticRegression
13+
from sklearn.base import clone
14+
import xgboost as xgb
15+
16+
class SubClassifier(object):
17+
def __init__(self):
18+
# import lightgbm as lgb
19+
# import xgboost as xgb
20+
# from sklearn.svm import SVC
21+
# from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
22+
# from sklearn.linear_model import LogisticRegression
23+
# from sklearn.svm import LinearSVC
24+
# clfs = {
25+
# 'lr': LogisticRegression(penalty='l1', C=0.1, tol=0.0001),
26+
# 'svm': LinearSVC(C=0.05, penalty='l2', dual=True),
27+
# 'svm_linear': SVC(kernel='linear', probability=True),
28+
# 'svm_ploy': SVC(kernel='poly', probability=True),
29+
# 'bagging': BaggingClassifier(base_estimator=base_clf, n_estimators=60, max_samples=1.0, max_features=1.0,
30+
# random_state=1, n_jobs=1, verbose=1),
31+
# 'rf': RandomForestClassifier(n_estimators=40, criterion='gini', max_depth=9),
32+
# 'adaboost': AdaBoostClassifier(base_estimator=base_clf, n_estimators=50, algorithm='SAMME'),
33+
# 'gbdt': GradientBoostingClassifier(),
34+
# 'xgb': xgb.XGBClassifier(learning_rate=0.1, max_depth=3, n_estimators=50),
35+
# 'lgb': lgb.LGBMClassifier(boosting_type='gbdt', learning_rate=0.01, max_depth=5, n_estimators=250, num_leaves=90)
36+
# }
37+
pass
38+
39+
def SelectModel(self, modelname):
40+
if modelname == "SVM":
41+
from sklearn.svm import SVC
42+
clf = SVC(kernel='rbf', C=16, gamma=0.125,probability=True)
43+
44+
elif modelname == "lr":
45+
from sklearn.linear_model import LogisticRegression
46+
clf = LogisticRegression()
47+
48+
elif modelname == "GBDT":
49+
from sklearn.ensemble import GradientBoostingClassifier
50+
clf = GradientBoostingClassifier()
51+
52+
elif modelname == "RF":
53+
from sklearn.ensemble import RandomForestClassifier
54+
clf = RandomForestClassifier(n_estimators=100)
55+
56+
elif modelname == "xgboost":
57+
from xgboost import XGBClassifier
58+
clf = XGBClassifier(
59+
learning_rate=0.01,
60+
n_estimators=1000,
61+
max_depth=4,
62+
min_child_weight=3,
63+
gamma=0.1,
64+
subsample=0.8,
65+
colsample_bytree=0.8,
66+
reg_alpha=1,
67+
objective='binary:logistic', #multi:softmax
68+
nthread=8,
69+
scale_pos_weight=1,
70+
seed=27,
71+
random_state=27
72+
)
73+
elif modelname == "KNN":
74+
from sklearn.neighbors import KNeighborsClassifier as knn
75+
clf = knn()
76+
77+
elif modelname == "MNB":
78+
from sklearn.naive_bayes import MultinomialNB
79+
clf = MultinomialNB()
80+
else:
81+
pass
82+
return clf
83+
84+
def performance(self, y_true, y_pred, modelname=""):
85+
accuracy = metrics.accuracy_score(y_true, y_pred)*100
86+
confusion = confusion_matrix(y_true, y_pred)
87+
report = classification_report(y_true, y_pred)
88+
print("模型{}预测accuracy:{}".format(modelname, accuracy))
89+
print("混淆矩阵:\n{}".format(confusion))
90+
print("预测结果:\n{}".format(report))
91+
return confusion, report
92+
93+
94+
class StackingClassifier(object):
95+
96+
def __init__(self, classifiers, meta_classifier,
97+
use_clones=True, n_folds=2,
98+
n_classes=2, random_state=100,
99+
sample_weight=None, use_probas=True):
100+
101+
self.classifiers = classifiers
102+
self.meta_classifier = meta_classifier
103+
self.use_clones=use_clones
104+
self.n_folds = n_folds
105+
self.n_classes = n_classes
106+
self.random_state = random_state
107+
self.sample_weight = sample_weight
108+
self.use_probas = use_probas
109+
110+
def cross_valid_oof(self, clf, X, y, n_folds):
111+
"""返回CV预测结果
112+
"""
113+
ntrain = X.shape[0]
114+
n_classes = self.n_classes
115+
random_state = self.random_state
116+
oof_features = np.zeros((ntrain, n_classes))
117+
oof_pred = np.zeros(ntrain)
118+
kf = KFold(n_splits=n_folds, random_state=random_state)
119+
for i,(train_index, test_index) in enumerate(kf.split(X)):
120+
kf_X_train = X[train_index] # 数据
121+
kf_y_train = y[train_index] # 标签
122+
123+
kf_X_test = X[test_index] # k-fold的验证集
124+
125+
clf.fit(kf_X_train, kf_y_train)
126+
if not self.use_probas:
127+
oof_features[test_index] = clf.predict(kf_X_test)
128+
else:
129+
oof_features[test_index] = clf.predict_proba(kf_X_test)
130+
oof_pred[test_index] = clf.predict(kf_X_test)
131+
print("fold-{i}: oof_features: {a}, cv-oof accuracy:{c}".format(i=i,
132+
a=oof_features.shape,
133+
c=self.get_accuracy(y[test_index], oof_pred[test_index])))
134+
return oof_features
135+
136+
def fit(self, X, y):
137+
self.clfs_ = self.classifiers
138+
self.meta_clf_ = self.meta_classifier
139+
140+
n_folds = self.n_folds
141+
sample_weight = self.sample_weight
142+
meta_features = None
143+
144+
#feature layer
145+
for name, sub_clf in self.clfs_.items():
146+
print("feature layer, current model: {}".format(name))
147+
meta_prediction = self.cross_valid_oof(sub_clf, X, y, n_folds)
148+
if meta_features is None:
149+
meta_features = meta_prediction
150+
else:
151+
meta_features = np.column_stack((meta_features, meta_prediction))
152+
153+
for name, model in self.clfs_.items():
154+
print("fit base model using all train set: {}".format(name))
155+
if sample_weight is None:
156+
model.fit(X, y)
157+
else:
158+
model.fit(X, y, sample_weight=sample_weight)
159+
160+
#meta layer
161+
if sample_weight is None:
162+
self.meta_clf_.fit(meta_features, y)
163+
else:
164+
self.meta_clf_.fit(meta_features, y, sample_weight=sample_weight)
165+
166+
return self
167+
168+
def predict_meta_features(self, X):
169+
""" Get meta-features of test-data.
170+
Parameters
171+
-------
172+
X : numpy array, shape = [n_samples, n_features]
173+
174+
Returns:
175+
-------
176+
meta-features : numpy array, shape = [n_samples, n_classifiers]
177+
"""
178+
per_model_preds = []
179+
180+
for name, model in self.clfs_.items():
181+
print("model {} predict_meta_features".format(name))
182+
if not self.use_probas:
183+
pred_score = model.predict(X)
184+
else:
185+
pred_score = model.predict_proba(X)
186+
187+
per_model_preds.append(pred_score)
188+
189+
return np.hstack(per_model_preds)
190+
191+
192+
def predict(self, X):
193+
""" Predict class label for X."""
194+
meta_features = self.predict_meta_features(X)
195+
return self.meta_clf_.predict(meta_features)
196+
197+
def predict_prob(self, X):
198+
""" Predict class probabilities for X."""
199+
meta_features = self.predict_meta_features(X)
200+
return self.meta_clf_.predict_proba(meta_features)
201+
202+
def get_accuracy(self, y_true, y_pred):
203+
accuracy = round(metrics.accuracy_score(y_true, y_pred)*100,3)
204+
return accuracy
205+
206+
def performance(self, y_true, y_pred):
207+
accuracy = self.get_accuracy(y_true, y_pred)
208+
confusion = confusion_matrix(y_true, y_pred)
209+
report = classification_report(y_true, y_pred)
210+
print("多模型融合预测accuracy:{}".format(accuracy))
211+
print("混淆矩阵:\n{}".format(confusion))
212+
print("预测结果:\n{}".format(report))
213+
return confusion, report
214+
215+
# 使用stacking方法的时候
216+
if __name__ == "__main__":
217+
# 导入数据集切割训练与测试数据
218+
data = load_digits()
219+
data_D = preprocessing.StandardScaler().fit_transform(data.data)
220+
data_L = data.target
221+
X_train, X_test, y_train, y_test = train_test_split(data_D,data_L,random_state=100,test_size=0.7)
222+
print(set(y_train))
223+
224+
#layer 1:多模型融合
225+
classifiers = {
226+
'KNN': SubClassifier().SelectModel(modelname="KNN"),
227+
'rf': SubClassifier().SelectModel(modelname="RF"),
228+
'svm': SubClassifier().SelectModel(modelname="SVM"),
229+
'GBDT': SubClassifier().SelectModel(modelname="GBDT")
230+
}
231+
232+
meta_classifier = SubClassifier().SelectModel(modelname="RF")
233+
234+
stacking_clf = StackingClassifier(classifiers, meta_classifier, n_classes=10,n_folds=5)
235+
236+
stacking_clf.fit(X_train, y_train)
237+
pred = stacking_clf.predict(X_test)
238+
239+
#模型评估
240+
stacking_clf.performance(y_test, pred)
241+
# 96.4228934817

0 commit comments

Comments
 (0)