Skip to content

Commit 9a5daa0

Browse files
committed
add stacking model draft
1 parent a93595a commit 9a5daa0

File tree

1 file changed

+181
-0
lines changed

1 file changed

+181
-0
lines changed

draft/stacking.py

+181
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
from sklearn.model_selection import KFold
2+
from sklearn.model_selection import train_test_split
3+
from sklearn.datasets import load_digits
4+
import numpy as np
5+
from sklearn.svm import SVC
6+
from sklearn import metrics
7+
from sklearn.ensemble import RandomForestClassifier
8+
from sklearn import preprocessing
9+
import pandas as pd
10+
from functools import reduce
11+
from sklearn.metrics import confusion_matrix, classification_report
12+
13+
class StackingClassifier(object):
14+
15+
def __init__(self, modellist=[], meta_classifier=None):
16+
self.modellist = modellist
17+
if meta_classifier == None:
18+
from sklearn.linear_model import LogisticRegression
19+
meta_classifier = LogisticRegression()
20+
self.meta_classifier = meta_classifier
21+
22+
def SelectModel(self, modelname):
23+
24+
if modelname == "SVM":
25+
from sklearn.svm import SVC
26+
model = SVC(kernel='rbf', C=16, gamma=0.125,probability=True)
27+
28+
elif modelname == "lr":
29+
from sklearn.linear_model import LogisticRegression
30+
model = LogisticRegression()
31+
32+
elif modelname == "GBDT":
33+
from sklearn.ensemble import GradientBoostingClassifier
34+
model = GradientBoostingClassifier()
35+
36+
elif modelname == "RF":
37+
from sklearn.ensemble import RandomForestClassifier
38+
model = RandomForestClassifier()
39+
40+
elif modelname == "xgboost":
41+
from xgboost import XGBClassifier
42+
model = XGBClassifier(
43+
learning_rate=0.01,
44+
n_estimators=1000,
45+
max_depth=4,
46+
min_child_weight=3,
47+
gamma=0.1,
48+
subsample=0.8,
49+
colsample_bytree=0.8,
50+
reg_alpha=1,
51+
objective='binary:logistic', #multi:softmax
52+
nthread=8,
53+
scale_pos_weight=1,
54+
seed=27,
55+
random_state=27
56+
)
57+
elif modelname == "KNN":
58+
from sklearn.neighbors import KNeighborsClassifier as knn
59+
model = knn()
60+
61+
elif modelname == "MNB":
62+
from sklearn.naive_bayes import MultinomialNB
63+
model = MultinomialNB()
64+
else:
65+
pass
66+
return model
67+
68+
def get_oof(self, clf, n_folds, X_train, y_train, X_test):
69+
ntrain = X_train.shape[0]
70+
ntest = X_test.shape[0]
71+
print("kfolds: ", ntrain, ntest)
72+
classnum = len(np.unique(y_train))
73+
kf = KFold(n_splits=n_folds,random_state=1)
74+
oof_train = np.zeros((ntrain,classnum))
75+
oof_test = np.zeros((ntest,classnum))
76+
77+
for i,(train_index, test_index) in enumerate(kf.split(X_train)):
78+
kf_X_train = X_train[train_index] # 数据
79+
kf_y_train = y_train[train_index] # 标签
80+
81+
kf_X_test = X_train[test_index] # k-fold的验证集
82+
83+
clf.fit(kf_X_train, kf_y_train)
84+
oof_train[test_index] = clf.predict_proba(kf_X_test)
85+
# print("shape of oof_train:", oof_train[test_index].shape)
86+
87+
print("fold{i}: oof_train: {a}, oof_test:{b}".format(i=i, a=oof_train.shape, b=oof_test.shape))
88+
oof_test += clf.predict_proba(X_test)
89+
oof_test = oof_test/float(n_folds)
90+
print("oof_train: {a}, oof_test:{b}".format(a=oof_train.shape, b=oof_test.shape))
91+
return oof_train, oof_test
92+
93+
def first_layer(self, X_train, y_train, X_test, modellist=None):
94+
"""modellist 需要重新修改
95+
"""
96+
newfeature_list = []
97+
newtestdata_list = []
98+
for modelname in self.modellist:
99+
sub_clf = self.SelectModel(modelname)
100+
oof_train_, oof_test_= self.get_oof(clf=sub_clf,
101+
n_folds=5,
102+
X_train=X_train,
103+
y_train=y_train,
104+
X_test=X_test)
105+
print("oof_train: ", oof_train_.shape)
106+
print("model-{}".format(modelname),len(oof_train_), len(oof_test_))
107+
newfeature_list.append(oof_train_)
108+
print("newfeature_list: ", len(newfeature_list))
109+
newtestdata_list.append(oof_test_)
110+
111+
# 特征组合
112+
X_train_stacking = reduce(lambda x,y:np.concatenate((x,y),axis=1),newfeature_list)
113+
X_test_stacking = reduce(lambda x,y:np.concatenate((x,y),axis=1),newtestdata_list)
114+
115+
return X_train_stacking, X_test_stacking
116+
117+
def fit(self, X_train, y_train, clf=None):
118+
if clf != None:
119+
self.meta_classifier = clf
120+
self.meta_classifier.fit(X_train, y_train)
121+
return self.meta_classifier
122+
123+
#second_layer
124+
def second_layer(self, X_train, y_train, clf=None):
125+
return self.fit(X_train, y_train, clf)
126+
127+
def predict(self, X_test, clf=None, type="label"):
128+
if clf == None:
129+
clf = self.meta_classifier
130+
if type == "proba":
131+
return clf.predict_proba(X_test)
132+
elif type == "label":
133+
return clf.predict(X_test)
134+
135+
def get_accuracy(self, y_true, y_pred):
136+
accuracy = metrics.accuracy_score(y_true, y_pred)*100
137+
return accuracy
138+
139+
def performance(self, y_true, y_pred):
140+
accuracy = self.get_accuracy(y_true, y_pred)
141+
confusion = confusion_matrix(y_true, y_pred)
142+
report = classification_report(y_true, y_pred)
143+
print("多模型融合预测accuracy:{}".format(accuracy))
144+
print("混淆矩阵:\n{}".format(confusion))
145+
print("预测结果:\n{}".format(report))
146+
return confusion, report
147+
148+
149+
# 使用stacking方法的时候
150+
# 第一级,重构特征当做第二级的训练集
151+
if __name__ == "__main__":
152+
# 导入数据集切割训练与测试数据
153+
data = load_digits()
154+
data_D = preprocessing.StandardScaler().fit_transform(data.data)
155+
data_L = data.target
156+
X_train, X_test, y_train, y_test = train_test_split(data_D,data_L,random_state=100,test_size=0.7)
157+
print(set(y_train))
158+
159+
# 单纯使用一个分类器的时候
160+
clf_meta = RandomForestClassifier()
161+
clf_meta.fit(X_train, y_train)
162+
pred = clf_meta.predict(X_test)
163+
accuracy = metrics.accuracy_score(y_test, pred)*100
164+
print ("====================", accuracy)
165+
# 91.0969793323
166+
167+
#layer 1:多模型融合
168+
modelist = ['SVM', 'GBDT', 'RF', 'KNN']
169+
stacking_clf = StackingClassifier(modelist)
170+
X_train_stacking, X_test_stacking = stacking_clf.first_layer(X_train, y_train, X_test)
171+
print("shape of X_train_stacking {}".format(X_train_stacking.shape))
172+
print("shape of X_test_stacking {}".format(X_test_stacking.shape))
173+
174+
#layer 2: 单模型训练
175+
RF = stacking_clf.SelectModel(modelname="RF")
176+
clf = stacking_clf.second_layer(X_train_stacking, y_train, clf=RF)
177+
pred = stacking_clf.predict(X_test_stacking)
178+
179+
#模型评估
180+
stacking_clf.performance(y_test, pred)
181+
# 96.4228934817

0 commit comments

Comments
 (0)