|
| 1 | +from sklearn.model_selection import KFold |
| 2 | +from sklearn.model_selection import train_test_split |
| 3 | +from sklearn.datasets import load_digits |
| 4 | +import numpy as np |
| 5 | +from sklearn.svm import SVC |
| 6 | +from sklearn import metrics |
| 7 | +from sklearn.ensemble import RandomForestClassifier |
| 8 | +from sklearn import preprocessing |
| 9 | +import pandas as pd |
| 10 | +from functools import reduce |
| 11 | +from sklearn.metrics import confusion_matrix, classification_report |
| 12 | + |
| 13 | +class StackingClassifier(object): |
| 14 | + |
| 15 | + def __init__(self, modellist=[], meta_classifier=None): |
| 16 | + self.modellist = modellist |
| 17 | + if meta_classifier == None: |
| 18 | + from sklearn.linear_model import LogisticRegression |
| 19 | + meta_classifier = LogisticRegression() |
| 20 | + self.meta_classifier = meta_classifier |
| 21 | + |
| 22 | + def SelectModel(self, modelname): |
| 23 | + |
| 24 | + if modelname == "SVM": |
| 25 | + from sklearn.svm import SVC |
| 26 | + model = SVC(kernel='rbf', C=16, gamma=0.125,probability=True) |
| 27 | + |
| 28 | + elif modelname == "lr": |
| 29 | + from sklearn.linear_model import LogisticRegression |
| 30 | + model = LogisticRegression() |
| 31 | + |
| 32 | + elif modelname == "GBDT": |
| 33 | + from sklearn.ensemble import GradientBoostingClassifier |
| 34 | + model = GradientBoostingClassifier() |
| 35 | + |
| 36 | + elif modelname == "RF": |
| 37 | + from sklearn.ensemble import RandomForestClassifier |
| 38 | + model = RandomForestClassifier() |
| 39 | + |
| 40 | + elif modelname == "xgboost": |
| 41 | + from xgboost import XGBClassifier |
| 42 | + model = XGBClassifier( |
| 43 | + learning_rate=0.01, |
| 44 | + n_estimators=1000, |
| 45 | + max_depth=4, |
| 46 | + min_child_weight=3, |
| 47 | + gamma=0.1, |
| 48 | + subsample=0.8, |
| 49 | + colsample_bytree=0.8, |
| 50 | + reg_alpha=1, |
| 51 | + objective='binary:logistic', #multi:softmax |
| 52 | + nthread=8, |
| 53 | + scale_pos_weight=1, |
| 54 | + seed=27, |
| 55 | + random_state=27 |
| 56 | + ) |
| 57 | + elif modelname == "KNN": |
| 58 | + from sklearn.neighbors import KNeighborsClassifier as knn |
| 59 | + model = knn() |
| 60 | + |
| 61 | + elif modelname == "MNB": |
| 62 | + from sklearn.naive_bayes import MultinomialNB |
| 63 | + model = MultinomialNB() |
| 64 | + else: |
| 65 | + pass |
| 66 | + return model |
| 67 | + |
| 68 | + def get_oof(self, clf, n_folds, X_train, y_train, X_test): |
| 69 | + ntrain = X_train.shape[0] |
| 70 | + ntest = X_test.shape[0] |
| 71 | + print("kfolds: ", ntrain, ntest) |
| 72 | + classnum = len(np.unique(y_train)) |
| 73 | + kf = KFold(n_splits=n_folds,random_state=1) |
| 74 | + oof_train = np.zeros((ntrain,classnum)) |
| 75 | + oof_test = np.zeros((ntest,classnum)) |
| 76 | + |
| 77 | + for i,(train_index, test_index) in enumerate(kf.split(X_train)): |
| 78 | + kf_X_train = X_train[train_index] # 数据 |
| 79 | + kf_y_train = y_train[train_index] # 标签 |
| 80 | + |
| 81 | + kf_X_test = X_train[test_index] # k-fold的验证集 |
| 82 | + |
| 83 | + clf.fit(kf_X_train, kf_y_train) |
| 84 | + oof_train[test_index] = clf.predict_proba(kf_X_test) |
| 85 | + # print("shape of oof_train:", oof_train[test_index].shape) |
| 86 | + |
| 87 | + print("fold{i}: oof_train: {a}, oof_test:{b}".format(i=i, a=oof_train.shape, b=oof_test.shape)) |
| 88 | + oof_test += clf.predict_proba(X_test) |
| 89 | + oof_test = oof_test/float(n_folds) |
| 90 | + print("oof_train: {a}, oof_test:{b}".format(a=oof_train.shape, b=oof_test.shape)) |
| 91 | + return oof_train, oof_test |
| 92 | + |
| 93 | + def first_layer(self, X_train, y_train, X_test, modellist=None): |
| 94 | + """modellist 需要重新修改 |
| 95 | + """ |
| 96 | + newfeature_list = [] |
| 97 | + newtestdata_list = [] |
| 98 | + for modelname in self.modellist: |
| 99 | + sub_clf = self.SelectModel(modelname) |
| 100 | + oof_train_, oof_test_= self.get_oof(clf=sub_clf, |
| 101 | + n_folds=5, |
| 102 | + X_train=X_train, |
| 103 | + y_train=y_train, |
| 104 | + X_test=X_test) |
| 105 | + print("oof_train: ", oof_train_.shape) |
| 106 | + print("model-{}".format(modelname),len(oof_train_), len(oof_test_)) |
| 107 | + newfeature_list.append(oof_train_) |
| 108 | + print("newfeature_list: ", len(newfeature_list)) |
| 109 | + newtestdata_list.append(oof_test_) |
| 110 | + |
| 111 | + # 特征组合 |
| 112 | + X_train_stacking = reduce(lambda x,y:np.concatenate((x,y),axis=1),newfeature_list) |
| 113 | + X_test_stacking = reduce(lambda x,y:np.concatenate((x,y),axis=1),newtestdata_list) |
| 114 | + |
| 115 | + return X_train_stacking, X_test_stacking |
| 116 | + |
| 117 | + def fit(self, X_train, y_train, clf=None): |
| 118 | + if clf != None: |
| 119 | + self.meta_classifier = clf |
| 120 | + self.meta_classifier.fit(X_train, y_train) |
| 121 | + return self.meta_classifier |
| 122 | + |
| 123 | + #second_layer |
| 124 | + def second_layer(self, X_train, y_train, clf=None): |
| 125 | + return self.fit(X_train, y_train, clf) |
| 126 | + |
| 127 | + def predict(self, X_test, clf=None, type="label"): |
| 128 | + if clf == None: |
| 129 | + clf = self.meta_classifier |
| 130 | + if type == "proba": |
| 131 | + return clf.predict_proba(X_test) |
| 132 | + elif type == "label": |
| 133 | + return clf.predict(X_test) |
| 134 | + |
| 135 | + def get_accuracy(self, y_true, y_pred): |
| 136 | + accuracy = metrics.accuracy_score(y_true, y_pred)*100 |
| 137 | + return accuracy |
| 138 | + |
| 139 | + def performance(self, y_true, y_pred): |
| 140 | + accuracy = self.get_accuracy(y_true, y_pred) |
| 141 | + confusion = confusion_matrix(y_true, y_pred) |
| 142 | + report = classification_report(y_true, y_pred) |
| 143 | + print("多模型融合预测accuracy:{}".format(accuracy)) |
| 144 | + print("混淆矩阵:\n{}".format(confusion)) |
| 145 | + print("预测结果:\n{}".format(report)) |
| 146 | + return confusion, report |
| 147 | + |
| 148 | + |
| 149 | +# 使用stacking方法的时候 |
| 150 | +# 第一级,重构特征当做第二级的训练集 |
| 151 | +if __name__ == "__main__": |
| 152 | + # 导入数据集切割训练与测试数据 |
| 153 | + data = load_digits() |
| 154 | + data_D = preprocessing.StandardScaler().fit_transform(data.data) |
| 155 | + data_L = data.target |
| 156 | + X_train, X_test, y_train, y_test = train_test_split(data_D,data_L,random_state=100,test_size=0.7) |
| 157 | + print(set(y_train)) |
| 158 | + |
| 159 | + # 单纯使用一个分类器的时候 |
| 160 | + clf_meta = RandomForestClassifier() |
| 161 | + clf_meta.fit(X_train, y_train) |
| 162 | + pred = clf_meta.predict(X_test) |
| 163 | + accuracy = metrics.accuracy_score(y_test, pred)*100 |
| 164 | + print ("====================", accuracy) |
| 165 | + # 91.0969793323 |
| 166 | + |
| 167 | + #layer 1:多模型融合 |
| 168 | + modelist = ['SVM', 'GBDT', 'RF', 'KNN'] |
| 169 | + stacking_clf = StackingClassifier(modelist) |
| 170 | + X_train_stacking, X_test_stacking = stacking_clf.first_layer(X_train, y_train, X_test) |
| 171 | + print("shape of X_train_stacking {}".format(X_train_stacking.shape)) |
| 172 | + print("shape of X_test_stacking {}".format(X_test_stacking.shape)) |
| 173 | + |
| 174 | + #layer 2: 单模型训练 |
| 175 | + RF = stacking_clf.SelectModel(modelname="RF") |
| 176 | + clf = stacking_clf.second_layer(X_train_stacking, y_train, clf=RF) |
| 177 | + pred = stacking_clf.predict(X_test_stacking) |
| 178 | + |
| 179 | + #模型评估 |
| 180 | + stacking_clf.performance(y_test, pred) |
| 181 | + # 96.4228934817 |
0 commit comments