feature: add xgboost and lgbm the same way we see slight improvement by using autoencoder

xxl4tomxu98 · xxl4tomxu98 · commit dde27552562b · 2021-09-24T20:48:45.000-04:00
diff --git a/autoencoder-classification.ipynb b/autoencoder-classification.ipynb
@@ -1126,8 +1126,6 @@
     "#rounted_yhat = np.rint(np.array(yhat))\n",
     "rounded_yhat = np.around(yhat)\n",
     "rounded_yhat = rounded_yhat.astype(int)\n",
-    "#rounded_yhat=np.argmax(yhat, axis=0)\n",
-    "#print(rounded_yhat)\n",
     "# calculate accuracy\n",
     "acc = accuracy_score(y_test, rounded_yhat)\n",
     "print(\"The accuracy score (acc) on test set: {:.4f}\".format(acc))\n",
@@ -1180,7 +1178,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 31,
    "source": [
     "# gradient boosting for classification in scikit-learn\n",
     "from numpy import mean\n",
@@ -1195,14 +1193,11 @@
     "n_scores = cross_val_score(gbm_cls, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')\n",
     "print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))\n",
     "# fit the model on the whole dataset\n",
-    "\n",
     "gbm_cls.fit(X_train, y_train)\n",
     "# make a single prediction\n",
-    "yhat = model.predict(X_test)\n",
+    "yhat = gbm_cls.predict(X_test)\n",
     "rounded_yhat = np.around(yhat)\n",
     "rounded_yhat = rounded_yhat.astype(int)\n",
-    "#rounded_yhat=np.argmax(yhat, axis=0)\n",
-    "#print(rounded_yhat)\n",
     "# calculate accuracy\n",
     "acc = accuracy_score(y_test, rounded_yhat)\n",
     "print(\"The accuracy score (acc) on test set: {:.4f}\".format(acc))"
@@ -1213,7 +1208,7 @@
      "name": "stdout",
      "text": [
       "Accuracy: 0.934 (0.023)\n",
-      "The accuracy score (acc) on test set: 0.8939\n"
+      "The accuracy score (acc) on test set: 0.9333\n"
      ]
     }
    ],
@@ -1281,6 +1276,240 @@
     "The autoencoder with GBM regressor and classifier seem to come to the same results with or without GBM model hyperparameter tunnings. The best hyperparameter settings with GBM regressor alone can do better than the combined autoencoder and GBM prediction model. "
    ],
    "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Histogram-Based Gradient Boosting Machine for Classification\n",
+    "\n",
+    "The example below first evaluates a HistGradientBoostingClassifier on the test problem using repeated k-fold cross-validation and reports the mean accuracy. Then a single model is fit on all available data and a single prediction is made."
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "source": [
+    "from sklearn.ensemble import HistGradientBoostingClassifier\n",
+    "# evaluate the model\n",
+    "hgb_cls = HistGradientBoostingClassifier(random_state=1)\n",
+    "cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)\n",
+    "n_scores = cross_val_score(hgb_cls, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')\n",
+    "print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))\n",
+    "# fit the model on the whole dataset\n",
+    "hgb_cls.fit(X_train, y_train)\n",
+    "# make a single prediction\n",
+    "yhat = hgb_cls.predict(X_test)\n",
+    "rounded_yhat = np.around(yhat)\n",
+    "rounded_yhat = rounded_yhat.astype(int)\n",
+    "acc = accuracy_score(y_test, rounded_yhat)\n",
+    "print(\"The accuracy score (acc) on test set: {:.4f}\".format(acc))"
+   ],
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "Accuracy: 0.941 (0.026)\n",
+      "The accuracy score (acc) on test set: 0.9394\n"
+     ]
+    }
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "source": [
+    "#We can then use this encoded data to train and evaluate the HGB classifier model, as before.\n",
+    "# fit model on training set\n",
+    "hgb_cls.fit(X_train_encode, y_train)\n",
+    "# make prediction on test set\n",
+    "yhat = hgb_cls.predict(X_test_encode)\n",
+    "#rounted_yhat = np.rint(np.array(yhat))\n",
+    "rounded_yhat = np.around(yhat)\n",
+    "rounded_yhat = rounded_yhat.astype(int)\n",
+    "# calculate accuracy\n",
+    "acc = accuracy_score(y_test, rounded_yhat)\n",
+    "print(\"The accuracy score (acc) on test set: {:.4f}\".format(acc))"
+   ],
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "The accuracy score (acc) on test set: 0.9182\n"
+     ]
+    }
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "XGBoost for Classification\n",
+    "The example below first evaluates an XGBClassifier on the test problem using repeated k-fold cross-validation and reports the mean accuracy. Then a single model is fit on all available data and a single prediction is made."
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "source": [
+    "# xgboost for classification\n",
+    "from numpy import asarray\n",
+    "from xgboost import XGBClassifier\n",
+    "# evaluate the model\n",
+    "xgb_cls = XGBClassifier(random_state=1, use_label_encoder=False)\n",
+    "cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)\n",
+    "n_scores = cross_val_score(xgb_cls, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')\n",
+    "print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))\n",
+    "# fit the model on the whole dataset\n",
+    "xgb_cls.fit(X_train, y_train)\n",
+    "# make a single prediction\n",
+    "yhat = xgb_cls.predict(X_test)\n",
+    "rounded_yhat = np.around(yhat)\n",
+    "rounded_yhat = rounded_yhat.astype(int)\n",
+    "acc = accuracy_score(y_test, rounded_yhat)\n",
+    "print(\"The accuracy score (acc) on test set: {:.4f}\".format(acc))"
+   ],
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "[20:37:59] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
+      "[20:37:59] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
+      "[20:37:59] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
+      "[20:37:59] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
+      "[20:37:59] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
+      "[20:37:59] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
+      "[20:37:59] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
+      "[20:37:59] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
+      "[20:38:01] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
+      "[20:38:01] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
+      "[20:38:01] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
+      "[20:38:01] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
+      "[20:38:01] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
+      "[20:38:01] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
+      "[20:38:01] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
+      "[20:38:01] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
+      "[20:38:03] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
+      "[20:38:03] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
+      "[20:38:03] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
+      "[20:38:03] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
+      "[20:38:03] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
+      "[20:38:03] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
+      "[20:38:03] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
+      "[20:38:03] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
+      "[20:38:05] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
+      "[20:38:05] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
+      "[20:38:05] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
+      "[20:38:05] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
+      "[20:38:05] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
+      "[20:38:05] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
+      "Accuracy: 0.930 (0.027)\n",
+      "[20:38:06] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
+      "The accuracy score (acc) on test set: 0.9000\n"
+     ]
+    }
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "source": [
+    "#We can then use this encoded data to train and evaluate the HGB classifier model, as before.\n",
+    "# fit model on training set\n",
+    "xgb_cls.fit(X_train_encode, y_train)\n",
+    "# make prediction on test set\n",
+    "yhat = xgb_cls.predict(X_test_encode)\n",
+    "#rounted_yhat = np.rint(np.array(yhat))\n",
+    "rounded_yhat = np.around(yhat)\n",
+    "rounded_yhat = rounded_yhat.astype(int)\n",
+    "# calculate accuracy\n",
+    "acc = accuracy_score(y_test, rounded_yhat)\n",
+    "print(\"The accuracy score (acc) on test set: {:.4f}\".format(acc))"
+   ],
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "[20:38:29] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
+      "The accuracy score (acc) on test set: 0.9182\n"
+     ]
+    }
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "LightGBM for Classification\n",
+    "The example below first evaluates an LGBMClassifier on the test problem using repeated k-fold cross-validation and reports the mean accuracy. Then a single model is fit on all available data and a single prediction is made."
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "source": [
+    "# lightgbm for classification\n",
+    "from lightgbm import LGBMClassifier\n",
+    "# evaluate the model\n",
+    "lgbm_cls = LGBMClassifier(random_state=1)\n",
+    "cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)\n",
+    "n_scores = cross_val_score(lgbm_cls, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')\n",
+    "print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))\n",
+    "# fit the model on the whole dataset\n",
+    "lgbm_cls.fit(X_train, y_train)\n",
+    "# make a single prediction\n",
+    "yhat = lgbm_cls.predict(X_test)\n",
+    "rounded_yhat = np.around(yhat)\n",
+    "rounded_yhat = rounded_yhat.astype(int)\n",
+    "acc = accuracy_score(y_test, rounded_yhat)\n",
+    "print(\"The accuracy score (acc) on test set: {:.4f}\".format(acc))"
+   ],
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "Accuracy: 0.943 (0.021)\n",
+      "The accuracy score (acc) on test set: 0.9273\n"
+     ]
+    }
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "source": [
+    "#We can then use this encoded data to train and evaluate the HGB classifier model, as before.\n",
+    "# fit model on training set\n",
+    "lgbm_cls.fit(X_train_encode, y_train)\n",
+    "# make prediction on test set\n",
+    "yhat = lgbm_cls.predict(X_test_encode)\n",
+    "#rounted_yhat = np.rint(np.array(yhat))\n",
+    "rounded_yhat = np.around(yhat)\n",
+    "rounded_yhat = rounded_yhat.astype(int)\n",
+    "# calculate accuracy\n",
+    "acc = accuracy_score(y_test, rounded_yhat)\n",
+    "print(\"The accuracy score (acc) on test set: {:.4f}\".format(acc))"
+   ],
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "The accuracy score (acc) on test set: 0.9303\n"
+     ]
+    }
+   ],
+   "metadata": {}
   }
  ],
  "metadata": {