Skip to content

Commit dde2755

Browse files
committed
feature: add xgboost and lgbm the same way we see slight improvement by using autoencoder
1 parent b8ccbdb commit dde2755

File tree

1 file changed

+237
-8
lines changed

1 file changed

+237
-8
lines changed

Diff for: autoencoder-classification.ipynb

+237-8
Original file line numberDiff line numberDiff line change
@@ -1126,8 +1126,6 @@
11261126
"#rounted_yhat = np.rint(np.array(yhat))\n",
11271127
"rounded_yhat = np.around(yhat)\n",
11281128
"rounded_yhat = rounded_yhat.astype(int)\n",
1129-
"#rounded_yhat=np.argmax(yhat, axis=0)\n",
1130-
"#print(rounded_yhat)\n",
11311129
"# calculate accuracy\n",
11321130
"acc = accuracy_score(y_test, rounded_yhat)\n",
11331131
"print(\"The accuracy score (acc) on test set: {:.4f}\".format(acc))\n",
@@ -1180,7 +1178,7 @@
11801178
},
11811179
{
11821180
"cell_type": "code",
1183-
"execution_count": 20,
1181+
"execution_count": 31,
11841182
"source": [
11851183
"# gradient boosting for classification in scikit-learn\n",
11861184
"from numpy import mean\n",
@@ -1195,14 +1193,11 @@
11951193
"n_scores = cross_val_score(gbm_cls, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')\n",
11961194
"print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))\n",
11971195
"# fit the model on the whole dataset\n",
1198-
"\n",
11991196
"gbm_cls.fit(X_train, y_train)\n",
12001197
"# make a single prediction\n",
1201-
"yhat = model.predict(X_test)\n",
1198+
"yhat = gbm_cls.predict(X_test)\n",
12021199
"rounded_yhat = np.around(yhat)\n",
12031200
"rounded_yhat = rounded_yhat.astype(int)\n",
1204-
"#rounded_yhat=np.argmax(yhat, axis=0)\n",
1205-
"#print(rounded_yhat)\n",
12061201
"# calculate accuracy\n",
12071202
"acc = accuracy_score(y_test, rounded_yhat)\n",
12081203
"print(\"The accuracy score (acc) on test set: {:.4f}\".format(acc))"
@@ -1213,7 +1208,7 @@
12131208
"name": "stdout",
12141209
"text": [
12151210
"Accuracy: 0.934 (0.023)\n",
1216-
"The accuracy score (acc) on test set: 0.8939\n"
1211+
"The accuracy score (acc) on test set: 0.9333\n"
12171212
]
12181213
}
12191214
],
@@ -1281,6 +1276,240 @@
12811276
"The autoencoder with GBM regressor and classifier seem to come to the same results with or without GBM model hyperparameter tunnings. The best hyperparameter settings with GBM regressor alone can do better than the combined autoencoder and GBM prediction model. "
12821277
],
12831278
"metadata": {}
1279+
},
1280+
{
1281+
"cell_type": "markdown",
1282+
"source": [
1283+
"Histogram-Based Gradient Boosting Machine for Classification\n",
1284+
"\n",
1285+
"The example below first evaluates a HistGradientBoostingClassifier on the test problem using repeated k-fold cross-validation and reports the mean accuracy. Then a single model is fit on all available data and a single prediction is made."
1286+
],
1287+
"metadata": {}
1288+
},
1289+
{
1290+
"cell_type": "code",
1291+
"execution_count": 30,
1292+
"source": [
1293+
"from sklearn.ensemble import HistGradientBoostingClassifier\n",
1294+
"# evaluate the model\n",
1295+
"hgb_cls = HistGradientBoostingClassifier(random_state=1)\n",
1296+
"cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)\n",
1297+
"n_scores = cross_val_score(hgb_cls, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')\n",
1298+
"print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))\n",
1299+
"# fit the model on the whole dataset\n",
1300+
"hgb_cls.fit(X_train, y_train)\n",
1301+
"# make a single prediction\n",
1302+
"yhat = hgb_cls.predict(X_test)\n",
1303+
"rounded_yhat = np.around(yhat)\n",
1304+
"rounded_yhat = rounded_yhat.astype(int)\n",
1305+
"acc = accuracy_score(y_test, rounded_yhat)\n",
1306+
"print(\"The accuracy score (acc) on test set: {:.4f}\".format(acc))"
1307+
],
1308+
"outputs": [
1309+
{
1310+
"output_type": "stream",
1311+
"name": "stdout",
1312+
"text": [
1313+
"Accuracy: 0.941 (0.026)\n",
1314+
"The accuracy score (acc) on test set: 0.9394\n"
1315+
]
1316+
}
1317+
],
1318+
"metadata": {}
1319+
},
1320+
{
1321+
"cell_type": "code",
1322+
"execution_count": 33,
1323+
"source": [
1324+
"#We can then use this encoded data to train and evaluate the HGB classifier model, as before.\n",
1325+
"# fit model on training set\n",
1326+
"hgb_cls.fit(X_train_encode, y_train)\n",
1327+
"# make prediction on test set\n",
1328+
"yhat = hgb_cls.predict(X_test_encode)\n",
1329+
"#rounted_yhat = np.rint(np.array(yhat))\n",
1330+
"rounded_yhat = np.around(yhat)\n",
1331+
"rounded_yhat = rounded_yhat.astype(int)\n",
1332+
"# calculate accuracy\n",
1333+
"acc = accuracy_score(y_test, rounded_yhat)\n",
1334+
"print(\"The accuracy score (acc) on test set: {:.4f}\".format(acc))"
1335+
],
1336+
"outputs": [
1337+
{
1338+
"output_type": "stream",
1339+
"name": "stdout",
1340+
"text": [
1341+
"The accuracy score (acc) on test set: 0.9182\n"
1342+
]
1343+
}
1344+
],
1345+
"metadata": {}
1346+
},
1347+
{
1348+
"cell_type": "markdown",
1349+
"source": [
1350+
"XGBoost for Classification\n",
1351+
"The example below first evaluates an XGBClassifier on the test problem using repeated k-fold cross-validation and reports the mean accuracy. Then a single model is fit on all available data and a single prediction is made."
1352+
],
1353+
"metadata": {}
1354+
},
1355+
{
1356+
"cell_type": "code",
1357+
"execution_count": 37,
1358+
"source": [
1359+
"# xgboost for classification\n",
1360+
"from numpy import asarray\n",
1361+
"from xgboost import XGBClassifier\n",
1362+
"# evaluate the model\n",
1363+
"xgb_cls = XGBClassifier(random_state=1, use_label_encoder=False)\n",
1364+
"cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)\n",
1365+
"n_scores = cross_val_score(xgb_cls, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')\n",
1366+
"print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))\n",
1367+
"# fit the model on the whole dataset\n",
1368+
"xgb_cls.fit(X_train, y_train)\n",
1369+
"# make a single prediction\n",
1370+
"yhat = xgb_cls.predict(X_test)\n",
1371+
"rounded_yhat = np.around(yhat)\n",
1372+
"rounded_yhat = rounded_yhat.astype(int)\n",
1373+
"acc = accuracy_score(y_test, rounded_yhat)\n",
1374+
"print(\"The accuracy score (acc) on test set: {:.4f}\".format(acc))"
1375+
],
1376+
"outputs": [
1377+
{
1378+
"output_type": "stream",
1379+
"name": "stdout",
1380+
"text": [
1381+
"[20:37:59] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
1382+
"[20:37:59] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
1383+
"[20:37:59] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
1384+
"[20:37:59] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
1385+
"[20:37:59] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
1386+
"[20:37:59] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
1387+
"[20:37:59] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
1388+
"[20:37:59] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
1389+
"[20:38:01] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
1390+
"[20:38:01] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
1391+
"[20:38:01] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
1392+
"[20:38:01] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
1393+
"[20:38:01] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
1394+
"[20:38:01] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
1395+
"[20:38:01] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
1396+
"[20:38:01] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
1397+
"[20:38:03] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
1398+
"[20:38:03] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
1399+
"[20:38:03] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
1400+
"[20:38:03] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
1401+
"[20:38:03] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
1402+
"[20:38:03] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
1403+
"[20:38:03] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
1404+
"[20:38:03] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
1405+
"[20:38:05] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
1406+
"[20:38:05] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
1407+
"[20:38:05] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
1408+
"[20:38:05] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
1409+
"[20:38:05] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
1410+
"[20:38:05] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
1411+
"Accuracy: 0.930 (0.027)\n",
1412+
"[20:38:06] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
1413+
"The accuracy score (acc) on test set: 0.9000\n"
1414+
]
1415+
}
1416+
],
1417+
"metadata": {}
1418+
},
1419+
{
1420+
"cell_type": "code",
1421+
"execution_count": 38,
1422+
"source": [
1423+
"#We can then use this encoded data to train and evaluate the HGB classifier model, as before.\n",
1424+
"# fit model on training set\n",
1425+
"xgb_cls.fit(X_train_encode, y_train)\n",
1426+
"# make prediction on test set\n",
1427+
"yhat = xgb_cls.predict(X_test_encode)\n",
1428+
"#rounted_yhat = np.rint(np.array(yhat))\n",
1429+
"rounded_yhat = np.around(yhat)\n",
1430+
"rounded_yhat = rounded_yhat.astype(int)\n",
1431+
"# calculate accuracy\n",
1432+
"acc = accuracy_score(y_test, rounded_yhat)\n",
1433+
"print(\"The accuracy score (acc) on test set: {:.4f}\".format(acc))"
1434+
],
1435+
"outputs": [
1436+
{
1437+
"output_type": "stream",
1438+
"name": "stdout",
1439+
"text": [
1440+
"[20:38:29] WARNING: /Users/ktietz/demo/mc3/conda-bld/xgboost-split_1628682908089/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
1441+
"The accuracy score (acc) on test set: 0.9182\n"
1442+
]
1443+
}
1444+
],
1445+
"metadata": {}
1446+
},
1447+
{
1448+
"cell_type": "markdown",
1449+
"source": [
1450+
"LightGBM for Classification\n",
1451+
"The example below first evaluates an LGBMClassifier on the test problem using repeated k-fold cross-validation and reports the mean accuracy. Then a single model is fit on all available data and a single prediction is made."
1452+
],
1453+
"metadata": {}
1454+
},
1455+
{
1456+
"cell_type": "code",
1457+
"execution_count": 39,
1458+
"source": [
1459+
"# lightgbm for classification\n",
1460+
"from lightgbm import LGBMClassifier\n",
1461+
"# evaluate the model\n",
1462+
"lgbm_cls = LGBMClassifier(random_state=1)\n",
1463+
"cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)\n",
1464+
"n_scores = cross_val_score(lgbm_cls, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')\n",
1465+
"print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))\n",
1466+
"# fit the model on the whole dataset\n",
1467+
"lgbm_cls.fit(X_train, y_train)\n",
1468+
"# make a single prediction\n",
1469+
"yhat = lgbm_cls.predict(X_test)\n",
1470+
"rounded_yhat = np.around(yhat)\n",
1471+
"rounded_yhat = rounded_yhat.astype(int)\n",
1472+
"acc = accuracy_score(y_test, rounded_yhat)\n",
1473+
"print(\"The accuracy score (acc) on test set: {:.4f}\".format(acc))"
1474+
],
1475+
"outputs": [
1476+
{
1477+
"output_type": "stream",
1478+
"name": "stdout",
1479+
"text": [
1480+
"Accuracy: 0.943 (0.021)\n",
1481+
"The accuracy score (acc) on test set: 0.9273\n"
1482+
]
1483+
}
1484+
],
1485+
"metadata": {}
1486+
},
1487+
{
1488+
"cell_type": "code",
1489+
"execution_count": 40,
1490+
"source": [
1491+
"#We can then use this encoded data to train and evaluate the HGB classifier model, as before.\n",
1492+
"# fit model on training set\n",
1493+
"lgbm_cls.fit(X_train_encode, y_train)\n",
1494+
"# make prediction on test set\n",
1495+
"yhat = lgbm_cls.predict(X_test_encode)\n",
1496+
"#rounted_yhat = np.rint(np.array(yhat))\n",
1497+
"rounded_yhat = np.around(yhat)\n",
1498+
"rounded_yhat = rounded_yhat.astype(int)\n",
1499+
"# calculate accuracy\n",
1500+
"acc = accuracy_score(y_test, rounded_yhat)\n",
1501+
"print(\"The accuracy score (acc) on test set: {:.4f}\".format(acc))"
1502+
],
1503+
"outputs": [
1504+
{
1505+
"output_type": "stream",
1506+
"name": "stdout",
1507+
"text": [
1508+
"The accuracy score (acc) on test set: 0.9303\n"
1509+
]
1510+
}
1511+
],
1512+
"metadata": {}
12841513
}
12851514
],
12861515
"metadata": {

0 commit comments

Comments
 (0)