-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathalgo_chooser.py
133 lines (106 loc) · 4.75 KB
/
algo_chooser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, matthews_corrcoef
from utils import preprocess_dataset , current_ms
from classifiers import init_classifiers, suppress_stdout, SUPERVISED
# Suppress ABOD warnings and LinearDiscriminantAnalysis (Wanings depends on the dataset)
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)
# Main script
if __name__ == "__main__":
# Load Dataset
df = pd.read_csv('./datasets/delayed/merged.csv')
print(f"Dataset loaded with {len(df)} records.")
# Preprocess Dataset
df = preprocess_dataset(df)
print("Dataset preprocessed successfully.")
print(df.head())
# Separate features and labels
X = df.drop(columns=['label'])
y = df['label']
# Train/Test Split
train_data, test_data, train_label, test_label = train_test_split(
X, y, test_size=0.1, shuffle=True
)
print("Dataset split into training and testing sets.")
# Track the best classifier based on Accuracy and MCC and prediction time
best_acc = float('-inf')
best_acc_clf = None
best_acc_n = None
best_mcc = float('-inf')
best_mcc_clf = None
best_mcc_n = None
best_predict_time = float('+inf')
best_predict_time_clf = None
classifiers = init_classifiers(n_features=len(X.columns))
# Evaluate Classifiers
for clf, clf_type in classifiers:
try:
# Train Classifier
with suppress_stdout():
start_train = current_ms()
if clf_type == SUPERVISED:
clf.fit(train_data, train_label) # Supervised training
else:
clf.fit(train_data) # Unsupervised training (no labels)
end_train = current_ms()
# Predict on Test Data
start_predict = current_ms()
predicted_labels = clf.predict(test_data) # Supervised/unsupervised prediction
end_predict = current_ms()
# Calculate time taken
train_time = end_train - start_train
predict_time = end_predict - start_predict
# Evaluate Model
acc_score = accuracy_score(test_label, predicted_labels)
mcc = matthews_corrcoef(test_label, predicted_labels)
tn, fp, fn, tp = confusion_matrix(test_label, predicted_labels).ravel()
prediction_time = end_predict - start_predict
train_time = end_train - start_train
print(
"%s (n_estimators=%s): Accuracy: %.4f, Train time: %dms, Prediction time: %dms, MCC: %.6f, TP: %d, TN: %d, FN: %d, FP: %d"
% (
getattr(clf, '_model_name', clf.__class__.__name__),
getattr(clf, 'n_estimators', 'N/A'),
acc_score,
train_time,
prediction_time,
mcc,
tp,
tn,
fn,
fp,
)
)
# Track the best classifier based on Accuracy
if acc_score >= best_acc:
best_acc = acc_score
best_acc_clf = clf
best_acc_n = getattr(clf, 'n_estimators', 'N/A')
# Track the best classifier based on MCC
if mcc >= best_mcc:
best_mcc = mcc
best_mcc_clf = clf
best_mcc_n = getattr(clf, 'n_estimators', 'N/A')
if prediction_time <= best_predict_time and prediction_time > 0:
best_predict_time = prediction_time
best_predict_time_clf = clf
except Exception as e:
print(f"Error evaluating classifier {clf.__class__.__name__}: {e}")
# Print the best classifier based on Accuracy
print("\nBest Classifier based on Accuracy")
print(f"Classifier: {best_acc_clf.__class__.__name__}")
print(f"n_estimators: {best_acc_n}")
print(f"Accuracy Score: {best_acc:.4f}")
# Print the best classifier based on MCC
print("\nBest Classifier based on MCC")
print(f"Classifier: {best_mcc_clf.__class__.__name__}")
print(f"n_estimators: {best_mcc_n}")
print(f"MCC Score: {best_mcc:.6f}")
# Print the best classifier based on prediction time
print("\nBest Classifier based on prediction time")
print(f"Classifier: {best_predict_time_clf.__class__.__name__}")
print(f"Time : {best_predict_time:.6f}ms")
# Export model prefering best accuracy model over best_mcc
# I comment the dump because, XGBClassifier does not provide the best accuracy every time
# The reason why it has been chosen over other models can be found in README.md