-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathseed_hacking_sgd_classifier.py
66 lines (53 loc) · 2.23 KB
/
seed_hacking_sgd_classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score, KFold
from statistics import mean, median, stdev
# Define the number of trials
num_trials = 100
# Define variables to track the best seed and best performance
best_seed = None
best_performance = -np.inf
performance_scores = [] # List to store performance scores
# Create a synthetic classification dataset
X, y = make_classification(n_samples=100, n_features=5, n_informative=4, n_redundant=1, random_state=42)
# Fix the cross-validation folds for all evaluations
kf = KFold(n_splits=5, shuffle=True, random_state=42)
# Iterate over multiple seeds for the model's randomness
for trial in range(num_trials):
# Set the seed for the bagging classifier
seed = trial
# Initialize the model with the current seed
model = SGDClassifier(random_state=seed)
# Evaluate the model using cross-validation
scores = cross_val_score(model, X, y, cv=kf)
# Calculate the mean performance
mean_performance = scores.mean()
performance_scores.append(mean_performance)
# Print the seed and performance if there is an improvement
if mean_performance > best_performance:
print(f"Seed: {seed}, Performance: {mean_performance:.4f}")
best_performance = mean_performance
best_seed = seed
# Report the best seed and its performance
print(f"\nBest Seed: {best_seed}, Best Performance: {best_performance:.4f}")
# Calculate statistics
min_score = min(performance_scores)
max_score = max(performance_scores)
median_score = median(performance_scores)
mean_score = mean(performance_scores)
std_dev_score = stdev(performance_scores)
print("\nPerformance Statistics:")
print(f"Minimum: {min_score:.4f}")
print(f"Median: {median_score:.4f}")
print(f"Maximum: {max_score:.4f}")
print(f"Mean: {mean_score:.4f}")
print(f"Standard Deviation: {std_dev_score:.4f}")
# Plot the distribution of performance scores
plt.hist(performance_scores, bins=10, edgecolor='black', alpha=0.7)
plt.title('Distribution of Performance Scores')
plt.xlabel('Performance Score')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()