forked from mvieth/extending-drift-detection-methods
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHDDDM.py
204 lines (183 loc) · 10.1 KB
/
HDDDM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
import numpy as np
from scipy.stats import t, ttest_ind, fisher_exact, barnard_exact, chi2_contingency
import matplotlib.pyplot as plt
def compute_histogram(X: np.ndarray, n_bins: int) -> np.ndarray:
"""
Compute a histogram from a collection of samples
:param X: collection of samples, all elements must be between 0 and 1
:param n_bins: number of bins for each dimension
:return: the histogram
"""
result = np.zeros((X.shape[1], n_bins))
for i in range(X.shape[1]):
result[i, :] = np.histogram(X[:, i], bins=n_bins, range=(0.0, 1.0), density=False)[0]
return result
def eval_histogram(x, hist) -> float: # assuming all elements of x are in [0; 1], and that the histogram is normalized (i.e. each row sums up to 1)
"""
Determine how well a sample fits a histogram
:param x: a sample
:param hist: the histogram
:return: a score specifying how well the sample fits the histogram
"""
assert x.shape[0] == hist.shape[0]
score = 0.0
for i in range(x.shape[0]):
score += hist[i, int(0.999*hist.shape[1]*x[i])]
return score/x.shape[0]
def compute_hellinger_dist(P, Q):
feature_distances = np.sqrt(np.sum(np.square(np.sqrt(np.divide(P, np.tile(np.sum(P, axis=1), (P.shape[1], 1)).transpose())) -
np.sqrt(np.divide(Q, np.tile(np.sum(Q, axis=1), (Q.shape[1], 1)).transpose()))), axis=1))
return np.mean(feature_distances), feature_distances
class HDDDM:
def __init__(self, gamma=None, alpha=None, batching_size=20, stride=None, visualize=False, verbose=False, localize_drifts=True):
"""
Hellinger Distance Drift Detection Method from "Hellinger distance based drift detection for nonstationary environments" by Ditzler and Polikar
:param gamma: how sensitive the drift detection is (higher value means fewer detections)
:param alpha: a different way to specify how sensitive the drift detection is (either gamma or alpha must be specified, but not both)
:param batching_size: the size of a batch (how many of the samples should make up the after-the-drift set)
:param stride: currently unused
:param visualize: whether to do visualizations when a drift is detected
:param verbose: whether to print additional information
:param localize_drifts: whether to localize the drifts in time, as described in "Extending Drift Detection Methods to Identify When Exactly the Change Happened" by Vieth et al.
"""
if gamma is None and alpha is None:
raise ValueError("Gamma and alpha can not be None at the same time! Please specify either gamma or alpha")
elif gamma is not None and alpha is not None:
raise ValueError("Specify either gamma or alpha, not both!")
elif gamma is None and alpha is not None:
self.gamma = None
self.alpha = max(0.0, min(0.5, alpha))
else:
self.gamma = max(0.0, gamma)
self.alpha = None
self.batching_size = max(1, int(batching_size))
if stride is None:
self.stride = self.batching_size
else:
self.stride = int(stride)
self.X_baseline = None
self.n_bins = None
self.hist_baseline = None
self.n_samples = 0
self.dist_old = np.nan
self.epsilons = []
self.accumulator = []
self.drift_delay = self.batching_size
self.localize_drifts = localize_drifts
self.visualize = visualize
self.verbose = verbose
self.most_important_feature = 0
def update(self, x):
self.accumulator.append(x)
if len(self.accumulator) >= self.batching_size:
X = np.zeros(shape=(len(self.accumulator), len(self.accumulator[0])))
for i in range(len(self.accumulator)):
X[i, :] = self.accumulator[i]
self.accumulator = []
return self.add_batch(X)
else:
return []
def add_batch(self, X):
if self.n_bins is None:
self.n_bins = int(np.floor(np.sqrt(X.shape[0])))
if self.hist_baseline is None:
self.X_baseline = X
self.hist_baseline = compute_histogram(X, self.n_bins)
self.n_samples = X.shape[0]
return []
hist = compute_histogram(X, self.n_bins)
dist, all_feature_distances = compute_hellinger_dist(self.hist_baseline, hist)
n_samples = X.shape[0]
if np.isnan(self.dist_old):
self.dist_old = dist
self.hist_baseline += hist
self.n_samples += n_samples
self.X_baseline = np.vstack((self.X_baseline, X))
return []
eps = dist - self.dist_old
self.dist_old = dist
if len(self.epsilons) < 2:
self.epsilons.append(eps)
self.hist_baseline += hist
self.n_samples += n_samples
self.X_baseline = np.vstack((self.X_baseline, X))
return []
epsilon_hat = np.sum(np.abs(self.epsilons))/len(self.epsilons)
sigma_hat = np.sqrt(np.sum(np.square(np.abs(self.epsilons) - epsilon_hat)) / len(self.epsilons))
if self.gamma is not None:
beta = epsilon_hat + self.gamma * sigma_hat
else:
beta = epsilon_hat + t.ppf(1.0 - self.alpha / 2, self.n_samples + n_samples - 2) * sigma_hat / np.sqrt(len(self.epsilons))
self.epsilons.append(eps)
# Test for drift
if self.verbose:
print("eps=", eps, "beta=", beta)
drift = np.abs(eps) > beta
if drift:
if self.verbose:
print("eps=", eps, "beta=", beta, "epsilon_hat=", epsilon_hat, "sigma_hat=", sigma_hat, "len(epsilons)=", len(self.epsilons), "epsilons=", self.epsilons)
if self.localize_drifts:
# determine drift location:
scores_binary = []
scores_cont = []
hist_old = self.hist_baseline
hist_new = hist
hist_baseline_normalized = np.divide(hist_old, np.tile(np.sum(hist_old, axis=1), (hist_old.shape[1], 1)).transpose())
hist_normalized = np.divide(hist_new, np.tile(np.sum(hist_new, axis=1), (hist_new.shape[1], 1)).transpose())
for i in range(self.X_baseline.shape[0]):
a = eval_histogram(self.X_baseline[i, :], hist_baseline_normalized)
b = eval_histogram(self.X_baseline[i, :], hist_normalized)
scores_cont.append(b/(a+b))
scores_binary.append(a < b)
for i in range(X.shape[0]):
a = eval_histogram(X[i, :], hist_baseline_normalized)
b = eval_histogram(X[i, :], hist_normalized)
scores_cont.append(b/(a+b))
scores_binary.append(a < b)
best_i = len(scores_binary) - self.batching_size
best_i_value = -99999999.9
for i_test in range(max(3, len(scores_binary) - 2 * X.shape[0]), len(scores_binary)-2):
mean_diff = -fisher_exact([[np.sum(scores_binary[i_test:]), np.sum(scores_binary[:i_test])],
[len(scores_binary[i_test:]) - np.sum(scores_binary[i_test:]), len(scores_binary[:i_test]) - np.sum(scores_binary[:i_test])]], alternative='greater').pvalue
if mean_diff > best_i_value:
best_i = i_test
best_i_value = mean_diff
self.drift_delay = len(scores_binary) - best_i - 0.5
if self.verbose:
print("best_i=", best_i, "best_i_value=", best_i_value, "len(scores_binary)=", len(scores_binary), "drift_delay=", self.drift_delay)
if self.visualize:
a = np.zeros(len(scores_binary))
b = np.zeros(len(scores_binary))
correction = np.zeros(len(scores_binary))
for i_test in range(3, len(scores_binary)-2): # At least 3 samples on each side
p2 = np.mean(scores_binary[:i_test])
p1 = np.mean(scores_binary[i_test:])
n2 = len(scores_binary[:i_test])
n1 = len(scores_binary[i_test:])
tmp = p1 * (1 - p1) / n1 + p2 * (1 - p2) / n2
correction[i_test] = (p1 - p2) / np.sqrt(tmp)
a[i_test] = fisher_exact([[np.sum(scores_binary[:i_test]), len(scores_binary[:i_test])-np.sum(scores_binary[:i_test])], [np.sum(scores_binary[i_test:]), len(scores_binary[i_test:])-np.sum(scores_binary[i_test:])]], alternative='less').pvalue
b[i_test] = (np.mean(scores_cont[i_test:])-np.mean(scores_cont[:i_test]))
plt.axvline(x=len(scores_binary)-self.batching_size, color="tab:red", ls="--", label="initial histogram split")
plt.plot(scores_binary, label="binary scores")
plt.plot((scores_cont-np.nanmin(scores_cont))/(np.nanmax(scores_cont-np.nanmin(scores_cont))), label="continuous scores")
plt.plot(a, label="fisher_exact")
plt.plot(b/np.nanmax(b), label="uncorrected diff of means")
plt.plot(correction/np.nanmax(correction), label="correction")
plt.legend()
plt.show()
self.epsilons = []
self.dist_old = np.nan
if self.drift_delay <= X.shape[0]:
self.X_baseline = X[max(0, int(X.shape[0]-self.drift_delay)):, :]
else:
self.X_baseline = np.vstack((self.X_baseline[max(0, int(self.X_baseline.shape[0]-(self.drift_delay-X.shape[0]))):, :], X))
self.n_samples = self.X_baseline.shape[0]
self.n_bins = int(np.floor(np.sqrt(self.batching_size)))
self.hist_baseline = compute_histogram(self.X_baseline, self.n_bins)
return [self.drift_delay]
else:
self.hist_baseline += hist
self.n_samples += n_samples
self.X_baseline = np.vstack((self.X_baseline, X))
return []