Skip to content

Commit 241649f

Browse files
committed
Upload code
1 parent 424148a commit 241649f

File tree

7 files changed

+1473
-2
lines changed

7 files changed

+1473
-2
lines changed

HDDDM.py

+204
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
import numpy as np
2+
from scipy.stats import t, ttest_ind, fisher_exact, barnard_exact, chi2_contingency
3+
import matplotlib.pyplot as plt
4+
5+
6+
def compute_histogram(X: np.ndarray, n_bins: int) -> np.ndarray:
7+
"""
8+
Compute a histogram from a collection of samples
9+
:param X: collection of samples, all elements must be between 0 and 1
10+
:param n_bins: number of bins for each dimension
11+
:return: the histogram
12+
"""
13+
result = np.zeros((X.shape[1], n_bins))
14+
for i in range(X.shape[1]):
15+
result[i, :] = np.histogram(X[:, i], bins=n_bins, range=(0.0, 1.0), density=False)[0]
16+
return result
17+
18+
19+
def eval_histogram(x, hist) -> float: # assuming all elements of x are in [0; 1], and that the histogram is normalized (i.e. each row sums up to 1)
20+
"""
21+
Determine how well a sample fits a histogram
22+
:param x: a sample
23+
:param hist: the histogram
24+
:return: a score specifying how well the sample fits the histogram
25+
"""
26+
assert x.shape[0] == hist.shape[0]
27+
score = 0.0
28+
for i in range(x.shape[0]):
29+
score += hist[i, int(0.999*hist.shape[1]*x[i])]
30+
return score/x.shape[0]
31+
32+
33+
def compute_hellinger_dist(P, Q):
34+
feature_distances = np.sqrt(np.sum(np.square(np.sqrt(np.divide(P, np.tile(np.sum(P, axis=1), (P.shape[1], 1)).transpose())) -
35+
np.sqrt(np.divide(Q, np.tile(np.sum(Q, axis=1), (Q.shape[1], 1)).transpose()))), axis=1))
36+
return np.mean(feature_distances), feature_distances
37+
38+
39+
class HDDDM:
40+
def __init__(self, gamma=None, alpha=None, batching_size=20, stride=None, visualize=False, verbose=False, localize_drifts=True):
41+
"""
42+
Hellinger Distance Drift Detection Method from "Hellinger distance based drift detection for nonstationary environments" by Ditzler and Polikar
43+
:param gamma: how sensitive the drift detection is (higher value means fewer detections)
44+
:param alpha: a different way to specify how sensitive the drift detection is (either gamma or alpha must be specified, but not both)
45+
:param batching_size: the size of a batch (how many of the samples should make up the after-the-drift set)
46+
:param stride: currently unused
47+
:param visualize: whether to do visualizations when a drift is detected
48+
:param verbose: whether to print additional information
49+
:param localize_drifts: whether to localize the drifts in time, as described in "Extending Drift Detection Methods to Identify When Exactly the Change Happened" by Vieth et al.
50+
"""
51+
if gamma is None and alpha is None:
52+
raise ValueError("Gamma and alpha can not be None at the same time! Please specify either gamma or alpha")
53+
elif gamma is not None and alpha is not None:
54+
raise ValueError("Specify either gamma or alpha, not both!")
55+
elif gamma is None and alpha is not None:
56+
self.gamma = None
57+
self.alpha = max(0.0, min(0.5, alpha))
58+
else:
59+
self.gamma = max(0.0, gamma)
60+
self.alpha = None
61+
self.batching_size = max(1, int(batching_size))
62+
if stride is None:
63+
self.stride = self.batching_size
64+
else:
65+
self.stride = int(stride)
66+
67+
self.X_baseline = None
68+
self.n_bins = None
69+
self.hist_baseline = None
70+
self.n_samples = 0
71+
self.dist_old = np.nan
72+
self.epsilons = []
73+
self.accumulator = []
74+
self.drift_delay = self.batching_size
75+
self.localize_drifts = localize_drifts
76+
self.visualize = visualize
77+
self.verbose = verbose
78+
79+
self.most_important_feature = 0
80+
81+
def update(self, x):
82+
self.accumulator.append(x)
83+
if len(self.accumulator) >= self.batching_size:
84+
X = np.zeros(shape=(len(self.accumulator), len(self.accumulator[0])))
85+
for i in range(len(self.accumulator)):
86+
X[i, :] = self.accumulator[i]
87+
self.accumulator = []
88+
return self.add_batch(X)
89+
else:
90+
return []
91+
92+
def add_batch(self, X):
93+
if self.n_bins is None:
94+
self.n_bins = int(np.floor(np.sqrt(X.shape[0])))
95+
if self.hist_baseline is None:
96+
self.X_baseline = X
97+
self.hist_baseline = compute_histogram(X, self.n_bins)
98+
self.n_samples = X.shape[0]
99+
return []
100+
101+
hist = compute_histogram(X, self.n_bins)
102+
dist, all_feature_distances = compute_hellinger_dist(self.hist_baseline, hist)
103+
n_samples = X.shape[0]
104+
105+
if np.isnan(self.dist_old):
106+
self.dist_old = dist
107+
self.hist_baseline += hist
108+
self.n_samples += n_samples
109+
self.X_baseline = np.vstack((self.X_baseline, X))
110+
return []
111+
eps = dist - self.dist_old
112+
self.dist_old = dist
113+
114+
if len(self.epsilons) < 2:
115+
self.epsilons.append(eps)
116+
self.hist_baseline += hist
117+
self.n_samples += n_samples
118+
self.X_baseline = np.vstack((self.X_baseline, X))
119+
return []
120+
epsilon_hat = np.sum(np.abs(self.epsilons))/len(self.epsilons)
121+
sigma_hat = np.sqrt(np.sum(np.square(np.abs(self.epsilons) - epsilon_hat)) / len(self.epsilons))
122+
123+
if self.gamma is not None:
124+
beta = epsilon_hat + self.gamma * sigma_hat
125+
else:
126+
beta = epsilon_hat + t.ppf(1.0 - self.alpha / 2, self.n_samples + n_samples - 2) * sigma_hat / np.sqrt(len(self.epsilons))
127+
self.epsilons.append(eps)
128+
129+
# Test for drift
130+
if self.verbose:
131+
print("eps=", eps, "beta=", beta)
132+
drift = np.abs(eps) > beta
133+
134+
if drift:
135+
if self.verbose:
136+
print("eps=", eps, "beta=", beta, "epsilon_hat=", epsilon_hat, "sigma_hat=", sigma_hat, "len(epsilons)=", len(self.epsilons), "epsilons=", self.epsilons)
137+
if self.localize_drifts:
138+
# determine drift location:
139+
scores_binary = []
140+
scores_cont = []
141+
hist_old = self.hist_baseline
142+
hist_new = hist
143+
hist_baseline_normalized = np.divide(hist_old, np.tile(np.sum(hist_old, axis=1), (hist_old.shape[1], 1)).transpose())
144+
hist_normalized = np.divide(hist_new, np.tile(np.sum(hist_new, axis=1), (hist_new.shape[1], 1)).transpose())
145+
for i in range(self.X_baseline.shape[0]):
146+
a = eval_histogram(self.X_baseline[i, :], hist_baseline_normalized)
147+
b = eval_histogram(self.X_baseline[i, :], hist_normalized)
148+
scores_cont.append(b/(a+b))
149+
scores_binary.append(a < b)
150+
for i in range(X.shape[0]):
151+
a = eval_histogram(X[i, :], hist_baseline_normalized)
152+
b = eval_histogram(X[i, :], hist_normalized)
153+
scores_cont.append(b/(a+b))
154+
scores_binary.append(a < b)
155+
best_i = len(scores_binary) - self.batching_size
156+
best_i_value = -99999999.9
157+
for i_test in range(max(3, len(scores_binary) - 2 * X.shape[0]), len(scores_binary)-2):
158+
mean_diff = -fisher_exact([[np.sum(scores_binary[i_test:]), np.sum(scores_binary[:i_test])],
159+
[len(scores_binary[i_test:]) - np.sum(scores_binary[i_test:]), len(scores_binary[:i_test]) - np.sum(scores_binary[:i_test])]], alternative='greater').pvalue
160+
if mean_diff > best_i_value:
161+
best_i = i_test
162+
best_i_value = mean_diff
163+
self.drift_delay = len(scores_binary) - best_i - 0.5
164+
if self.verbose:
165+
print("best_i=", best_i, "best_i_value=", best_i_value, "len(scores_binary)=", len(scores_binary), "drift_delay=", self.drift_delay)
166+
167+
if self.visualize:
168+
a = np.zeros(len(scores_binary))
169+
b = np.zeros(len(scores_binary))
170+
correction = np.zeros(len(scores_binary))
171+
for i_test in range(3, len(scores_binary)-2): # At least 3 samples on each side
172+
p2 = np.mean(scores_binary[:i_test])
173+
p1 = np.mean(scores_binary[i_test:])
174+
n2 = len(scores_binary[:i_test])
175+
n1 = len(scores_binary[i_test:])
176+
tmp = p1 * (1 - p1) / n1 + p2 * (1 - p2) / n2
177+
correction[i_test] = (p1 - p2) / np.sqrt(tmp)
178+
a[i_test] = fisher_exact([[np.sum(scores_binary[:i_test]), len(scores_binary[:i_test])-np.sum(scores_binary[:i_test])], [np.sum(scores_binary[i_test:]), len(scores_binary[i_test:])-np.sum(scores_binary[i_test:])]], alternative='less').pvalue
179+
b[i_test] = (np.mean(scores_cont[i_test:])-np.mean(scores_cont[:i_test]))
180+
181+
plt.axvline(x=len(scores_binary)-self.batching_size, color="tab:red", ls="--", label="initial histogram split")
182+
plt.plot(scores_binary, label="binary scores")
183+
plt.plot((scores_cont-np.nanmin(scores_cont))/(np.nanmax(scores_cont-np.nanmin(scores_cont))), label="continuous scores")
184+
plt.plot(a, label="fisher_exact")
185+
plt.plot(b/np.nanmax(b), label="uncorrected diff of means")
186+
plt.plot(correction/np.nanmax(correction), label="correction")
187+
plt.legend()
188+
plt.show()
189+
190+
self.epsilons = []
191+
self.dist_old = np.nan
192+
if self.drift_delay <= X.shape[0]:
193+
self.X_baseline = X[max(0, int(X.shape[0]-self.drift_delay)):, :]
194+
else:
195+
self.X_baseline = np.vstack((self.X_baseline[max(0, int(self.X_baseline.shape[0]-(self.drift_delay-X.shape[0]))):, :], X))
196+
self.n_samples = self.X_baseline.shape[0]
197+
self.n_bins = int(np.floor(np.sqrt(self.batching_size)))
198+
self.hist_baseline = compute_histogram(self.X_baseline, self.n_bins)
199+
return [self.drift_delay]
200+
else:
201+
self.hist_baseline += hist
202+
self.n_samples += n_samples
203+
self.X_baseline = np.vstack((self.X_baseline, X))
204+
return []

0 commit comments

Comments
 (0)