HammerLabML
diff --git a/‎HDDDM.py
+204 b/‎HDDDM.py
+204
@@ -0,0 +1,204 @@
+import numpy as np
+from scipy.stats import t, ttest_ind, fisher_exact, barnard_exact, chi2_contingency
+import matplotlib.pyplot as plt
+
+
+def compute_histogram(X: np.ndarray, n_bins: int) -> np.ndarray:
+    """
+    Compute a histogram from a collection of samples
+    :param X: collection of samples, all elements must be between 0 and 1
+    :param n_bins: number of bins for each dimension
+    :return: the histogram
+    """
+    result = np.zeros((X.shape[1], n_bins))
+    for i in range(X.shape[1]):
+        result[i, :] = np.histogram(X[:, i], bins=n_bins, range=(0.0, 1.0), density=False)[0]
+    return result
+
+
+def eval_histogram(x, hist) -> float:  # assuming all elements of x are in [0; 1], and that the histogram is normalized (i.e. each row sums up to 1)
+    """
+    Determine how well a sample fits a histogram
+    :param x: a sample
+    :param hist: the histogram
+    :return: a score specifying how well the sample fits the histogram
+    """
+    assert x.shape[0] == hist.shape[0]
+    score = 0.0
+    for i in range(x.shape[0]):
+        score += hist[i, int(0.999*hist.shape[1]*x[i])]
+    return score/x.shape[0]
+
+
+def compute_hellinger_dist(P, Q):
+    feature_distances = np.sqrt(np.sum(np.square(np.sqrt(np.divide(P, np.tile(np.sum(P, axis=1), (P.shape[1], 1)).transpose())) -
+                                                 np.sqrt(np.divide(Q, np.tile(np.sum(Q, axis=1), (Q.shape[1], 1)).transpose()))), axis=1))
+    return np.mean(feature_distances), feature_distances
+
+
+class HDDDM:
+    def __init__(self, gamma=None, alpha=None, batching_size=20, stride=None, visualize=False, verbose=False, localize_drifts=True):
+        """
+        Hellinger Distance Drift Detection Method from "Hellinger distance based drift detection for nonstationary environments" by Ditzler and Polikar
+        :param gamma: how sensitive the drift detection is (higher value means fewer detections)
+        :param alpha: a different way to specify how sensitive the drift detection is (either gamma or alpha must be specified, but not both)
+        :param batching_size: the size of a batch (how many of the samples should make up the after-the-drift set)
+        :param stride: currently unused
+        :param visualize: whether to do visualizations when a drift is detected
+        :param verbose: whether to print additional information
+        :param localize_drifts: whether to localize the drifts in time, as described in "Extending Drift Detection Methods to Identify When Exactly the Change Happened" by Vieth et al.
+        """
+        if gamma is None and alpha is None:
+            raise ValueError("Gamma and alpha can not be None at the same time! Please specify either gamma or alpha")
+        elif gamma is not None and alpha is not None:
+            raise ValueError("Specify either gamma or alpha, not both!")
+        elif gamma is None and alpha is not None:
+            self.gamma = None
+            self.alpha = max(0.0, min(0.5, alpha))
+        else:
+            self.gamma = max(0.0, gamma)
+            self.alpha = None
+        self.batching_size = max(1, int(batching_size))
+        if stride is None:
+            self.stride = self.batching_size
+        else:
+            self.stride = int(stride)
+
+        self.X_baseline = None
+        self.n_bins = None
+        self.hist_baseline = None
+        self.n_samples = 0
+        self.dist_old = np.nan
+        self.epsilons = []
+        self.accumulator = []
+        self.drift_delay = self.batching_size
+        self.localize_drifts = localize_drifts
+        self.visualize = visualize
+        self.verbose = verbose
+
+        self.most_important_feature = 0
+    
+    def update(self, x):
+        self.accumulator.append(x)
+        if len(self.accumulator) >= self.batching_size:
+            X = np.zeros(shape=(len(self.accumulator), len(self.accumulator[0])))
+            for i in range(len(self.accumulator)):
+                X[i, :] = self.accumulator[i]
+            self.accumulator = []
+            return self.add_batch(X)
+        else:
+            return []
+
+    def add_batch(self, X):
+        if self.n_bins is None:
+            self.n_bins = int(np.floor(np.sqrt(X.shape[0])))
+        if self.hist_baseline is None:
+            self.X_baseline = X
+            self.hist_baseline = compute_histogram(X, self.n_bins)
+            self.n_samples = X.shape[0]
+            return []
+
+        hist = compute_histogram(X, self.n_bins)
+        dist, all_feature_distances = compute_hellinger_dist(self.hist_baseline, hist)
+        n_samples = X.shape[0]
+
+        if np.isnan(self.dist_old):
+            self.dist_old = dist
+            self.hist_baseline += hist
+            self.n_samples += n_samples
+            self.X_baseline = np.vstack((self.X_baseline, X))
+            return []
+        eps = dist - self.dist_old
+        self.dist_old = dist
+
+        if len(self.epsilons) < 2:
+            self.epsilons.append(eps)
+            self.hist_baseline += hist
+            self.n_samples += n_samples
+            self.X_baseline = np.vstack((self.X_baseline, X))
+            return []
+        epsilon_hat = np.sum(np.abs(self.epsilons))/len(self.epsilons)
+        sigma_hat = np.sqrt(np.sum(np.square(np.abs(self.epsilons) - epsilon_hat)) / len(self.epsilons))
+
+        if self.gamma is not None:
+            beta = epsilon_hat + self.gamma * sigma_hat
+        else:
+            beta = epsilon_hat + t.ppf(1.0 - self.alpha / 2, self.n_samples + n_samples - 2) * sigma_hat / np.sqrt(len(self.epsilons))
+        self.epsilons.append(eps)
+
+        # Test for drift
+        if self.verbose:
+            print("eps=", eps, "beta=", beta)
+        drift = np.abs(eps) > beta
+
+        if drift:
+            if self.verbose:
+                print("eps=", eps, "beta=", beta, "epsilon_hat=", epsilon_hat, "sigma_hat=", sigma_hat, "len(epsilons)=", len(self.epsilons), "epsilons=", self.epsilons)
+            if self.localize_drifts:
+                # determine drift location:
+                scores_binary = []
+                scores_cont = []
+                hist_old = self.hist_baseline
+                hist_new = hist
+                hist_baseline_normalized = np.divide(hist_old, np.tile(np.sum(hist_old, axis=1), (hist_old.shape[1], 1)).transpose())
+                hist_normalized = np.divide(hist_new, np.tile(np.sum(hist_new, axis=1), (hist_new.shape[1], 1)).transpose())
+                for i in range(self.X_baseline.shape[0]):
+                    a = eval_histogram(self.X_baseline[i, :], hist_baseline_normalized)
+                    b = eval_histogram(self.X_baseline[i, :], hist_normalized)
+                    scores_cont.append(b/(a+b))
+                    scores_binary.append(a < b)
+                for i in range(X.shape[0]):
+                    a = eval_histogram(X[i, :], hist_baseline_normalized)
+                    b = eval_histogram(X[i, :], hist_normalized)
+                    scores_cont.append(b/(a+b))
+                    scores_binary.append(a < b)
+                best_i = len(scores_binary) - self.batching_size
+                best_i_value = -99999999.9
+                for i_test in range(max(3, len(scores_binary) - 2 * X.shape[0]), len(scores_binary)-2):
+                    mean_diff = -fisher_exact([[np.sum(scores_binary[i_test:]), np.sum(scores_binary[:i_test])],
+                                               [len(scores_binary[i_test:]) - np.sum(scores_binary[i_test:]), len(scores_binary[:i_test]) - np.sum(scores_binary[:i_test])]], alternative='greater').pvalue
+                    if mean_diff > best_i_value:
+                        best_i = i_test
+                        best_i_value = mean_diff
+                self.drift_delay = len(scores_binary) - best_i - 0.5
+                if self.verbose:
+                    print("best_i=", best_i, "best_i_value=", best_i_value, "len(scores_binary)=", len(scores_binary), "drift_delay=", self.drift_delay)
+
+                if self.visualize:
+                    a = np.zeros(len(scores_binary))
+                    b = np.zeros(len(scores_binary))
+                    correction = np.zeros(len(scores_binary))
+                    for i_test in range(3, len(scores_binary)-2):  # At least 3 samples on each side
+                        p2 = np.mean(scores_binary[:i_test])
+                        p1 = np.mean(scores_binary[i_test:])
+                        n2 = len(scores_binary[:i_test])
+                        n1 = len(scores_binary[i_test:])
+                        tmp = p1 * (1 - p1) / n1 + p2 * (1 - p2) / n2
+                        correction[i_test] = (p1 - p2) / np.sqrt(tmp)
+                        a[i_test] = fisher_exact([[np.sum(scores_binary[:i_test]), len(scores_binary[:i_test])-np.sum(scores_binary[:i_test])], [np.sum(scores_binary[i_test:]), len(scores_binary[i_test:])-np.sum(scores_binary[i_test:])]], alternative='less').pvalue
+                        b[i_test] = (np.mean(scores_cont[i_test:])-np.mean(scores_cont[:i_test]))
+
+                    plt.axvline(x=len(scores_binary)-self.batching_size, color="tab:red", ls="--", label="initial histogram split")
+                    plt.plot(scores_binary, label="binary scores")
+                    plt.plot((scores_cont-np.nanmin(scores_cont))/(np.nanmax(scores_cont-np.nanmin(scores_cont))), label="continuous scores")
+                    plt.plot(a, label="fisher_exact")
+                    plt.plot(b/np.nanmax(b), label="uncorrected diff of means")
+                    plt.plot(correction/np.nanmax(correction), label="correction")
+                    plt.legend()
+                    plt.show()
+
+            self.epsilons = []
+            self.dist_old = np.nan
+            if self.drift_delay <= X.shape[0]:
+                self.X_baseline = X[max(0, int(X.shape[0]-self.drift_delay)):, :]
+            else:
+                self.X_baseline = np.vstack((self.X_baseline[max(0, int(self.X_baseline.shape[0]-(self.drift_delay-X.shape[0]))):, :], X))
+            self.n_samples = self.X_baseline.shape[0]
+            self.n_bins = int(np.floor(np.sqrt(self.batching_size)))
+            self.hist_baseline = compute_histogram(self.X_baseline, self.n_bins)
+            return [self.drift_delay]
+        else:
+            self.hist_baseline += hist
+            self.n_samples += n_samples
+            self.X_baseline = np.vstack((self.X_baseline, X))
+            return []