|
| 1 | +import typing as t |
| 2 | + |
| 3 | +import matplotlib.pyplot as plt |
| 4 | +import numpy as np |
| 5 | +import zntrack |
| 6 | + |
| 7 | +from ipsuite import base |
| 8 | + |
| 9 | + |
| 10 | +class FilterOutlier(base.ProcessAtoms): |
| 11 | + """Remove outliers from the data based on a given property. |
| 12 | +
|
| 13 | + Attributes |
| 14 | + ---------- |
| 15 | + key : str, default="energy" |
| 16 | + The property to filter on. |
| 17 | + threshold : float, default=3 |
| 18 | + The threshold for filtering in units of standard deviations. |
| 19 | + direction : {"above", "below", "both"}, default="both" |
| 20 | + The direction to filter in. |
| 21 | + """ |
| 22 | + |
| 23 | + key: str = zntrack.params("energy") |
| 24 | + threshold: float = zntrack.params(3) |
| 25 | + direction: t.Literal["above", "below", "both"] = zntrack.params("both") |
| 26 | + |
| 27 | + filtered_indices: list = zntrack.outs() |
| 28 | + histogram: str = zntrack.outs_path(zntrack.nwd / "histogram.png") |
| 29 | + |
| 30 | + def run(self): |
| 31 | + values = [x.calc.results[self.key] for x in self.data] |
| 32 | + mean = np.mean(values) |
| 33 | + std = np.std(values) |
| 34 | + |
| 35 | + if self.direction == "above": |
| 36 | + self.filtered_indices = [ |
| 37 | + i for i, x in enumerate(values) if x > mean + self.threshold * std |
| 38 | + ] |
| 39 | + elif self.direction == "below": |
| 40 | + self.filtered_indices = [ |
| 41 | + i for i, x in enumerate(values) if x < mean - self.threshold * std |
| 42 | + ] |
| 43 | + else: |
| 44 | + self.filtered_indices = [ |
| 45 | + i |
| 46 | + for i, x in enumerate(values) |
| 47 | + if x > mean + self.threshold * std or x < mean - self.threshold * std |
| 48 | + ] |
| 49 | + |
| 50 | + fig, ax = plt.subplots(3, figsize=(10, 10)) |
| 51 | + ax[0].hist(values, bins=100) |
| 52 | + ax[0].set_title("All") |
| 53 | + ax[1].hist( |
| 54 | + [values[i] for i in range(len(values)) if i not in self.filtered_indices], |
| 55 | + bins=100, |
| 56 | + ) |
| 57 | + ax[1].set_title("Filtered") |
| 58 | + ax[2].hist([values[i] for i in self.filtered_indices], bins=100) |
| 59 | + ax[2].set_title("Excluded") |
| 60 | + fig.savefig(self.histogram, bbox_inches="tight") |
| 61 | + |
| 62 | + @property |
| 63 | + def atoms(self): |
| 64 | + return [ |
| 65 | + self.data[i] for i in range(len(self.data)) if i not in self.filtered_indices |
| 66 | + ] |
| 67 | + |
| 68 | + @property |
| 69 | + def excluded_atoms(self): |
| 70 | + return [self.data[i] for i in self.filtered_indices] |
0 commit comments