snad-space
diff --git a/‎docs/notebooks/devnet.ipynb‎
Lines changed: 228 additions & 0 deletions b/‎docs/notebooks/devnet.ipynb‎
Lines changed: 228 additions & 0 deletions
diff --git a/‎docs/notebooks/devnet_datasets.ipynb‎
Lines changed: 200 additions & 0 deletions b/‎docs/notebooks/devnet_datasets.ipynb‎
Lines changed: 200 additions & 0 deletions
@@ -0,0 +1,200 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "ea4ae65a-d555-4b54-96f9-11eed006adc2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %pip uninstall -y coniferest\n",
+    "# %pip install 'git+https://github.com/snad-space/coniferest@fix-devent-celeba'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "3d9577061e9494ed",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-03-13T15:41:49.204695Z",
+     "start_time": "2024-03-13T15:41:49.201344Z"
+    },
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from collections import defaultdict\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "from coniferest.aadforest import AADForest\n",
+    "from coniferest.datasets import Dataset, DevNetDataset\n",
+    "from coniferest.isoforest import IsolationForest\n",
+    "from coniferest.label import Label\n",
+    "from coniferest.pineforest import PineForest\n",
+    "from coniferest.session.oracle import OracleSession, create_oracle_session"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "initial_id",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-03-13T15:41:49.210919Z",
+     "start_time": "2024-03-13T15:41:49.206277Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "class Compare:\n",
+    "    models = {\n",
+    "        'Isolation Forest': IsolationForest,\n",
+    "        'AAD': AADForest,\n",
+    "        'Pine Forest': PineForest,\n",
+    "    }\n",
+    "    \n",
+    "    def __init__(self, dataset: Dataset, *, iterations=100, n_jobs=-1):\n",
+    "        self.model_kwargs = {\n",
+    "            'n_trees': 128,\n",
+    "            'n_jobs': n_jobs,\n",
+    "        }\n",
+    "        self.session_kwargs = {\n",
+    "            'data': dataset.data,\n",
+    "            'labels': dataset.labels,\n",
+    "            'max_iterations': iterations,\n",
+    "        }\n",
+    "        self.results = {}\n",
+    "        self.steps = np.arange(1, iterations + 1)\n",
+    "        self.total_anomaly_fraction = np.mean(dataset.labels == Label.A)\n",
+    "\n",
+    "    def get_sessions(self, random_seed):\n",
+    "        model_kwargs = self.model_kwargs | {'random_seed': random_seed}\n",
+    "\n",
+    "        return {\n",
+    "            name: create_oracle_session(model=model(**model_kwargs), **self.session_kwargs)\n",
+    "            for name, model in self.models.items()\n",
+    "        }\n",
+    "\n",
+    "    def run(self, random_seeds):\n",
+    "        results = defaultdict(dict)\n",
+    "        \n",
+    "        for random_seed in tqdm(random_seeds):\n",
+    "            sessions = self.get_sessions(random_seed)\n",
+    "            for name, session in sessions.items():\n",
+    "                session.run()\n",
+    "                anomalies = np.cumsum(np.array(list(session.known_labels.values())) == Label.A)\n",
+    "                results[name][random_seed] = anomalies\n",
+    "\n",
+    "        self.results |= results\n",
+    "        return self\n",
+    "    \n",
+    "    def plot(self, dataset_name: str, savefig=False):\n",
+    "        plt.figure(figsize=(8, 6))\n",
+    "        plt.title(f'Dataset: {dataset_name}')\n",
+    "\n",
+    "        for name, anomalies_dict in self.results.items():\n",
+    "            anomalies = np.stack(list(anomalies_dict.values()))\n",
+    "            q10, median, q90 = np.quantile(anomalies, [0.1, 0.5, 0.9], axis = 0)\n",
+    "\n",
+    "            plt.plot(self.steps, median, alpha=0.75, label=name)\n",
+    "            plt.fill_between(self.steps, q10, q90, alpha=0.5)\n",
+    "\n",
+    "        plt.plot(self.steps, self.steps * self.total_anomaly_fraction, ls='--', color='grey', label='Theoretical radnom')\n",
+    "\n",
+    "        plt.xlabel('Iteration')\n",
+    "        plt.ylabel('Number of anomalies')\n",
+    "        plt.grid()\n",
+    "        plt.legend()\n",
+    "        if savefig:\n",
+    "            plt.savefig(f'{dataset}.pdf')\n",
+    "        \n",
+    "        return self"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "71c337b3577915d5",
+   "metadata": {
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['donors', 'census', 'fraud', 'celeba', 'backdoor', 'campaign', 'thyroid']\n",
+      "donors\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 60%|██████████████████████████████████▏                      | 12/20 [1:56:30<1:25:02, 637.84s/it]"
+     ]
+    }
+   ],
+   "source": [
+    "print(DevNetDataset.avialble_datasets)\n",
+    "\n",
+    "seeds = range(20)\n",
+    "\n",
+    "for dataset in DevNetDataset.avialble_datasets:\n",
+    "    print(dataset)\n",
+    "    %time compare = Compare(DevNetDataset(dataset), iterations=100, n_jobs=10).run(seeds).plot(dataset, savefig=True)\n",
+    "    plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "603f9b12-b5ca-470e-95ba-34e4c6571687",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%time compare = Compare(DevNetDataset(\"thyroid\"), iterations=7200, n_jobs=15).run([0]).plot(f'{dataset}_full', savefig=True)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7e7fb96f-b3a4-4f33-8389-466ad23b9da6",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}