Created using Colaboratory

thegunner157 · Nov 5, 2019 · ed6a561 · ed6a561
1 parent bab0ddf
commit ed6a561
Showing 1 changed file with 56 additions and 5 deletions.
diff --git a/assignment2/Assignment2.ipynb b/assignment2/Assignment2.ipynb
@@ -798,7 +798,9 @@
         "colab_type": "text"
       },
       "source": [
-        "TODO: copy the above cell 3 times and run simulations with different parameters. Add a 1-2 sentence note explaining how is the Kalman filter behaivng in each case."
+        "TODO: copy the above cell 3 times and run simulations with different parameters. Add a 1-2 sentence note explaining how is the Kalman filter behaivng in each case.\n",
+        "\n",
+        "Try to see what happens when the parameters of the simulator (such as the motion sigma) don't match the ones used by the filter."
       ]
     },
     {
@@ -909,13 +911,13 @@
       },
       "source": [
         "def entropy(counts):\n",
-        "    raise NotImplementedError\n",
+        "    TODO\n",
         "\n",
         "def gini(counts):\n",
-        "    raise NotImplementedError\n",
+        "    TODO\n",
         "\n",
         "def mean_err_rate(counts):\n",
-        "    raise NotImplementedError\n",
+        "    TODO\n",
         "\n",
         "\n",
         "# Make a plot of the purity functions\n"
@@ -1311,7 +1313,56 @@
         "\n",
         "    def __call__(self, x):\n",
         "        TODO\n",
-        "    return best_split, best_purity_gain"
+        "    \n",
+        "    def __str__(self):\n",
+        "        return f\"NumericalSplit: {self.attr} <= {self.th}\"\n",
+        "\n",
+        "    def iter_subtrees(self):\n",
+        "        return self.subtrees\n",
+        "    \n",
+        "    def add_to_graphviz(self, dot, parent, print_info):\n",
+        "        self.subtrees[0].add_to_graphviz(dot, print_info)\n",
+        "        dot.edge(f'{id(parent)}', f'{id(self.subtrees[0])}',\n",
+        "                 label=f'<= {self.th:.2f}')\n",
+        "        self.subtrees[1].add_to_graphviz(dot, print_info)\n",
+        "        dot.edge(f'{id(parent)}', f'{id(self.subtrees[1])}',\n",
+        "                 label=f'> {self.th:.2f}')\n",
+        "\n",
+        "\n",
+        "def get_numrical_split_and_purity(df, parent_purity, purity_fun, attr,\n",
+        "                                  normalize_by_split_entropy=False):\n",
+        "    \"\"\"Find best split thereshold and compute the average purity after a split.\n",
+        "    Args:\n",
+        "        df: a dataframe\n",
+        "        parent_purity: purity of the parent node\n",
+        "        purity_fun: function to compute the purity\n",
+        "        attr: attribute over whihc to split the dataframe\n",
+        "        normalize_by_split_entropy: if True, divide the purity gain by the split\n",
+        "            entropy (to compute https://en.wikipedia.org/wiki/Information_gain_ratio)\n",
+        "    \n",
+        "    Returns:\n",
+        "        pair of (split, purity_gain)\n",
+        "    \"\"\"\n",
+        "    attr_df = df[[attr, 'target']].sort_values(attr)\n",
+        "    targets = attr_df['target']\n",
+        "    values = attr_df[attr]\n",
+        "    # Start with a split that puts all the samples into the right subtree\n",
+        "    right_counts = targets.value_counts()\n",
+        "    left_counts = right_counts * 0\n",
+        "\n",
+        "    best_split = None\n",
+        "    best_purity_gain = -1\n",
+        "    N = len(attr_df)\n",
+        "    for row_i in range(N - 1):\n",
+        "        # Update the counts of targets in the left and right subtree and compute\n",
+        "        # the purity of the slipt for all possible thresholds!\n",
+        "        # Return the best split found.\n",
+        "\n",
+        "        # Remember that the attribute may have duplicate values and all samples\n",
+        "        # with the same attribute value must end in the same subtree!\n",
+        "        row_target = targets.iloc[row_i]\n",
+        "\n",
+        "            return best_split, best_purity_gain"
       ],
       "execution_count": 0,
       "outputs": []