diff --git a/assignment2/Assignment2.ipynb b/assignment2/Assignment2.ipynb index 44be2d9..69655b3 100644 --- a/assignment2/Assignment2.ipynb +++ b/assignment2/Assignment2.ipynb @@ -798,7 +798,9 @@ "colab_type": "text" }, "source": [ - "TODO: copy the above cell 3 times and run simulations with different parameters. Add a 1-2 sentence note explaining how is the Kalman filter behaivng in each case." + "TODO: copy the above cell 3 times and run simulations with different parameters. Add a 1-2 sentence note explaining how is the Kalman filter behaivng in each case.\n", + "\n", + "Try to see what happens when the parameters of the simulator (such as the motion sigma) don't match the ones used by the filter." ] }, { @@ -909,13 +911,13 @@ }, "source": [ "def entropy(counts):\n", - " raise NotImplementedError\n", + " TODO\n", "\n", "def gini(counts):\n", - " raise NotImplementedError\n", + " TODO\n", "\n", "def mean_err_rate(counts):\n", - " raise NotImplementedError\n", + " TODO\n", "\n", "\n", "# Make a plot of the purity functions\n" @@ -1311,7 +1313,56 @@ "\n", " def __call__(self, x):\n", " TODO\n", - " return best_split, best_purity_gain" + " \n", + " def __str__(self):\n", + " return f\"NumericalSplit: {self.attr} <= {self.th}\"\n", + "\n", + " def iter_subtrees(self):\n", + " return self.subtrees\n", + " \n", + " def add_to_graphviz(self, dot, parent, print_info):\n", + " self.subtrees[0].add_to_graphviz(dot, print_info)\n", + " dot.edge(f'{id(parent)}', f'{id(self.subtrees[0])}',\n", + " label=f'<= {self.th:.2f}')\n", + " self.subtrees[1].add_to_graphviz(dot, print_info)\n", + " dot.edge(f'{id(parent)}', f'{id(self.subtrees[1])}',\n", + " label=f'> {self.th:.2f}')\n", + "\n", + "\n", + "def get_numrical_split_and_purity(df, parent_purity, purity_fun, attr,\n", + " normalize_by_split_entropy=False):\n", + " \"\"\"Find best split thereshold and compute the average purity after a split.\n", + " Args:\n", + " df: a dataframe\n", + " parent_purity: purity of the parent node\n", + " purity_fun: function to compute the purity\n", + " attr: attribute over whihc to split the dataframe\n", + " normalize_by_split_entropy: if True, divide the purity gain by the split\n", + " entropy (to compute https://en.wikipedia.org/wiki/Information_gain_ratio)\n", + " \n", + " Returns:\n", + " pair of (split, purity_gain)\n", + " \"\"\"\n", + " attr_df = df[[attr, 'target']].sort_values(attr)\n", + " targets = attr_df['target']\n", + " values = attr_df[attr]\n", + " # Start with a split that puts all the samples into the right subtree\n", + " right_counts = targets.value_counts()\n", + " left_counts = right_counts * 0\n", + "\n", + " best_split = None\n", + " best_purity_gain = -1\n", + " N = len(attr_df)\n", + " for row_i in range(N - 1):\n", + " # Update the counts of targets in the left and right subtree and compute\n", + " # the purity of the slipt for all possible thresholds!\n", + " # Return the best split found.\n", + "\n", + " # Remember that the attribute may have duplicate values and all samples\n", + " # with the same attribute value must end in the same subtree!\n", + " row_target = targets.iloc[row_i]\n", + "\n", + " return best_split, best_purity_gain" ], "execution_count": 0, "outputs": []