Ensemble methods: Optimize split search, update comments

rushter · rushter · commit 33d266253063 · 2016-11-19T22:03:10.000+03:00
diff --git a/mla/ensemble/gbm.py b/mla/ensemble/gbm.py
@@ -38,7 +38,7 @@ def transform(self, pred):
         return pred
 
     def gain(self, actual, predicted):
-        """Gain for split finding."""
+        """Calculate gain for split search."""
         nominator = self.grad(actual, predicted).sum() ** 2
         denominator = (self.hess(actual, predicted).sum() + self.regularization)
         return 0.5 * (nominator / denominator)
@@ -70,7 +70,7 @@ def transform(self, output):
 
 
 class GradientBoosting(BaseEstimator):
-    """Gradient boosting trees with taylor expansion approximation (as in xgboost)."""
+    """Gradient boosting trees with Taylor's expansion approximation (as in xgboost)."""
 
     def __init__(self, n_estimators, learning_rate=0.1, max_features=10, max_depth=2, min_samples_split=10):
         self.min_samples_split = min_samples_split
diff --git a/mla/ensemble/tree.py b/mla/ensemble/tree.py
@@ -27,18 +27,17 @@ def __init__(self, regression=False, criterion=None):
     def is_terminal(self):
         return not bool(self.left_child and self.right_child)
 
-    def _find_splits(self, X, y):
+    def _find_splits(self, X):
         """Find all possible split values."""
+        split_values = set()
 
-        # Sort feature set
-        df = np.rec.fromarrays([X, y], names='x,y')
-        df.sort(order='x')
+        # Get unique values in a sorted order
+        x_unique = list(np.unique(X))
+        for i in range(1, len(x_unique)):
+            # Find a point between two values
+            average = (x_unique[i - 1] + x_unique[i]) / 2.0
+            split_values.add(average)
 
-        split_values = set()
-        for i in range(1, X.shape[0]):
-            if df.y[i - 1] != df.y[i]:
-                average = (df.x[i - 1] + df.x[i]) / 2.0
-                split_values.add(average)
         return list(split_values)
 
     def _find_best_split(self, X, target, n_features):
@@ -49,7 +48,7 @@ def _find_best_split(self, X, target, n_features):
         max_gain, max_col, max_val = None, None, None
 
         for column in subset:
-            split_values = self._find_splits(X[:, column], target['y'])
+            split_values = self._find_splits(X[:, column])
             for value in split_values:
                 if self.loss is None:
                     # Random forest
@@ -112,6 +111,7 @@ def train(self, X, target, max_features=None, min_samples_split=10, max_depth=No
             # Split dataset
             left_X, right_X, left_target, right_target = split_dataset(X, target, column, value)
 
+            # Grow left and right child
             self.left_child = Tree(self.regression, self.criterion)
             self.left_child.train(left_X, left_target, max_features, min_samples_split, max_depth - 1,
                                   minimum_gain, loss)
@@ -137,6 +137,7 @@ def _calculate_leaf_value(self, targets):
                 self.outcome = stats.itemfreq(targets['y'])[:, 1] / float(targets['y'].shape[0])
 
     def predict_row(self, row):
+        """Predict single row."""
         if not self.is_terminal:
             if row[self.column_index] < self.threshold:
                 return self.left_child.predict_row(row)