-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathxgboost.py
79 lines (57 loc) · 2.35 KB
/
xgboost.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import numpy as np
from sklearn.datasets import fetch_california_housing
from decision_tree import DecisionTree
# TODO classification
def squared_loss(y, pred):
return np.square(pred - y).mean() / 2
def squared_loss_gradient(y, pred):
return pred - y
class XGBoostRegressionTree(DecisionTree):
def __init__(self, max_depth):
self.lambd = 0.01
self.gamma = 0.1
super(XGBoostRegressionTree, self).__init__(
metric_type="Gini impurity", depth=max_depth, regression=True)
self.metric = self.score
def gen_leaf(self, y, w):
return {'label': y.dot(w) / (sum(w) + self.lambd)}
def score(self, y, w):
return np.square(y.dot(w)) / (sum(w) + self.lambd)
def split_gain(self, p_score, l_y, r_y, l_w, r_w):
return (self.metric(l_y, l_w) + self.metric(r_y, r_w) - p_score) / 2 - self.gamma
# importance for each feature
class XGBoost(object):
def __init__(self, regression=True, max_depth=4, tree_num=20):
self.regression = regression
self.max_depth = max_depth
self.tree_num = tree_num
self.forest = []
self.shrinkage = 0.5
def get_importance(self):
return sum(tree.get_importance() for tree in self.forest) / self.tree_num
def fit(self, x, y):
pred = 0
for i in range(self.tree_num):
grad = squared_loss_gradient(y, pred)
self.forest.append(XGBoostRegressionTree(max_depth=self.max_depth))
self.forest[i].fit(x, grad)
pred -= self.forest[i].predict(x) * self.shrinkage
print("tree {} constructed, loss {}".format(
i, squared_loss(y, pred)))
def predict(self, x):
return -np.array([tree.predict(x) * self.shrinkage for tree in self.forest]).sum(axis=0)
def main():
data = fetch_california_housing(data_home='data')
test_ratio = 0.2
test_split = np.random.uniform(0, 1, len(data.data))
train_x = data.data[test_split >= test_ratio]
test_x = data.data[test_split < test_ratio]
train_y = data.target[test_split >= test_ratio]
test_y = data.target[test_split < test_ratio]
xgboost = XGBoost()
xgboost.fit(train_x, train_y)
print(xgboost.get_importance())
print(squared_loss(train_y, xgboost.predict(train_x)))
print(squared_loss(test_y, xgboost.predict(test_x)))
if __name__ == "__main__":
main()