-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathGradientBoostingClassifier.py
182 lines (141 loc) · 6.53 KB
/
GradientBoostingClassifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import numpy as np
import logging
from typing import List, Optional
from DecisionTreeClassifier import DecisionTreeClassifier
class GradientBoostingClassifier:
"""
A simple implementation of a gradient boosting classifier.
This class represents a gradient boosting model for binary classification tasks.
It supports basic functionality such as fitting to a dataset and predicting labels for new data.
Parameters
----------
- n_estimators (int): The number of trees to build. Default is 100.
- learning_rate (float): The learning rate controls the contribution of each tree. Default is 0.1.
- max_depth (int, optional): The maximum depth of each tree. If None, the trees will grow until all leaves are pure.
- min_samples_split (int): The minimum number of samples required to split an internal node. Default is 2.
- max_features (int, optional): The number of features to consider when looking for the best split. If None, all features are considered.
- min_impurity_decrease (float): A node will be split if this split induces a decrease of the impurity greater than or equal to this value. Default is 0.0.
- random_state (int): Controls the randomness of the bootstrapping of the samples used when building trees. Default is None.
- debug (bool): If True, the logging level will be set to DEBUG, providing more detailed logging information. Default is False.
Attributes
----------
- trees (List[DecisionTreeClassifier]): The list of fitted trees.
- f0 (float): The initial prediction of the model.
Methods
-------
- fit(X, y): Fits the gradient boosting model to the given dataset.
- predict(X): Predicts the class labels for the given dataset.
"""
def __init__(
self,
n_estimators: int = 100,
learning_rate: float = 0.1,
max_depth: Optional[int] = None,
min_samples_split: int = 2,
max_features: Optional[int] = None,
min_impurity_decrease: float = 0.0,
random_state: Optional[int] = None,
debug: bool = False,
) -> None:
"""
Initializes the GradientBoostingClassifier with the specified parameters.
"""
self.n_estimators = n_estimators
self.learning_rate = learning_rate
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.max_features = max_features
self.min_impurity_decrease = min_impurity_decrease
self.random_state = random_state
self.debug = debug
self.trees: List[DecisionTreeClassifier] = []
self.f0: Optional[float] = None
self.random = np.random.RandomState(self.random_state)
self._logger: logging.Logger = logging.getLogger("GradientBoostingClassifier")
self._logger.setLevel(logging.DEBUG if self.debug else logging.INFO)
def __repr__(self) -> str:
return (
"GradientBoostingClassifier("
f"n_estimators={self.n_estimators}, "
f"learning_rate={self.learning_rate}, "
f"max_depth={self.max_depth}, "
f"min_samples_split={self.min_samples_split}, "
f"max_features={self.max_features}, "
f"min_impurity_decrease={self.min_impurity_decrease}, "
f"random_state={self.random_state}"
")"
)
def _sigmoid(self, x: np.ndarray) -> np.ndarray:
"""
The sigmoid function applied element-wise to the input array.
Parameters:
x (np.ndarray): The input array.
Returns:
np.ndarray: The transformed array with the sigmoid function applied.
"""
return 1 / (1 + np.exp(-x))
def _log_loss(self, y: np.ndarray, predictions: np.ndarray) -> float:
"""
Calculates the logistic loss between true labels and predictions.
Parameters:
y (np.ndarray): The true labels.
predictions (np.ndarray): The predicted probabilities.
Returns:
float: The calculated logistic loss.
"""
predictions = np.clip(predictions, -1e15, 1e15)
probs = self._sigmoid(predictions)
return -np.mean(y * np.log(probs) + (1 - y) * np.log(1 - probs))
def _gradient(self, y: np.ndarray, predictions: np.ndarray) -> np.ndarray:
"""
Calculates the gradient of the logistic loss with respect to the predictions.
Parameters:
y (np.ndarray): The true labels.
predictions (np.ndarray): The current predictions.
Returns:
np.ndarray: The gradient of the loss.
"""
predictions = np.clip(predictions, -1e15, 1e15)
probs = self._sigmoid(predictions)
return -(y - probs)
def fit(self, X: np.ndarray, y: np.ndarray) -> None:
"""
Fits the gradient boosting model to the given dataset.
Parameters:
X (np.ndarray): The input feature array.
y (np.ndarray): The target labels array.
"""
self._logger.debug(f"Training tree {len(self.trees) + 1}...")
p: float = np.mean(y)
self.f0 = np.log(p / (1 - p)) if p > 0 and p < 1 else 0
predictions: np.ndarray = np.full(y.shape, self.f0, dtype=np.float64)
for _ in range(self.n_estimators):
self._logger.debug(f"Fitting tree {_ + 1}...")
residuals: np.ndarray = -self._gradient(y, predictions)
tree = DecisionTreeClassifier(
max_depth=self.max_depth,
min_samples_split=self.min_samples_split,
max_features=self.max_features,
min_impurity_decrease=self.min_impurity_decrease,
random_state=self.random_state,
debug=self.debug,
is_regression=True,
)
tree.fit(X, residuals)
self.trees.append(tree)
predictions += self.learning_rate * tree.predict(X)
loss: float = self._log_loss(y, predictions)
self._logger.debug(f"Loss after fitting tree {_ + 1}: {loss:.4f}")
def predict(self, X: np.ndarray) -> np.ndarray:
"""
Predicts the class labels for the given dataset.
Parameters:
X (np.ndarray): The input feature array.
Returns:
np.ndarray: The predicted class labels.
"""
predictions: np.ndarray = np.full(X.shape[0], self.f0, dtype=np.float64)
for tree in self.trees:
predictions += self.learning_rate * tree.predict(X)
probs: np.ndarray = self._sigmoid(predictions)
return np.where(probs >= 0.5, 1, 0)