Skip to content

Commit 2a3ba4f

Browse files
committed
pep8 formating
1 parent 299867e commit 2a3ba4f

28 files changed

+2662
-2320
lines changed

README.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,11 @@ Also see decision boundary visualization for implemented classifiers in decision
1010
Implemented algorithms:
1111

1212
* Regression Models
13-
* Ridge Regression
13+
* Linear Regression
1414
* Matrix solver
1515
* SGD/Adam solver
16+
* L1 regularization Lasso
17+
* L2 regularization Ridge
1618
* Logistic Regression
1719
* Multi-class prediction
1820
* Factorization Machines

adaboost.py

+42-36
Original file line numberDiff line numberDiff line change
@@ -4,45 +4,51 @@
44

55

66
class AdaBoost(object):
7-
def __init__(self, esti_num=10):
8-
self.esti_num = esti_num
9-
self.estimators = []
10-
self.alphas = []
11-
12-
def fit(self, x, y):
13-
n_data = x.shape[0]
14-
w = np.ones(x.shape[0]) / n_data
15-
eps = 1e-16
16-
prediction = np.zeros(n_data)
17-
for i in range(self.esti_num):
18-
self.estimators.append(DecisionTree(metric_type='Gini impurity', depth=2))
19-
self.estimators[i].fit(x, y, w)
20-
pred_i = self.estimators[i].predict(x)
21-
error_i = (pred_i!=y).dot(w.T)
22-
self.alphas.append(np.log((1.0-error_i)/(error_i+eps))/2)
23-
w = w * np.exp(self.alphas[i]*(2*(pred_i!=y)-1))
24-
w = w / w.sum()
25-
26-
prediction += pred_i * self.alphas[i]
27-
print("Tree {} constructed, acc {}".format(i, (np.sign(prediction) == y).sum()/n_data))
28-
29-
def predict(self, x):
30-
return sum(esti.predict(x) * alpha for esti, alpha in zip(self.estimators, self.alphas))
317

8+
def __init__(self, esti_num=10):
9+
self.esti_num = esti_num
10+
self.estimators = []
11+
self.alphas = []
12+
13+
def fit(self, x, y):
14+
n_data = x.shape[0]
15+
w = np.ones(x.shape[0]) / n_data
16+
eps = 1e-16
17+
prediction = np.zeros(n_data)
18+
for i in range(self.esti_num):
19+
self.estimators.append(DecisionTree(
20+
metric_type='Gini impurity', depth=2))
21+
self.estimators[i].fit(x, y, w)
22+
pred_i = self.estimators[i].predict(x)
23+
error_i = (pred_i != y).dot(w.T)
24+
self.alphas.append(np.log((1.0 - error_i) / (error_i + eps)) / 2)
25+
w = w * np.exp(self.alphas[i] * (2 * (pred_i != y) - 1))
26+
w = w / w.sum()
27+
28+
prediction += pred_i * self.alphas[i]
29+
print("Tree {} constructed, acc {}".format(
30+
i, (np.sign(prediction) == y).sum() / n_data))
31+
32+
def predict(self, x):
33+
return sum(esti.predict(x) * alpha for esti, alpha in zip(self.estimators, self.alphas))
3234

33-
def main():
34-
data = load_breast_cancer()
35-
y = data.target*2-1
36-
test_ratio = 0.2
37-
test_split = np.random.uniform(0, 1, len(data.data))
38-
train_x, test_x = data.data[test_split >= test_ratio], data.data[test_split < test_ratio]
39-
train_y, test_y = y[test_split >= test_ratio], y[test_split < test_ratio]
4035

41-
adaboost = AdaBoost()
42-
adaboost.fit(train_x, train_y)
43-
print((np.sign(adaboost.predict(train_x))==train_y).sum()/train_x.shape[0])
44-
print((np.sign(adaboost.predict(test_x))==test_y).sum()/test_x.shape[0])
36+
def main():
37+
data = load_breast_cancer()
38+
y = data.target * 2 - 1
39+
test_ratio = 0.2
40+
test_split = np.random.uniform(0, 1, len(data.data))
41+
train_x, test_x = data.data[test_split >=
42+
test_ratio], data.data[test_split < test_ratio]
43+
train_y, test_y = y[test_split >= test_ratio], y[test_split < test_ratio]
44+
45+
adaboost = AdaBoost()
46+
adaboost.fit(train_x, train_y)
47+
print((np.sign(adaboost.predict(train_x))
48+
== train_y).sum() / train_x.shape[0])
49+
print((np.sign(adaboost.predict(test_x))
50+
== test_y).sum() / test_x.shape[0])
4551

4652

4753
if __name__ == "__main__":
48-
main()
54+
main()

bayesian_net.py

+66-56
Original file line numberDiff line numberDiff line change
@@ -5,71 +5,81 @@
55

66

77
class BayesianNet(object):
8-
def __init__(self, names, edges, tables=None):
9-
self.n_nodes = len(names)
10-
if tables is None: tables = [[0]] * self.n_nodes
11-
self.nodes = [{'name': name, 'table': np.array(table)} for name, table in zip(names, tables)]
12-
self.name2idx = {k: v for v, k in enumerate(names)}
13-
self.graph = np.zeros((self.n_nodes, self.n_nodes))
14-
for edge in edges:
15-
self.graph[self.name2idx[edge[1]], self.name2idx[edge[0]]] = 1
16-
self.binary = np.array([1 << self.n_nodes - 1 - i for i in range(self.n_nodes)])
178

18-
def fit(self, data):
19-
data_size = len(data)
20-
for i, node in enumerate(self.nodes):
21-
table = []
22-
parents = self.graph[i]==1
23-
marginal = data[:, parents]
24-
index = np.zeros(data.shape[0])
25-
if marginal.shape[1] > 0:
26-
index = (marginal * self.binary[-marginal.shape[1]:]).sum(axis=1)
27-
for j in range(2**parents.sum()):
28-
table.append(data[(index == j), i].sum() / (index == j).sum())
29-
node['table'] = np.array(table)
9+
def __init__(self, names, edges, tables=None):
10+
self.n_nodes = len(names)
11+
if tables is None:
12+
tables = [[0]] * self.n_nodes
13+
self.nodes = [{'name': name, 'table': np.array(
14+
table)} for name, table in zip(names, tables)]
15+
self.name2idx = {k: v for v, k in enumerate(names)}
16+
self.graph = np.zeros((self.n_nodes, self.n_nodes))
17+
for edge in edges:
18+
self.graph[self.name2idx[edge[1]], self.name2idx[edge[0]]] = 1
19+
self.binary = np.array(
20+
[1 << self.n_nodes - 1 - i for i in range(self.n_nodes)])
3021

31-
def joint_p(self, values):
32-
p = 1
33-
for i in range(self.n_nodes):
34-
index = 0
35-
parents = self.graph[i]==1
36-
if parents.sum() > 0:
37-
index = np.dot(values[parents], self.binary[-parents.sum():])
38-
p *= (1 - values[i]) + (2 * values[i] - 1) * self.nodes[i]['table'][int(index)]
39-
return p
22+
def fit(self, data):
23+
data_size = len(data)
24+
for i, node in enumerate(self.nodes):
25+
table = []
26+
parents = self.graph[i] == 1
27+
marginal = data[:, parents]
28+
index = np.zeros(data.shape[0])
29+
if marginal.shape[1] > 0:
30+
index = (
31+
marginal * self.binary[-marginal.shape[1]:]).sum(axis=1)
32+
for j in range(2**parents.sum()):
33+
table.append(data[(index == j), i].sum() / (index == j).sum())
34+
node['table'] = np.array(table)
4035

41-
def marginal_p(self, condition):
42-
p = 0
43-
values = -np.ones(self.n_nodes)
44-
for v in condition:
45-
values[self.name2idx[v[1]]] = int(v[0] != '~')
46-
mask = np.arange(self.n_nodes)[(values==-1)]
47-
n_unkowns = self.n_nodes - len(condition)
48-
for i in range(2**n_unkowns):
49-
values[mask] = np.array([int(x) for x in '{:0{size}b}'.format(i, size=n_unkowns)])
50-
p += self.joint_p(values)
51-
return p
36+
def joint_p(self, values):
37+
p = 1
38+
for i in range(self.n_nodes):
39+
index = 0
40+
parents = self.graph[i] == 1
41+
if parents.sum() > 0:
42+
index = np.dot(values[parents], self.binary[-parents.sum():])
43+
p *= (1 - values[i]) + (2 * values[i] - 1) * \
44+
self.nodes[i]['table'][int(index)]
45+
return p
46+
47+
def marginal_p(self, condition):
48+
p = 0
49+
values = -np.ones(self.n_nodes)
50+
for v in condition:
51+
values[self.name2idx[v[1]]] = int(v[0] != '~')
52+
mask = np.arange(self.n_nodes)[(values == -1)]
53+
n_unkowns = self.n_nodes - len(condition)
54+
for i in range(2**n_unkowns):
55+
values[mask] = np.array(
56+
[int(x) for x in '{:0{size}b}'.format(i, size=n_unkowns)])
57+
p += self.joint_p(values)
58+
return p
59+
60+
def query(self, v, condition):
61+
p_pos = self.marginal_p([f'+{v}'] + condition) / self.marginal_p(condition)
62+
return [1 - p_pos, p_pos]
5263

53-
def query(self, v, condition):
54-
p_pos = self.marginal_p([f'+{v}'] + condition) / self.marginal_p(condition)
55-
return [1 - p_pos, p_pos]
5664

5765
def get_asia_data(url):
58-
return read_csv(url).apply(lambda x: x == 'yes').astype(int).values
66+
return read_csv(url).apply(lambda x: x == 'yes').astype(int).values
5967

6068

6169
def main():
62-
names = 'ATSLBEXD'
63-
edges = ['AT', 'SL', 'SB', 'TE', 'LE', 'BD', 'EX', 'ED']
64-
#tables = [[0.01], [0.01, 0.05], [0.5], [0.01, 0.1], [0.3, 0.6], [0, 1, 1, 1], [0.05, 0.98], [0.1, 0.7, 0.8, 0.9]]
65-
bn = BayesianNet(list(names), edges) # also can use predefined conditional tables
66-
asia_url = 'http://www.ccd.pitt.edu/wiki/images/ASIA10k.csv'
67-
bn.fit(get_asia_data(asia_url))
68-
print(bn.nodes)
69-
for condition in [[], ['+A', '~S'], ['+A', '~S', '~D', '+X']]:
70-
for c in ['T', 'L', 'B', 'E']:
71-
print('p({}|{})={}'.format(c, ','.join(condition), bn.query(c, condition)))
70+
names = 'ATSLBEXD'
71+
edges = ['AT', 'SL', 'SB', 'TE', 'LE', 'BD', 'EX', 'ED']
72+
#tables = [[0.01], [0.01, 0.05], [0.5], [0.01, 0.1], [0.3, 0.6], [0, 1, 1, 1], [0.05, 0.98], [0.1, 0.7, 0.8, 0.9]]
73+
# also can use predefined conditional tables
74+
bn = BayesianNet(list(names), edges)
75+
asia_url = 'http://www.ccd.pitt.edu/wiki/images/ASIA10k.csv'
76+
bn.fit(get_asia_data(asia_url))
77+
print(bn.nodes)
78+
for condition in [[], ['+A', '~S'], ['+A', '~S', '~D', '+X']]:
79+
for c in ['T', 'L', 'B', 'E']:
80+
print('p({}|{})={}'.format(c, ','.join(
81+
condition), bn.query(c, condition)))
7282

7383

7484
if __name__ == "__main__":
75-
main()
85+
main()

0 commit comments

Comments
 (0)