Skip to content

Commit 2889764

Browse files
committed
adaboost
1 parent 09853c2 commit 2889764

12 files changed

+626
-62
lines changed

Adaboost/README.md

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
## Adaboost
2+
3+
参考博文:[机器学习算法-Adaboost](http://www.csuldw.com/2015/07/05/2015-07-12-Adaboost/)
4+
5+
- Python + Numpy
6+
7+
Adaboost是Ensemble算法中比较经典的一种,其运行过程为:训练集中的每个样本,赋予其一个权重,这些权重构成向量D。一开始,这些权重都初试化成相等值。首先在训练数据上训练处一个若分类器并计算该分类器的错误率,然后在同一数据集上再次训练若分类器。在分类器的第二次训练当中,将会重新调整每个样本的权重,其中第一次分队的样本的权重值将会降低,而第一次分错的样本的权重将会提高。为了从所有分类器中得到最终的分类结果,AdaBoost为每个分类器都分配了一个权重值alpha,这些alpha值是基于每个分类器的错误率进行计算的。计算出alpha值之后,可以对权重向量D进行更新,使得正确分类的样本的权重值降低而分错的样本权重值升高,计算出D后,AdaBoost接着开始下一轮的迭代。AdaBoost算法会不断地重复训练和调整权重的过程,知道训练错误率为0或者若分类器的数目达到用户指定值为止。全文详解请[点击这里](http://www.csuldw.com/2015/07/05/2015-07-12-Adaboost/).
8+
9+
## 目录介绍
10+
11+
- data: 存放数据集
12+
- adaboost.py:算法实现,封装成class
13+
- HandWriting.py:用adaboost算法测试手写识别字,用来和KNN进行比较
14+
- test.py:数据集与logistics_regression的数据集一样
15+
16+
## Results
17+
18+
test.py输出error_rate的结果为:[[ 0.29850746]]
19+
20+
21+

Adaboost/adaboost.py

+103
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Wed Feb 24 08:57:10 2016
4+
Adaboost algorithm
5+
@author: liudiwei
6+
"""
7+
import numpy as np
8+
9+
10+
class AdaboostClassifier(object):
11+
12+
def __init__(self, max_iter=100, numSteps=10, num_iter=50):
13+
self._max_iter = max_iter
14+
self._numSteps = numSteps
15+
self._num_iter = num_iter
16+
17+
def _stumpClassify(self, X, axis, threshVal, threshIneq):
18+
"""
19+
功能:一个单层决策树,通过阈值比较对数据集进行分类
20+
返回:
21+
"""
22+
retArray = np.ones((np.shape(X)[0], 1)) #生成一个m*1的矩阵,全1
23+
if threshIneq == 'lt': #设定规则为"lt",小于阈值则为-1.0
24+
retArray[X[:, axis] <= threshVal] = -1.0
25+
else:
26+
retArray[X[:, axis] > threshVal] = -1.0
27+
return retArray
28+
29+
def _buildStump(self, X, y, D):
30+
"""
31+
输入:X, y和初始化的权重值D
32+
功能:找到最佳的单层决策树
33+
返回:一个字典(dim, ineq, thresh)、错误率、基于该特征和阈值下的类别估计值
34+
"""
35+
dataMat = np.mat(X)
36+
labelMat = np.mat(y).T #类标签转置
37+
m, n = np.shape(dataMat) #获得矩阵的维数
38+
bestStump = {}
39+
bestClasEst = np.mat(np.zeros((m,1)))
40+
minError = np.inf #初始化最小误差,令其为最大值
41+
for i in range(n):#遍历所有的维数
42+
minVal = dataMat[:,i].min()
43+
maxVal = dataMat[:,i].max()
44+
stepSize = (maxVal-minVal)/self._numSteps
45+
for j in range(-1, int(self._numSteps)+1):
46+
for inequal in ['lt', 'gt']:
47+
threshVal = (minVal + float(j)*stepSize)
48+
predVals = self._stumpClassify(dataMat,i,threshVal,inequal)
49+
errArr = np.mat(np.ones((m,1)))
50+
errArr[predVals == labelMat] = 0 #统计误分类的个数
51+
weightedError = D.T*errArr #计算加权错误率
52+
if weightedError < minError:
53+
minError = weightedError
54+
bestClasEst = predVals.copy()
55+
bestStump['dim'] = i
56+
bestStump['thresh'] = threshVal
57+
bestStump['ineq'] = inequal
58+
return bestStump, minError, bestClasEst
59+
60+
61+
def fit(self, X, y):
62+
"""
63+
输入:X,y,迭代次数
64+
输出:弱分类器weakClassArr(字典)
65+
"""
66+
weakClassArr = []
67+
m = np.shape(X)[0]
68+
D = np.mat(np.ones((m, 1))/m) #初始化权重值都相等,为1/m
69+
aggClassEst = np.mat(np.zeros((m, 1))) #样本点类别估计累计值
70+
for i in range(self._num_iter):
71+
bestStump, error, classEst = self._buildStump(X, y, D)
72+
alpha = float(0.5 * np.log((1.0-error) / max(error, 1e-16)))
73+
bestStump['alpha'] = alpha
74+
weakClassArr.append(bestStump)
75+
expon = np.multiply(-1 * alpha * np.mat(y).T, classEst)
76+
D = np.multiply(D, np.exp(expon))
77+
D = D/D.sum()
78+
#计算所有分类器的训练误差,如果为0则提前结束循环
79+
aggClassEst += alpha*classEst
80+
aggErrors = np.multiply(\
81+
np.sign(aggClassEst) != np.mat(y).T,np.ones((m,1)))
82+
errorRate = aggErrors.sum()/m
83+
#print "total error is: ", errorRate
84+
if errorRate == 0.0:
85+
break
86+
return weakClassArr
87+
88+
def predict(self, test_X, classifierArr):
89+
"""
90+
输入: 测试集和分类器
91+
输出:分类结果(二分类)
92+
"""
93+
dataMat = np.mat(test_X)
94+
m = np.shape(dataMat)[0]
95+
aggClassEst = np.mat(np.zeros((m,1)))
96+
for i in range(len(classifierArr)):
97+
classEst = self._stumpClassify(dataMat,\
98+
classifierArr[i]['dim'],\
99+
classifierArr[i]['thresh'],\
100+
classifierArr[i]['ineq'])
101+
aggClassEst += classifierArr[i]['alpha'] * classEst
102+
#print aggClassEst
103+
return np.sign(aggClassEst)

Adaboost/adaboost.pyc

3.69 KB
Binary file not shown.

Adaboost/data/test.txt

+67
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
2 1 38.50 54 20 0 1 2 2 3 4 1 2 2 5.90 0 2 42.00 6.30 0 0 1
2+
2 1 37.60 48 36 0 0 1 1 0 3 0 0 0 0 0 0 44.00 6.30 1 5.00 1
3+
1 1 37.7 44 28 0 4 3 2 5 4 4 1 1 0 3 5 45 70 3 2 1
4+
1 1 37 56 24 3 1 4 2 4 4 3 1 1 0 0 0 35 61 3 2 0
5+
2 1 38.00 42 12 3 0 3 1 1 0 1 0 0 0 0 2 37.00 5.80 0 0 1
6+
1 1 0 60 40 3 0 1 1 0 4 0 3 2 0 0 5 42 72 0 0 1
7+
2 1 38.40 80 60 3 2 2 1 3 2 1 2 2 0 1 1 54.00 6.90 0 0 1
8+
2 1 37.80 48 12 2 1 2 1 3 0 1 2 0 0 2 0 48.00 7.30 1 0 1
9+
2 1 37.90 45 36 3 3 3 2 2 3 1 2 1 0 3 0 33.00 5.70 3 0 1
10+
2 1 39.00 84 12 3 1 5 1 2 4 2 1 2 7.00 0 4 62.00 5.90 2 2.20 0
11+
2 1 38.20 60 24 3 1 3 2 3 3 2 3 3 0 4 4 53.00 7.50 2 1.40 1
12+
1 1 0 140 0 0 0 4 2 5 4 4 1 1 0 0 5 30 69 0 0 0
13+
1 1 37.90 120 60 3 3 3 1 5 4 4 2 2 7.50 4 5 52.00 6.60 3 1.80 0
14+
2 1 38.00 72 36 1 1 3 1 3 0 2 2 1 0 3 5 38.00 6.80 2 2.00 1
15+
2 9 38.00 92 28 1 1 2 1 1 3 2 3 0 7.20 0 0 37.00 6.10 1 1.10 1
16+
1 1 38.30 66 30 2 3 1 1 2 4 3 3 2 8.50 4 5 37.00 6.00 0 0 1
17+
2 1 37.50 48 24 3 1 1 1 2 1 0 1 1 0 3 2 43.00 6.00 1 2.80 1
18+
1 1 37.50 88 20 2 3 3 1 4 3 3 0 0 0 0 0 35.00 6.40 1 0 0
19+
2 9 0 150 60 4 4 4 2 5 4 4 0 0 0 0 0 0 0 0 0 0
20+
1 1 39.7 100 30 0 0 6 2 4 4 3 1 0 0 4 5 65 75 0 0 0
21+
1 1 38.30 80 0 3 3 4 2 5 4 3 2 1 0 4 4 45.00 7.50 2 4.60 1
22+
2 1 37.50 40 32 3 1 3 1 3 2 3 2 1 0 0 5 32.00 6.40 1 1.10 1
23+
1 1 38.40 84 30 3 1 5 2 4 3 3 2 3 6.50 4 4 47.00 7.50 3 0 0
24+
1 1 38.10 84 44 4 0 4 2 5 3 1 1 3 5.00 0 4 60.00 6.80 0 5.70 0
25+
2 1 38.70 52 0 1 1 1 1 1 3 1 0 0 0 1 3 4.00 74.00 0 0 1
26+
2 1 38.10 44 40 2 1 3 1 3 3 1 0 0 0 1 3 35.00 6.80 0 0 1
27+
2 1 38.4 52 20 2 1 3 1 1 3 2 2 1 0 3 5 41 63 1 1 1
28+
1 1 38.20 60 0 1 0 3 1 2 1 1 1 1 0 4 4 43.00 6.20 2 3.90 1
29+
2 1 37.70 40 18 1 1 1 0 3 2 1 1 1 0 3 3 36.00 3.50 0 0 1
30+
1 1 39.1 60 10 0 1 1 0 2 3 0 0 0 0 4 4 0 0 0 0 1
31+
2 1 37.80 48 16 1 1 1 1 0 1 1 2 1 0 4 3 43.00 7.50 0 0 1
32+
1 1 39.00 120 0 4 3 5 2 2 4 3 2 3 8.00 0 0 65.00 8.20 3 4.60 1
33+
1 1 38.20 76 0 2 3 2 1 5 3 3 1 2 6.00 1 5 35.00 6.50 2 0.90 1
34+
2 1 38.30 88 0 0 0 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0
35+
1 1 38.00 80 30 3 3 3 1 0 0 0 0 0 6.00 0 0 48.00 8.30 0 4.30 1
36+
1 1 0 0 0 3 1 1 1 2 3 3 1 3 6.00 4 4 0 0 2 0 0
37+
1 1 37.60 40 0 1 1 1 1 1 1 1 0 0 0 1 1 0 0 2 2.10 1
38+
2 1 37.50 44 0 1 1 1 1 3 3 2 0 0 0 0 0 45.00 5.80 2 1.40 1
39+
2 1 38.2 42 16 1 1 3 1 1 3 1 0 0 0 1 0 35 60 1 1 1
40+
2 1 38 56 44 3 3 3 0 0 1 1 2 1 0 4 0 47 70 2 1 1
41+
2 1 38.30 45 20 3 3 2 2 2 4 1 2 0 0 4 0 0 0 0 0 1
42+
1 1 0 48 96 1 1 3 1 0 4 1 2 1 0 1 4 42.00 8.00 1 0 1
43+
1 1 37.70 55 28 2 1 2 1 2 3 3 0 3 5.00 4 5 0 0 0 0 1
44+
2 1 36.00 100 20 4 3 6 2 2 4 3 1 1 0 4 5 74.00 5.70 2 2.50 0
45+
1 1 37.10 60 20 2 0 4 1 3 0 3 0 2 5.00 3 4 64.00 8.50 2 0 1
46+
2 1 37.10 114 40 3 0 3 2 2 2 1 0 0 0 0 3 32.00 0 3 6.50 1
47+
1 1 38.1 72 30 3 3 3 1 4 4 3 2 1 0 3 5 37 56 3 1 1
48+
1 1 37.00 44 12 3 1 1 2 1 1 1 0 0 0 4 2 40.00 6.70 3 8.00 1
49+
1 1 38.6 48 20 3 1 1 1 4 3 1 0 0 0 3 0 37 75 0 0 1
50+
1 1 0 82 72 3 1 4 1 2 3 3 0 3 0 4 4 53 65 3 2 0
51+
1 9 38.20 78 60 4 4 6 0 3 3 3 0 0 0 1 0 59.00 5.80 3 3.10 0
52+
2 1 37.8 60 16 1 1 3 1 2 3 2 1 2 0 3 0 41 73 0 0 0
53+
1 1 38.7 34 30 2 0 3 1 2 3 0 0 0 0 0 0 33 69 0 2 0
54+
1 1 0 36 12 1 1 1 1 1 2 1 1 1 0 1 5 44.00 0 0 0 1
55+
2 1 38.30 44 60 0 0 1 1 0 0 0 0 0 0 0 0 6.40 36.00 0 0 1
56+
2 1 37.40 54 18 3 0 1 1 3 4 3 2 2 0 4 5 30.00 7.10 2 0 1
57+
1 1 0 0 0 4 3 0 2 2 4 1 0 0 0 0 0 54 76 3 2 1
58+
1 1 36.6 48 16 3 1 3 1 4 1 1 1 1 0 0 0 27 56 0 0 0
59+
1 1 38.5 90 0 1 1 3 1 3 3 3 2 3 2 4 5 47 79 0 0 1
60+
1 1 0 75 12 1 1 4 1 5 3 3 0 3 5.80 0 0 58.00 8.50 1 0 1
61+
2 1 38.20 42 0 3 1 1 1 1 1 2 2 1 0 3 2 35.00 5.90 2 0 1
62+
1 9 38.20 78 60 4 4 6 0 3 3 3 0 0 0 1 0 59.00 5.80 3 3.10 0
63+
2 1 38.60 60 30 1 1 3 1 4 2 2 1 1 0 0 0 40.00 6.00 1 0 1
64+
2 1 37.80 42 40 1 1 1 1 1 3 1 0 0 0 3 3 36.00 6.20 0 0 1
65+
1 1 38 60 12 1 1 2 1 2 1 1 1 1 0 1 4 44 65 3 2 0
66+
2 1 38.00 42 12 3 0 3 1 1 1 1 0 0 0 0 1 37.00 5.80 0 0 1
67+
2 1 37.60 88 36 3 1 1 1 3 3 2 1 3 1.50 0 0 44.00 6.00 0 0 0

0 commit comments

Comments
 (0)