1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Wed Feb 24 08:57:10 2016
4
+ Adaboost algorithm
5
+ @author: liudiwei
6
+ """
7
+ import numpy as np
8
+
9
+
10
+ class AdaboostClassifier (object ):
11
+
12
+ def __init__ (self , max_iter = 100 , numSteps = 10 , num_iter = 50 ):
13
+ self ._max_iter = max_iter
14
+ self ._numSteps = numSteps
15
+ self ._num_iter = num_iter
16
+
17
+ def _stumpClassify (self , X , axis , threshVal , threshIneq ):
18
+ """
19
+ 功能:一个单层决策树,通过阈值比较对数据集进行分类
20
+ 返回:
21
+ """
22
+ retArray = np .ones ((np .shape (X )[0 ], 1 )) #生成一个m*1的矩阵,全1
23
+ if threshIneq == 'lt' : #设定规则为"lt",小于阈值则为-1.0
24
+ retArray [X [:, axis ] <= threshVal ] = - 1.0
25
+ else :
26
+ retArray [X [:, axis ] > threshVal ] = - 1.0
27
+ return retArray
28
+
29
+ def _buildStump (self , X , y , D ):
30
+ """
31
+ 输入:X, y和初始化的权重值D
32
+ 功能:找到最佳的单层决策树
33
+ 返回:一个字典(dim, ineq, thresh)、错误率、基于该特征和阈值下的类别估计值
34
+ """
35
+ dataMat = np .mat (X )
36
+ labelMat = np .mat (y ).T #类标签转置
37
+ m , n = np .shape (dataMat ) #获得矩阵的维数
38
+ bestStump = {}
39
+ bestClasEst = np .mat (np .zeros ((m ,1 )))
40
+ minError = np .inf #初始化最小误差,令其为最大值
41
+ for i in range (n ):#遍历所有的维数
42
+ minVal = dataMat [:,i ].min ()
43
+ maxVal = dataMat [:,i ].max ()
44
+ stepSize = (maxVal - minVal )/ self ._numSteps
45
+ for j in range (- 1 , int (self ._numSteps )+ 1 ):
46
+ for inequal in ['lt' , 'gt' ]:
47
+ threshVal = (minVal + float (j )* stepSize )
48
+ predVals = self ._stumpClassify (dataMat ,i ,threshVal ,inequal )
49
+ errArr = np .mat (np .ones ((m ,1 )))
50
+ errArr [predVals == labelMat ] = 0 #统计误分类的个数
51
+ weightedError = D .T * errArr #计算加权错误率
52
+ if weightedError < minError :
53
+ minError = weightedError
54
+ bestClasEst = predVals .copy ()
55
+ bestStump ['dim' ] = i
56
+ bestStump ['thresh' ] = threshVal
57
+ bestStump ['ineq' ] = inequal
58
+ return bestStump , minError , bestClasEst
59
+
60
+
61
+ def fit (self , X , y ):
62
+ """
63
+ 输入:X,y,迭代次数
64
+ 输出:弱分类器weakClassArr(字典)
65
+ """
66
+ weakClassArr = []
67
+ m = np .shape (X )[0 ]
68
+ D = np .mat (np .ones ((m , 1 ))/ m ) #初始化权重值都相等,为1/m
69
+ aggClassEst = np .mat (np .zeros ((m , 1 ))) #样本点类别估计累计值
70
+ for i in range (self ._num_iter ):
71
+ bestStump , error , classEst = self ._buildStump (X , y , D )
72
+ alpha = float (0.5 * np .log ((1.0 - error ) / max (error , 1e-16 )))
73
+ bestStump ['alpha' ] = alpha
74
+ weakClassArr .append (bestStump )
75
+ expon = np .multiply (- 1 * alpha * np .mat (y ).T , classEst )
76
+ D = np .multiply (D , np .exp (expon ))
77
+ D = D / D .sum ()
78
+ #计算所有分类器的训练误差,如果为0则提前结束循环
79
+ aggClassEst += alpha * classEst
80
+ aggErrors = np .multiply (\
81
+ np .sign (aggClassEst ) != np .mat (y ).T ,np .ones ((m ,1 )))
82
+ errorRate = aggErrors .sum ()/ m
83
+ #print "total error is: ", errorRate
84
+ if errorRate == 0.0 :
85
+ break
86
+ return weakClassArr
87
+
88
+ def predict (self , test_X , classifierArr ):
89
+ """
90
+ 输入: 测试集和分类器
91
+ 输出:分类结果(二分类)
92
+ """
93
+ dataMat = np .mat (test_X )
94
+ m = np .shape (dataMat )[0 ]
95
+ aggClassEst = np .mat (np .zeros ((m ,1 )))
96
+ for i in range (len (classifierArr )):
97
+ classEst = self ._stumpClassify (dataMat ,\
98
+ classifierArr [i ]['dim' ],\
99
+ classifierArr [i ]['thresh' ],\
100
+ classifierArr [i ]['ineq' ])
101
+ aggClassEst += classifierArr [i ]['alpha' ] * classEst
102
+ #print aggClassEst
103
+ return np .sign (aggClassEst )
0 commit comments