csuldw
diff --git a/‎DecisionTree/DT.py
+49-26 b/‎DecisionTree/DT.py
+49-26
diff --git a/‎DecisionTree/readme.md
+55 b/‎DecisionTree/readme.md
+55
diff --git a/‎DecisionTree/tree.pdf
67.5 KB b/‎DecisionTree/tree.pdf
67.5 KB
diff --git a/‎DecisionTree/treePlotter.py
+3-1 b/‎DecisionTree/treePlotter.py
+3-1
diff --git a/‎DecisionTree/treePlotter.pyc
71 Bytes b/‎DecisionTree/treePlotter.pyc
71 Bytes
@@ -8,9 +8,9 @@
 import numpy as np
 
 class DecitionTree():
-    """this is a decision tree classifier. """
+    """This is a decision tree classifier. """
 
-    def __init__(self, criteria='C4.5'):
+    def __init__(self, criteria='ID3'):
         self._tree = None
         if criteria == 'ID3' or criteria == 'C4.5':
             self._criteria = criteria
@@ -19,9 +19,9 @@ def __init__(self, criteria='C4.5'):
 
     def _calEntropy(slef, y):
         '''
-        _calEntropy用于计算香农熵 e=-sum(pi*log pi)
-        其中y为数组array
-        输出entropy
+        功能：_calEntropy用于计算香农熵 e=-sum(pi*log pi)
+        参数：其中y为数组array
+        输出：信息熵entropy
         '''
         n = y.shape[0]  
         labelCounts = {}
@@ -38,7 +38,8 @@ def _calEntropy(slef, y):
 
     def _splitData(self, X, y, axis, cutoff):
         """
-        该函数返回数据集中特征下标为axis，特征值等于cutoff的子数据集
+        参数：X为特征,y为label,axis为某个特征的下标,cutoff是下标为axis特征取值值
+        输出：返回数据集中特征下标为axis，特征值等于cutoff的子数据集
         """
         ret = []
         featVec = X[:,axis]
@@ -51,7 +52,9 @@ def _splitData(self, X, y, axis, cutoff):
 
     def _chooseBestSplit(self, X, y):
         """ID3 & C4.5
-        根据信息增益或者信息增益率来获取最好的划分特征
+        参数：X为特征，y为label
+        功能：根据信息增益或者信息增益率来获取最好的划分特征
+        输出：返回最好划分特征的下标
         """
         numFeat = X.shape[1]
         baseEntropy = self._calEntropy(y)
@@ -83,17 +86,23 @@ def _chooseBestSplit(self, X, y):
 
     def _majorityCnt(self, labellist):
         """
-        返回labellist中出现次数最多的label
+        参数:labellist是类标签，序列类型为list
+        输出：返回labellist中出现次数最多的label
         """
         labelCount={}
         for vote in labellist:
-            if vote not in labelCount.keys(): labelCount[vote] = 0
+            if vote not in labelCount.keys(): 
+                labelCount[vote] = 0
             labelCount[vote] += 1
         sortedClassCount = sorted(labelCount.iteritems(), key=lambda x:x[1], \
                                      reverse=True)
         return sortedClassCount[0][0]
 
     def _createTree(self, X, y, featureIndex):
+        """
+        参数:X为特征,y为label,featureIndex类型是元组，记录X特征在原始数据中的下标
+        输出:根据当前的featureIndex创建一颗完整的树
+        """
         labelList = list(y)
         if labelList.count(labelList[0]) == len(labelList): 
             return labelList[0]
@@ -110,12 +119,16 @@ def _createTree(self, X, y, featureIndex):
         for value in uniqueVals:
             #对每个value递归地创建树
             sub_X, sub_y = self._splitData(X,y, bestFeatIndex, value)
-            myTree[bestFeatAxis][value] = self._createTree(sub_X,sub_y,\
+            myTree[bestFeatAxis][value] = self._createTree(sub_X, sub_y, \
                                             featureIndex)
         return myTree  
 
     def fit(self, X, y):
-        #对数据X和y进行类型检测，保证其为array
+        """
+        参数：X是特征，y是类标签
+        注意事项：对数据X和y进行类型检测，保证其为array
+        输出：self本身
+        """
         if isinstance(X, np.ndarray) and isinstance(y, np.ndarray):
             pass
         else:
@@ -129,41 +142,49 @@ def fit(self, X, y):
         return self  #allow using: clf.fit().predict()
 
     def _classify(self, tree, sample):
-        featIndex = tree.keys()[0]
-        secondDict = tree[featIndex]
-        key = sample[int(featIndex[1:])]
-        valueOfkey = secondDict[key]
-        if type(valueOfkey).__name__=='dict':
-            return self._classify(valueOfkey, sample)
+        """
+        用训练好的模型对输入数据进行分类 
+        注意：决策树的构建是一个递归的过程，用决策树分类也是一个递归的过程
+        _classify()一次只能对一个样本（sample）分类
+        """
+        featIndex = tree.keys()[0] #得到数的根节点值
+        secondDict = tree[featIndex] #得到以featIndex为划分特征的结果
+        axis=featIndex[1:] #得到根节点特征在原始数据中的下标
+        key = sample[int(axis)] #获取待分类样本中下标为axis的值
+        valueOfKey = secondDict[key] #获取secondDict中keys为key的value值
+        if type(valueOfKey).__name__=='dict': #如果value为dict，则继续递归分类
+            return self._classify(valueOfKey, sample)
         else: 
-            return valueOfkey
+            return valueOfKey
 
     def predict(self, X):
         if self._tree==None:
             raise NotImplementedError("Estimator not fitted, call `fit` first")
-        if isinstance(X,np.ndarray): 
+        #对X的类型进行检测，判断其是否是数组
+        if isinstance(X, np.ndarray): 
             pass
         else: 
             try:
                 X = np.array(X)
             except:
                 raise TypeError("numpy.ndarray required for X")
 
-        if len(X.shape)==1:
+        if len(X.shape) == 1:
             return self._classify(self._tree, X)
         else:
             result = []
             for i in range(X.shape[0]):
-                result.append(self._classify(self._tree, X[i]))
+                value = self._classify(self._tree, X[i])
+                print str(i+1)+"-th sample is classfied as:", value 
+                result.append(value)
             return np.array(result)
 
-    def show(self):
+    def show(self, outpdf):
         if self._tree==None:
             pass
         #plot the tree using matplotlib
         import treePlotter
-        treePlotter.createPlot(self._tree)
-        
+        treePlotter.createPlot(self._tree, outpdf)
 
 if __name__=="__main__":
     trainfile=r"data\train.txt"
@@ -173,8 +194,10 @@ def show(self):
     import dataload as dload
     train_x, train_y = dload.loadData(trainfile)
     test_x, test_y = dload.loadData(testfile)
-    clf = DecitionTree()
+    
+    clf = DecitionTree(criteria="C4.5")
     clf.fit(train_x, train_y)
     result = clf.predict(test_x)    
-    clf.show()
+    outpdf = r"tree.pdf"
+    clf.show(outpdf)
 
@@ -0,0 +1,55 @@
+## Decision Tree
+
+决策树理论详解：http://www.csuldw.com/2015/05/08/2015-05-08-decision%20tree/
+
+- data存放数据集
+- calIG.py：计算信息增益的实例代码
+- DT.py：决策树实现
+- treePlotter.py：决策树的可视化绘制
+
+## 相关知识
+
+- python
+- numpy
+- matplotlib
+
+## dataset
+
+- 训练集：./data/train.txt
+- 测试集：./data/test.txt
+
+## Run
+
+```
+if __name__=="__main__":
+    trainfile=r"data\train.txt"
+    testfile=r"data\test.txt"
+    import sys
+    sys.path.append(r"F:\CSU\Github\MachineLearning\lib")  
+    import dataload as dload
+    train_x, train_y = dload.loadData(trainfile)
+    test_x, test_y = dload.loadData(testfile)
+    
+    clf = DecitionTree(criteria="C4.5")
+    clf.fit(train_x, train_y)
+    result = clf.predict(test_x)    
+    outpdf = r"tree.pdf"
+    #clf.show(outpdf)
+```
+
+## Result
+
+训练得到的树：https://github.com/csuldw/MachineLearning/tree/master/DecisionTree/tree.pdf
+
+对test分类的结果：
+
+```
+1-th sample is classfied as: 1
+2-th sample is classfied as: 0
+3-th sample is classfied as: 0
+```
+
+## 参考资料
+
+- 机器学习实战
+- Andrew Ng 机器学习公开课
@@ -4,6 +4,7 @@
 @author: Peter Harrington
 '''
 import matplotlib.pyplot as plt
+from matplotlib.pyplot import savefig
 
 decisionNode = dict(boxstyle="sawtooth", fc="0.8")
 leafNode = dict(boxstyle="round4", fc="0.8")
@@ -59,7 +60,7 @@ def plotTree(myTree, parentPt, nodeTxt):#if the first key tells you what feat wa
     plotTree.yOff = plotTree.yOff + 1.0/plotTree.totalD
 #if you do get a dictonary you know it's a tree, and the first element will be another dict
 
-def createPlot(inTree):
+def createPlot(inTree, outpdf):
     fig = plt.figure(1, facecolor='white')
     fig.clf()
     axprops = dict(xticks=[], yticks=[])
@@ -69,6 +70,7 @@ def createPlot(inTree):
     plotTree.totalD = float(getTreeDepth(inTree))
     plotTree.xOff = -0.5/plotTree.totalW; plotTree.yOff = 1.0;
     plotTree(inTree, (0.5,1.0), '')
+    plt.savefig(outpdf)
     plt.show()
 
 #def createPlot():