sahith
diff --git a/‎Boosting_Bagging_CordinateDescent/2.1a.py
Lines changed: 228 additions & 0 deletions b/‎Boosting_Bagging_CordinateDescent/2.1a.py
Lines changed: 228 additions & 0 deletions
diff --git a/‎Boosting_Bagging_CordinateDescent/Bagging.py
Lines changed: 149 additions & 0 deletions b/‎Boosting_Bagging_CordinateDescent/Bagging.py
Lines changed: 149 additions & 0 deletions
@@ -0,0 +1,228 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Oct 17 20:59:28 2018
+
+@author: sahith
+"""
+
+import pandas as pd
+import numpy as np
+import math
+import time
+Train_data = pd.read_csv('heart_train.data',header = None)
+Test_data = pd.read_csv('heart_test.data',header = None)
+
+Train_data.loc[Train_data[0] == 0, 0] = -1
+Test_data.loc[Test_data[0] == 0, 0] = -1
+
+
+
+attrs = len(Train_data.columns)  #No of Attributes
+
+rows = len(Train_data)   #No of Training Data Points
+
+
+Train_data = np.array(Train_data.iloc[:,:])
+Test_data = np.array(Test_data.iloc[:,:])
+
+        
+
+def output(tree, lst):
+    while(True):
+        for key, value in tree.items():
+            #print('key', key)
+            #print('value', value)
+            if value == 1 or value == -1:
+                return value
+            else:
+                k = lst[key]
+                tree = tree[key][k]
+            if tree == 1 or tree == -1:
+                return tree
+
+def AdaBoost(rounds, model, data):
+    weightedError = 1
+    weight = [1/rows]*rows #Initial weight matrix
+    alpha = []
+    HSpace = [] #Total hypothesis space
+    predictions = []
+    labels = []
+    eps = []    #Weighted errors
+    for r in range(rounds):
+        print('round',r+1)
+        weightedError = 1
+        it = 0
+        for mod in model:
+            w = 0
+            if r == 0:
+                labels.append([])
+            lst = []
+            for i in range(rows):
+                row = data[i, :]
+                if r == 0:
+                    k = output(mod, row)
+                    lst.append(k)
+                    if row[0] != k:
+                        w = w + weight[i]
+                    labels[it].append(k)
+                else:
+                    k = labels[it][i]
+                    lst.append(k)
+                    if row[0] != k:
+                        w = w + weight[i]
+                    if w > weightedError or w > 0.5:#Optimization
+                        break
+            if w < weightedError:
+                weightedError = w
+                bTree = mod #bTree indicates it is a best tree
+                bestP = lst
+            it += 1
+        HSpace.append(bTree)
+        eps.append(weightedError)
+        t = 1/2 * math.log((1-weightedError)/weightedError)
+        alpha.append(t)
+        #Weight Updation
+        sum1 = 0
+        predictions.append(bestP)
+        for i in range(rows):
+            prediction = bestP[i]
+            actual = data[i, 0]
+            #print(prediction, actual)
+            weight[i] = (weight[i] * (np.exp(-1 * prediction * actual * t)))/(2*np.sqrt(weightedError * (1-weightedError)))
+            sum1 += weight[i]
+        print("sum", sum1)
+        #print(weight)
+        #print(alpha)
+        print(weightedError)                                  
+    return eps, alpha, HSpace, predictions
+                                        
+                    
+                
+        
+        
+def Hypothesis():
+    hypo = []
+    lst = [1, -1]
+    for i in range(1, attrs):#attribute iteration
+        for j in range(1, attrs):
+            for k in range(1, attrs):#5 different cases for 3 attribute splits
+                for a in lst:
+                    for b in lst:
+                        for c in lst:
+                            for d in lst:
+                                tree = {}                #case 1 LLL
+                                tree[i] = {}
+                                tree[i][1] = a
+                                tree1 = {}
+                                tree1[j] = {}
+                                tree1[j][1] = b
+                                tree2 = {}
+                                tree2[k] = {}
+                                tree2[k][0] = c
+                                tree2[k][1] = d
+                                tree1[j][0] = tree2
+                                tree[i][0] = tree1
+                                hypo.append(tree)
+                                
+                                tree = {}                #case 2 LLR
+                                tree[i] = {}
+                                tree[i][1] = a
+                                tree1 = {}
+                                tree1[j] = {}
+                                tree1[j][0] = b
+                                tree2 = {}
+                                tree2[k] = {}
+                                tree2[k][0] = c
+                                tree2[k][1] = d
+                                tree1[j][1] = tree2
+                                tree[i][0] = tree1
+                                hypo.append(tree)
+                                
+                                tree = {}                #case 3 
+                                tree[i] = {}
+                                tree1 = {}
+                                tree1[j] = {}
+                                tree1[j][0] = a
+                                tree1[j][1] = b
+                                tree2 = {}
+                                tree2[k] = {}
+                                tree2[k][0] = c
+                                tree2[k][1] = d
+                                tree[i][0] = tree1
+                                tree[i][1] = tree2
+                                hypo.append(tree)
+                                
+                                tree = {}                #case 4
+                                tree[i] = {}
+                                tree[i][0] = a
+                                tree1 = {}
+                                tree1[j] = {}
+                                tree1[j][1] = b
+                                tree2 = {}
+                                tree2[k] = {}
+                                tree2[k][0] = c
+                                tree2[k][1] = d  
+                                tree1[j][0] = tree2
+                                tree[i][1] = tree1
+                                hypo.append(tree)
+                                
+                                tree = {}                #case 5
+                                tree[i] = {}
+                                tree[i][0] = a
+                                tree1 = {}
+                                tree1[j] = {}
+                                tree1[j][0] = b
+                                tree2 = {}
+                                tree2[k] = {}
+                                tree2[k][0] = c
+                                tree2[k][1] = d
+                                tree1[j][1] = tree2
+                                tree[i][1] = tree1
+                                hypo.append(tree)  
+    return hypo
+                
+
+rounds = 5
+print('starting time')
+t1 = time.time()
+Thypo = Hypothesis()   #Thypo mean Total hypothesis space
+print('end')
+t2 = time.time()
+print(t2-t1, 'secs')
+#print(hypo)
+print('Generated Hypothesis spaces')
+
+epsilon, alpha, hypo, predictions = AdaBoost(rounds, Thypo, Train_data) #method for getting Hypothesis space and Alpha values
+t2 = time.time()
+print(t2-t1, 'secs')
+
+print(hypo)
+
+#accuracy calculation
+length = len(Test_data)
+
+accuracy = 0
+for i in range(length):
+    p = 0
+    row = Test_data[i, :]
+    y = row[0]
+    for r in range(rounds):
+        k = output(hypo[r], row)
+        p = p + alpha[r] * k
+    if p >= 0:
+        if y == 1:
+            accuracy += 1
+    else:
+        if y == -1:
+            accuracy += 1
+
+accuracy = accuracy/length * 100
+print("Accuracy on the test data set is", accuracy)
+
+
+for r in range(rounds):
+    print('The value of epsilon and alpha for round,',r+1,'is',alpha[r],epsilon[r])
+    
+t2 = time.time()
+print(t2-t1, 'secs')
@@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Oct 17 19:29:14 2018
+
+@author: sahith
+"""
+
+import pandas as pd
+import math
+
+Train_data = pd.read_csv('heart_train.data',header = None)
+Test_data = pd.read_csv('heart_test.data',header = None)
+
+
+attrs = len(Train_data.columns)  #No of Attributes
+
+rows = len(Train_data)   #No of Training Data Points
+
+
+
+def splitData(data, attribute):
+    data0 = data.loc[data[attribute] == 0, :]
+    data1 = data.loc[data[attribute] == 1, :]    
+    return data0, data1
+
+def label(data):
+    p = len(data.loc[data[0] == 1, :])
+    n = len(data.loc[data[0] == 0, :])
+    if p >= n:
+        return 1
+    else:
+        return 0
+
+def Train(data, attrList):
+    maxProb = 1
+    bestAttr = -1       
+    for i in attrList:
+        prob = 0 #prob of the target given an attribute
+        for j in range(2): #binary values so j is  0, 1 
+            temp = data.loc[data[i] == j, [0, i]]
+            tempLen = len(temp)
+            p = len(temp.loc[temp[0] == 1])
+            e = len(temp.loc[temp[0] == 0])
+            logp = 0
+            loge = 0
+            if(tempLen > 0):
+                if p > 0:
+                    logp = (math.log(p/tempLen))/(math.log(2))
+                if e > 0:
+                    loge = (math.log(e/tempLen))/(math.log(2))
+                prob = prob + -1 * tempLen/rows * ((p/tempLen)*logp+(e/tempLen)*loge)
+        if(prob <= maxProb):
+            maxProb = prob
+            bestAttr = i
+            #print(maxProb)
+            #print(bestAttr)
+    tree = {}
+    tree[bestAttr] = {}
+    data0, data1 = splitData(data, bestAttr)
+    tree[bestAttr][0] = label(data0)
+    tree[bestAttr][1] = label(data1)
+    return tree, bestAttr
+    
+    
+
+def output(tree, lst):
+    while(True):
+        for key, value in tree.items():
+            #print('key', key)
+            #print('value', value)
+            if value == 1 or value == 0:
+                return value
+            else:
+                k = lst[key]
+                tree = tree[key][k]
+            if tree == 1 or tree == 0:
+                return tree
+            
+    
+
+def acc(model, data):
+    r = len(data)
+    accuracy = 0
+    for i in range(r):
+        row = data.loc[i, :]
+        lst = [0] * 2
+        for mod in model:
+            lst[output(mod, row)] += 1
+        if lst[0] > lst[1]:
+            if row[0] == 0:
+                accuracy += 1
+        else:
+            if row[0] == 1:
+                accuracy += 1
+    
+    accuracy = accuracy/r * 100
+    return accuracy
+            
+            
+            
+        
+    
+
+it = 25   #Running it for for 25 Iteration to select the best Test accuracy out of 25 Iterations
+Test_accuracy = 0
+while(it != 0):
+    n = 20  #20 samplings
+    classifiers = []
+    attributeList = []
+    
+    for i in range(1, attrs):
+            attributeList.append(i)
+    
+    for i in range(20):
+        data = Train_data.sample(n = rows, replace = True)
+        Hspace, attr = Train(data, attributeList)
+        attributeList.remove(attr)
+        classifiers.append(Hspace)
+    accuracy = acc(classifiers, Test_data) 
+    if accuracy > Test_accuracy:
+        Test_accuracy = accuracy
+        print('Iteration', it, 'Accuracy', Test_accuracy)
+    it -= 1 
+
+
+
+    
+    
+    
+
+print("Accuracy on test data set is", Test_accuracy)
+    
+print('Classifiers are', classifiers)
+print('length', len(classifiers))
+    
+    
+
+
+
+
+
+
+
+
+
+
+
+