-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnaiveBayes_votingAggregation.R
100 lines (75 loc) · 2.88 KB
/
naiveBayes_votingAggregation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#Naive Bayes
#Predict bug covering questions based on various values of
#the parameters in the aggregation methods
install.packages(class);
library(class);
install.packages(gmodels);
library(gmodels)
install.packages(e1071, dependencies=TRUE)
library(e1071)
install.packages(klaR)
library(klaR)
#Obtain the data
# Import data
source("C://Users//chris//OneDrive//Documentos//GitHub//ML_VotingAggregation//aggregateAnswerOptionsPerQuestion.R");
summaryTable <- runMain();
#summaryTable <- data.frame(summaryTable);
#I need to guarantee that some examples (i.e., failing methods)
#do not dominate the training or testing sets. To do that, I need to get a
#close to equal proportion of examples in both sets
#Scramble the dataset before extracting the training set.
set.seed(8850);
g<- runif((nrow(summaryTable))); #generates a random distribution
summaryTable <- summaryTable[order(g),];
###########################################################################################
# Naive Bayes
totalData = length(summaryTable$Question.ID);
trainingSize = trunc(totalData * 0.7);
startTestIndex = trainingSize + 1;
endTestIndex = totalData;
#convert columns to numeric
summaryTable<- data.frame(summaryTable)
summaryTable[,"rankingVote"] <- as.numeric(unlist(summaryTable[,"rankingVote"]));
summaryTable$bugCovering <- as.factor(summaryTable$bugCovering);
trainingData<- as.data.frame(summaryTable[1:trainingSize,]);
testingData<-as.data.frame(summaryTable[startTestIndex:endTestIndex,]);
nb.model <- NaiveBayes(as.factor(bugCovering) ~ rankingVote, data =trainingData );
nb.pred <- predict(nb.model, data = testingData);
prediction.df<- data.frame(nb.pred);
CrossTable(x=testingData$bugCovering, y=prediction.df$class, prop.chisq=FALSE);
length(testingData$bugCovering)
length(prediction.df$class)
############################################################################################
# Naive Bayes using CARET
install.packages(ElemStatLearn)
library(ElemStatLearn)
library(klaR)
library(caret)
sub = sample(nrow(summaryTable), floor(nrow(summaryTable) * 1))
train = summaryTable[sub,]
test = summaryTable[-sub,]
xTrain = train[,"rankingVote"]
yTrain = as.factor(train$bugCovering);
xTest = test[,"rankingVote"]
yTest = as.factor(test$bugCovering);
xS = summaryTable[,"rankingVote"]
yS = data.frame(summaryTable[,"bugCovering"]);
nb.fit = train(xTrain,yTrain,'nb',trControl=trainControl(method='cv',number=5))
predictted<-predict(nb.fit$finalModel,xS);
summary(predictted)
predictted.df <- data.frame(predictted);
CrossTable(predictted.df$class,yS$bugCovering);
confusionMatrix(predictted.df$class,yS$bugCovering);
#PREDICTION ACTUAL
# FALSE TRUE
#FALSE 98 10
#TRUE 6 15
#TP = 15/25
#FP = 6
#FN = 10
plot(yTest)
prop.table(table(predict(model$finalModel,xTest)$class,yTest))
#yTest
# FALSE TRUE
#FALSE 0.71794872 0.10256410
#TRUE 0.05128205 0.12820513