-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathknn-crossvalidated_votingAggregation.R
120 lines (89 loc) · 4.28 KB
/
knn-crossvalidated_votingAggregation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#K-Nearest Neighbor KNN
#Predict bug covering questions based on various values of
#the parameters in the aggregation methods
install.packages("tibble")
library(tibble)
install.packages("class");
library(class);
install.packages("gmodels");
library(gmodels)
install.packages("caret")
library(caret)
install.packages('e1071', dependencies=TRUE)
library(e1071)
#Obtain the data
# Import data
source("C://Users//chris//OneDrive//Documentos//GitHub//ML_VotingAggregation//aggregateAnswerOptionsPerQuestion.R");
summaryTable <- runMain();
#I need to guarantee that some examples (i.e., failing methods)
#do not dominate the training or testing sets. To do that, I need to get a
#close to equal proportion of examples in both sets
#Scramble the dataset before extracting the training set.
set.seed(9850);
g<- runif((nrow(summaryTable))); #generates a random distribution
summaryTable <- summaryTable[order(g),];
#head(summaryTable)
##################################################################
#Build the KNN model
#Select only the ranking as a feature to predict bugCovering
trainingData <- summaryTable[,c("bugCovering","rankingVote","majorityVote","Yes.Count")];
#Prepare explanatory variable (rankingVote) and target (bugCovering)
#trainingData <-data.frame(summaryTable);
trainingData$rankingVote <- as.numeric(trainingData$rankingVote);
trainingData$majorityVote <- as.numeric(trainingData$majorityVote);
trainingData$Yes.count <- as.numeric(trainingData$Yes.Count);
######################################################################################
#Using KNN from CLASS package
fitModel.cv <- knn.cv (train =trainingData, cl=trainingData$bugCovering, k=3, l=0, prob = FALSE, use.all=TRUE);
#Evaluate model
fitModel.cv.df<-data.frame(fitModel.cv)
CrossTable(x = trainingData$bugCovering, y=fitModel.cv.df[,1], prop.chisq = FALSE)
plot(fitModel.cv)
trainingData$bugCovering <- as.factor(trainingData$bugCovering);
predictedBugCoveringList<-trainingData[fitModel.cv.df[,1]==TRUE,];
predictedList <- as.numeric(unlist(predictedBugCoveringList[,2]));
mean(predictedList)
min(predictedList)
max(predictedList)
#Plot metric distribution
predictedList.df <- data.frame(predictedList);
colnames(predictedList.df)<- c("votes");
ggplot(data=predictedList.df, aes(x=predictedList.df$votes)) +
geom_histogram(binwidth = .5,alpha=.5, position="identity")+
geom_vline(aes(xintercept=mean(predictedList.df$votes, na.rm=T)), # Ignore NA values for mean
color="red", linetype="dashed", size=1) +
ggtitle("Distribution of votes for the questions categorized as bug covering")+
labs(x="Threshold vote values of questions categorized as bug-covering. Mininal vote=6, mean=9.41",
y="Frequency");
######################################################################################
### Using KNN from CARET package
## https://cran.r-project.org/web/packages/caret/vignettes/caret.pdf
## https://dataaspirant.com/2017/01/09/knn-implementation-r-using-caret-package/
# I will do 5 repeats of 10-Fold CV. I will fit
# a KNN model that evaluates 10 values of k
set.seed(1234)
trctrl <- trainControl(method = "repeatedcv", number=10, p=0.9, repeats = 5)
trainingData$rankingVote <- as.numeric(trainingData$rankingVote);
trainingData$bugCovering <- as.factor(trainingData$bugCovering);
mean(trainingData$rankingVote);
knn_fit <- train(bugCovering ~ rankingVote, data = trainingData, method = "knn",
trControl=trctrl,
preProcess = c("center", "scale"),
tuneLength = 10);
bugCoveringPredicted <- predict(knn_fit,newdata = trainingData);
confusionMatrix(data=bugCoveringPredicted,trainingData$bugCovering)
#False Positives = 10
#False Negatives = 6
#True Positives = 15
#True Negatives = 98
df<-data.frame(bugCoveringPredicted);
predictedBugCoveringList<-trainingData[df[,1]==TRUE,];
rankingList <- as.numeric(unlist(predictedBugCoveringList[,2]));
predictedBugCoveringList[,1]
mean(rankingList)
max(rankingList)
min(rankingList)
hist(rankingList,main="Bug-covering ranking dist., knn caret repeatedcv, mean=1.52, max=2",xlab="ranking");
#Caret produced more false positives than knn.cv from class package. I tried to fine tune it more,
#but was not enough. The k value selected by CARET was 23.
###########################################################################################