-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCarPricePrediction_Mustafa_Ozturk.R
125 lines (91 loc) · 4.2 KB
/
CarPricePrediction_Mustafa_Ozturk.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#----Data Importing----------------
##Delete
library(readr)
Dataset <- read_csv("C:/Users/mustafa.ozturk/Downloads/dataset-DS-TestData.csv")
View(Dataset)
#-------Data Cleaning---------------
Dataset$`normalized-losses` = as.character(Dataset$`normalized-losses`)
Dataset$`normalized-losses` = as.numeric(Dataset$`normalized-losses`)
cor(Dataset$`normalized-losses`,Dataset$symboling)
#correlation of symboling and normalized losses are very low and they contain many unknown,
#therefore remove them
Dataset = Dataset[,-c(1,2)]
Dataset$bore = as.character(Dataset$bore)
Dataset$bore = as.numeric(Dataset$bore)
Dataset$bore[is.na(Dataset$bore)]<- -1
Dataset$stroke = as.character(Dataset$stroke)
Dataset$stroke = as.numeric(Dataset$stroke)
Dataset$stroke[is.na(Dataset$stroke)]<- -1
Dataset$price = as.character(Dataset$price)
Dataset$price = as.numeric(Dataset$price)
Dataset$price[is.na(Dataset$price)]<- -1
#delete rows from dataset where price is unknown
Dataset=Dataset[-which(Dataset$price==-1),]
Dataset$horsepower = as.character(Dataset$horsepower)
Dataset$horsepower = as.numeric(Dataset$horsepower)
Dataset$horsepower[is.na(Dataset$horsepower)]<- -1
Dataset$`peak-rpm` = as.character(Dataset$`peak-rpm`)
Dataset$`peak-rpm` = as.numeric(Dataset$`peak-rpm`)
Dataset$`peak-rpm`[is.na(Dataset$`peak-rpm`)]<- -1
#cor(Dataset$bore,Dataset$`engine-size`)
#Correlation between bore and engine-size is high,
#So we can impute the missing values in column bore by using values in engine-size column
#get mean bore of engine-size +/- 30 and fill the bore column with this mean value
rowNums = which(Dataset$bore==-1.00)
for(i in rowNums){
newbore = Dataset[i,"engine-size"]
Dataset[i,"bore"]=mean(Dataset[which(Dataset$`engine-size`>(newbore-30) & Dataset$`engine-size`<(newbore+30) & Dataset$bore!=-1.00) ,"bore"])
}
#cor(Dataset$bore,Dataset$stroke)
#bore and stroke is also correlated
#Then we can fill the stroke column missing values by using bore column values
rowNums = which(Dataset$stroke==-1.00)
for(i in rowNums){
newstroke = Dataset[i,"bore"]
Dataset[i,"stroke"]=mean(Dataset[which(Dataset$bore>(newstroke-0.5) & Dataset$bore<(newstroke+0.5) & Dataset$stroke!=-1.00) ,"stroke"])
}
#fill the missing values horsepower column by using `engine-size` column
# since correlation between these 2 columns are high
#cor(Dataset$engine.size,Dataset$horsepower)
rowNums = which(Dataset$horsepower==-1.00)
for(i in rowNums){
newhorsepower = Dataset[i,"engine-size"]
Dataset[i,"horsepower"]=mean(Dataset[which(Dataset$`engine-size`>(newhorsepower-30) & Dataset$`engine-size`<(newhorsepower+30) & Dataset$horsepower!=-1.00) ,"horsepower"])
}
#peak-rpm column seems like not so much correlated with other columns
#therefore fill the missing values in this column by mean of the column
rowNums = which(Dataset$`peak-rpm`==-1.00)
for(i in rowNums){
Dataset[i,"peak-rpm"]=as.integer(mean(Dataset[which(Dataset$`peak-rpm`!=-1),"peak-rpm"]))
}
#-------Data Splitting----------------
# Data splitting rule %70-training,%30-test
sample_size <- floor(0.70 * nrow(Dataset))
## set the seed to make your partition reproductible
train_indexes <- sample(seq_len(nrow(Dataset)), size = sample_size)
train <- Dataset[train_indexes, ]
test <- Dataset[-train_indexes, ]
#----------Random Forest Modelling------------
library(randomForest)
rf <- randomForest(price ~ .,data=train,importance=TRUE, na.action=na.exclude)
y = test[,24]
y <- as.character(y)
y <- as.numeric(y)
predicted = predict(rf,test[,-24])
rSquare = 1 - sum((y-predicted)^2)/sum((y-mean(y))^2)
#feature importance list
varImpPlot(rf)
#eliminate the least important five features : fuel.type,aspiration,num.of.doors,body.style,engine.location
dataset2 = dataset[,-c(2,3,4,5,7)]
train2 <- dataset2[train_indexes, ]
test2 <- dataset2[-train_indexes, ]
rf2 <- randomForest(price ~ .,data=train2,importance=TRUE, na.action=na.exclude)
y2 = test2[,ncol(test2)]
predicted2 = predict(rf2,test2[,-ncol(test2)])
rSquare2 = 1 - sum((y2-predicted2)^2)/sum((y2-mean(y2))^2)
#-------------Seleceting Final Model------------
if(rSquare>rSquare2){
finalModel = rf
}else{
finalModel = rf2
}