Data_Analysis_Using_R/LinearRegression.R at master · jnunez03/Data_Analysis_Using_R · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# Load Data
wine <- read_csv("https://prod-edxapp.edx-cdn.org/assets/courseware/v1/834e8a91ad31bfcdf317bf6356b808f2/asset-v1:MITx+15.071x+2T2017+type@asset+block/wine.csv")
wine_test <- read_csv("https://prod-edxapp.edx-cdn.org/assets/courseware/v1/11433f68d6c74e205f3dfa73dc4711c3/asset-v1:MITx+15.071x+2T2017+type@asset+block/wine_test.csv")
View(wine_test)
str(wine) # details of variables
summary(wine)

# regression model with 1 predictor

model1 = lm(formula = Price ~ AGST, data=wine)
summary(model1)

# Sum Square Errors.
model1$residuals

SSE = sum(model1$residuals^2)
SSE

Model2 = lm(Price ~ AGST + HarvestRain, data=wine)
summary(Model2)
# Multiple R^2 and adjusted R^2 increased.
SSE = sum(Model2$residuals^2)
SSE

Model3 = lm(Price ~ AGST + HarvestRain + WinterRain + Age + FrancePop,data=wine)
summary(Model3)
SSE = sum(Model3$residuals^2)
SSE #decreased

# Breakdown of Summary() function on our model

# If a coefficient is not significantly different from 0, then we should
# remove the variable from our model.

# Standard error column gives a measure of how much the coefficient is likely to
# vary from the estimate value.

# t-value is the estimate divided by the standard error.
# The larger the t-value is the more likely it is to be significant.

# last column; a small p -value means that it is less likely that our coefficient is 0.
# If absolute value of T is small, p-value will be large.

# The stars show significance. 3 stars is highest level of significance.
# will display a p-value less than .001.
#
# Make a new model taking out FrancePop. Before we saw it wasn't significant.
model4 = lm(Price ~ AGST + HarvestRain + WinterRain + Age, data =wine)
summary(model4)

# All of a sudden, Age is now significant. Before it wasn't. This is
# Multi-collinearity. Adjusted-R^2 increased.
# Age and France-Pop were highly-correlated.

cor(wine$WinterRain, wine$Price) # correlation
cor(wine$Age, wine$FrancePop) # highly correlated.

cor(wine) #similar to a heatmap.

# lets see what would happen if we took Age and FrancePop out at the same time.

model5 = lm(Price ~ AGST + HarvestRain + WinterRain, data=wine)
summary(model5)
# Rsquared dropped. Model with Age has higher Rsquared.
# Age makes more intuitive sense since the age of wine makes it more expensive.

# Typically a correlation greater than .7 or less then -.7 is cause for concern.

# ---

#    Accuracy of model on Test data is: Out of Sample Accuracy.
str(wine_test)
PredictTest = predict(model4, newdata=wine_test)
PredictTest
# actual was 6.95, 6.5 from str(wine_test)

SSE = sum((wine_test$Price - PredictTest)^2)
SST = sum((wine_test$Price - mean(wine$Price))^2)
1 - SSE/SST  #.79

# Test R^2 needs to be considered, Train R2 can increase by test can fluctuate.