forked from Jun-Lizst/HarvardXCourse
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfather_son.r
97 lines (70 loc) · 2.31 KB
/
father_son.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
###########################################
# Father and son heights model estimation
###########################################
# assume that this is the entire population:
library(UsingR)
x = father.son$fheight
y = father.son$sheight
n = length(y)
# Run a Monte Carlo simulation in which
# we take a sample size of 50 over and over again.
betahat <- replicate(
1000,
{
index <- sample(n,50)
sampledat = father.son[index,]
x = sampledat$fheight
y = sampledat$sheight
return(lm(y~x)$coef)
}
)
# Get estimates in two columns
betahat <- t(betahat)
mypar(1,2)
qqnorm(betahat[,1])
qqline(betahat[,1])
qqnorm(betahat[,2])
qqline(betahat[,2])
# correlation of 2 estimates betahat[,1] and betahat[,2]
cor(betahat[,1],betahat[,2])
## [1] -0.9992293
# Describe the variance-covariance matrix.
# The covariance of two random variables is defined as follows:
mean( (betahat[,1]-mean(betahat[,1] ))* (betahat[,2]-mean(betahat[,2])))
## [1] -1.035291
## Estimate the errors
n = nrow(father.son)
N = 50
index = sample(n,N)
sampledat = father.son[index,]
x = sampledat$fheight
y = sampledat$sheight
X = model.matrix(~x)
N = nrow(X)
p = ncol(X)
XtXinv = solve(crossprod(X))
resid = y - X %*% XtXinv %*% crossprod(X,y)
s = sqrt( sum(resid^2)/(N-p))
ses <- sqrt(diag(XtXinv))*s
print(ses)
## (Intercept) x
## 8.3899781 0.1240767
# Compare with the output of lm
summary(lm(y~x))$coef[,2]
## (Intercept) x
## 8.3899781 0.1240767
Th# They are identical because they are doing the same thing.
#Compare with the Monte Carlo results:
apply(betahat,2,sd)
## (Intercept) x
## 8.3817556 0.1237362
# CLT and t-distribution
# We have shown how we can obtain standard errors for our estimates.
# However to perform inference we need to know the distribution
# of these random variables. The reason we went through the
# effort to compute the standard errors is because the CLT applies in linear models.
# If N is large enough, then the LSE will be normally distributed with mean ?? and standard errors as described.
# For small samples, if the ?? are normally distributed,
# then the ??^????? follow a t-distribution.
# We do not derive here, but the results are extremely useful since it is how we
# construct p-values and confidence intervals in the context of linear models.