Skip to content

Commit 97dd41f

Browse files
Initial commit
0 parents  commit 97dd41f

11 files changed

+1440
-0
lines changed

.gitattributes

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# Auto detect text files and perform LF normalization
2+
* text=auto

.gitignore

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# History files
2+
.Rhistory
3+
.Rapp.history
4+
5+
# Session Data files
6+
.RData
7+
8+
# Example code in package build process
9+
*-Ex.R
10+
11+
# Output files from R CMD build
12+
/*.tar.gz
13+
14+
# Output files from R CMD check
15+
/*.Rcheck/
16+
17+
# RStudio files
18+
.Rproj.user/
19+
20+
# produced vignettes
21+
vignettes/*.html
22+
vignettes/*.pdf
23+
24+
# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
25+
.httr-oauth
26+
27+
# knitr and R markdown default cache directories
28+
/*_cache/
29+
/cache/
30+
31+
# Temporary files created by R markdown
32+
*.utf8.md
33+
*.knit.md
34+
35+
# Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html
36+
rsconnect/

README.md

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
2+
# How to conduct a simulation study
3+
4+
This is an example/template of a simple simulation study in `R`.
5+
6+
In this example, we want to estimate the performance
7+
of a kernel-based estimator of the conditional mean.
8+
9+
10+
It is composed of the following files:
11+
12+
- `functions.R`: contains the functions necessary to do the estimation.
13+
14+
- `simulation.R`: the R script to run to do the simulations and save them as a csv file.
15+
16+
- `loadingData.R`: the R script that read the csv file and process it.
17+
18+
- `analysingSimulations.Rmd`: the R Markdown script that produces the figures and tables.
19+

Simulation study.Rproj

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
Version: 1.0
2+
3+
RestoreWorkspace: Default
4+
SaveWorkspace: Default
5+
AlwaysSaveHistory: Default
6+
7+
EnableCodeIndexing: Yes
8+
UseSpacesForTab: Yes
9+
NumSpacesForTab: 2
10+
Encoding: UTF-8
11+
12+
RnwWeave: Sweave
13+
LaTeX: pdfLaTeX
14+
15+
AutoAppendNewline: Yes

analysingSimulations.Rmd

+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
---
2+
title: "Analysing data"
3+
author: "Alexis Derumigny"
4+
date: "03/03/2022"
5+
output:
6+
pdf_document: default
7+
html_document: default
8+
---
9+
10+
```{r setup, include=TRUE}
11+
library(tidyverse)
12+
```
13+
14+
We first load the data of the simulations that we did.
15+
16+
```{r loading data}
17+
source("loadingData.R")
18+
```
19+
20+
We can now print the summary statistics.
21+
22+
```{r summary}
23+
summarisedData %>%
24+
select(all_of(c("n", "h", "MSE", "meanComputationTime")))
25+
```
26+
27+
28+
We plot now the mean-squared error as a function of the bandwidth $h$.
29+
30+
31+
```{r}
32+
summarisedData %>%
33+
ggplot(aes(x = h, y = MSE)) +
34+
geom_line() +
35+
scale_x_log10() +
36+
scale_y_log10()
37+
```
38+
39+
40+
We can also plot the distribution of the computation time as a function of $h$.
41+
42+
```{r}
43+
totalData %>%
44+
mutate( h_ = factor(h, levels = sort(unique(h))) ) %>%
45+
ggplot(aes(x = h_, y = computationTime)) +
46+
geom_boxplot() +
47+
ylab("Computation time (s)")
48+
```
49+
50+

analysingSimulations.html

+262
Large diffs are not rendered by default.

analysingSimulations.pdf

123 KB
Binary file not shown.

functions.R

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
2+
3+
# Estimator =====================================
4+
5+
6+
7+
#' Estimate the conditional mean of Y given X = x0
8+
#'
9+
#' @param vec_x observed data of X
10+
#' @param vec_y observed data of Y
11+
#' @param h the bandwidth used for kernel smoothing
12+
#' @param x0 the point at which the estimation is done
13+
#'
14+
#' @return an estimator of the conditional mean
15+
#'
16+
EstimCondMean <- function(vec_x, vec_y, h, x0)
17+
{
18+
diff_x = (vec_x - x0)
19+
weights = (1/h) * exp( - (diff_x / h)^2 / 2)
20+
21+
estimator = sum(vec_y * weights) / sum(weights)
22+
return(estimator)
23+
}
24+

loadingData.R

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
2+
3+
# Loading of the libraries
4+
library(tidyverse)
5+
6+
# Specifying the files to be load
7+
filenames = c("simus_1.csv")
8+
9+
# Reading the data
10+
totalData = filenames %>%
11+
map( function(x){return(if(file.exists(x)){x} else {NULL})} ) %>%
12+
unlist() %>%
13+
map_dfr( read.csv, sep = ";", dec = ".", header = TRUE)
14+
15+
16+
# Summarizing the simulation
17+
# to create the mean squared error (MSE)
18+
# and mean computation time
19+
summarisedData = totalData %>%
20+
group_by(n, modelName, mean_x, sd_x, sd_epsilon, x0, h) %>%
21+
summarise(MSE = mean(estimError^2),
22+
Bias = mean(estimError),
23+
Sd_MSE = sd(estimError^2, na.rm = TRUE),
24+
25+
nReplications = n(),
26+
27+
MSE_q05 = MSE - 1.96 * Sd_MSE / nReplications,
28+
MSE_q95 = MSE + 1.96 * Sd_MSE / nReplications,
29+
30+
meanComputationTime = mean(computationTime),
31+
sdComputationTime = sd(computationTime),
32+
33+
.groups = "keep") %>%
34+
ungroup()
35+

simulation.R

+96
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
2+
# 1- Loading the libraries ==================================
3+
library(pbapply)
4+
source("functions.R")
5+
6+
7+
# 2- Creating the file if it doesn't exists =================
8+
nameFile = "simus_1.csv"
9+
10+
if (!file.exists(nameFile)){
11+
12+
caracDGP = c("n", "modelName", "mean_x", "sd_x", "sd_epsilon")
13+
caracEstimation = c("x0", "trueCondMean")
14+
caracEstimator = c("h")
15+
caracResult = c("estimError", "computationTime")
16+
17+
write.table(
18+
x = paste0(c(caracDGP, caracEstimation,
19+
caracEstimator, caracResult), collapse=";") ,
20+
21+
file = nameFile,
22+
append = F, sep = ";", col.names = FALSE, row.names = FALSE,
23+
quote = FALSE)
24+
}
25+
26+
27+
# 3- Characteristics of the simulation =====================
28+
29+
n = 50000
30+
31+
# Data-generating process:
32+
modelName = "Y = X^2 + epsilon"
33+
mean_x = 5
34+
sd_x = 2
35+
sd_epsilon = 0.1
36+
37+
38+
# 4- Characteristics of the estimation ======================
39+
40+
grid_h = c(0.001, 0.002, 0.005,
41+
0.01, 0.02, 0.05,
42+
0.1, 0.2, 0.5)
43+
x0 = 1
44+
45+
46+
# 5- Simulations ===========================================
47+
48+
Nreplications = 100
49+
number_steps = Nreplications * length(grid_h)
50+
51+
pb = pbapply::startpb(min = 0, max = number_steps)
52+
i_step = 0
53+
for (iReplication in 1:Nreplications)
54+
{
55+
for (i_h in 1:length(grid_h))
56+
{
57+
h = grid_h[i_h]
58+
59+
## Generating data ---------------------------------------
60+
61+
vec_x = rnorm(n = n, mean = mean_x, sd = sd_x)
62+
epsilon = rnorm(n = n, mean = 0, sd = sd_epsilon)
63+
vec_y = vec_x^2 + epsilon
64+
65+
trueCondMean = x0^2
66+
67+
## Estimation --------------------------------------------
68+
69+
time1 = proc.time()
70+
estimatedCondMean = EstimCondMean(vec_x = vec_x, vec_y = vec_y,
71+
h = h, x0 = x0)
72+
time2 = proc.time()
73+
computationTime = (time2 - time1)[3]
74+
75+
## Storing the result ------------------------------------
76+
77+
estimError = estimatedCondMean - trueCondMean
78+
79+
toWrite1 = c(n, modelName, mean_x, sd_x, sd_epsilon,
80+
x0, trueCondMean,
81+
h,
82+
estimError, computationTime)
83+
84+
write.table(
85+
x = matrix(toWrite1, nrow = 1) ,
86+
file = nameFile ,
87+
append = T, sep = ";", col.names = FALSE, row.names = FALSE
88+
)
89+
90+
i_step = i_step + 1
91+
pbapply::setpb(pb, i_step)
92+
}
93+
}
94+
95+
pbapply::closepb(pb)
96+

0 commit comments

Comments
 (0)