Initial commit

AlexisDerumigny · AlexisDerumigny · commit 97dd41f1ae27 · 2022-03-11T14:09:23.000+01:00
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,2 @@
+# Auto detect text files and perform LF normalization
+* text=auto
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,36 @@
+# History files
+.Rhistory
+.Rapp.history
+
+# Session Data files
+.RData
+
+# Example code in package build process
+*-Ex.R
+
+# Output files from R CMD build
+/*.tar.gz
+
+# Output files from R CMD check
+/*.Rcheck/
+
+# RStudio files
+.Rproj.user/
+
+# produced vignettes
+vignettes/*.html
+vignettes/*.pdf
+
+# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
+.httr-oauth
+
+# knitr and R markdown default cache directories
+/*_cache/
+/cache/
+
+# Temporary files created by R markdown
+*.utf8.md
+*.knit.md
+
+# Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html
+rsconnect/
diff --git a/README.md b/README.md
@@ -0,0 +1,19 @@
+
+# How to conduct a simulation study
+
+This is an example/template of a simple simulation study in `R`.
+
+In this example, we want to estimate the performance
+of a kernel-based estimator of the conditional mean.
+
+
+It is composed of the following files:
+
+- `functions.R`: contains the functions necessary to do the estimation.
+
+- `simulation.R`: the R script to run to do the simulations and save them as a csv file.
+
+- `loadingData.R`: the R script that read the csv file and process it.
+
+- `analysingSimulations.Rmd`: the R Markdown script that produces the figures and tables.
+
diff --git a/Simulation study.Rproj b/Simulation study.Rproj
@@ -0,0 +1,15 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
+
+AutoAppendNewline: Yes
diff --git a/analysingSimulations.Rmd b/analysingSimulations.Rmd
@@ -0,0 +1,50 @@
+---
+title: "Analysing data"
+author: "Alexis Derumigny"
+date: "03/03/2022"
+output:
+  pdf_document: default
+  html_document: default
+---
+
+```{r setup, include=TRUE}
+library(tidyverse)
+```
+
+We first load the data of the simulations that we did.
+
+```{r loading data}
+source("loadingData.R")
+```
+
+We can now print the summary statistics.
+
+```{r summary}
+summarisedData %>%
+  select(all_of(c("n", "h", "MSE", "meanComputationTime")))
+```
+
+
+We plot now the mean-squared error as a function of the bandwidth $h$.
+
+
+```{r}
+summarisedData %>%
+  ggplot(aes(x = h, y = MSE)) +
+  geom_line() +
+  scale_x_log10() + 
+  scale_y_log10()
+```
+
+
+We can also plot the distribution of the computation time as a function of $h$.
+
+```{r}
+totalData %>% 
+  mutate( h_ = factor(h, levels = sort(unique(h))) ) %>%
+  ggplot(aes(x = h_, y = computationTime)) +
+  geom_boxplot() +
+  ylab("Computation time (s)")
+```
+
+
diff --git a/analysingSimulations.html b/analysingSimulations.html
diff --git a/analysingSimulations.pdf b/analysingSimulations.pdf
diff --git a/functions.R b/functions.R
@@ -0,0 +1,24 @@
+
+
+# Estimator =====================================
+
+
+
+#' Estimate the conditional mean of Y given X = x0
+#' 
+#' @param vec_x observed data of X
+#' @param vec_y observed data of Y
+#' @param h the bandwidth used for kernel smoothing
+#' @param x0 the point at which the estimation is done
+#' 
+#' @return an estimator of the conditional mean
+#'  
+EstimCondMean <- function(vec_x, vec_y, h, x0)
+{
+  diff_x = (vec_x - x0)
+  weights = (1/h) * exp( - (diff_x / h)^2 / 2)
+  
+  estimator = sum(vec_y * weights) / sum(weights)
+  return(estimator)
+}
+
diff --git a/loadingData.R b/loadingData.R
@@ -0,0 +1,35 @@
+
+
+# Loading of the libraries
+library(tidyverse)
+
+# Specifying the files to be load
+filenames = c("simus_1.csv")
+
+# Reading the data
+totalData = filenames %>%
+  map( function(x){return(if(file.exists(x)){x} else {NULL})} ) %>%
+  unlist() %>%
+  map_dfr( read.csv, sep = ";", dec = ".", header = TRUE)
+
+
+# Summarizing the simulation
+# to create the mean squared error (MSE)
+# and mean computation time
+summarisedData = totalData %>%
+  group_by(n, modelName, mean_x, sd_x, sd_epsilon, x0, h) %>%
+  summarise(MSE = mean(estimError^2),
+            Bias = mean(estimError),
+            Sd_MSE = sd(estimError^2, na.rm = TRUE),
+            
+            nReplications = n(),
+            
+            MSE_q05 = MSE - 1.96 * Sd_MSE / nReplications,
+            MSE_q95 = MSE + 1.96 * Sd_MSE / nReplications,
+            
+            meanComputationTime = mean(computationTime),
+            sdComputationTime = sd(computationTime),
+            
+            .groups = "keep") %>%
+  ungroup()
+
diff --git a/simulation.R b/simulation.R
@@ -0,0 +1,96 @@
+
+# 1- Loading the libraries ==================================
+library(pbapply)
+source("functions.R")
+
+
+# 2- Creating the file if it doesn't exists =================
+nameFile = "simus_1.csv"
+
+if (!file.exists(nameFile)){
+  
+  caracDGP = c("n", "modelName", "mean_x", "sd_x", "sd_epsilon")
+  caracEstimation = c("x0", "trueCondMean")
+  caracEstimator = c("h")
+  caracResult = c("estimError", "computationTime")
+  
+  write.table(
+    x = paste0(c(caracDGP, caracEstimation,
+                 caracEstimator, caracResult), collapse=";") ,
+    
+    file = nameFile,
+    append = F, sep = ";", col.names = FALSE, row.names = FALSE,
+    quote = FALSE)
+}
+
+
+# 3- Characteristics of the simulation  =====================
+
+n = 50000
+
+# Data-generating process:
+modelName = "Y = X^2 + epsilon"
+mean_x = 5
+sd_x = 2
+sd_epsilon = 0.1
+
+
+# 4- Characteristics of the estimation ======================
+
+grid_h = c(0.001, 0.002, 0.005,
+           0.01, 0.02, 0.05,
+           0.1, 0.2, 0.5)
+x0 = 1
+
+
+# 5-  Simulations ===========================================
+
+Nreplications = 100
+number_steps = Nreplications * length(grid_h)
+  
+pb = pbapply::startpb(min = 0, max = number_steps)
+i_step = 0
+for (iReplication in 1:Nreplications)
+{
+  for (i_h in 1:length(grid_h))
+  {
+    h = grid_h[i_h]
+    
+    ## Generating data ---------------------------------------
+    
+    vec_x = rnorm(n = n, mean = mean_x, sd = sd_x)
+    epsilon = rnorm(n = n, mean = 0, sd = sd_epsilon)
+    vec_y = vec_x^2 + epsilon
+    
+    trueCondMean = x0^2
+    
+    ## Estimation --------------------------------------------
+    
+    time1 = proc.time()
+    estimatedCondMean = EstimCondMean(vec_x = vec_x, vec_y = vec_y,
+                                      h = h, x0 = x0)
+    time2 = proc.time()
+    computationTime = (time2 - time1)[3]
+    
+    ## Storing the result ------------------------------------
+    
+    estimError = estimatedCondMean - trueCondMean
+    
+    toWrite1 = c(n, modelName, mean_x,  sd_x, sd_epsilon,
+                 x0, trueCondMean,
+                 h,
+                 estimError, computationTime)
+    
+    write.table(
+      x = matrix(toWrite1, nrow = 1) ,
+      file = nameFile ,
+      append = T, sep = ";", col.names = FALSE, row.names = FALSE
+    )
+    
+    i_step = i_step + 1
+    pbapply::setpb(pb, i_step)
+  }
+}
+
+pbapply::closepb(pb)
+
diff --git a/simus_1.csv b/simus_1.csv

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# Auto detect text files and perform LF normalization`
	`2`	`+* text=auto`