fix check notes/errors, update vignette title

statdivlab · Jan 12, 2021 · b111946 · b111946
1 parent 3168db4
commit b111946
Show file tree

Hide file tree

Showing 38 changed files with 281 additions and 200 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -25,7 +25,8 @@ Imports:
     rstantools (>= 1.5.1),
     tibble,
     dplyr,
-    magrittr
+    magrittr,
+    rlang
 LinkingTo:
     BH (>= 1.69.0-1),
     Rcpp (>= 1.0.0),

diff --git a/R/extract_posterior_summaries.R b/R/extract_posterior_summaries.R
@@ -20,32 +20,32 @@
 #' }
 #'
 #' @examples
-#' ## load the package, read in example data
+#' # load the package, read in example data
 #' library("paramedic")
 #' data(example_16S_data)
 #' data(example_qPCR_data)
 #'
-#' ## run paramedic (with an extremely small number of iterations, for illustration only)
-#' ## on only the first 10 taxa
+#' # run paramedic (with an extremely small number of iterations, for illustration only)
+#' # on only the first 10 taxa
 #' mod <- run_paramedic(W = example_16S_data[, 1:10], V = example_qPCR_data,
 #' n_iter = 30, n_burnin = 25, 
 #' n_chains = 1, stan_seed = 4747)
 #'
-#' ## get summary, samples
+#' # get summary, samples
 #' mod_summ <- rstan::summary(mod, probs = c(0.025, 0.975))$summary
 #' mod_samps <- rstan::extract(mod)
 #'
-#' ## extract relevant summaries
+#' # extract relevant summaries
 #' summs <- extract_posterior_summaries(stan_mod = mod_summ, stan_samps = mod_samps, 
 #' taxa_of_interest = 1:3,
 #' mult_num = 1, level = 0.95, interval_type = "wald")
 #'
 #' @export
 extract_posterior_summaries <- function(stan_mod, stan_samps, taxa_of_interest, mult_num = 1, level = 0.95, interval_type = "wald") {
-  ## check to make sure all taxa of interest are actual taxa
+  # check to make sure all taxa of interest are actual taxa
   check_taxa <- lapply(as.list(taxa_of_interest), function(x) any(grepl(paste0(",", x, "]"), rownames(stan_mod), fixed = TRUE)))
   if (!any(unlist(check_taxa))) stop("One or more of your taxa of interest are not present in the sampling output. Please specify only taxa for which you have samples.")
-  ## get the posterior estimates
+  # get the posterior estimates
   mu_summ_lst <- lapply(as.list(taxa_of_interest), function(x) stan_mod[grepl("mu", rownames(stan_mod)) & !grepl("log", rownames(stan_mod)) & grepl(paste0(",", x, "]"), rownames(stan_mod), fixed = TRUE), c(1, 3, 4, 5)]*mult_num)
   est_lst <- lapply(mu_summ_lst, function(x) x[, 1])
   sd_lst <- lapply(mu_summ_lst, function(x) x[, 2])
@@ -55,20 +55,20 @@ extract_posterior_summaries <- function(stan_mod, stan_samps, taxa_of_interest,
   rownames(estimates) <- as.character(1:dim(estimates)[1])
   rownames(sd) <- as.character(1:dim(sd)[1])
 
-  ## get cis
+  # get cis
   credible_intervals <- sapply(1:length(mu_summ_lst), function(x) mu_summ_lst[[x]][, c(3, 4)], simplify = "array")
   rownames(credible_intervals) <- as.character(1:dim(estimates)[1])
 
-  ## get prediction intervals
+  # get prediction intervals
   if (interval_type == "wald") {
     intervals <- sapply(1:length(taxa_of_interest), function(x) gen_wald_interval(estimates[, x], sd[, x], alpha = 1 - level), simplify = "array")
   } else if (interval_type == "quantile") {
     intervals <- sapply(1:length(taxa_of_interest), function(x) gen_quantile_interval(mu_quantiles = credible_intervals[, , x], mu_samps = stan_samps$mu[, , taxa_of_interest[x]], div_num = mult_num, alpha = 1 - level, type = "credible_quantiles"), simplify = "array")
-  } else { ## next, add quantile
+  } else { # next, add quantile
     stop("Unsupported prediction interval type. Please enter one of 'quantile' or 'wald'.")
   }
 
-  ## extract summaries of varying efficiency, if they exist
+  # extract summaries of varying efficiency, if they exist
   if (any(grepl("e", rownames(stan_mod)) & !grepl("beta", rownames(stan_mod)))) {
     e <- stan_mod[grepl("e", rownames(stan_mod)) & !grepl("beta", rownames(stan_mod)), 1]
     e_intervals <- stan_mod[grepl("e", rownames(stan_mod)) &!grepl("beta", rownames(stan_mod)), c(4, 5)]

diff --git a/R/gen_quantile_interval.R b/R/gen_quantile_interval.R
@@ -11,21 +11,21 @@
 #' @return A (1 - \eqn{\alpha})x100\% Poisson-quantile-based prediction interval for each qPCR
 #'
 #' @examples
-#' ## load the package, read in example data
+#' # load the package, read in example data
 #' library("paramedic")
 #' data(example_16S_data)
 #' data(example_qPCR_data)
 #'
-#' ## run paramedic (with an extremely small number of iterations, for illustration only)
-#' ## on only the first 10 taxa
+#' # run paramedic (with an extremely small number of iterations, for illustration only)
+#' # on only the first 10 taxa
 #' mod <- run_paramedic(W = example_16S_data[, 1:10], V = example_qPCR_data,
 #' n_iter = 30, n_burnin = 25, 
 #' n_chains = 1, stan_seed = 4747)
-#' ## get model summary
+#' # get model summary
 #' mod_summ <- rstan::summary(mod, probs = c(0.025, 0.975))$summary
-#' ## get samples
+#' # get samples
 #' mod_samps <- rstan::extract(mod)
-#' ## extract relevant summaries
+#' # extract relevant summaries
 #' summs <- extract_posterior_summaries(stan_mod = mod_summ, stan_samps = mod_samps, 
 #' taxa_of_interest = 1:3,
 #' mult_num = 1, level = 0.95, interval_type = "quantile")
@@ -39,13 +39,13 @@ gen_quantile_interval <- function(mu_quantiles, mu_samps, alpha = 0.05, type = "
     upper_limits <- stats::qpois(p = 1 - alpha/2, lambda = mu_quantiles[, 2])
     pred_intervals <- cbind(lower_limits, upper_limits)
   } else if (type == "sample_quantiles") {
-    ## check to make sure it's an array of dimension 3
+    # check to make sure it's an array of dimension 3
     if (length(dim(mu_samps)) != 3) mu_samps <- array(mu_samps, dim = c(dim(mu_samps), 1))
 
-    ## sample from Poisson
+    # sample from Poisson
     v_samps <- apply(mu_samps, c(1, 2, 3), function(x) stats::rpois(1, x))
 
-    ## compute quantiles for each (i,j) pair
+    # compute quantiles for each (i,j) pair
     quantiles <- apply(v_samps, c(2, 3), function(x) stats::quantile(x, probs = c(alpha/2, 1 - alpha/2)))
 
     pred_intervals <- matrix(quantiles, nrow = dim(quantiles)[2]*dim(quantiles)[3], ncol = 2, byrow = TRUE)

diff --git a/R/gen_wald_interval.R b/R/gen_wald_interval.R
@@ -10,37 +10,37 @@
 #' @return A (1 - \eqn{\alpha})x100\% Wald-type prediction interval for each qPCR
 #'
 #' @examples
-#' ## load the package, read in example data
+#' # load the package, read in example data
 #' library("paramedic")
 #' data(example_16S_data)
 #' data(example_qPCR_data)
 #'
-#' ## run paramedic (with an extremely small number of iterations, for illustration only)
-#' ## on only first 10 taxa
+#' # run paramedic (with an extremely small number of iterations, for illustration only)
+#' # on only first 10 taxa
 #' mod <- run_paramedic(W = example_16S_data[, 1:10], V = example_qPCR_data,
 #' n_iter = 30, n_burnin = 25, 
 #' n_chains = 1, stan_seed = 4747)
-#' ## get model summary
+#' # get model summary
 #' mod_summ <- rstan::summary(mod, probs = c(0.025, 0.975))$summary
-#' ## get samples
+#' # get samples
 #' mod_samps <- rstan::extract(mod)
-#' ## extract relevant summaries
+#' # extract relevant summaries
 #' summs <- extract_posterior_summaries(stan_mod = mod_summ, stan_samps = mod_samps, 
 #' taxa_of_interest = 1:3,
 #' mult_num = 1, level = 0.95, interval_type = "wald")
 #'
 #' @export
 gen_wald_interval <- function(mu, sd, alpha = 0.05, truncate = TRUE) {
-  ## check if it is a matrix first; if so, make it a vector
+  # check if it is a matrix first; if so, make it a vector
   if (is.matrix(mu)) mu <- as.vector(t(mu))
   if (is.matrix(sd)) sd <- as.vector(t(sd))
-  ## estimated variance
+  # estimated variance
   est_var <- mu + sd^2
 
-  ## get intervals
+  # get intervals
   pred_intervals <- mu + sqrt(est_var) %o% stats::qnorm(c(alpha/2, 1 - alpha/2))
 
-  ## truncate
+  # truncate
   if (truncate) {
     pred_intervals[, 1] <- ifelse(pred_intervals[, 1] < 0, 0, pred_intervals[, 1])
   }

diff --git a/R/predict_paramedic.R b/R/predict_paramedic.R
@@ -1,6 +1,6 @@
 #' Generate predictions based on a paramedic model fit
 #'
-#' Predict concentrations (and efficiencies, if applicable), absolute abundances, and relative abundances based on the posterior distributions from a previously fitted model resulting from a call to \code{run_paramedic.
+#' Predict concentrations (and efficiencies, if applicable), absolute abundances, and relative abundances based on the posterior distributions from a previously fitted model resulting from a call to \code{run_paramedic}.
 #'
 #' @param W The new relative abundance data, e.g., from broad range 16S sequencing with "universal" primers. Expects data (e.g., matrix, data.frame, tibble) with sample identifiers in the first column. Sample identifiers must be the same between W and V, and the column must have the same name in W and V.
 #' @param V The new absolute abundance data, e.g., from taxon-specific absolute primers. Expects data (e.g., matrix, data.frame, tibble) with sample identifiers in the first column. Sample identifiers must be the same between W and V, and the column must have the same name in W and V.
@@ -27,16 +27,16 @@
 #' @details Using the posterior distributions from a call to \code{run_paramedic}, generate predicted values.
 #'
 #' @examples
-#' ## load the package, read in example data
+#' # load the package, read in example data
 #' library("paramedic")
 #' data(example_16S_data)
 #' data(example_qPCR_data)
-#' ## generate a train/test split
+#' # generate a train/test split
 #' set.seed(1234)
 #' folds <- sample(1:2, size = nrow(example_16S_data), replace = TRUE)
 #'
-#' ## run paramedic (with an extremely small number of iterations, for illustration only)
-#' ## on only the first 10 taxa, and on a subset of the data
+#' # run paramedic (with an extremely small number of iterations, for illustration only)
+#' # on only the first 10 taxa, and on a subset of the data
 #' mod <- run_paramedic(W = example_16S_data[folds == 1, 1:10], 
 #' V = example_qPCR_data[folds == 1, ], n_iter = 30, n_burnin = 25, 
 #' n_chains = 1, stan_seed = 4747)
@@ -46,8 +46,10 @@
 #' beta0_post <- ext_mod$beta_0
 #' sigma_post <- ext_mod$Sigma
 #' sigmae_post <- ext_mod$sigma_e
-#' pred_mod <- predict_paramedic(beta_0 = beta0_post, Sigma = sigma_post,
-#' sigma_e = sigmae_post, ) 
+#' pred_mod <- predict_paramedic(W = example_16S_data[folds == 2, 1:10], 
+#' V = example_qPCR_data[folds == 2, ], n_iter = 75, n_burnin = 25,
+#'  beta_0 = beta0_post, Sigma = sigma_post, sigma_e = sigmae_post, 
+#'  n_chains = 1, stan_seed = 1234) 
 #'
 #' @seealso \code{\link[rstan]{stan}} and \code{\link[rstan]{sampling}} for specific usage of the \code{stan} and \code{sampling} functions; \code{\link[paramedic]{run_paramedic}} for usage of the \code{run_paramedic} function.
 #'
@@ -62,23 +64,23 @@ predict_paramedic <- function(W, V, X = V[, 1, drop = FALSE], beta_0,
                           stan_seed = 4747, alpha_sigma = 2, kappa_sigma = 1, 
                           alpha_phi = 0, beta_phi = 0, sigma_xi = 1,
                           ...) {
-    ## --------------
-    ## error messages
-    ## --------------
+    # --------------
+    # error messages
+    # --------------
     check_entered_data(W, V, X, k, alpha_sigma = alpha_sigma, 
                        kappa_sigma = kappa_sigma)
-    ## ---------------------------
-    ## pre-processing and warnings
-    ## ---------------------------
+    # ---------------------------
+    # pre-processing and warnings
+    # ---------------------------
     pre_processed_lst <- make_paramedic_tibbles(W, V, X, k, 
                                                 alpha_sigma = alpha_sigma, 
                                                 kappa_sigma = kappa_sigma)
     W_mat <- pre_processed_lst$w_mat
     V_mat <- pre_processed_lst$v_mat
     X_mat <- pre_processed_lst$x_mat
-    ## ----------------------------------------
-    ## set up the data list
-    ## ----------------------------------------
+    # ----------------------------------------
+    # set up the data list
+    # ----------------------------------------
     N <- ifelse(k > 0, dim(W_mat)[2], dim(W_mat)[1])
     N_samples <- nrow(Sigma)
     q <- ifelse(k > 0, dim(W_mat)[3], dim(W_mat)[2])
@@ -91,9 +93,9 @@ predict_paramedic <- function(W, V, X = V[, 1, drop = FALSE], beta_0,
                      phi = phi, alpha_sigma = alpha_sigma, 
                      kappa_sigma = kappa_sigma, alpha_phi = alpha_phi,
                      beta_phi = beta_phi)
-    ## ----------------------
-    ## run the Stan algorithm
-    ## ----------------------
+    # ----------------------
+    # run the Stan algorithm
+    # ----------------------
     pars <- c("mu",
               switch((alpha_sigma > 0 & kappa_sigma > 0) + 1, NULL, "e"),
               "V", "W")

diff --git a/R/prep_data.R b/R/prep_data.R
@@ -130,6 +130,27 @@ make_paramedic_tibbles <- function(W, V, X, k, inits_lst,
 #' @param W_mat the pre-processed matrix W
 #' @param V_mat the pre-processed matrix V
 #' @param X_mat the pre-processed matrix X
+#' @param inits_lst An optional list of initial values of the parameters. 
+#' Must be a named list; see \code{\link[rstan]{stan}}.
+#' @param sigma_beta Hyperparameter specifying the prior variance on 
+#' \eqn{\beta_0}. Defaults to \eqn{\sqrt{50}}.
+#' @param sigma_Sigma Hyperparameter specifying the prior variance on 
+#' \eqn{\Sigma}. Defaults to \eqn{\sqrt{50}}.
+#' @param alpha_sigma Hyperparameter specifying the shape parameter of the 
+#' prior distribution on \eqn{\sigma_e}. Defaults to 2.
+#' @param kappa_sigma Hyperparameter specifying the scale parameter of the
+#'  prior distribution on \eqn{\sigma_e}. Defaults to 1.
+#' @param alpha_phi Hyperparameter specifying the shape parameter of the 
+#' prior distribution on \eqn{phi} (the optional Negative Binomial 
+#' dispersion parameter). Defaults to 0 (in which case a Poisson distribution
+#' on V is used).
+#' @param beta_phi Hyperparameter specifying the scale parameter of the 
+#' prior distribution on \eqn{phi} (the optional Negative Binomial 
+#' dispersion parameter). Defaults to 0 (in which case a Poisson distribution
+#' on V is used).
+#' @param n_chains the number of chains to run
+#' @param centered whether or not to use the centered parameterization of the 
+#' stan algorithm. Defaults to \code{FALSE}.
 #' @describeIn check_entered_data Make data and initial values lists to
 #'  pass to stan
 #' @importFrom stats var

diff --git a/R/process_data.R b/R/process_data.R
@@ -15,11 +15,11 @@
 #' @return a list containing two matrices: one with the observed absolute abundances, and the other with the observed relative abundances.
 #'
 #' @examples
-#' ## load the package, read in example data
+#' # load the package, read in example data
 #' library("paramedic")
 #' data(simple_example_data)
 #'
-#' ## process the example data
+#' # process the example data
 #' q <- 3
 #' q_obs <- 2
 #' processed_data <- paramedic::process_data(full_data = simple_example_data, rel_inds = 1:q,
@@ -30,40 +30,40 @@
 #'
 #' @export
 process_data <- function(full_data, rel_inds, abs_inds, abs_plus_rel_inds, regex_thr = "_t", regex_abs = "_cps", llod = 0, m_min = 1000, div_num = 100) {
-  ## helper function to process absolute abundance data
+  # helper function to process absolute abundance data
   process_absolute <- function(absolute, llod_inds, abs_inds, llod = 0, div_num = 100) {
-    ## subset with absolute abundance values
+    # subset with absolute abundance values
     sub_absolute <- absolute[, abs_inds]/div_num
     sub_llod <- absolute[, llod_inds]/div_num
-    ## clean up with llod
+    # clean up with llod
     clean_absolute <- mapply(function(x, y) ifelse(x < y, llod, x), sub_absolute, sub_llod)
     return(clean_absolute)
   }
-  ## if full_data is a matrix, make it a data.frame
+  # if full_data is a matrix, make it a data.frame
   if (is.matrix(full_data)) {
     tmp <- as.data.frame(full_data)
     full_data <- tmp
   }
 
-  ## subset the absolute abundance data; set counts at lower limit of detection to llod
+  # subset the absolute abundance data; set counts at lower limit of detection to llod
   sub_absolute <- full_data[, abs_inds]
   clean_absolute <- process_absolute(sub_absolute, grepl(regex_thr, names(sub_absolute), fixed = TRUE), grepl(regex_abs, names(sub_absolute), fixed = TRUE), llod, div_num)
   tmp_absolute <- data.frame(clean_absolute)
 
-  ## subset the relative abundance data
+  # subset the relative abundance data
   subset_relative <- full_data[, rel_inds]
-  ## re-order so that the bugs with absolute abundance are first
+  # re-order so that the bugs with absolute abundance are first
   all_relative <- cbind(subset_relative[, abs_plus_rel_inds], subset_relative[, -c(abs_plus_rel_inds)])
-  ## check whether or not any have a nonzero count
+  # check whether or not any have a nonzero count
   nonzero_counts <- apply(all_relative > 0, 2, sum) > 0
   tmp_relative <- data.frame(all_relative[, nonzero_counts])
   names(tmp_relative) <- names(full_data)[rel_inds]
 
-  ## remove any rows that have less than m_min reads
+  # remove any rows that have less than m_min reads
   m <- apply(tmp_relative, 1, sum)
   absolute_m <- tmp_absolute[m >= m_min, ]
   relative_m <- tmp_relative[m >= m_min, ]
-  ## remove any rows that have missing absolute abundances
+  # remove any rows that have missing absolute abundances
   na_absolute <- apply(absolute_m, 1, function(x) any(is.na(x)))
   absolute <- absolute_m[!na_absolute, ]
   relative <- relative_m[!na_absolute, ]