Skip to content

Commit

Permalink
fix check notes/errors, update vignette title
Browse files Browse the repository at this point in the history
  • Loading branch information
bdwilliamson committed Jan 12, 2021
1 parent 3168db4 commit b111946
Show file tree
Hide file tree
Showing 38 changed files with 281 additions and 200 deletions.
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ Imports:
rstantools (>= 1.5.1),
tibble,
dplyr,
magrittr
magrittr,
rlang
LinkingTo:
BH (>= 1.69.0-1),
Rcpp (>= 1.0.0),
Expand Down
22 changes: 11 additions & 11 deletions R/extract_posterior_summaries.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,32 +20,32 @@
#' }
#'
#' @examples
#' ## load the package, read in example data
#' # load the package, read in example data
#' library("paramedic")
#' data(example_16S_data)
#' data(example_qPCR_data)
#'
#' ## run paramedic (with an extremely small number of iterations, for illustration only)
#' ## on only the first 10 taxa
#' # run paramedic (with an extremely small number of iterations, for illustration only)
#' # on only the first 10 taxa
#' mod <- run_paramedic(W = example_16S_data[, 1:10], V = example_qPCR_data,
#' n_iter = 30, n_burnin = 25,
#' n_chains = 1, stan_seed = 4747)
#'
#' ## get summary, samples
#' # get summary, samples
#' mod_summ <- rstan::summary(mod, probs = c(0.025, 0.975))$summary
#' mod_samps <- rstan::extract(mod)
#'
#' ## extract relevant summaries
#' # extract relevant summaries
#' summs <- extract_posterior_summaries(stan_mod = mod_summ, stan_samps = mod_samps,
#' taxa_of_interest = 1:3,
#' mult_num = 1, level = 0.95, interval_type = "wald")
#'
#' @export
extract_posterior_summaries <- function(stan_mod, stan_samps, taxa_of_interest, mult_num = 1, level = 0.95, interval_type = "wald") {
## check to make sure all taxa of interest are actual taxa
# check to make sure all taxa of interest are actual taxa
check_taxa <- lapply(as.list(taxa_of_interest), function(x) any(grepl(paste0(",", x, "]"), rownames(stan_mod), fixed = TRUE)))
if (!any(unlist(check_taxa))) stop("One or more of your taxa of interest are not present in the sampling output. Please specify only taxa for which you have samples.")
## get the posterior estimates
# get the posterior estimates
mu_summ_lst <- lapply(as.list(taxa_of_interest), function(x) stan_mod[grepl("mu", rownames(stan_mod)) & !grepl("log", rownames(stan_mod)) & grepl(paste0(",", x, "]"), rownames(stan_mod), fixed = TRUE), c(1, 3, 4, 5)]*mult_num)
est_lst <- lapply(mu_summ_lst, function(x) x[, 1])
sd_lst <- lapply(mu_summ_lst, function(x) x[, 2])
Expand All @@ -55,20 +55,20 @@ extract_posterior_summaries <- function(stan_mod, stan_samps, taxa_of_interest,
rownames(estimates) <- as.character(1:dim(estimates)[1])
rownames(sd) <- as.character(1:dim(sd)[1])

## get cis
# get cis
credible_intervals <- sapply(1:length(mu_summ_lst), function(x) mu_summ_lst[[x]][, c(3, 4)], simplify = "array")
rownames(credible_intervals) <- as.character(1:dim(estimates)[1])

## get prediction intervals
# get prediction intervals
if (interval_type == "wald") {
intervals <- sapply(1:length(taxa_of_interest), function(x) gen_wald_interval(estimates[, x], sd[, x], alpha = 1 - level), simplify = "array")
} else if (interval_type == "quantile") {
intervals <- sapply(1:length(taxa_of_interest), function(x) gen_quantile_interval(mu_quantiles = credible_intervals[, , x], mu_samps = stan_samps$mu[, , taxa_of_interest[x]], div_num = mult_num, alpha = 1 - level, type = "credible_quantiles"), simplify = "array")
} else { ## next, add quantile
} else { # next, add quantile
stop("Unsupported prediction interval type. Please enter one of 'quantile' or 'wald'.")
}

## extract summaries of varying efficiency, if they exist
# extract summaries of varying efficiency, if they exist
if (any(grepl("e", rownames(stan_mod)) & !grepl("beta", rownames(stan_mod)))) {
e <- stan_mod[grepl("e", rownames(stan_mod)) & !grepl("beta", rownames(stan_mod)), 1]
e_intervals <- stan_mod[grepl("e", rownames(stan_mod)) &!grepl("beta", rownames(stan_mod)), c(4, 5)]
Expand Down
18 changes: 9 additions & 9 deletions R/gen_quantile_interval.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,21 +11,21 @@
#' @return A (1 - \eqn{\alpha})x100\% Poisson-quantile-based prediction interval for each qPCR
#'
#' @examples
#' ## load the package, read in example data
#' # load the package, read in example data
#' library("paramedic")
#' data(example_16S_data)
#' data(example_qPCR_data)
#'
#' ## run paramedic (with an extremely small number of iterations, for illustration only)
#' ## on only the first 10 taxa
#' # run paramedic (with an extremely small number of iterations, for illustration only)
#' # on only the first 10 taxa
#' mod <- run_paramedic(W = example_16S_data[, 1:10], V = example_qPCR_data,
#' n_iter = 30, n_burnin = 25,
#' n_chains = 1, stan_seed = 4747)
#' ## get model summary
#' # get model summary
#' mod_summ <- rstan::summary(mod, probs = c(0.025, 0.975))$summary
#' ## get samples
#' # get samples
#' mod_samps <- rstan::extract(mod)
#' ## extract relevant summaries
#' # extract relevant summaries
#' summs <- extract_posterior_summaries(stan_mod = mod_summ, stan_samps = mod_samps,
#' taxa_of_interest = 1:3,
#' mult_num = 1, level = 0.95, interval_type = "quantile")
Expand All @@ -39,13 +39,13 @@ gen_quantile_interval <- function(mu_quantiles, mu_samps, alpha = 0.05, type = "
upper_limits <- stats::qpois(p = 1 - alpha/2, lambda = mu_quantiles[, 2])
pred_intervals <- cbind(lower_limits, upper_limits)
} else if (type == "sample_quantiles") {
## check to make sure it's an array of dimension 3
# check to make sure it's an array of dimension 3
if (length(dim(mu_samps)) != 3) mu_samps <- array(mu_samps, dim = c(dim(mu_samps), 1))

## sample from Poisson
# sample from Poisson
v_samps <- apply(mu_samps, c(1, 2, 3), function(x) stats::rpois(1, x))

## compute quantiles for each (i,j) pair
# compute quantiles for each (i,j) pair
quantiles <- apply(v_samps, c(2, 3), function(x) stats::quantile(x, probs = c(alpha/2, 1 - alpha/2)))

pred_intervals <- matrix(quantiles, nrow = dim(quantiles)[2]*dim(quantiles)[3], ncol = 2, byrow = TRUE)
Expand Down
20 changes: 10 additions & 10 deletions R/gen_wald_interval.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,37 +10,37 @@
#' @return A (1 - \eqn{\alpha})x100\% Wald-type prediction interval for each qPCR
#'
#' @examples
#' ## load the package, read in example data
#' # load the package, read in example data
#' library("paramedic")
#' data(example_16S_data)
#' data(example_qPCR_data)
#'
#' ## run paramedic (with an extremely small number of iterations, for illustration only)
#' ## on only first 10 taxa
#' # run paramedic (with an extremely small number of iterations, for illustration only)
#' # on only first 10 taxa
#' mod <- run_paramedic(W = example_16S_data[, 1:10], V = example_qPCR_data,
#' n_iter = 30, n_burnin = 25,
#' n_chains = 1, stan_seed = 4747)
#' ## get model summary
#' # get model summary
#' mod_summ <- rstan::summary(mod, probs = c(0.025, 0.975))$summary
#' ## get samples
#' # get samples
#' mod_samps <- rstan::extract(mod)
#' ## extract relevant summaries
#' # extract relevant summaries
#' summs <- extract_posterior_summaries(stan_mod = mod_summ, stan_samps = mod_samps,
#' taxa_of_interest = 1:3,
#' mult_num = 1, level = 0.95, interval_type = "wald")
#'
#' @export
gen_wald_interval <- function(mu, sd, alpha = 0.05, truncate = TRUE) {
## check if it is a matrix first; if so, make it a vector
# check if it is a matrix first; if so, make it a vector
if (is.matrix(mu)) mu <- as.vector(t(mu))
if (is.matrix(sd)) sd <- as.vector(t(sd))
## estimated variance
# estimated variance
est_var <- mu + sd^2

## get intervals
# get intervals
pred_intervals <- mu + sqrt(est_var) %o% stats::qnorm(c(alpha/2, 1 - alpha/2))

## truncate
# truncate
if (truncate) {
pred_intervals[, 1] <- ifelse(pred_intervals[, 1] < 0, 0, pred_intervals[, 1])
}
Expand Down
40 changes: 21 additions & 19 deletions R/predict_paramedic.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#' Generate predictions based on a paramedic model fit
#'
#' Predict concentrations (and efficiencies, if applicable), absolute abundances, and relative abundances based on the posterior distributions from a previously fitted model resulting from a call to \code{run_paramedic.
#' Predict concentrations (and efficiencies, if applicable), absolute abundances, and relative abundances based on the posterior distributions from a previously fitted model resulting from a call to \code{run_paramedic}.
#'
#' @param W The new relative abundance data, e.g., from broad range 16S sequencing with "universal" primers. Expects data (e.g., matrix, data.frame, tibble) with sample identifiers in the first column. Sample identifiers must be the same between W and V, and the column must have the same name in W and V.
#' @param V The new absolute abundance data, e.g., from taxon-specific absolute primers. Expects data (e.g., matrix, data.frame, tibble) with sample identifiers in the first column. Sample identifiers must be the same between W and V, and the column must have the same name in W and V.
Expand All @@ -27,16 +27,16 @@
#' @details Using the posterior distributions from a call to \code{run_paramedic}, generate predicted values.
#'
#' @examples
#' ## load the package, read in example data
#' # load the package, read in example data
#' library("paramedic")
#' data(example_16S_data)
#' data(example_qPCR_data)
#' ## generate a train/test split
#' # generate a train/test split
#' set.seed(1234)
#' folds <- sample(1:2, size = nrow(example_16S_data), replace = TRUE)
#'
#' ## run paramedic (with an extremely small number of iterations, for illustration only)
#' ## on only the first 10 taxa, and on a subset of the data
#' # run paramedic (with an extremely small number of iterations, for illustration only)
#' # on only the first 10 taxa, and on a subset of the data
#' mod <- run_paramedic(W = example_16S_data[folds == 1, 1:10],
#' V = example_qPCR_data[folds == 1, ], n_iter = 30, n_burnin = 25,
#' n_chains = 1, stan_seed = 4747)
Expand All @@ -46,8 +46,10 @@
#' beta0_post <- ext_mod$beta_0
#' sigma_post <- ext_mod$Sigma
#' sigmae_post <- ext_mod$sigma_e
#' pred_mod <- predict_paramedic(beta_0 = beta0_post, Sigma = sigma_post,
#' sigma_e = sigmae_post, )
#' pred_mod <- predict_paramedic(W = example_16S_data[folds == 2, 1:10],
#' V = example_qPCR_data[folds == 2, ], n_iter = 75, n_burnin = 25,
#' beta_0 = beta0_post, Sigma = sigma_post, sigma_e = sigmae_post,
#' n_chains = 1, stan_seed = 1234)
#'
#' @seealso \code{\link[rstan]{stan}} and \code{\link[rstan]{sampling}} for specific usage of the \code{stan} and \code{sampling} functions; \code{\link[paramedic]{run_paramedic}} for usage of the \code{run_paramedic} function.
#'
Expand All @@ -62,23 +64,23 @@ predict_paramedic <- function(W, V, X = V[, 1, drop = FALSE], beta_0,
stan_seed = 4747, alpha_sigma = 2, kappa_sigma = 1,
alpha_phi = 0, beta_phi = 0, sigma_xi = 1,
...) {
## --------------
## error messages
## --------------
# --------------
# error messages
# --------------
check_entered_data(W, V, X, k, alpha_sigma = alpha_sigma,
kappa_sigma = kappa_sigma)
## ---------------------------
## pre-processing and warnings
## ---------------------------
# ---------------------------
# pre-processing and warnings
# ---------------------------
pre_processed_lst <- make_paramedic_tibbles(W, V, X, k,
alpha_sigma = alpha_sigma,
kappa_sigma = kappa_sigma)
W_mat <- pre_processed_lst$w_mat
V_mat <- pre_processed_lst$v_mat
X_mat <- pre_processed_lst$x_mat
## ----------------------------------------
## set up the data list
## ----------------------------------------
# ----------------------------------------
# set up the data list
# ----------------------------------------
N <- ifelse(k > 0, dim(W_mat)[2], dim(W_mat)[1])
N_samples <- nrow(Sigma)
q <- ifelse(k > 0, dim(W_mat)[3], dim(W_mat)[2])
Expand All @@ -91,9 +93,9 @@ predict_paramedic <- function(W, V, X = V[, 1, drop = FALSE], beta_0,
phi = phi, alpha_sigma = alpha_sigma,
kappa_sigma = kappa_sigma, alpha_phi = alpha_phi,
beta_phi = beta_phi)
## ----------------------
## run the Stan algorithm
## ----------------------
# ----------------------
# run the Stan algorithm
# ----------------------
pars <- c("mu",
switch((alpha_sigma > 0 & kappa_sigma > 0) + 1, NULL, "e"),
"V", "W")
Expand Down
21 changes: 21 additions & 0 deletions R/prep_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,27 @@ make_paramedic_tibbles <- function(W, V, X, k, inits_lst,
#' @param W_mat the pre-processed matrix W
#' @param V_mat the pre-processed matrix V
#' @param X_mat the pre-processed matrix X
#' @param inits_lst An optional list of initial values of the parameters.
#' Must be a named list; see \code{\link[rstan]{stan}}.
#' @param sigma_beta Hyperparameter specifying the prior variance on
#' \eqn{\beta_0}. Defaults to \eqn{\sqrt{50}}.
#' @param sigma_Sigma Hyperparameter specifying the prior variance on
#' \eqn{\Sigma}. Defaults to \eqn{\sqrt{50}}.
#' @param alpha_sigma Hyperparameter specifying the shape parameter of the
#' prior distribution on \eqn{\sigma_e}. Defaults to 2.
#' @param kappa_sigma Hyperparameter specifying the scale parameter of the
#' prior distribution on \eqn{\sigma_e}. Defaults to 1.
#' @param alpha_phi Hyperparameter specifying the shape parameter of the
#' prior distribution on \eqn{phi} (the optional Negative Binomial
#' dispersion parameter). Defaults to 0 (in which case a Poisson distribution
#' on V is used).
#' @param beta_phi Hyperparameter specifying the scale parameter of the
#' prior distribution on \eqn{phi} (the optional Negative Binomial
#' dispersion parameter). Defaults to 0 (in which case a Poisson distribution
#' on V is used).
#' @param n_chains the number of chains to run
#' @param centered whether or not to use the centered parameterization of the
#' stan algorithm. Defaults to \code{FALSE}.
#' @describeIn check_entered_data Make data and initial values lists to
#' pass to stan
#' @importFrom stats var
Expand Down
24 changes: 12 additions & 12 deletions R/process_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@
#' @return a list containing two matrices: one with the observed absolute abundances, and the other with the observed relative abundances.
#'
#' @examples
#' ## load the package, read in example data
#' # load the package, read in example data
#' library("paramedic")
#' data(simple_example_data)
#'
#' ## process the example data
#' # process the example data
#' q <- 3
#' q_obs <- 2
#' processed_data <- paramedic::process_data(full_data = simple_example_data, rel_inds = 1:q,
Expand All @@ -30,40 +30,40 @@
#'
#' @export
process_data <- function(full_data, rel_inds, abs_inds, abs_plus_rel_inds, regex_thr = "_t", regex_abs = "_cps", llod = 0, m_min = 1000, div_num = 100) {
## helper function to process absolute abundance data
# helper function to process absolute abundance data
process_absolute <- function(absolute, llod_inds, abs_inds, llod = 0, div_num = 100) {
## subset with absolute abundance values
# subset with absolute abundance values
sub_absolute <- absolute[, abs_inds]/div_num
sub_llod <- absolute[, llod_inds]/div_num
## clean up with llod
# clean up with llod
clean_absolute <- mapply(function(x, y) ifelse(x < y, llod, x), sub_absolute, sub_llod)
return(clean_absolute)
}
## if full_data is a matrix, make it a data.frame
# if full_data is a matrix, make it a data.frame
if (is.matrix(full_data)) {
tmp <- as.data.frame(full_data)
full_data <- tmp
}

## subset the absolute abundance data; set counts at lower limit of detection to llod
# subset the absolute abundance data; set counts at lower limit of detection to llod
sub_absolute <- full_data[, abs_inds]
clean_absolute <- process_absolute(sub_absolute, grepl(regex_thr, names(sub_absolute), fixed = TRUE), grepl(regex_abs, names(sub_absolute), fixed = TRUE), llod, div_num)
tmp_absolute <- data.frame(clean_absolute)

## subset the relative abundance data
# subset the relative abundance data
subset_relative <- full_data[, rel_inds]
## re-order so that the bugs with absolute abundance are first
# re-order so that the bugs with absolute abundance are first
all_relative <- cbind(subset_relative[, abs_plus_rel_inds], subset_relative[, -c(abs_plus_rel_inds)])
## check whether or not any have a nonzero count
# check whether or not any have a nonzero count
nonzero_counts <- apply(all_relative > 0, 2, sum) > 0
tmp_relative <- data.frame(all_relative[, nonzero_counts])
names(tmp_relative) <- names(full_data)[rel_inds]

## remove any rows that have less than m_min reads
# remove any rows that have less than m_min reads
m <- apply(tmp_relative, 1, sum)
absolute_m <- tmp_absolute[m >= m_min, ]
relative_m <- tmp_relative[m >= m_min, ]
## remove any rows that have missing absolute abundances
# remove any rows that have missing absolute abundances
na_absolute <- apply(absolute_m, 1, function(x) any(is.na(x)))
absolute <- absolute_m[!na_absolute, ]
relative <- relative_m[!na_absolute, ]
Expand Down
Loading

0 comments on commit b111946

Please sign in to comment.