diff --git a/DESCRIPTION b/DESCRIPTION index 44bccd9..66e7023 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,17 +1,18 @@ Package: TCA Type: Package Title: Tensor Composition Analysis -Version: 1.1.0 -Authors@R: person("Elior", "Rahmani", email = "elior.rahmani@gmail.com", role = c("aut", "cre")) -Author: Elior Rahmani [aut, cre] -Maintainer: Elior Rahmani -Description: Tensor Composition Analysis (TCA) allows the deconvolution of two-dimensional data (features by observations) coming from a mixture of sources into a three-dimensional matrix of signals (features by observations by sources). TCA further allows to test the features in the data for different statistical relations with an outcome of interest while modeling source-specific effects (TCA regression); particularly, it allows to look for statistical relations between source-specific signals and an outcome. For example, TCA can deconvolve bulk tissue-level DNA methylation data (methylation sites by individuals) into a tensor of cell-type-specific methylation levels for each individual (methylation sites by individuals by cell types) and it allows to detect cell-type-specific relations (associations) with an outcome of interest. For more details see Rahmani et al. (2018) . +Version: 1.2.1 +Authors@R: c( + person("Elior", "Rahmani", role=c("aut", "cre"), email = "elior.rahmani@gmail.com"), + person("Brandon", "Jew", role=c("ctb")) + ) +Description: Tensor Composition Analysis (TCA) allows the deconvolution of two-dimensional data (features by observations) coming from a mixture of heterogeneous sources into a three-dimensional matrix of signals (features by observations by sources). The TCA framework further allows to test the features in the data for different statistical relations with an outcome of interest while modeling source-specific effects; particularly, it allows to look for statistical relations between source-specific signals and an outcome. For example, TCA can deconvolve bulk tissue-level DNA methylation data (methylation sites by individuals) into a three-dimensional tensor of cell-type-specific methylation levels for each individual (i.e. methylation sites by individuals by cell types) and it allows to detect cell-type-specific statistical relations (associations) with phenotypes. For more details see Rahmani et al. (2019) . License: GPL-3 Encoding: UTF-8 LazyData: true Imports: config, data.table, futile.logger, gmodels, Matrix, matrixcalc, matrixStats, nloptr, parallel, pbapply, pracma, rsvd, stats, quadprog, glmnet -RoxygenNote: 6.1.1 -Depends: R (>= 3.4.0) +RoxygenNote: 7.1.1 +Depends: R (>= 3.5.0) Suggests: testthat, knitr, diff --git a/NAMESPACE b/NAMESPACE index 40c282c..4b73fe2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -30,7 +30,9 @@ importFrom(pbapply,pboptions) importFrom(pracma,Reshape) importFrom(pracma,lsqlincon) importFrom(pracma,repmat) +importFrom(pracma,sqrtm) importFrom(rsvd,rpca) +importFrom(stats,anova) importFrom(stats,lm) importFrom(stats,logLik) importFrom(stats,p.adjust) diff --git a/NEWS.md b/NEWS.md index c4ab15e..a18feab 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,23 @@ +TCA v1.2.1 (Release date: 02-13-2021) +============== + +*NOTE* v1.2.1 is not backward compatible with earlier versions + +Changes: + +* A new argument 'vars.mle' in 'tca' which controls the method used for optimization; setting to FALSE allows a much faster optimization. + +* A new argument 'fast_mode' in 'tcareg' allows a much faster optimization. + +* A new argument 'constrain_mu' in 'tca' allows to determine whether to constrain the mean parameters in the optimization; setting to false now allows to obtain p-values for the covariates in 'C1' and 'C2'. + +* A new 'scale' argument in 'tensor' for dividing estimates by estimated standard deviations. + +* Default values of several arguments have been changed. + +* Switched the default FDR method used in ‘tcareg’ back from “BY” to “BH” (can be changed by editing the config file). + + TCA v1.1.0 (Release date: 11-14-2019) ============== diff --git a/R/TCA.R b/R/TCA.R index d6418b5..cdcb0da 100644 --- a/R/TCA.R +++ b/R/TCA.R @@ -1,55 +1,60 @@ #' @title Fitting the TCA model #' -#' @description Fits the TCA model for an input matrix of observations coming from a mixture of \code{k} sources, under the assumption that each observation is a mixture of unique source-specific values (in each feature in the data). For example, in the context of tissue-level bulk DNA methylation data coming from a mixture of cell types (i.e. the input is methylation sites by individuals), \code{tca} allows to model the methylation of each individual as a mixture of cell-type-specific methylation levels that are unique to the individual. +#' @description Fits the TCA model for an input matrix of features by observations that are coming from a mixture of \code{k} sources, under the assumption that each observation is a mixture of unique (unobserved) source-specific values (in each feature in the data). This function further allows to statistically test the effect of covariates on source-specific values. For example, in the context of tissue-level bulk DNA methylation data coming from a mixture of cell types (i.e. the input is methylation sites by individuals), \code{tca} allows to model the methylation of each individual as a mixture of cell-type-specific methylation levels that are unique to the individual. In addition, it allows to statistically test the effects of covariates and phenotypes on methylation at the cell-type level. #' -#' @param X An \code{m} by \code{n} matrix of measurements of \code{m} features for \code{n} observations. Each column in \code{X} is assumed to be a mixture of \code{k} different sources. Note that \code{X} must include row names and column names and that NA values are currently not supported. \code{X} should not include features that are constant across all observations. -#' @param W An \code{n} by \code{k} matrix of weights - the weights of \code{k} sources for each of the \code{n} mixtures (observations). All the weights must be positive and each row, corresponding to the weights of a single observation, must sum up to 1. Note that \code{W} must include row names and column names and that NA values are currently not supported. In case where only initial estimates of \code{W} are available, \code{tca} can be set to re-estimate \code{W} (see \code{refit_W}). -#' @param C1 An \code{n} by \code{p1} design matrix of covariates that may affect the hidden source-specific values (possibly a different effect on each source). Note that \code{C1} must include row names and column names and should not include an intercept term. NA values are currently not supported. -#' @param C1.map An \code{p1} by \code{k} matrix of 0/1 values, indicating for each of the \code{p1} covariates in \code{C1} whether to consider its potential effects on the values of each of the \code{k} sources (e.g., if position \code{i,j} in \code{C1.map} is 1 then the potential effect of the \code{i}-th covariate in \code{C1} on the {j}-th source will be considered). If \code{C1.map == NULL} then effects for all covariates in \code{C1} in each of the sources will be considered. +#' @param X An \code{m} by \code{n} matrix of measurements of \code{m} features for \code{n} observations. Each column in \code{X} is assumed to be a mixture of \code{k} sources. Note that \code{X} must include row names and column names and that NA values are currently not supported. \code{X} should not include features that are constant across all observations. +#' @param W An \code{n} by \code{k} matrix of weights - the weights of \code{k} sources for each of the \code{n} mixtures (observations). All the weights must be positive and each row - corresponding to the weights of a single observation - must sum up to 1. Note that \code{W} must include row names and column names and that NA values are currently not supported. In cases where only noisy estimates of \code{W} are available, \code{tca} can be set to re-estimate \code{W} (see \code{refit_W}). +#' @param C1 An \code{n} by \code{p1} design matrix of covariates that may affect the hidden source-specific values (possibly a different effect size in each source). Note that \code{C1} must include row names and column names and should not include an intercept term. NA values are currently not supported. Note that each covariate in \code{C1} results in \code{k} additional parameters in the model of each feature, therefore, in order to alleviate the possibility of model overfitting, it is advised to be mindful of the balance between the size of \code{C1} and the sample size in \code{X}. +#' @param C1.map An \code{p1} by \code{k} matrix of 0/1 values, indicating for each of the \code{p1} covariates in \code{C1} whether to consider its potential effects on the values of each of the \code{k} sources (e.g., if position \code{i,j} in \code{C1.map} is 1 then the potential effect of the \code{i}-th covariate in \code{C1} on the {j}-th source will be considered). If \code{C1.map == NULL} then effects for all covariates in \code{C1} will be considered in each of the sources. Note that \code{C1.map} is available only if \code{constrain_mu == TRUE}. #' @param C2 An \code{n} by \code{p2} design matrix of covariates that may affect the mixture (i.e. rather than directly the sources of the mixture; for example, variables that capture biases in the collection of the measurements). Note that \code{C2} must include row names and column names and should not include an intercept term. NA values are currently not supported. #' @param refit_W A logical value indicating whether to re-estimate the input \code{W} under the TCA model. -#' @param refit_W.features A vector with the names of the features in \code{X} to consider when re-estimating \code{W} (i.e. when \code{refit_W == TRUE}). This is useful since oftentimes just a subset of the features in \code{X} will be informative for estimating \code{W}. If \code{refit_W.features == NULL} then the ReFACTor algorithm will be used for performing feature selection (see also \code{refit_W.sparsity, refit_W.sd_threshold}). +#' @param refit_W.features A vector with the names of the features in \code{X} to consider when re-estimating \code{W} (i.e. when \code{refit_W == TRUE}). This is useful since in general only a subset of the features in \code{X} are expected to be highly informative for estimating \code{W}. If \code{refit_W.features == NULL} then the ReFACTor algorithm will be used for performing feature selection (see also \code{refit_W.sparsity, refit_W.sd_threshold}). #' @param refit_W.sparsity A numeric value indicating the number of features to select using the ReFACTor algorithm when re-estimating \code{W} (activated only if \code{refit_W == TRUE} and \code{refit_W.features == NULL}). Note that \code{refit_W.sparsity} must be lower or equal to the number of features in \code{X}. For more information, see the argument \code{sparsity} in \link{refactor}. #' @param refit_W.sd_threshold A numeric value indicating a standard deviation threshold to be used for excluding low-variance features in \code{X} (activated only if \code{refit_W == TRUE} and \code{refit_W.features == NULL}). For more information, see the argument \code{sd_threshold} in \link{refactor}. #' @param tau A non-negative numeric value of the standard deviation of the measurement noise (i.e. the i.i.d. component of variation in the model). If \code{tau == NULL} then \code{tca} will estimate \code{tau}. +#' @param vars.mle A logical value indicating whether to use maximum likelihood estimation when learning the variances in the model. If \code{vars.mle == FALSE} then \code{tca} will use a non-negative least-squares optimization for learning the variances. In practice, both approaches appear to provide highly correlated models, however, setting \code{vars.mle == FALSE}allows a substantial speedup. +#' @param constrain_mu A logical value indicating whether to constrain the estimates of the mean parameters (i.e. \eqn{\{\mu_{hj}\}}; see details below), in which case they will be constrained to the range of the values in \code{X}. Note that if \code{constrain_mu == TRUE} then \code{tca} will not output p-values for the effect sizes of the covariates in \code{C1} and in \code{C2}. #' @param parallel A logical value indicating whether to use parallel computing (possible when using a multi-core machine). #' @param num_cores A numeric value indicating the number of cores to use (activated only if \code{parallel == TRUE}). If \code{num_cores == NULL} then all available cores except for one will be used. -#' @param max_iters A numeric value indicating the maximal number of iterations to use in the optimization of the TCA model (\code{max_iters} iterations will be used as long as the optimization does not converge in earlier iterations). -#' @param log_file A path to an output log file. Note that if the file \code{log_file} already exists then logs will be appended to the end of the file. Set \code{log_file} to \code{NULL} to prevent output from being saved into a file; note that if \code{verbose == TRUE} then no output file will be generated regardless of the value of \code{log_file}. -#' @param debug A logical value indicating whether to set the logger to a more detailed debug level; please set \code{debug} to \code{TRUE} before reporting issues. +#' @param max_iters A numeric value indicating the maximal number of iterations to use in the optimization of the TCA model (\code{max_iters} iterations will be used as long as the optimization does not converge earlier). +#' @param log_file A path to an output log file. Note that if the file \code{log_file} already exists then logs will be appended to the end of the file. Set \code{log_file} to \code{NULL} to prevent output from being saved into a file; note that if \code{verbose == FALSE} then no output file will be generated regardless of the value of \code{log_file}. +#' @param debug A logical value indicating whether to set the logger to a more detailed debug level; set \code{debug} to \code{TRUE} before reporting issues. #' @param verbose A logical value indicating whether to print logs. #' #' @importFrom futile.logger flog.info #' @importFrom pbapply pboptions #' -#' @details The TCA model assumes that the hidden source-specific values are random variables. Formally, denote by \eqn{Z_{hj}^i} the source-specific value of observation \eqn{i} in feature \eqn{j} source \eqn{h}, the TCA model assumes: \deqn{Z_{hj}^i \sim N(\mu_{hj},\sigma_{hj}^2)} where \eqn{\mu_{hj},\sigma_{hj}} represent the mean and standard deviation that are specific to feature \eqn{j} source \eqn{h}. The model further assumes that the observed value of observation \eqn{i} in feature \eqn{j} is a mixture of \eqn{k} different sources: \deqn{X_{ji} = \sum_{h=1}^k W_{ih}Z_{hj}^i + \epsilon_{ji}} where \eqn{W_{ih}} is the non-negative proportion of source \eqn{h} in the mixture of observation \eqn{i} such that \eqn{\sum_{h=1}^kW_{ih} = 1}, and \eqn{\epsilon_{ji} \sim N(0,\tau^2)} is an i.i.d. component of variation that models measurement noise. Note that the mixture proportions in \eqn{W} are, in general, unique for each individual, therefore each entry in the data matrix \eqn{X} is coming from a unique distribution (i.e. a different mean and a different variance). +#' @details The TCA model assumes that the hidden source-specific values are random variables. Formally, denote by \eqn{Z_{hj}^i} the source-specific value of observation \eqn{i} in feature \eqn{j} source \eqn{h}, the TCA model assumes: \deqn{Z_{hj}^i \sim N(\mu_{hj},\sigma_{hj}^2)} where \eqn{\mu_{hj},\sigma_{hj}} represent the mean and standard deviation that are specific to feature \eqn{j}, source \eqn{h}. The model further assumes that the observed value of observation \eqn{i} in feature \eqn{j} is a mixture of \eqn{k} different sources: \deqn{X_{ji} = \sum_{h=1}^k W_{ih}Z_{hj}^i + \epsilon_{ji}} where \eqn{W_{ih}} is the non-negative proportion of source \eqn{h} in the mixture of observation \eqn{i} such that \eqn{\sum_{h=1}^kW_{ih} = 1}, and \eqn{\epsilon_{ji} \sim N(0,\tau^2)} is an i.i.d. component of variation that models measurement noise. Note that the mixture proportions in \eqn{W} are, in general, unique for each individual, therefore each entry in \eqn{X} is coming from a unique distribution (i.e. a different mean and a different variance). #' -#' In cases where the true \code{W} is unknown, \code{tca} can be provided with initial estimates of \code{W} and then re-estimate \code{W} as part of the optimization procedure (see argument \code{refit_W}). These initial estimates should not be random but rather capture the information in \code{W} to some extent. When the argument \code{refit_W} is used, it is typically the case that only a subset of the features should be used for re-estimating \code{W}. Therefore, when re-estimating \code{W}, \code{tca} performs feature selection using the ReFACTor algorithm; alternatively, it can also be provided with a user-specified list of features to be used in the re-estimation (see argument \code{refit_W.features}). +#' In cases where the true \code{W} is unknown, \code{tca} can be provided with noisy estimates of \code{W} and then re-estimate \code{W} as part of the optimization procedure (see argument \code{refit_W}). These initial estimates should not be random but rather capture the information in \code{W} to some extent. When the argument \code{refit_W} is used, it is typically the case that only a subset of the features should be used for re-estimating \code{W}. Therefore, when re-estimating \code{W}, \code{tca} performs feature selection using the ReFACTor algorithm; alternatively, it can also be provided with a user-specified list of features to be used in the re-estimation, assuming that such list of features that are most informative for estimating \eqn{W} exist (see argument \code{refit_W.features}). #' -#' Factors that systematically affect the source-specific values \eqn{Z_{hj}^i} can be further considered (see argument \code{C1}). In that case, we assume: \deqn{Z_{hj}^i \sim N(\mu_{hj}+c^{(1)}_i \gamma_j^h,\sigma_{hj}^2)} where \eqn{c^{(1)}_i} is a row vector from \code{C1}, corresponding to the values of the \eqn{p_1} factors for observation \eqn{i}, and \eqn{\gamma_j^h} is a vector of \eqn{p_1} corresponding effect sizes. +#' Covariates that systematically affect the source-specific values \eqn{Z_{hj}^i} can be further considered (see argument \code{C1}). In that case, we assume: \deqn{Z_{hj}^i \sim N(\mu_{hj}+c^{(1)}_i \gamma_j^h,\sigma_{hj}^2)} where \eqn{c^{(1)}_i} and \eqn{\gamma_j^h} correspond to the \eqn{p_1} covariate values of observation \eqn{i} (i.e. a row vector from \code{C1}) and their effect sizes, respectively. #' -#' Factors that systematically affect the mixture values \eqn{X_{ji}}, such as variables that capture biases in the collection of the measurements, can also be considered (see argument \code{C2}). In that case, we assume: \deqn{X_{ji} \sim \sum_{h=1}^k W_{ih}Z_{hj}^i + c^{(2)}_i \delta_j + \epsilon_{ij}} where \eqn{c^{(2)}_i} is a row vector from \code{C2}, corresponding to the values of the \eqn{p_2} factors for observation \eqn{i}, and \eqn{\delta_j} is a vector of \eqn{p_2} corresponding effect sizes. +#' Covariates that systematically affect the mixture values \eqn{X_{ji}}, such as variables that capture technical biases in the collection of the measurements, can also be considered (see argument \code{C2}). In that case, we assume: \deqn{X_{ji} = \sum_{h=1}^k W_{ih}Z_{hj}^i + c^{(2)}_i \delta_j + \epsilon_{ij}} where \eqn{c^{(2)}_i} and \eqn{\delta_j} correspond to the \eqn{p_2} covariate values of observation \eqn{i} (i.e. a row vector from \code{C2}) and their effect sizes, respectively. #' +#' Since the standard deviation of \eqn{X_{ji}} is specific to observation \eqn{i} and feature \eqn{j}, we can obtain p-values for the estimates of \eqn{\gamma_j^h} and \eqn{\delta_j} by dividing each observed data point \eqn{x_{ji}} by its estimated standard deviation and calculating T-statistics under a standard linear regression framework. #' -#' @return A list with the estimated parameters of the model. This list can be then used as the input to other functions such as \code{tcareg}. -#' \item{W}{An \code{n} by \code{k} matrix of weights. If \code{refit_W == TRUE} then this is the re-estimated \code{W}; otherwise this is the input \code{W}} +#' +#' @return A list with the estimated parameters of the model. This list can be then used as the input to other functions such as \link{tcareg}. +#' \item{W}{An \code{n} by \code{k} matrix of weights. If \code{refit_W == TRUE} then this is the re-estimated \code{W}, and otherwise this is the input \code{W}} #' \item{mus_hat}{An \code{m} by \code{k} matrix of estimates for the mean of each source in each feature.} #' \item{sigmas_hat}{An \code{m} by \code{k} matrix of estimates for the standard deviation of each source in each feature.} #' \item{tau_hat}{An estimate of the standard deviation of the i.i.d. component of variation in \code{X}. If an input value was provided for \code{tau} (i.e. \code{tau != NULL}) then \code{tau_hat == tau}.} -#' \item{gammas_hat}{An \code{m} by \code{k*p1} matrix of the estimated effects of the \code{p1} factors in \code{C1} on each of the \code{m} features in \code{X}, where the first \code{p1} columns are the source-specific effects of the \code{p1} factors on the first source, the following \code{p1} columns are the source-specific effects on the second source and so on.} -#' \item{deltas_hat}{An \code{m} by \code{p2} matrix of the estimated effects of the \code{p2} factors in \code{C2} on the mixture values of each of the \code{m} features in \code{X}.} +#' \item{gammas_hat}{An \code{m} by \code{k*p1} matrix of the estimated effects of the \code{p1} covariates in \code{C1} on each of the \code{m} features in \code{X}, where the first \code{p1} columns are the source-specific effects of the \code{p1} covariates on the first source, the following \code{p1} columns are the source-specific effects on the second source and so on.} +#' \item{deltas_hat}{An \code{m} by \code{p2} matrix of the estimated effects of the \code{p2} covariates in \code{C2} on the mixture values of each of the \code{m} features in \code{X}.} +#' \item{gammas_hat_pvals}{An \code{m} by \code{k*p1} matrix of p-values for the estimates in \code{gammas_hat} (based on a T-test). Note that \code{gammas_hat_pvals} is set to NULL if \code{constrain_mu == TRUE}.} +#' \item{gammas_hat_pvals.joint}{An \code{m} by \code{p1} matrix of p-values for the joint effects (i.e. across all \code{k} sources) of each of the \code{p1} covariates in \code{C1} on each of the \code{m} features in \code{X} (based on a partial F-test). In other words, these are p-values for the combined statistical effects of each one of the \code{p1} covariates on each of the \code{m} features under the TCA model. Note that \code{gammas_hat_pvals.joint} is set to NULL if \code{constrain_mu == TRUE}.} +#' \item{deltas_hat_pvals}{An \code{m} by \code{p2} matrix of p-values for the estimates in \code{deltas_hat} (based on a T-test). Note that \code{deltas_hat_pvals} is set to NULL if \code{constrain_mu == TRUE}.} #' #' @examples #' data <- test_data(100, 20, 3, 1, 1, 0.01) #' tca.mdl <- tca(X = data$X, W = data$W, C1 = data$C1, C2 = data$C2) #' -#' @section Note: The function \code{tca} may require a long running time when the input matrix \code{X} is very large; to alleviate this, it is strongly advised to use the \code{parallel} argument, given that a multi-core machine is available. -#' -#' @references Rahmani E, Schweiger R, Rhead B, Criswell LA, Barcellos LF, Eskin E, Rosset S, Sankararaman S, Halperin E. Cell-type-specific resolution epigenetics without the need for cell sorting or single-cell biology. Nature Communications 2018. +#' @references Rahmani E, Schweiger R, Rhead B, Criswell LA, Barcellos LF, Eskin E, Rosset S, Sankararaman S, Halperin E. Cell-type-specific resolution epigenetics without the need for cell sorting or single-cell biology. Nature Communications 2019. #' @references Rahmani E, Zaitlen N, Baran Y, Eng C, Hu D, Galanter J, Oh S, Burchard EG, Eskin E, Zou J, Halperin E. Sparse PCA corrects for cell type heterogeneity in epigenome-wide association studies. Nature Methods 2016. #' #' @export tca -tca <- function(X, W, C1 = NULL, C1.map = NULL, C2 = NULL, refit_W = FALSE, refit_W.features = NULL, refit_W.sparsity = 500, refit_W.sd_threshold = 0.02, tau = NULL, parallel = FALSE, num_cores = NULL, max_iters = 10, log_file = "TCA.log", debug = FALSE, verbose = TRUE){ +tca <- function(X, W, C1 = NULL, C1.map = NULL, C2 = NULL, refit_W = FALSE, refit_W.features = NULL, refit_W.sparsity = 500, refit_W.sd_threshold = 0.02, tau = NULL, vars.mle = FALSE, constrain_mu = FALSE, parallel = FALSE, num_cores = NULL, max_iters = 10, log_file = "TCA.log", debug = FALSE, verbose = TRUE){ start_logger(log_file, debug, verbose) @@ -66,7 +71,7 @@ tca <- function(X, W, C1 = NULL, C1.map = NULL, C2 = NULL, refit_W = FALSE, refi C2 <- if (is.matrix(C2) | is.null(C2)) C2 else as.matrix(C2) flog.info("Validating input...") - tca.validate_input(X, W, C1, C1.map, C2, refit_W, refit_W.features, refit_W.sparsity, refit_W.sd_threshold, tau, parallel, num_cores, max_iters, log_file, debug) + tca.validate_input(X, W, C1, C1.map, C2, refit_W, refit_W.features, refit_W.sparsity, refit_W.sd_threshold, tau, constrain_mu, parallel, num_cores, max_iters, log_file, debug) if (is.null(C1)) C1 <- matrix(0, nrow=ncol(X), ncol=0) if (is.null(C2)) C2 <- matrix(0, nrow=ncol(X), ncol=0) @@ -77,27 +82,29 @@ tca <- function(X, W, C1 = NULL, C1.map = NULL, C2 = NULL, refit_W = FALSE, refi flog.info("Starting re-estimation of W...") if (is.null(refit_W.features)){ flog.info("Performing feature selection using refactor...") - #ref <- refactor(X, ncol(W), sparsity = refit_W.sparsity, C = if (ncol(cbind(C1,C2))) cbind(C1,C2) else NULL, sd_threshold = refit_W.sd_threshold, rand_svd = config$rand_svd, log_file = NULL) ref <- refactor.run(X = X, k = ncol(W), sparsity = refit_W.sparsity, C = if (ncol(cbind(C1,C2))) cbind(C1,C2) else NULL, C.remove = FALSE, sd_threshold = refit_W.sd_threshold, num_comp = NULL, rand_svd = config$rand_svd, log_file = NULL, logger_on = FALSE, verbose = verbose) refit_W.features <- ref$ranked_list[1:refit_W.sparsity] } X_sub <- subset(X, subset = rownames(X) %in% refit_W.features) flog.info("Fitting the TCA model using the selected features for re-estimating W...") - mdl0 <- tca.fit(X = t(X_sub), W = W, C1 = C1, C1.map = C1.map, C2 = C2, refit_W = TRUE, tau = tau, parallel = parallel, num_cores = num_cores, max_iters = max_iters) + mdl0 <- tca.fit(X = t(X_sub), W = W, C1 = C1, C1.map = C1.map, C2 = C2, refit_W = TRUE, tau = tau, vars.mle = vars.mle, constrain_mu = constrain_mu, parallel = parallel, num_cores = num_cores, max_iters = max_iters) W <- mdl0[["W"]] msg <- "Fitting the TCA model given the updated W..." } X <- t(X) flog.info(msg) - mdl <- tca.fit(X = X, W = W, C1 = C1, C1.map = C1.map, C2 = C2, refit_W = FALSE, tau = tau, parallel = parallel, num_cores = num_cores, max_iters = max_iters) + mdl <- tca.fit(X = X, W = W, C1 = C1, C1.map = C1.map, C2 = C2, refit_W = FALSE, tau = tau, vars.mle = vars.mle, constrain_mu = constrain_mu, parallel = parallel, num_cores = num_cores, max_iters = max_iters) W <- mdl[["W"]] mus_hat <- mdl[["mus_hat"]] sigmas_hat <- mdl[["sigmas_hat"]] tau_hat <- mdl[["tau_hat"]] deltas_hat <- mdl[["deltas_hat"]] gammas_hat <- mdl[["gammas_hat"]] + deltas_hat_pvals <- mdl[["deltas_hat_pvals"]] + gammas_hat_pvals <- mdl[["gammas_hat_pvals"]] + gammas_hat_pvals.joint <- mdl[["gammas_hat_pvals.joint"]] flog.info("Finished tca.") - return( list("W" = W, "mus_hat" = mus_hat, "sigmas_hat" = sigmas_hat, "tau_hat" = tau_hat, "deltas_hat" = deltas_hat, "gammas_hat" = gammas_hat, "C1" = C1, "C2" = C2) ) + return (list("W" = W, "mus_hat" = mus_hat, "sigmas_hat" = sigmas_hat, "tau_hat" = tau_hat, "deltas_hat" = deltas_hat, "gammas_hat" = gammas_hat, "deltas_hat_pvals" = deltas_hat_pvals, "gammas_hat_pvals" = gammas_hat_pvals, "gammas_hat_pvals.joint" = gammas_hat_pvals.joint, "C1" = C1, "C2" = C2) ) } @@ -107,8 +114,8 @@ tca <- function(X, W, C1 = NULL, C1.map = NULL, C2 = NULL, refit_W = FALSE, refi #' #' @param tca.mdl The value returned by applying the function \code{tca} to some data matrix\code{X}. #' @param features A vector with the identifiers of the features to extract (as they appear in the rows of \code{X}). -#' @param log_file A path to an output log file. Note that if the file \code{log_file} already exists then logs will be appended to the end of the file. Set \code{log_file} to \code{NULL} to prevent output from being saved into a file; note that if \code{verbose == TRUE} then no output file will be generated regardless of the value of \code{log_file}. -#' @param debug A logical value indicating whether to set the logger to a more detailed debug level; please set \code{debug} to \code{TRUE} before reporting issues. +#' @param log_file A path to an output log file. Note that if the file \code{log_file} already exists then logs will be appended to the end of the file. Set \code{log_file} to \code{NULL} to prevent output from being saved into a file; note that if \code{verbose == FALSE} then no output file will be generated regardless of the value of \code{log_file}. +#' @param debug A logical value indicating whether to set the logger to a more detailed debug level; set \code{debug} to \code{TRUE} before reporting issues. #' @param verbose A logical value indicating whether to print logs. #' #' @importFrom futile.logger flog.info @@ -122,14 +129,18 @@ tca <- function(X, W, C1 = NULL, C1.map = NULL, C2 = NULL, refit_W = FALSE, refi #' \item{tau_hat}{Equals to \code{tca.mdl$tau_hat}} #' \item{gammas_hat}{A \code{q} by \code{k*p1} matrix which is a subset of the matrix \code{tca.mdl$gammas_hat}, where \code{q} is the number of features in the argument \code{features}.} #' \item{deltas_hat}{A \code{q} by \code{p2} matrix which is a subset of the matrix \code{tca.mdl$deltas_hat}, where \code{q} is the number of features in the argument \code{features}.} +#' \item{gammas_hat_pvals}{A \code{q} by \code{k*p1} matrix which is a subset of the matrix \code{tca.mdl$gammas_hat_pvals}, where \code{q} is the number of features in the argument \code{features}. Note that if \code{tca.mdl$gammas_hat_pvals == NULL} then \code{gammas_hat_pvals} is also set to NULL.} +#' \item{gammas_hat_pvals.joint}{A \code{q} by \code{p1} matrix which is a subset of the matrix \code{tca.mdl$gammas_hat_pvals.joint}, where \code{q} is the number of features in the argument \code{features}. Note that if \code{tca.mdl$gammas_hat_pvals.joint == NULL} then \code{gammas_hat_pvals} is also set to NULL.} +#' \item{deltas_hat_pvals}{A \code{q} by \code{p2} matrix which is a subset of the matrix \code{tca.mdl$deltas_hat_pvals}, where \code{q} is the number of features in the argument \code{features}. Note that if \code{tca.mdl$deltas_hat_pvals == NULL} then \code{deltas_hat_pvals} is also set to NULL.} #' #' @examples #' data <- test_data(50, 20, 3, 0, 0, 0.01) #' tca.mdl <- tca(X = data$X, W = data$W) #' tca.mdl.subset <- tcasub(tca.mdl, rownames(data$X)[1:10]) #' y <- matrix(rexp(50, rate=.1), ncol=1) +#' rownames(y) <- rownames(data$W) #' # run tcareg test with an outcome y: -#' res <- tcareg(data$X[1:10,], tca.mdl.subset, y, test = "joint", save_results = FALSE) +#' res <- tcareg(data$X[1:10,], tca.mdl.subset, y, test = "joint") #' #' @export tcasub tcasub <- function(tca.mdl, features, log_file = "TCA.log", debug = FALSE, verbose = TRUE){ @@ -138,18 +149,16 @@ tcasub <- function(tca.mdl, features, log_file = "TCA.log", debug = FALSE, verbo flog.info("Starting tcasub...") tcasub.validate_input(tca.mdl, features, log_file, debug) flog.info("Finished tcasub.") - if (length(features) == 1){ - mus_hat <- t(as.matrix(tca.mdl[["mus_hat"]][features,])) - sigmas_hat <- t(as.matrix(tca.mdl[["sigmas_hat"]][features,])) - deltas_hat <- t(as.matrix(tca.mdl[["deltas_hat"]][features,])) - gammas_hat <- t(as.matrix(tca.mdl[["gammas_hat"]][features,])) - }else{ - mus_hat <- as.matrix(tca.mdl[["mus_hat"]][features,]) - sigmas_hat <- as.matrix(tca.mdl[["sigmas_hat"]][features,]) - deltas_hat <- as.matrix(tca.mdl[["deltas_hat"]][features,]) - gammas_hat <- as.matrix(tca.mdl[["gammas_hat"]][features,]) - } - return( list("W" = tca.mdl[["W"]], "mus_hat" = mus_hat, "sigmas_hat" = sigmas_hat, "tau_hat" = tca.mdl[["tau_hat"]], "deltas_hat" = deltas_hat, "gammas_hat" = gammas_hat, "C1" = as.matrix(tca.mdl[["C1"]]), "C2" = as.matrix(tca.mdl[["C2"]])) ) + + mus_hat <- as.matrix(tca.mdl[["mus_hat"]][features,,drop=F]) + sigmas_hat <- as.matrix(tca.mdl[["sigmas_hat"]][features,,drop=F]) + deltas_hat <- as.matrix(tca.mdl[["deltas_hat"]][features,,drop=F]) + gammas_hat <- as.matrix(tca.mdl[["gammas_hat"]][features,,drop=F]) + deltas_hat_pvals <- if(is.null(tca.mdl[["deltas_hat_pvals"]])) NULL else as.matrix(tca.mdl[["deltas_hat_pvals"]][features,,drop=F]) + gammas_hat_pvals <- if(is.null(tca.mdl[["gammas_hat_pvals"]])) NULL else as.matrix(tca.mdl[["gammas_hat_pvals"]][features,,drop=F]) + gammas_hat_pvals.joint <- if (is.null(tca.mdl[["gammas_hat_pvals.joint"]])) NULL else as.matrix(tca.mdl[["gammas_hat_pvals.joint"]][features,,drop=F]) + + return( list("W" = tca.mdl[["W"]], "mus_hat" = mus_hat, "sigmas_hat" = sigmas_hat, "tau_hat" = tca.mdl[["tau_hat"]], "deltas_hat" = deltas_hat, "gammas_hat" = gammas_hat, "deltas_hat_pvals" = deltas_hat_pvals, "gammas_hat_pvals" = gammas_hat_pvals, "gammas_hat_pvals.joint" = gammas_hat_pvals.joint, "C1" = as.matrix(tca.mdl[["C1"]]), "C2" = as.matrix(tca.mdl[["C2"]])) ) } @@ -158,15 +167,16 @@ tcasub <- function(tca.mdl, features, log_file = "TCA.log", debug = FALSE, verbo #' #' @description Estimates 3-dimensional signals (features by observations by sources) from input of mixtures (features by observations), under the assumption of the TCA model that each observation is a mixture of unique source-specific values (in each feature in the data). For example, in the context of tissue-level bulk DNA methylation data coming from a mixture of cell types (i.e. the input is methylation sites by individuals), \code{tensor} allows to estimate a tensor of cell-type-specific levels for each individual in each methylation site (i.e. a tensor of methylation sites by individuals by cell types). #' -#' @param X An \code{m} by \code{n} matrix of measurements of \code{m} features for \code{n} observations. Each column in \code{X} is assumed to be a mixture of \code{k} different sources. Note that \code{X} must include row names and column names and that NA values are currently not supported. \code{X} should not include features that are constant across all observations. +#' @param X An \code{m} by \code{n} matrix of measurements of \code{m} features for \code{n} observations. Each column in \code{X} is assumed to be a mixture of \code{k} sources. Note that \code{X} must include row names and column names and that NA values are currently not supported. \code{X} should not include features that are constant across all observations. #' @param tca.mdl The value returned by applying the function \code{tca} to \code{X}. +#' @param scale A logical value indicating whether to divide the estimate of each entry in the tensor by its estimated standard deviation. #' @param parallel A logical value indicating whether to use parallel computing (possible when using a multi-core machine). #' @param num_cores A numeric value indicating the number of cores to use (activated only if \code{parallel == TRUE}). If \code{num_cores == NULL} then all available cores except for one will be used. -#' @param log_file A path to an output log file. Note that if the file \code{log_file} already exists then logs will be appended to the end of the file. Set \code{log_file} to \code{NULL} to prevent output from being saved into a file; note that if \code{verbose == TRUE} then no output file will be generated regardless of the value of \code{log_file}. -#' @param debug A logical value indicating whether to set the logger to a more detailed debug level; please set \code{debug} to \code{TRUE} before reporting issues. +#' @param log_file A path to an output log file. Note that if the file \code{log_file} already exists then logs will be appended to the end of the file. Set \code{log_file} to \code{NULL} to prevent output from being saved into a file; note that if \code{verbose == FALSE} then no output file will be generated regardless of the value of \code{log_file}. +#' @param debug A logical value indicating whether to set the logger to a more detailed debug level; set \code{debug} to \code{TRUE} before reporting issues. #' @param verbose A logical value indicating whether to print logs. #' -#' @details See \link{tca} for notations and details about the TCA model. Given estimates of the parameters of the model (given by \link{tca}), \code{tensor} uses the conditional distribution \eqn{Z_{hj}^i|X_{ji}} for estimating the \eqn{k} source-specific levels of each observation \eqn{i} in each feature \eqn{j}. +#' @details See \link{tca} for notations and details about the TCA model. Given estimates of the parameters of the model (given by \link{tca}), \code{tensor} uses the conditional distribution \eqn{Z_{hj}^i|X_{ji}=x_{ji}} for estimating the \eqn{k} source-specific levels of each observation \eqn{i} in each feature \eqn{j}. # #' @return A list with the estimated source-specific values. The first element in the list is an \code{m} by \code{n} matrix (features by observations) corresponding to the estimated values coming from the first source, the second element in the list is another \code{m} by \code{n} matrix (features by observations) corresponding to the estimated values coming from the second source and so on. #' @@ -175,15 +185,18 @@ tcasub <- function(tca.mdl, features, log_file = "TCA.log", debug = FALSE, verbo #' tca.mdl <- tca(X = data$X, W = data$W, C1 = data$C1, C2 = data$C2) #' Z_hat <- tensor(data$X, tca.mdl) #' -#' @references Rahmani E, Schweiger R, Rhead B, Criswell LA, Barcellos LF, Eskin E, Rosset S, Sankararaman S, Halperin E. Cell-type-specific resolution epigenetics without the need for cell sorting or single-cell biology. Nature Communications 2018. +#' @references Rahmani E, Schweiger R, Rhead B, Criswell LA, Barcellos LF, Eskin E, Rosset S, Sankararaman S, Halperin E. Cell-type-specific resolution epigenetics without the need for cell sorting or single-cell biology. Nature Communications 2019. #' #' @export tensor -tensor <- function(X, tca.mdl, parallel = FALSE, num_cores = NULL, log_file = "TCA.log", debug = FALSE, verbose = TRUE){ +tensor <- function(X, tca.mdl, scale = FALSE, parallel = FALSE, num_cores = NULL, log_file = "TCA.log", debug = FALSE, verbose = TRUE){ start_logger(log_file, debug, verbose) config <- config::get(file = system.file("extdata", "config.yml", package = "TCA"), use_parent = FALSE) + flog.info("Validating input...") + tensor.validate_input(X, scale, parallel, num_cores, log_file, debug) + # progress bar options op <- if (verbose) pboptions(nout = config$nout, type = config[["type"]]) else pboptions(nout = config$nout, type = "none") @@ -200,39 +213,43 @@ tensor <- function(X, tca.mdl, parallel = FALSE, num_cores = NULL, log_file = "T C1 <- tca.mdl[["C1"]] C2 <- tca.mdl[["C2"]] - return( estimate_Z(t(X), W, mus_hat, sigmas_hat, tau_hat, C2, deltas_hat, C1, gammas_hat, parallel, num_cores) ) + return( estimate_Z(t(X), W, mus_hat, sigmas_hat, tau_hat, C2, deltas_hat, C1, gammas_hat, scale, parallel, num_cores) ) } #' @title Fitting a TCA regression model #' -#' @description TCA regression allows to test for several types of statistical relations between source-specific values and an outcome of interest. For example, in the context of tissue-level bulk DNA methylation data coming from a mixture of cell types (i.e. the input is methylation sites by individuals), \code{tcareg} allows to test for cell-type-specific effects of methylation on an outcome of interest. +#' @description TCA regression allows to test, under several types of statistical tests, the effects of source-specific values on an outcome of interest (or on mediating components thereof). For example, in the context of tissue-level bulk DNA methylation data coming from a mixture of cell types (i.e. the input is methylation sites by individuals), \code{tcareg} allows to test for cell-type-specific effects of methylation on outcomes of interest (or on mediating components thereof). #' -#' @param X An \code{m} by \code{n} matrix of measurements of \code{m} features for \code{n} observations. Each column in \code{X} is assumed to be a mixture of \code{k} different sources. Note that \code{X} must include row names and column names and that NA values are currently not supported. \code{X} should not include features that are constant across all observations. -#' @param tca.mdl The value returned by applying the function \code{tca} to \code{X}. +#' @param X An \code{m} by \code{n} matrix of measurements of \code{m} features for \code{n} observations. Each column in \code{X} is assumed to be a mixture of \code{k} sources. Note that \code{X} must include row names and column names and that NA values are currently not supported. \code{X} should not include features that are constant across all observations. +#' @param tca.mdl The value returned by applying \link{tca} to \code{X}. #' @param y An \code{n} by 1 matrix of an outcome of interest for each of the \code{n} observations in \code{X}. Note that \code{y} must include row names and column names and that NA values are currently not supported. #' @param C3 An \code{n} by \code{p3} design matrix of covariates that may affect \code{y}. Note that \code{C3} must include row names and column names and should not include an intercept term. NA values are currently not supported. #' @param test A character vector with the type of test to perform on each of the features in \code{X}; one of the following options: \code{'marginal'}, \code{'marginal_conditional'}, \code{'joint'}, \code{'single_effect'}, or \code{'custom'}. Setting \code{'marginal'} or \code{'marginal_conditional'} corresponds to testing each feature in \code{X} for a statistical relation between \code{y} and each of the \code{k} sources separately; for any particular source under test, the \code{marginal_conditional} option further accounts for possible effects of the rest of the \code{k-1} sources (\code{'marginal'} will therefore tend to be more powerful in discovering truly related features, but at the same time more prone to falsely tagging the correct related sources if sources are highly correlated). Setting \code{'joint'} or \code{'single_effect'} corresponds to testing each feature for an overall statistical relation with \code{y}, while modeling source-specific effects; the latter option further assumes that the source-specific effects are the same within each feature (\code{'single_effect'} means only one degree of freedom and will therefore be more powerful when the assumption of a single effect within a feature holds). Finally, \code{'custom'} corresponds to testing each feature in \code{X} for a statistical relation with \code{y} under a user-specified model (alternative model) with respect to a null model (null model); for example, for testing for relation of the combined (potentially different) effects of features 1 and 2 while accounting for the (potentially different) effects of 3 and 4, set the null model to be sources 3, 4 and the alternative model to be sources 1, 2, 3, 4. Indicating that \code{null_model} assumes no effect for any of the sources can be done by setting it to \code{NULL}. #' @param null_model A vector with a subset of the names of the sources in \code{tca.mdl$W} to be used as a null model (activated only if \code{test == 'custom'}). Note that the null model must be nested within the alternative model; set \code{null_model} to be \code{NULL} for indicating no effect for any of the sources under the null model. #' @param alternative_model A vector with a subset (or all) of the names of the sources in \code{tca.mdl$W} to be used as an alternative model (activated only if \code{test == 'custom'}). -#' @param save_results A logical value indicating whether to save the returned results in a file. If \code{TRUE} and \code{test == 'marginal'} or \code{test == 'marginal_conditional'} then \code{k} files will be saved (one for the results of each source). +#' @param save_results A logical value indicating whether to save the returned results in a file. If \code{test == 'marginal'} or (\code{fast_mode == TRUE} and \code{test == 'marginal_conditional'}) then \code{k} files will be saved (one for the results of each source). +#' @param fast_mode A logical value indicating whether to use a fast version of TCA regression, in which source-specific-values are first estimated using the \code{tensor} function and then tested under a standard regression framework (see more details below). #' @param output Prefix for output files (activated only if \code{save_results == TRUE}). -#' @param sort_results A logical value indicating whether to sort the results by their p-value (i.e. features with lower p-value will appear first in the results). +#' @param sort_results A logical value indicating whether to sort the results by their p-value (i.e. features with lower p-value will appear first in the results). This option is not available if \code{fast_mode == TRUE} and \code{test == "marginal_conditional"}. #' @param parallel A logical value indicating whether to use parallel computing (possible when using a multi-core machine). #' @param num_cores A numeric value indicating the number of cores to use (activated only if \code{parallel == TRUE}). If \code{num_cores == NULL} then all available cores except for one will be used. -#' @param log_file A path to an output log file. Note that if the file \code{log_file} already exists then logs will be appended to the end of the file. Set \code{log_file} to \code{NULL} to prevent output from being saved into a file; note that if \code{verbose == TRUE} then no output file will be generated regardless of the value of \code{log_file}. -#' @param features_metadata A path to a csv file containing metadata about the features in \code{X} that will be added to the output files (activated only if \code{save_results == TRUE}). Each row in the metadata file should correspond to one feature (with the row name being the feature identifier, as it appears in the rows of \code{X}) and each column should correspond to one metadata descriptor (with an appropriate column name). Features that do not exist in \code{X} will be ignored and features in \code{X} with missing metadata information will show missing values. -#' @param debug A logical value indicating whether to set the logger to a more detailed debug level; please set \code{debug} to \code{TRUE} before reporting issues. +#' @param log_file A path to an output log file. Note that if the file \code{log_file} already exists then logs will be appended to the end of the file. Set \code{log_file} to \code{NULL} to prevent output from being saved into a file; note that if \code{verbose == FALSE} then no output file will be generated regardless of the value of \code{log_file}. +#' @param features_metadata A path to a csv file containing metadata about the features in \code{X} that will be added to the output files (activated only if \code{save_results == TRUE}). Each row in the metadata file should correspond to one feature in \code{X} (with the row name being the feature identifier, as it appears in the rows of \code{X}) and each column should correspond to one metadata descriptor (with an appropriate column name). Features that do not exist in \code{X} will be ignored and features in \code{X} with missing metadata information will show missing values. +#' @param debug A logical value indicating whether to set the logger to a more detailed debug level; set \code{debug} to \code{TRUE} before reporting issues. #' @param verbose A logical value indicating whether to print logs. #' -#' @details TCA models \eqn{Z_{hj}^i} as the source-specific value of observation \eqn{i} in feature \eqn{j} coming from source \eqn{h} (see \link{tca} for more details). A TCA regression model tests an outcome \eqn{Y} for a linear statistical relation with the source-specific values of a feature \eqn{j} by assuming: \deqn{Y_i = \sum_{h=1}^k \beta_{hj} Z_{hj}^i + e_i} where \eqn{e_i \sim N(0,\phi^2)}. In practice, \code{tcareg} fits this model using the conditional distribution \eqn{Y|X}, which, effectively, integrates over the latent \eqn{Z_{hj}^i} parameters. Statistical significance is then calculated using a likelihood ratio test (LRT). Note that the null and alternative models will be set automatically, except when \code{test == 'custom'}, in which case they will be set according to the user-specified null and alternative hypotheses. +#' @details TCA models \eqn{Z_{hj}^i} as the source-specific value of observation \eqn{i} in feature \eqn{j} coming from source \eqn{h} (see \link{tca} for more details). A TCA regression model tests an outcome \eqn{Y} for a linear statistical relation with the source-specific values of a feature \eqn{j} by assuming: \deqn{Y_i = \alpha_{j,0} + \sum_{h=1}^k\beta_{hj} Z_{hj}^i + c_i^{(3)}\alpha_{j} + e_i} where \eqn{\alpha_{j,0}} is an intercept term, \eqn{\beta_{hj}} is the effect of source \eqn{h}, \eqn{c_i^{(3)}} and \eqn{\alpha_j} correspond to the \eqn{p_3} covariate values of observation \eqn{i} (i.e. a row vector from \code{C3}) and their effect sizes, respectively, and \eqn{e_i \sim N(0,\phi^2)}. In practice, if \code{fast_mode == FALSE} then \code{tcareg} fits this model using the conditional distribution \eqn{Y|X}, which, effectively, integrates over the random \eqn{Z_{hj}^i}. Statistical significance is then calculated using a likelihood ratio test (LRT). +#' Alternatively, in case \code{fast_mode == TRUE} the above model is fitted by first learning point estimates for \eqn{Z_{hj}^i} using the \link{tensor} function and then assessing statistical significance using T-tests and partial F-tests under a standard regression framework. This alternative provides a substantial boost in speed. #' -#' Under the TCA regression model, several statistical tests can be performed by setting the argument \code{test} according to one of the following options. +#' Note that the null and alternative models will be set automatically, except when \code{test == 'custom'}, in which case they will be set according to the user-specified null and alternative hypotheses. +#' +#' Under the TCA regression model, several statistical tests can be performed by setting the argument \code{test} according to one of the following options: #' #' 1. If \code{test == 'marginal'}, \code{tcareg} will perform the following for each source \eqn{l}. For each feature \eqn{j}, \eqn{\beta_{lj}} will be estimated and tested for a non-zero effect, while assuming \eqn{\beta_{hj}=0} for all other sources \eqn{h\neq l}. #' -#' 2. If \code{test == 'marginal_conditional'}, \code{tcareg} will perform the following for each source \eqn{l}. For each feature \eqn{j}, \eqn{\beta_{lj}} will be estimated and tested for a non-zero effect, while also estimating the effect sizes \eqn{\beta_{hj}} for all other sources \eqn{h\neq l}. +#' 2. If \code{test == 'marginal_conditional'}, \code{tcareg} will perform the following for each source \eqn{l}. For each feature \eqn{j}, \eqn{\beta_{lj}} will be estimated and tested for a non-zero effect, while also estimating the effect sizes \eqn{\beta_{hj}} for all other sources \eqn{h\neq l} (thus accounting for covariances between the estimated effects of different sources). #' #' 3. If \code{test == 'joint'}, \code{tcareg} will estimate for each feature \eqn{j} the effect sizes of all \eqn{k} sources \eqn{\beta_{1j},…,\beta_{kj}} and then test the set of \eqn{k} estimates of each feature \code{j} for a joint effect. #' @@ -240,19 +257,17 @@ tensor <- function(X, tca.mdl, parallel = FALSE, num_cores = NULL, log_file = "T #' #' 5. If \code{test == 'custom'}, \code{tcareg} will estimate for each feature \eqn{j} the effect sizes of a predefined set of sources (defined by a user-specified alternative model) and then test their estimates for a joint effect, while accounting for a nested predefined set of sources (defined by a user-specified null model). #' -#' @return A list with the results of applying the TCA regression model to each of the features in the data. If \code{test == 'marginal'} or \code{test == 'marginal_conditional'} then a list of \code{k} such lists of results are returned, one for the results of each source. +#' @return A list with the results of applying the TCA regression model to each of the features in \code{X}. If \code{test == 'marginal'} or (\code{test == 'marginal_conditional'} and \code{fast_mode == FALSE}) then a list of \code{k} such lists of results are returned, one for the results of each source. #' \item{phi}{An estimate of the standard deviation of the i.i.d. component of variation in the TCA regression model.} -#' \item{beta}{A matrix of effect size estimates for the source-specific effects, such that each row corresponds to the estimated effect sizes of one feature. The number of columns corresponds to the number of estimated effects (e.g., if \code{test} is set to \code{marginal} then \code{beta} will include a single column, if \code{test} is set to \code{joint} then \code{beta} will include \code{k} columns and so on).} -#' \item{intercept}{An \code{m} by \code{1} matrix of estimates for the intercept of each feature.} -#' \item{alpha}{An \code{m} by \code{p3} matrix of effect size estimates for the \code{p3} covariates in \code{C3}, such that each row corresponds to the estimated effect sizes of one feature.} -#' \item{null_ll}{An \code{m} by \code{1} matrix of the log-likelihood of the model under the null hypothesis.} +#' \item{beta}{A matrix of effect size estimates for the source-specific effects, such that each row corresponds to the estimated effect sizes of one feature in \code{X}. The number of columns corresponds to the number of estimated effects (e.g., if \code{test} is set to \code{marginal} then \code{beta} will include a single column, if \code{test} is set to \code{joint} then \code{beta} will include \code{k} columns etc.).} +#' \item{intercept}{An \code{m} by \code{1} matrix of estimates for the intercept term of each feature.} +#' \item{alpha}{An \code{m} by \code{p3} matrix of effect size estimates for the \code{p3} covariates in \code{C3}, such that each row corresponds to the estimated effect sizes of one feature in \code{X}.} +#' \item{null_ll}{An \code{m} by \code{1} matrix of the log-likelihood of the model under the null hypothesis. Returned only if \code{fast_mode == FALSE}.} #' \item{alternative_ll}{An \code{m} by \code{1} matrix of the log-likelihood of the model under the alternative hypothesis.} -#' \item{stats}{An \code{m} by \code{1} matrix of the LRT statistic for each feature in the data.} -#' \item{df}{The degrees of freedom for deriving p-values using LRT.} -#' \item{pvals}{An \code{m} by \code{1} matrix of the p-value for each feature in the data.} -#' \item{qvals}{An \code{m} by \code{1} matrix of the q-value (FDR-adjusted p-values) for each feature in the data.} -#' -#' @section Note: The function \code{tcareg} may require a long running time when the input matrix \code{X} is very large; to alleviate this, it is strongly advised to use the \code{parallel} argument, given that a multi-core machine is available. +#' \item{stats}{An \code{m} by \code{k} matrix of T statistics for each source in each feature in \code{X} assuming \code{test == "marginal_conditional"} and \code{fast_mode == TRUE}; otherwise, an \code{m} by \code{1} matrix of an (partial) F statistic (if \code{fast_mode == TRUE}) or a likelihood-ratio test statistic (if \code{fast_mode == FALSE}) for each feature in \code{X}.} +#' \item{df}{The degrees of freedom for deriving p-values.} +#' \item{pvals}{An \code{m} by \code{k} matrix of p-values for each source in each feature in \code{X} assuming \code{test == "marginal_conditional"} and \code{fast_mode == TRUE}; otherwise, an \code{m} by \code{1} matrix of the p-value for each feature in \code{X}.} +#' \item{qvals}{An \code{m} by \code{k} matrix of q-values (FDR-adjusted p-values) for each source in each feature in \code{X} assuming \code{test == "marginal_conditional"} and \code{fast_mode == TRUE}; otherwise, an \code{m} by \code{1} matrix of the q-value for each feature in \code{X}. Note that if \code{test == "marginal_conditional"} and \code{fast_mode == TRUE} then q-values are calculated for each source separately.} #' #' @examples #' n <- 50 @@ -263,20 +278,23 @@ tensor <- function(X, tca.mdl, parallel = FALSE, num_cores = NULL, log_file = "T #' data <- test_data(n, m, k, p1, p2, 0.01) #' tca.mdl <- tca(X = data$X, W = data$W, C1 = data$C1, C2 = data$C2) #' y <- matrix(rexp(n, rate=.1), ncol=1) +#' rownames(y) <- rownames(data$W) +#' # marginal conditional test: +#' res0 <- tcareg(data$X, tca.mdl, y) #' # joint test: -#' res1 <- tcareg(data$X, tca.mdl, y, test = "joint", save_results = FALSE) +#' res1 <- tcareg(data$X, tca.mdl, y, test = "joint") #' # custom test, testing for a joint effect of sources 1,2 while accounting for source 3 #' res2 <- tcareg(data$X, tca.mdl, y, test = "custom", null_model = c("3"), -#' alternative_model = c("1","2","3"), save_results = FALSE) +#' alternative_model = c("1","2","3")) #' # custom test, testing for a joint effect of sources 1,2 assuming no effects under the null #' res3 <- tcareg(data$X, tca.mdl, y, test = "custom", null_model = NULL, -#' alternative_model = c("1","2"), save_results = FALSE) +#' alternative_model = c("1","2")) #' -#' @references Rahmani E, Schweiger R, Rhead B, Criswell LA, Barcellos LF, Eskin E, Rosset S, Sankararaman S, Halperin E. Cell-type-specific resolution epigenetics without the need for cell sorting or single-cell biology. Nature Communications 2018. +#' @references Rahmani E, Schweiger R, Rhead B, Criswell LA, Barcellos LF, Eskin E, Rosset S, Sankararaman S, Halperin E. Cell-type-specific resolution epigenetics without the need for cell sorting or single-cell biology. Nature Communications 2019. #' #' @export tcareg #' -tcareg <- function(X, tca.mdl, y, C3 = NULL, test = "marginal", null_model = NULL, alternative_model = NULL, save_results = TRUE, output = "TCA", sort_results = TRUE, parallel = FALSE, num_cores = NULL, log_file = "TCA.log", features_metadata = NULL, debug = FALSE, verbose = TRUE){ +tcareg <- function(X, tca.mdl, y, C3 = NULL, test = "marginal_conditional", null_model = NULL, alternative_model = NULL, save_results = FALSE, fast_mode = TRUE, output = "TCA", sort_results = FALSE, parallel = FALSE, num_cores = NULL, log_file = "TCA.log", features_metadata = NULL, debug = FALSE, verbose = TRUE){ start_logger(log_file, debug, verbose) @@ -293,7 +311,7 @@ tcareg <- function(X, tca.mdl, y, C3 = NULL, test = "marginal", null_model = NUL W <- tca.mdl[['W']] flog.info("Validating input...") - tcareg.validate_input(X, W, y, C3, test, null_model, alternative_model, save_results, output, sort_results, parallel, num_cores, log_file, features_metadata, debug) + tcareg.validate_input(X, W, y, C3, test, null_model, alternative_model, save_results, fast_mode, output, sort_results, parallel, num_cores, log_file, features_metadata, debug) flog.info("Preparing data...") X <- t(X) @@ -305,17 +323,16 @@ tcareg <- function(X, tca.mdl, y, C3 = NULL, test = "marginal", null_model = NUL C1 <- tca.mdl[["C1"]] gammas_hat <- tca.mdl[["gammas_hat"]] C3 <- if(!is.null(C3)) C3 else matrix(0, nrow=nrow(X), ncol=0) - #null_model <- match(null_model,colnames(W)) - null_model <- if (is.null(null_model)) NULL else match(null_model,colnames(W)) - alternative_model <- match(alternative_model,colnames(W)) + null_model <- if (is.null(null_model)) NULL else sort(match(null_model,colnames(W))) + alternative_model <- sort(match(alternative_model,colnames(W))) k <- ncol(W) flog.info("Running TCA regression...") - res <- tcareg.fit(X, y, W, mus_hat, sigmas_hat, C2, deltas_hat, C1, gammas_hat, tau_hat, C3, test, null_model, alternative_model, parallel, num_cores) + res <- tcareg.fit(X, y, W, mus_hat, sigmas_hat, C2, deltas_hat, C1, gammas_hat, tau_hat, C3, test, null_model, alternative_model, fast_mode, parallel, num_cores) if (save_results){ flog.info("Saving results...") - save_association_results(res, output, test, alternative_model, colnames(X), colnames(W), colnames(C3), sort_results, features_metadata) + save_association_results(res, output, test, fast_mode, alternative_model, colnames(X), colnames(W), colnames(C3), sort_results, features_metadata) } flog.info("Finished tcareg.") @@ -328,16 +345,16 @@ tcareg <- function(X, tca.mdl, y, C3 = NULL, test = "marginal", null_model = NUL #' #' @description Performs unsupervised feature selection followed by principal component analysis (PCA) under a row-sparse model using the ReFACTor algorithm. For example, in the context of tissue-level bulk DNA methylation data coming from a mixture of cell types (i.e. the input is methylation sites by individuals), \code{refactor} allows to capture the variation in cell-type composition, which was shown to be a dominant sparse signal in methylation data. #' -#' @param X An \code{m} by \code{n} matrix of measurements of \code{m} features for \code{n} observations. Each column in \code{X} is assumed to be a mixture of \code{k} different sources. Note that \code{X} must include row names and column names and that NA values are currently not supported. \code{X} should not include features that are constant across all observations. -#' @param k A numeric value indicating the dimension of the signal in the data (i.e. the number of sources). -#' @param sparsity A numeric value indicating the sparsity of the signal in the data (the number of signal rows). -#' @param C An \code{n} by \code{p} design matrix of covariates that will be accounted for in the feature selection step. Note that \code{C} must include row names and column names and that NA values are currently not supported; ; set \code{C} to be \code{NULL} if there are no such covariates. -#' @param C.remove A logical value indicating whether the covariates in X should be accounted for not only in the feature selection step, but also in the final calculation of the principal components (i.e. if \code{C.remove == TRUE} then the selected features will be adjusted for the covariates in \code{C} prior to calculating principal components). Note that setting \code{C.remove} to be \code{TRUE} is desired when ReFACTor is intended to be used for correction in downstream analysis, whereas setting \code{C.remove} to be \code{FALSE} is desired when ReFACTor is merely used for capturing the sparse signals in the data (i.e. regardless of correction). +#' @param X An \code{m} by \code{n} matrix of measurements of \code{m} features for \code{n} observations. Each column in \code{X} is assumed to be a mixture of \code{k} sources. Note that \code{X} must include row names and column names and that NA values are currently not supported. \code{X} should not include features that are constant across all observations. +#' @param k A numeric value indicating the dimension of the signal in \code{X} (i.e. the number of sources). +#' @param sparsity A numeric value indicating the sparsity of the signal in \code{X} (the number of signal rows). +#' @param C An \code{n} by \code{p} design matrix of covariates that will be accounted for in the feature selection step. An intercept term will be included automatically. Note that \code{C} must include row names and column names and that NA values are currently not supported; set \code{C} to be \code{NULL} if there are no such covariates. +#' @param C.remove A logical value indicating whether the covariates in X should be accounted for not only in the feature selection step, but also in the final calculation of the principal components (i.e. if \code{C.remove == TRUE} then the selected features will be adjusted for the covariates in \code{C} prior to calculating principal components). Note that setting \code{C.remove} to be \code{TRUE} is desired when ReFACTor is intended to be used for correction in downstream analysis, whereas setting \code{C.remove} to be \code{FALSE} is desired when ReFACTor is merely used for capturing the sparse signals in \code{X} (i.e. regardless of correction). #' @param sd_threshold A numeric value indicating a standard deviation threshold to be used for excluding low-variance features in \code{X} (i.e. features with standard deviation lower than \code{sd_threshold} will be excluded). Set \code{sd_threshold} to be \code{NULL} for turning off this filter. Note that removing features with very low variability tends to improve speed and performance. #' @param num_comp A numeric value indicating the number of ReFACTor components to return. #' @param rand_svd A logical value indicating whether to use random svd for estimating the low-rank structure of the data in the first step of the algorithm; random svd can result in a substantial speedup for large data. -#' @param log_file A path to an output log file. Note that if the file \code{log_file} already exists then logs will be appended to the end of the file. Set \code{log_file} to \code{NULL} to prevent output from being saved into a file; note that if \code{verbose == TRUE} then no output file will be generated regardless of the value of \code{log_file}. -#' @param debug A logical value indicating whether to set the logger to a more detailed debug level; please set \code{debug} to \code{TRUE} before reporting issues. +#' @param log_file A path to an output log file. Note that if the file \code{log_file} already exists then logs will be appended to the end of the file. Set \code{log_file} to \code{NULL} to prevent output from being saved into a file; note that if \code{verbose == FALSE} then no output file will be generated regardless of the value of \code{log_file}. +#' @param debug A logical value indicating whether to set the logger to a more detailed debug level; set \code{debug} to \code{TRUE} before reporting issues. #' @param verbose A logical value indicating whether to print logs. #' #' @importFrom rsvd rpca @@ -350,7 +367,7 @@ tcareg <- function(X, tca.mdl, y, C3 = NULL, test = "marginal", null_model = NUL #' @return A list with the estimated components of the ReFACTor model. #' \item{scores}{An \code{n} by \code{num_comp} matrix of the ReFACTor components (the projection scores).} #' \item{coeffs}{A \code{sparsity} by \code{num_comp} matrix of the coefficients of the ReFACTor components (the projection loadings).} -#' \item{ranked_list}{A vector with the features in the data, ranked by their scores in the feature selection step of the algorithm; the top scoring features (set according to the argument \code{sparsity}) are used for calculating the ReFACTor components. Note that features that were excluded according to \code{sd_threshold} will not appear in this \code{ranked_list}.} +#' \item{ranked_list}{A vector with the features in \code{X}, ranked by their scores in the feature selection step of the algorithm; the top scoring features (set according to the argument \code{sparsity}) are used for calculating the ReFACTor components. Note that features that were excluded according to \code{sd_threshold} will not appear in this \code{ranked_list}.} #' #' @section Note: For very large input matrices it is advised to use random svd for speeding up the feature selection step (see argument \code{rand_svd}). #' @@ -380,7 +397,7 @@ refactor <- function(X, k, sparsity = 500, C = NULL, C.remove = FALSE, sd_thresh #' @param p1 The number of covariates that affect the source-specific values to simulate. #' @param p2 The number of covariates that affect the mixture values to simulate. #' @param tau The variance of the i.i.d. component of variation to add on top of the simulated mixture values. -#' @param log_file A path to an output log file. Note that if the file \code{log_file} already exists then logs will be appended to the end of the file. Set \code{log_file} to \code{NULL} to prevent output from being saved into a file; note that if \code{verbose == TRUE} then no output file will be generated regardless of the value of \code{log_file}. +#' @param log_file A path to an output log file. Note that if the file \code{log_file} already exists then logs will be appended to the end of the file. Set \code{log_file} to \code{NULL} to prevent output from being saved into a file; note that if \code{verbose == FALSE} then no output file will be generated regardless of the value of \code{log_file}. #' @param verbose A logical value indicating whether to print logs. #' #' @importFrom futile.logger flog.info @@ -396,8 +413,8 @@ refactor <- function(X, k, sparsity = 500, C = NULL, C.remove = FALSE, sd_thresh #' \item{sigmas}{An \code{m} by \code{k} matrix of the standard variation of each of the \code{m} features for each of the \code{k} sources.} #' \item{C1}{ An \code{n} by \code{p1} design matrix of simulated covariates that affect the hidden source-specific values.} #' \item{C2}{ An \code{n} by \code{p2} design matrix of simulated covariates that affect the mixture.} -#' \item{gammas}{An \code{m} by \code{k*p1} matrix of the effects of the \code{p1} factors in \code{C1} on each of the \code{m} features in \code{X}, where the first \code{p1} columns are the source-specific effects of the \code{p1} factors on the first source, the following \code{p1} columns are the source-specific effects on the second source and so on.} -#' \item{deltas}{An \code{m} by \code{p2} matrix of the effects of the \code{p2} factors in \code{C2} on the mixture values of each of the \code{m} features in \code{X}.} +#' \item{gammas}{An \code{m} by \code{k*p1} matrix of the effects of the \code{p1} covariates in \code{C1} on each of the \code{m} features in \code{X}, where the first \code{p1} columns are the source-specific effects of the \code{p1} covariates on the first source, the following \code{p1} columns are the source-specific effects on the second source and so on.} +#' \item{deltas}{An \code{m} by \code{p2} matrix of the effects of the \code{p2} covariates in \code{C2} on the mixture values of each of the \code{m} features in \code{X}.} #' #' @examples #' data <- test_data(100, 50, 3, 2, 2, 0.01) diff --git a/R/model_fit.R b/R/model_fit.R index cbcbc21..d11b5de 100644 --- a/R/model_fit.R +++ b/R/model_fit.R @@ -1,6 +1,9 @@ # Fits the TCA model -tca.fit <- function(X, W, C1, C1.map, C2, refit_W, tau, parallel, num_cores, max_iters){ +#' @importFrom parallel clusterExport +#' @importFrom pbapply pblapply +#' @importFrom stats anova +tca.fit <- function(X, W, C1, C1.map, C2, refit_W, tau, vars.mle, constrain_mu, parallel, num_cores, max_iters){ flog.debug("Starting function 'tca.fit'...") @@ -19,6 +22,9 @@ tca.fit <- function(X, W, C1, C1.map, C2, refit_W, tau, parallel, num_cores, max deltas_hat <- init[["deltas_hat"]] gammas_hat <- init[["gammas_hat"]] tau_hat <- init[["tau_hat"]] + deltas_hat_pvals <- if(constrain_mu) NULL else init[["deltas_hat_pvals"]] + gammas_hat_pvals <- if(constrain_mu) NULL else init[["gammas_hat_pvals"]] + gammas_hat_pvals.joint <- if(constrain_mu) NULL else init[["gammas_hat_pvals.joint"]] const <- -n*log(2*pi) ll_prev <- -Inf @@ -26,7 +32,7 @@ tca.fit <- function(X, W, C1, C1.map, C2, refit_W, tau, parallel, num_cores, max if (refit_W) flog.info("Iteration %s out of %s external iterations (fitting all parameters including W)...", iter, max_iters) flog.info("Fitting means and variances...") - res <- tca.fit_means_vars(X, W, mus_hat, sigmas_hat, tau_hat, C2, deltas_hat, C1, gammas_hat, C1.map, tau, max_iters, parallel, num_cores) + res <- tca.fit_means_vars(X, W, mus_hat, sigmas_hat, tau_hat, C2, deltas_hat, C1, gammas_hat, C1.map, tau, vars.mle, constrain_mu, max_iters, parallel, num_cores) mus_hat <- res[["mus_hat"]] sigmas_hat <- res[["sigmas_hat"]] deltas_hat <- res[["deltas_hat"]] @@ -55,13 +61,64 @@ tca.fit <- function(X, W, C1, C1.map, C2, refit_W, tau, parallel, num_cores, max } - return (list("W" = W, "mus_hat" = mus_hat, "sigmas_hat" = sigmas_hat, "tau_hat" = tau_hat, "deltas_hat" = deltas_hat, "gammas_hat" = gammas_hat)) - + if(!constrain_mu){ + flog.info("Calculate p-values for deltas and gammas.") + C1_ <- calc_C1_W_interactions(W,C1) + W_norms <- (tcrossprod(W**2,sigmas_hat**2)+tau_hat**2 )**0.5 + X_tilde <- X/W_norms + cl <- NULL + if (parallel){ + cl <- init_cluster(num_cores) + clusterExport(cl, varlist = c("C1_","W_norms","X_tilde","W","k","p2","C2","p1","p1","lm"), envir=environment()) + } + res <- pblapply(1:m,function(j) { + df = data.frame(y = X_tilde[,j], cbind(W/t(repmat(W_norms[,j],k,1)),if (p2>0) C2/t(repmat(W_norms[,j],p2,1)) else C2,if (p1>0) C1_/t(repmat(W_norms[,j],k*p1,1)) else C1_)); + mdl1.fit <- lm(y~., data = df) + mdl1.coef <- summary(mdl1.fit)$coefficients + mdl1.cov.names <- colnames(df)[colnames(df) != 'y'] + deltas_gammas_hat_pvals <- sapply(mdl1.cov.names, function(x){ + if (x %in% rownames(mdl1.coef)){ + return(mdl1.coef[x,'Pr(>|t|)']) + } + else { + return(NA) + } + }) + #deltas_gammas_hat_pvals <- summary(mdl1.fit)$coefficients[2:(1+k+p1*k+p2),4]; + gammas_hat_pvals.joint <- numeric(p1)+1 + if (p1){ + C1_alt <- C1_/t(repmat(W_norms[,j],k*p1,1)) + for (d in 1:p1){ + C1_null <- C1_alt[,setdiff(1:(p1*k),seq(d,k*p1,p1))] + df = data.frame(y = X_tilde[,j], cbind(W/t(repmat(W_norms[,j],k,1)),if (p2>0) C2/t(repmat(W_norms[,j],p2,1)) else C2,C1_null)); + mdl0.fit <- lm(y~., data = df) + anova.fit <- anova(mdl0.fit,mdl1.fit); + gammas_hat_pvals.joint[d] <- anova.fit$`Pr(>F)`[2]; + } + } + return(c(deltas_gammas_hat_pvals,gammas_hat_pvals.joint)); + }, cl = cl ) + if (parallel) stop_cluster(cl) + for (j in 1:m){ + deltas_hat_pvals[j,] <- res[[j]][(k+1):(k+p2)] + gammas_hat_pvals[j,] <- res[[j]][(k+p2+1):(k+p2+p1*k)] + gammas_hat_pvals.joint[j,] <- res[[j]][(k+p2+p1*k+1):(k+p2+p1*k+p1)] + } + # res <- pblapply(1:m,function(j) { + # df = data.frame(y = X_tilde[,j], cbind(W/t(repmat(W_norms[,j],k,1)),if (p2>0) C2/t(repmat(W_norms[,j],p2,1)) else C2,if (p1>0) C1_/t(repmat(W_norms[,j],k*p1,1)) else C1_)); + # return(summary(lm(y~., data = df))$coefficients[2:(1+k+p1*k+p2),4]); + # }, cl = cl ) + # if (parallel) stop_cluster(cl) + # for (j in 1:m){ + # deltas_hat_pvals[j,] <- res[[j]][(k+1):(k+p2)] + # gammas_hat_pvals[j,] <- res[[j]][(k+p2+1):(k+p2+p1*k)] + # } + } + return (list("W" = W, "mus_hat" = mus_hat, "sigmas_hat" = sigmas_hat, "tau_hat" = tau_hat, "deltas_hat" = deltas_hat, "gammas_hat" = gammas_hat, "deltas_hat_pvals" = deltas_hat_pvals, "gammas_hat_pvals"= gammas_hat_pvals, "gammas_hat_pvals.joint" = gammas_hat_pvals.joint)) } init_means_vars <- function(C1_names, C2_names, feature_ids, source_ids, tau){ - #config <- config::get() config <- config::get(file = system.file("extdata", "config.yml", package = "TCA"), use_parent = FALSE) k <- length(source_ids) m <- length(feature_ids) @@ -78,6 +135,9 @@ init_means_vars <- function(C1_names, C2_names, feature_ids, source_ids, tau){ sigmas_hat <- matrix(0, nrow=m, ncol=k) deltas_hat <- matrix(0, nrow=m, ncol=p2) gammas_hat <- matrix(0, nrow=m, ncol=p1*k) + deltas_hat_pvals <- matrix(1, nrow=m, ncol=p2) + gammas_hat_pvals <- matrix(1, nrow=m, ncol=p1*k) + gammas_hat_pvals.joint <- matrix(1, nrow=m, ncol=p1) if (is.null(tau)){ tau_hat <- config[["tau_hat_init"]] }else{ @@ -92,22 +152,25 @@ init_means_vars <- function(C1_names, C2_names, feature_ids, source_ids, tau){ colnames(deltas_hat) <- C2_names rownames(gammas_hat) <- feature_ids colnames(gammas_hat) <- C1_names_ - - return( list("mus_hat" = mus_hat, "sigmas_hat" = sigmas_hat, "gammas_hat" = gammas_hat, "deltas_hat"= deltas_hat, "tau_hat" = tau_hat) ) + rownames(deltas_hat_pvals) <- feature_ids + colnames(deltas_hat_pvals) <- C2_names + rownames(gammas_hat_pvals) <- feature_ids + colnames(gammas_hat_pvals) <- C1_names_ + rownames(gammas_hat_pvals.joint) <- feature_ids + colnames(gammas_hat_pvals.joint) <- C1_names + + return( list("mus_hat" = mus_hat, "sigmas_hat" = sigmas_hat, "gammas_hat" = gammas_hat, "deltas_hat"= deltas_hat, "deltas_hat_pvals" = deltas_hat_pvals, "gammas_hat_pvals" = gammas_hat_pvals, "gammas_hat_pvals.joint" = gammas_hat_pvals.joint, "tau_hat" = tau_hat) ) } -#' @importFrom parallel clusterExport -#' @importFrom pbapply pblapply #' @importFrom pracma repmat #' @importFrom pracma lsqlincon #' @importFrom nloptr nloptr #' @importFrom matrixStats colVars -tca.fit_means_vars <- function(X, W, mus_hat, sigmas_hat, tau_hat, C2, deltas_hat, C1, gammas_hat, C1.map, tau, max_iters, parallel, num_cores){ +tca.fit_means_vars <- function(X, W, mus_hat, sigmas_hat, tau_hat, C2, deltas_hat, C1, gammas_hat, C1.map, tau, vars.mle, constrain_mu, max_iters, parallel, num_cores){ flog.debug("Starting function 'tca.fit_means_vars'...") - #config <- config::get() config <- config::get(file = system.file("extdata", "config.yml", package = "TCA"), use_parent = FALSE) nloptr_opts = list("algorithm"=config[["nloptr_opts_algorithm"]], "xtol_rel"=config[["nloptr_opts_xtol_rel"]], "print_level" = config[["nloptr_opts_print_level"]], "check_derivatives" = config[["nloptr_opts_check_derivatives"]]) @@ -123,6 +186,7 @@ tca.fit_means_vars <- function(X, W, mus_hat, sigmas_hat, tau_hat, C2, deltas_ha const <- -n*log(2*pi) W_squared <- W**2 + W_squared_ <- cbind(W_squared,numeric(n) + 1) ll_prev <- -Inf # Perform an alternative optimization of the means (mus, deltas, gammas) and variances (sigmas and tau) @@ -133,11 +197,13 @@ tca.fit_means_vars <- function(X, W, mus_hat, sigmas_hat, tau_hat, C2, deltas_ha # (1) Estimate the means (mus, deltas, gammas) ub <- numeric(k+p2+p1*k)+config[["lsqlincon_inf"]] - ub[1:k] <- if(is.null(config[["mu_max"]])) max(X) else config[["mu_max"]] - ub[1:k] <- ub[1:k] - config[["mu_epsilon"]] lb <- numeric(k+p2+p1*k)-config[["lsqlincon_inf"]] - lb[1:k] <- if(is.null(config[["mu_min"]])) min(X) else config[["mu_min"]] - lb[1:k] <- lb[1:k] + config[["mu_epsilon"]] + if (constrain_mu){ + ub[1:k] <- max(X) + ub[1:k] <- ub[1:k] - config[["mu_epsilon"]] + lb[1:k] <- min(X) + lb[1:k] <- lb[1:k] + config[["mu_epsilon"]] + } if (p1){ # All zeros will get ub and lb set to 0 (i.e. effect sizes will not be estimated); use a small value (config[["nloptr_opts_xtol_rel"]]) for stability; otherwise nloptr may return an error in some cases. @@ -185,94 +251,78 @@ tca.fit_means_vars <- function(X, W, mus_hat, sigmas_hat, tau_hat, C2, deltas_ha # Calculate some quantities that will be repeatedly used throughout the optimization in this step U <- (tcrossprod(W,mus_hat) + tcrossprod(C2,deltas_hat) + tcrossprod(C1_,gammas_hat) - X)**2 - if (sum(colSums(sigmas_hat)) == 0){ - # Set a starting point for the optimization - flog.debug("Get initial estimates of sigmas") - row_names <- rownames(sigmas_hat) - col_names <- colnames(sigmas_hat) - sigmas_hat <- t(repmat((colVars(X)/k)**0.5,k,1)) - rownames(sigmas_hat) <- row_names - colnames(sigmas_hat) <- col_names - } - - # # (2.1) Get initial estimate of sigmas, tau under the assumption that tau is feature-specific - # - # flog.debug("Estimate sigmas, tau under the assumption that tau is feature-specific.") - # lb <- numeric(k+1)+config$min_var - # ub <- numeric(k+1)+Inf - # if (parallel){ - # clusterExport(cl, c("W_tilde","C2_tilde","C1_tilde","X_tilde","lb","ub","n","k","p1","p2","U","const","W_squared","sigmas_hat","tau_hat","nloptr_opts")) - # res <- parLapply(cl, 1:m,function(j) nloptr( x0=c(sigmas_hat[j,],tau_hat), - # eval_f=function(x,U_j,W_squared,const) - # {k<-length(x)-1; - # V <- tcrossprod(W_squared,t(x[1:k]**2))+x[k+1]**2; - # V_squared <- V**2; - # g<-numeric(k+1) - # g[1:k] <- -(colSums(W_squared*repmat(x[1:k],n,1)*t(repmat(U_j,k,1))/repmat(V_squared,1,k)) - colSums(W_squared*repmat(x[1:k],n,1)/repmat(V,1,k))) - # g[k+1] <- -(x[k+1]*(sum(U_j/V_squared)-sum(1./V))) - # return(list("objective"= -0.5*(const-sum(log(V))-sum(U_j/V)), - # "gradient" = g))}, - # lb = lb, - # ub = ub, - # opts = nloptr_opts, - # U_j = U[,j], - # W_squared = W_squared, - # const = const)$solution[1:k] ) - # }else{ - # res <- lapply(1:m,function(j) nloptr( x0=c(sigmas_hat[j,],tau_hat), - # eval_f=function(x,U_j,W_squared,const) - # {k<-length(x)-1; - # V <- tcrossprod(W_squared,t(x[1:k]**2))+x[k+1]**2; - # V_squared <- V**2; - # g<-numeric(k+1) - # g[1:k] <- -(colSums(W_squared*repmat(x[1:k],n,1)*t(repmat(U_j,k,1))/repmat(V_squared,1,k)) - colSums(W_squared*repmat(x[1:k],n,1)/repmat(V,1,k))) - # g[k+1] <- -(x[k+1]*(sum(U_j/V_squared)-sum(1./V))) - # return(list("objective"= -0.5*(const-sum(log(V))-sum(U_j/V)), - # "gradient" = g))}, - # lb = lb, - # ub = ub, - # opts = nloptr_opts, - # U_j = U[,j], - # W_squared = W_squared, - # const = const)$solution[1:k] ) - # } - # for (j in 1:m){ - # sigmas_hat[j,] = res[[j]] - # } + if (vars.mle){ + + if (sum(colSums(sigmas_hat)) == 0){ + # Set a starting point for the optimization + flog.debug("Get initial estimates of sigmas") + row_names <- rownames(sigmas_hat) + col_names <- colnames(sigmas_hat) + sigmas_hat <- t(repmat((colVars(X)/k)**0.5,k,1)) + rownames(sigmas_hat) <- row_names + colnames(sigmas_hat) <- col_names + } + + # (2.2) Estimate sigmas + flog.debug("Estimate sigmas.") + lb <- numeric(k)+config[["min_sd"]] + ub <- numeric(k)+Inf + if (parallel) clusterExport(cl, c("lb","ub","n","k","U","const","W_squared","sigmas_hat","tau_hat","nloptr_opts","minus_log_likelihood_sigmas"), envir=environment()) + res <- pblapply(1:m,function(j) nloptr( x0=sigmas_hat[j,], + eval_f = function(x,U_j,W_squared,const,tau_hat) minus_log_likelihood_sigmas(x,U_j,W_squared,const,tau_hat), + lb = lb, + ub = ub, + opts = nloptr_opts, + U_j = U[,j], + W_squared = W_squared, + const = const, + tau_hat = tau_hat)$solution, cl = cl ) + + for (j in 1:m){ + sigmas_hat[j,] = res[[j]] + } + + # (2.3) Estimate tau + if (is.null(tau)){ + flog.debug("Estimate tau.") + lb <- config[["min_sd"]] + ub <- Inf + tau_hat = nloptr(x0=tau_hat, + eval_f = function(x,U,W_squared,sigmas_hat,const) minus_log_likelihood_tau(U,W_squared,sigmas_hat,const,x), + lb = lb, + ub = ub, + opts = nloptr_opts, + U = U, + W_squared = W_squared, + sigmas_hat = sigmas_hat, + const = const)$solution + } - # (2.2) Estimate sigmas - flog.debug("Estimate sigmas.") - lb <- numeric(k)+config[["min_sd"]] - ub <- numeric(k)+Inf - if (parallel) clusterExport(cl, c("lb","ub","n","k","U","const","W_squared","sigmas_hat","tau_hat","nloptr_opts","minus_log_likelihood_sigmas"), envir=environment()) - res <- pblapply(1:m,function(j) nloptr( x0=sigmas_hat[j,], - eval_f = function(x,U_j,W_squared,const,tau_hat) minus_log_likelihood_sigmas(x,U_j,W_squared,const,tau_hat), - lb = lb, - ub = ub, - opts = nloptr_opts, - U_j = U[,j], - W_squared = W_squared, - const = const, - tau_hat = tau_hat)$solution, cl = cl ) - - for (j in 1:m){ - sigmas_hat[j,] = res[[j]] - } - - # (2.3) Estimate tau - if (is.null(tau)){ - flog.debug("Estimate tau.") - lb <- config[["min_sd"]] - ub <- Inf - tau_hat = nloptr(x0=tau_hat, - eval_f = function(x,U,W_squared,sigmas_hat,const) minus_log_likelihood_tau(U,W_squared,sigmas_hat,const,x), - lb = lb, - ub = ub, - opts = nloptr_opts, - U = U, - W_squared = W_squared, - sigmas_hat = sigmas_hat, - const = const)$solution + }else{ + # Learn the variances using gmm + flog.debug("Estimate sigmas, tau") + lb <- numeric(k+1)+config[["min_sd"]] + # calculate weights matrix V + if (sum(colSums(sigmas_hat)) == 0){ + V <- matrix(1,n,m) + }else{ + V <- abs(U - tcrossprod(W_squared_, cbind(sigmas_hat**2, matrix(tau_hat**2,m,1)))) + V[V < config[["V_weight_limit"]]] <- config[["V_weight_limit"]] + } + if (parallel) clusterExport(cl, c("lb","U","W_squared_","lsqlincon","V","n"), envir=environment()) + res <- pblapply(1:m,function(j) { + x <- W_squared_/t(repmat(V[,j],ncol(W_squared_),1)); + # For numeric stability, normalize the design matrix and adjust the final estimats accordingly + norms <- (colSums(x**2))**0.5; + x <- x/repmat(norms,n,1); + lsqlincon(x, U[,j]/V[,j], lb = lb*norms)/norms + }, cl = cl) + tau_squared_hat <- 0 + for (j in 1:m){ + sigmas_hat[j,] <- sqrt(res[[j]][1:k]) + tau_squared_hat <- tau_squared_hat + res[[j]][k+1] + } + tau_hat <- sqrt(tau_squared_hat/m) } flog.debug("Test for convergence.") @@ -300,7 +350,6 @@ tca.fit_W <- function(X, W, mus_hat, sigmas_hat, tau_hat, C2, deltas_hat, C1, ga flog.debug("Starting function 'tca.fit_W'...") - #config <- config::get() config <- config::get(file = system.file("extdata", "config.yml", package = "TCA"), use_parent = FALSE) nloptr_opts = list("algorithm"=config[["nloptr_opts_fit_W_algorithm"]], "xtol_rel"=config[["nloptr_opts_xtol_rel"]], "print_level" = config[["nloptr_opts_print_level"]], "check_derivatives" = config[["nloptr_opts_check_derivatives"]]) @@ -417,7 +466,7 @@ minus_log_likelihood_w <- function(w_i, x_i, c1_i, mus, tau, gammas, const, C_ti # X is n by m -estimate_Z <- function(X, W, mus_hat, sigmas_hat, tau_hat, C2, deltas_hat, C1, gammas_hat, parallel, num_cores){ +estimate_Z <- function(X, W, mus_hat, sigmas_hat, tau_hat, C2, deltas_hat, C1, gammas_hat, scale, parallel, num_cores){ flog.info("Estimate tensor...") # init n <- dim(X)[1] @@ -439,7 +488,7 @@ estimate_Z <- function(X, W, mus_hat, sigmas_hat, tau_hat, C2, deltas_hat, C1, g cl <- if (parallel) init_cluster(num_cores) else NULL if (parallel) clusterExport(cl, c("W","mus_hat","sigmas_hat","tau_hat","C1","gammas_hat","W_prime","C2_prime","estimate_Z_j"), envir=environment()) # Estimate Z - res <- pblapply(1:m, function(j) estimate_Z_j(W, mus_hat[j,], sigmas_hat[j,], tau_hat, C1, gammas_hat[j,], W_prime, C2_prime[,j]), cl = cl ) + res <- pblapply(1:m, function(j) estimate_Z_j(W, mus_hat[j,], sigmas_hat[j,], tau_hat, C1, gammas_hat[j,], W_prime, C2_prime[,j], scale), cl = cl ) if (parallel) stop_cluster(cl) for (j in 1:m){ for (h in 1:k){ @@ -459,53 +508,234 @@ estimate_Z <- function(X, W, mus_hat, sigmas_hat, tau_hat, C2, deltas_hat, C1, g # Estimate Z for one feature j #' @importFrom matrixcalc matrix.inverse -estimate_Z_j <- function(W, mus_hat_j, sigmas_hat_j, tau_hat, C1, gammas_hat_j, W_prime, C2_prime_j){ +#' @importFrom pracma sqrtm +estimate_Z_j <- function(W, mus_hat_j, sigmas_hat_j, tau_hat, C1, gammas_hat_j, W_prime, C2_prime_j, scale){ n <- nrow(W) k <- ncol(W) p1 <- ncol(C1) Z_j_hat <- matrix(0,n,k) - Sig_j <- matrix.inverse(diag(sigmas_hat_j**2)) + Sig_j_orig <- diag(sigmas_hat_j**2) + Sig_j <- matrix.inverse(Sig_j_orig) C1_prime <- tcrossprod(C1, t(Reshape(gammas_hat_j,p1,k))) for (i in 1:n){ - Z_j_hat[i,] = crossprod(matrix.inverse(W_prime[[i]]+Sig_j), ( tcrossprod(Sig_j,t(mus_hat_j+C1_prime[i,])) + W[i,]*C2_prime_j[i] ) ) + #S_ij_inv <- W_prime[[i]]+Sig_j + #S_ij <- matrix.inverse(S_ij_inv) + ## the above two lines are the straightforward (slower) way to calculate S_ij; calculate 'matrix.inverse(W_prime[[i]]+Sig_j)' using the lemma from Miller 1981: + BA_inv <- W_prime[[i]] %*% Sig_j_orig + g <- sum(diag(BA_inv)) + S_ij <- Sig_j_orig - ((1/(1+g))*(Sig_j_orig %*% BA_inv)) + Z_j_hat[i,] = crossprod(S_ij, ( tcrossprod(Sig_j,t(mus_hat_j+C1_prime[i,])) + W[i,]*C2_prime_j[i] ) ) + if (scale) Z_j_hat[i,] <- Z_j_hat[i,] / diag(S_ij)**0.5 } return(Z_j_hat) } -tcareg.fit <- function(X, y, W, mus_hat, sigmas_hat, C2, deltas_hat, C1, gammas_hat, tau_hat, C3, test, null_model, alternative_model, parallel, num_cores){ +tcareg.fit <- function(X, y, W, mus_hat, sigmas_hat, C2, deltas_hat, C1, gammas_hat, tau_hat, C3, test, null_model, alternative_model, fast_mode, parallel, num_cores){ flog.debug("Starting function 'tcareg.fit'...") - single_effect <- if (test == "single_effect") TRUE else FALSE - - if (test == "joint" | test == "single_effect") results <- tcareg.fit_joint(X, y, W, mus_hat, sigmas_hat, C2, deltas_hat, C1, gammas_hat, tau_hat, C3, parallel, num_cores, single_effect) - if (test == "marginal") results <- tcareg.fit_marginal(X, y, W, mus_hat, sigmas_hat, C2, deltas_hat, C1, gammas_hat, tau_hat, C3, parallel, num_cores) - if (test == "marginal_conditional") results <- tcareg.fit_marginal_conditional(X, y, W, mus_hat, sigmas_hat, C2, deltas_hat, C1, gammas_hat, tau_hat, C3, parallel, num_cores) - if (test == "custom") results <- tcareg.fit_custom(X, y, W, mus_hat, sigmas_hat, C2, deltas_hat, C1, gammas_hat, tau_hat, C3, parallel, num_cores, null_model, alternative_model) + k <- ncol(W) + if (fast_mode){ + if (test == "joint" | test == "single_effect" | test == "marginal_conditional") alternative_model <- 1:k + if (test == "joint" | test == "single_effect") null_model <- c() + if (test == "marginal"){ + results <- vector(mode="list", k) + for (h in 1:k) results[[h]] <- tcareg.fit.fast(X, y, W, mus_hat, sigmas_hat, C2, deltas_hat, C1, gammas_hat, tau_hat, C3, test, null_model, alternative_model = h, parallel, num_cores) + }else{ + results <- tcareg.fit.fast(X, y, W, mus_hat, sigmas_hat, C2, deltas_hat, C1, gammas_hat, tau_hat, C3, test, null_model, alternative_model, parallel, num_cores) + } + }else{ + single_effect <- if(test == "single_effect") TRUE else FALSE + if (test == "joint" | test == "single_effect") results <- tcareg.fit_joint(X, y, W, mus_hat, sigmas_hat, C2, deltas_hat, C1, gammas_hat, tau_hat, C3, parallel, num_cores, single_effect) + if (test == "marginal") results <- tcareg.fit_marginal(X, y, W, mus_hat, sigmas_hat, C2, deltas_hat, C1, gammas_hat, tau_hat, C3, parallel, num_cores) + if (test == "marginal_conditional") results <- tcareg.fit_marginal_conditional(X, y, W, mus_hat, sigmas_hat, C2, deltas_hat, C1, gammas_hat, tau_hat, C3, parallel, num_cores) + if (test == "custom") results <- tcareg.fit_custom(X, y, W, mus_hat, sigmas_hat, C2, deltas_hat, C1, gammas_hat, tau_hat, C3, parallel, num_cores, null_model, alternative_model) + } return(results) +} + + +#' @importFrom stats anova +tcareg.fit.fast <- function(X, y, W, mus_hat, sigmas_hat, C2, deltas_hat, C1, gammas_hat, tau_hat, C3, test, null_model, alternative_model, parallel, num_cores){ + + flog.debug("Starting function 'tcareg.fit.fast'...") + + config <- config::get(file = system.file("extdata", "config.yml", package = "TCA"), use_parent = FALSE) + + m <- ncol(X) + n <- nrow(X) + k <- ncol(W) + + p3 <- ncol(C3) + num_betas <- if (test == "custom" | test == "joint" | test == "marginal_conditional") length(alternative_model) else 1 + num_pvals <- if (test == "marginal_conditional") length(alternative_model) else 1 + Z_hat <- estimate_Z(X, W, mus_hat, sigmas_hat, tau_hat, C2, deltas_hat, C1, gammas_hat, scale = FALSE, parallel, num_cores) + lm0.joint.fit <- if (p3) lm(y~.,data = data.frame(C3)) else lm(y ~ 1) + cl <- if (parallel) init_cluster(num_cores) else NULL + if (parallel) clusterExport(cl, c("Z_hat","n","k","y","test","null_model","alternative_model","C3","p3","num_betas","lm0.joint.fit"), envir=environment()) + res <- pblapply(1:m, function(j){ + r <- list(); + D <- matrix(0,n,k); + for (h in 1:k) D[,h] <- Z_hat[[h]][j,]; + if (test == "single_effect") D <- as.matrix(rowSums(D)); + if (test == "custom"){ + D <- D[,alternative_model,drop=F]; + } + if (test == "marginal"){ + D <- D[,alternative_model,drop=F]; + } + colnames(D) <- paste("X",1:ncol(D),sep="") + D <- cbind(C3,D) + lm.fit <- lm(y~.,data = data.frame(D)); + coefficients <- summary(lm.fit)$coefficients; + r[["intercept"]] <- coefficients[1,1]; + if (p3) r[["alpha"]] <- coefficients[2:(1+p3),1]; + r[["phi"]] <- summary(lm.fit)$sigma; + r[["alternative_ll"]] <- as.numeric(logLik(lm.fit)); + if (test == "custom"){ + lm0.fit <- if (is.null(null_model)) lm0.joint.fit else lm(y~.,data = data.frame(cbind(C3,D[,null_model]))); + anova.fit <- anova(lm0.fit,lm.fit); + stats <- anova.fit$F[2]; + pvals <- anova.fit$`Pr(>F)`[2]; + } + if (test == "joint"){ + anova.fit <- anova(lm0.joint.fit, lm.fit); + stats <- anova.fit$F[2]; + pvals <- anova.fit$`Pr(>F)`[2]; + #stats <- summary(lm.fit)$fstatistic[1]; + #pvals <- pf(summary(lm.fit)$fstatistic[1],summary(lm.fit)$fstatistic[2],summary(lm.fit)$fstatistic[3],lower.tail=FALSE); + } + l <- nrow(coefficients); + beta <- matrix(NA,1,num_betas); + v <- if (max(0,nrow(coefficients)-1-p3)) match(rownames(coefficients)[(p3+2):nrow(coefficients)], colnames(D)[(p3+1):ncol(D)]) else NULL + + if(!is.null(v)) beta[v[!is.na(v)]] <- coefficients[(p3+2):l,1]; + if (test == "single_effect" | test == "marginal_conditional" | test == "marginal"){ + stats <- matrix(NA,1,num_betas); + pvals <- matrix(1,1,num_betas); + if(!is.null(v)){ + stats[v[!is.na(v)]] <- coefficients[(p3+2):l,3]; + pvals[v[!is.na(v)]] <- coefficients[(p3+2):l,4]; + } + } + r[["beta"]] <- beta; + r[["stats"]] <- stats; + r[["pvals"]] <- pvals; + return(r) }, cl = cl) + if (parallel) stop_cluster(cl) + + lst <- init_fast_tcareg_output(colnames(X), rownames(X), colnames(W), colnames(C3), num_pvals, num_betas, alternative_model, test) + phi <- lst[["phi"]] + beta <- lst[["beta"]] + intercept <- lst[["intercept"]] + alpha <- lst[["alpha"]] + alternative_ll <- lst[["alternative_ll"]] + stats <- lst[["stats"]] + pvals <- lst[["pvals"]] + qvals <- lst[["qvals"]] + for (j in 1:m){ + phi[j,1] <- res[[j]][["phi"]] + beta[j,] <- res[[j]][["beta"]] + intercept[j,1] <- res[[j]][["intercept"]] + alpha[j,] <- res[[j]][["alpha"]] + alternative_ll[j,1] <- res[[j]][["alternative_ll"]] + stats[j,] <- res[[j]][["stats"]] + pvals[j,] <- res[[j]][["pvals"]] + } + for (i in 1:num_pvals) qvals[,i] <- cbind(p.adjust(pvals[,i], method = config[["fdr_method"]])) + + deg_freedom <- if (test == "marginal" | test == "marginal_conditional" | test == "single_effect") 1 else (length(alternative_model)-length(null_model)) + + return( list("phi" = phi, "beta" = beta, "intercept" = intercept, "alpha" = alpha, "null_ll" = NA, "alternative_ll" = alternative_ll, "pvals" = pvals, "qvals" = qvals, "stats" = stats, "df" = deg_freedom) ) + + +} + +init_fast_tcareg_output <- function(feature_ids, sample_ids, source_ids, C3_names, num_pvals, num_betas, alternative_model, test){ + + m <- length(feature_ids) + p3 <- length(C3_names) + + phi <- matrix(0, m, 1) + beta <- matrix(0, m, num_betas) + intercept <- matrix(0, m, 1) + alpha <- matrix(0, m, p3) + alternative_ll <- matrix(0, m, 1) + stats <- matrix(0, m, num_pvals) + pvals <- matrix(1, m, num_pvals) + qvals <- matrix(1, m, num_pvals) + + rownames(phi) <- feature_ids + rownames(beta) <- feature_ids + rownames(intercept) <- feature_ids + rownames(alpha) <- feature_ids + rownames(alternative_ll) <- feature_ids + rownames(stats) <- feature_ids + rownames(qvals) <- feature_ids + rownames(pvals) <- feature_ids + + if(p3) colnames(alpha) <- C3_names + colnames(beta) <- if (test != "single_effect") source_ids[alternative_model] else "beta" + colnames(phi) <- "phi" + colnames(intercept) <- "intercept" + colnames(alternative_ll) <- "alternative_ll" + + if (test == "marginal_conditional"){ + colnames(stats) <- source_ids + colnames(pvals) <- source_ids + colnames(qvals) <- source_ids + }else{ + colnames(stats) <- "stats" + colnames(pvals) <- "pvals" + colnames(qvals) <- "qvals" + } + + return( list("phi" = phi, "beta" = beta, "intercept" = intercept, "alpha"= alpha, "alternative_ll" = alternative_ll, "stats" = stats, "pvals" = pvals, "qvals" = qvals) ) } tcareg.fit_joint <- function(X, y, W, mus_hat, sigmas_hat, C2, deltas_hat, C1, gammas_hat, tau_hat, C3, parallel, num_cores, single_effect){ + + flog.debug("Starting function 'tcareg.fit_joint'...") + config <- config::get(file = system.file("extdata", "config.yml", package = "TCA"), use_parent = FALSE) k <- ncol(W) + + lst <- init_tcareg.fit_model(colnames(X), colnames(W), colnames(C3), 1:k, single_effect) + phi <- lst[["phi"]] + beta <- lst[["beta"]] + intercept <- lst[["intercept"]] + alpha <- lst[["alpha"]] + null_ll <- lst[["null_ll"]] + alternative_ll <- lst[["alternative_ll"]] + pvals <- lst[["pvals"]] + qvals <- lst[["qvals"]] + stats <- lst[["stats"]] + mdl1 <- tcareg.optimize(X, y, W, mus_hat, sigmas_hat, C2, deltas_hat, C1, gammas_hat, tau_hat, C3, 1:k, parallel, num_cores, single_effect) - ll1 <- mdl1[["ll"]] - #d <- data.frame(y,C3) - #mdl0 <- lm(y~.,data = d) # the null log-likelihood under the model y ~ C3 (i.e. the null under the case of no source-specific effects) + phi[,1] <- mdl1[["phi"]] + beta[,] <- mdl1[["beta"]] + intercept[,1] <- mdl1[["intercept"]] + alpha[,] <- mdl1[["alpha"]] + alternative_ll[,] <- mdl1[["ll"]] + if (ncol(C3) == 0){ mdl0 <- lm(y ~ 1) }else{ mdl0 <- lm(y~.,data = data.frame(C3)) } - ll0 <- numeric(length(mdl1[["ll"]])) + as.numeric(logLik(mdl0)) + null_ll[,1] <- numeric(length(mdl1[["ll"]])) + as.numeric(logLik(mdl0)) + df <- if(single_effect) 1 else ncol(W) - lrt <- lrt.test(ll0, mdl1[["ll"]], df = df) - qvals <- p.adjust(lrt[["pvals"]], method = config[["fdr_method"]]) - return( list("phi" = mdl1[["phi"]], "beta" = mdl1[["beta"]], "intercept" = mdl1[["intercept"]], "alpha" = mdl1[["alpha"]], "null_ll" = ll0, "alternative_ll" = ll1, "pvals" = lrt[["pvals"]], "qvals" = qvals, "stats" = lrt[["stats"]], "df" = df) ) + lrt <- lrt.test(null_ll[,1], alternative_ll[,1], df = df) + pvals[,1] <- lrt[["pvals"]] + qvals[,1] <- p.adjust(lrt[["pvals"]], method = config[["fdr_method"]]) + stats[,1] <- lrt[["stats"]] + + return( list("phi" = phi, "beta" = beta, "intercept" = intercept, "alpha" = alpha, "null_ll" = null_ll, "alternative_ll" = alternative_ll, "pvals" = pvals, "qvals" = qvals, "stats" = stats, "df" = df) ) } @@ -514,21 +744,40 @@ tcareg.fit_marginal <- function(X, y, W, mus_hat, sigmas_hat, C2, deltas_hat, C1 m <- ncol(X) k <- ncol(W) results <- list() - #d <- data.frame(y,C3) - #mdl0 <- lm(y~.,data = d) # the null log-likelihood under the model y ~ C3 (i.e. the null under the case of no source-specific effects) + if (ncol(C3) == 0){ mdl0 <- lm(y ~ 1) }else{ mdl0 <- lm(y~.,data = data.frame(C3)) } for (h in 1:k){ + lst <- init_tcareg.fit_model(colnames(X), colnames(W), colnames(C3), h, FALSE) + phi <- lst[["phi"]] + beta <- lst[["beta"]] + intercept <- lst[["intercept"]] + alpha <- lst[["alpha"]] + null_ll <- lst[["null_ll"]] + alternative_ll <- lst[["alternative_ll"]] + pvals <- lst[["pvals"]] + qvals <- lst[["qvals"]] + stats <- lst[["stats"]] + mdl1 <- tcareg.optimize(X, y, W, mus_hat, sigmas_hat, C2, deltas_hat, C1, gammas_hat, tau_hat, C3, h, parallel, num_cores, FALSE) - ll1 <- mdl1[["ll"]] - ll0 <- numeric(length(mdl1[["ll"]])) + as.numeric(logLik(mdl0)) + alternative_ll[,1] <- mdl1[["ll"]] + null_ll[,1] <- numeric(length(mdl1[["ll"]])) + as.numeric(logLik(mdl0)) + phi[,1] <- mdl1[["phi"]] + beta[,] <- mdl1[["beta"]] + intercept[,1] <- mdl1[["intercept"]] + alpha[,] <- mdl1[["alpha"]] + df <- 1 - lrt <- lrt.test(ll0, ll1, df = df) - qvals <- p.adjust(lrt[["pvals"]], method = config[["fdr_method"]]) - results[[length(results)+1]] = list("phi" = mdl1[["phi"]], "beta" = mdl1[["beta"]], "intercept" = mdl1[["intercept"]], "alpha" = mdl1[["alpha"]], "null_ll" = ll0, "alternative_ll" = ll1, "pvals" = lrt[["pvals"]], "qvals" = qvals, "stats" = lrt[["stats"]], "df" = df) + lrt <- lrt.test(null_ll[,1], alternative_ll[,1], df = df) + + pvals[,1] <- lrt[["pvals"]] + qvals[,1] <- p.adjust(lrt[["pvals"]], method = config[["fdr_method"]]) + stats[,1] <- lrt[["stats"]] + + results[[length(results)+1]] <- list("phi" = phi, "beta" = beta, "intercept" = intercept, "alpha" = alpha, "null_ll" = null_ll, "alternative_ll" = alternative_ll, "pvals" = pvals, "qvals" = qvals, "stats" = stats, "df" = df) } return(results) } @@ -539,49 +788,135 @@ tcareg.fit_marginal_conditional <- function(X, y, W, mus_hat, sigmas_hat, C2, de m <- ncol(X) k <- ncol(W) results <- list() + # Calculate the likelihood of the alternative model mdl1 <- tcareg.optimize(X, y, W, mus_hat, sigmas_hat, C2, deltas_hat, C1, gammas_hat, tau_hat, C3, 1:k, parallel, num_cores, FALSE) - ll1 <- mdl1[["ll"]] + # Calculate the likelihood of each of the null models + df <- 1 for (h in 1:k){ + lst <- init_tcareg.fit_model(colnames(X), colnames(W), colnames(C3), 1:k, FALSE) + phi <- lst[["phi"]] + beta <- lst[["beta"]] + intercept <- lst[["intercept"]] + alpha <- lst[["alpha"]] + null_ll <- lst[["null_ll"]] + alternative_ll <- lst[["alternative_ll"]] + pvals <- lst[["pvals"]] + qvals <- lst[["qvals"]] + stats <- lst[["stats"]] + phi[,1] <- mdl1[["phi"]] + beta[,] <- mdl1[["beta"]] + + intercept[,1] <- mdl1[["intercept"]] + alpha[,] <- mdl1[["alpha"]] + mdl0 <- tcareg.optimize(X, y, W, mus_hat, sigmas_hat, C2, deltas_hat, C1, gammas_hat, tau_hat, C3, setdiff(1:k,h), parallel, num_cores, FALSE) - ll0 <- mdl0[["ll"]] + null_ll[,1] <- mdl0[["ll"]] + alternative_ll[,1] <- mdl1[["ll"]] + df <- 1 - lrt <- lrt.test(ll0, ll1, df = df) - qvals <- p.adjust(lrt[["pvals"]], method = config[["fdr_method"]]) - results[[length(results)+1]] = list("phi" = mdl1[["phi"]], "beta" = mdl1[["beta"]], "intercept" = mdl1[["intercept"]], "alpha" = mdl1[["alpha"]], "null_ll" = ll0, "alternative_ll" = ll1, "pvals" = lrt[["pvals"]], "qvals" = qvals, "stats" = lrt[["stats"]], "df" = df) + lrt <- lrt.test(null_ll[,1], alternative_ll[,1], df = df) + pvals[,1] <- lrt[["pvals"]] + qvals[,1] <- p.adjust(lrt[["pvals"]], method = config[["fdr_method"]]) + stats[,1] <- lrt[["stats"]] + + results[[length(results)+1]] <- list("phi" = phi, "beta" = beta, "intercept" = intercept, "alpha" = alpha, "null_ll" = null_ll, "alternative_ll" = alternative_ll, "pvals" = pvals, "qvals" = qvals, "stats" = stats, "df" = df) } return(results) } tcareg.fit_custom <- function(X, y, W, mus_hat, sigmas_hat, C2, deltas_hat, C1, gammas_hat, tau_hat, C3, parallel, num_cores, null_model, alternative_model){ + config <- config::get(file = system.file("extdata", "config.yml", package = "TCA"), use_parent = FALSE) + + lst <- init_tcareg.fit_model(colnames(X), colnames(W), colnames(C3), alternative_model, FALSE) + phi <- lst[["phi"]] + beta <- lst[["beta"]] + intercept <- lst[["intercept"]] + alpha <- lst[["alpha"]] + null_ll <- lst[["null_ll"]] + alternative_ll <- lst[["alternative_ll"]] + pvals <- lst[["pvals"]] + qvals <- lst[["qvals"]] + stats <- lst[["stats"]] + + mdl1 <- tcareg.optimize(X, y, W, mus_hat, sigmas_hat, C2, deltas_hat, C1, gammas_hat, tau_hat, C3, alternative_model, parallel, num_cores, FALSE) + alternative_ll[,1] <- mdl1[["ll"]] + phi[,1] <- mdl1[["phi"]] + beta[,] <- mdl1[["beta"]] + intercept[,1] <- mdl1[["intercept"]] + alpha[,] <- mdl1[["alpha"]] + if (is.null(null_model)){ - #d <- data.frame(y,C3) - #mdl0 <- lm(y~.,data = d) # the null log-likelihood under the model y ~ C3 (i.e. the null under the case of no source-specific effects) if (ncol(C3) == 0){ mdl0 <- lm(y ~ 1) }else{ mdl0 <- lm(y~.,data = data.frame(C3)) } - ll0 <- numeric(ncol(X)) + as.numeric(logLik(mdl0)) + null_ll[,1] <- numeric(ncol(X)) + as.numeric(logLik(mdl0)) }else{ + # mdl0.ll <- conditional_model_log_likelihood.all(X, y, W, mus_hat, sigmas_hat, C2, deltas_hat, C1, gammas_hat, tau_hat, C3, mdl1$phi, mdl1$beta[,match(null_model,alternative_model),drop=F], mdl1$alpha, mdl1$intercept, null_model, parallel, num_cores) mdl0 <- tcareg.optimize(X, y, W, mus_hat, sigmas_hat, C2, deltas_hat, C1, gammas_hat, tau_hat, C3, null_model, parallel, num_cores, FALSE) - ll0 <- mdl0[["ll"]] + null_ll[,1] <- mdl0[["ll"]] } - mdl1 <- tcareg.optimize(X, y, W, mus_hat, sigmas_hat, C2, deltas_hat, C1, gammas_hat, tau_hat, C3, alternative_model, parallel, num_cores, FALSE) - ll1 <- mdl1[["ll"]] + df <- length(alternative_model) - length(null_model) - lrt <- lrt.test(ll0, ll1, df = df) - qvals <- p.adjust(lrt[["pvals"]], method = config[["fdr_method"]]) - return( list("phi" = mdl1[["phi"]], "beta" = mdl1[["beta"]], "intercept" = mdl1[["intercept"]], "alpha" = mdl1[["alpha"]], "null_ll" = ll0, "alternative_ll" = ll1, "pvals" = lrt[["pvals"]], "qvals" = qvals, "stats" = lrt[["stats"]], "df" = df) ) + lrt <- lrt.test(null_ll[,1], alternative_ll[,1], df = df) + + pvals[,1] <- lrt[["pvals"]] + qvals[,1] <- p.adjust(lrt[["pvals"]], method = config[["fdr_method"]]) + stats[,1] <- lrt[["stats"]] + + return( list("phi" = phi, "beta" = beta, "intercept" = intercept, "alpha" = alpha, "null_ll" = null_ll, "alternative_ll" = alternative_ll, "pvals" = pvals, "qvals" = qvals, "stats" = stats, "df" = df) ) } +init_tcareg.fit_model <- function(feature_ids, source_ids, C3_names, mdl, single_effect){ + + flog.debug("Starting function init_tcareg.fit_model...") + + m <- length(feature_ids) + p3 <- length(C3_names) + num_beta <- length(mdl) + + phi <- matrix(0,m,1) + beta <- if (single_effect) matrix(0,m,1) else matrix(0,m,num_beta) + intercept <- matrix(0,m,1) + alpha <- matrix(0,m,p3) + null_ll <- matrix(0,m,1) + alternative_ll <- matrix(0,m,1) + + pvals <- matrix(1,m,1) + qvals <- matrix(1,m,1) + stats <- matrix(1,m,1) + + rownames(phi) <- feature_ids + rownames(beta) <- feature_ids + rownames(intercept) <- feature_ids + rownames(alpha) <- feature_ids + rownames(null_ll) <- feature_ids + rownames(alternative_ll) <- feature_ids + rownames(pvals) <- feature_ids + rownames(qvals) <- feature_ids + rownames(stats) <- feature_ids + + colnames(beta) <- if(single_effect) "beta" else source_ids[mdl] + colnames(phi) <- "phi" + colnames(intercept) <- "intercept" + colnames(alpha) <- C3_names + colnames(null_ll) <- "null_ll" + colnames(alternative_ll) <- "alternative_ll" + colnames(pvals) <- "pvals" + colnames(qvals) <- "qvals" + colnames(stats) <- "stats" + + return(list("phi" = phi, "beta" = beta, "intercept" = intercept, "alpha" = alpha, "null_ll" = null_ll, "alternative_ll" = alternative_ll,"pvals" = pvals, "qvals" = qvals, "stats" = stats)) + +} tcareg.optimize <- function(X, y, W, mus_hat, sigmas_hat, C2, deltas_hat, C1, gammas_hat, tau_hat, C3, mdl, parallel, num_cores, single_effect){ flog.debug("...") - #config <- config::get() config <- config::get(file = system.file("extdata", "config.yml", package = "TCA"), use_parent = FALSE) nloptr_opts = list("algorithm"=config[["nloptr_opts_fit_conditional_algorithm"]], "xtol_rel"=config[["nloptr_opts_xtol_rel"]], "print_level" = config[["nloptr_opts_print_level"]], "check_derivatives" = config[["nloptr_opts_check_derivatives"]]) @@ -604,8 +939,6 @@ tcareg.optimize <- function(X, y, W, mus_hat, sigmas_hat, C2, deltas_hat, C1, ga # X0 <- get_initial_estimates(Z_hat, y, C3, mdl, lambda, single_effect, parallel, num_cores) # Set initial estimates for phi and alpha for each feature j using the model y~C3; initialize all betas with 0 - #d <- data.frame(y,C3) - #mdl0 <- lm(y~.,data = d) if (p3 == 0){ mdl0 <- lm(y ~ 1) }else{ @@ -630,11 +963,13 @@ tcareg.optimize <- function(X, y, W, mus_hat, sigmas_hat, C2, deltas_hat, C1, ga if (parallel) clusterExport(cl, c("lb","ub","p1","k","W","y","C1","C3","n","l","X0","const","sigmas_squared","X_tilde","V","mdl","gammas_hat","sigmas_squared","mus_hat","lambda","single_effect","nloptr_opts","conditional_model_minus_log_likelihood","tcareg.optimize_j"), envir=environment()) res <- pblapply(1:m, function(j) tcareg.optimize_j(j, y, W, C1, C3, mus_hat, gammas_hat, const, sigmas_squared, V, X_tilde, X0, mdl, lb, ub, nloptr_opts, single_effect, lambda), cl = cl) if (parallel) stop_cluster(cl) + intercept <- matrix(0, m, 1) alpha <- matrix(0, m, p3) beta <- matrix(0, m, num_betas) phi <- matrix(0, m, 1) ll <- matrix(0, m, 1) + for (j in 1:m){ phi[j,] <- res[[j]][["solution"]][1] beta[j,] <- res[[j]][["solution"]][2:(1+num_betas)] @@ -681,6 +1016,55 @@ tcareg.optimize_j <- function(j, y, W, C1, C3, mus_hat, gammas_hat, const, sigma return( list("solution" = nloptr_res[["solution"]], "objective" = nloptr_res[["objective"]]) ) } +# +# conditional_model_log_likelihood.all <- function(X, y, W, mus_hat, sigmas_hat, C2, deltas_hat, C1, gammas_hat, tau_hat, C3, phis, betas, alphas, intercepts, mdl, parallel, num_cores){ +# +# flog.debug("Starting function 'conditional_model_log_likelihood.all'...") +# +# config <- config::get(file = system.file("extdata", "config.yml", package = "TCA"), use_parent = FALSE) +# lambda <- config[["lambda"]] +# +# l <- length(mdl) +# k <- ncol(W) +# n <- nrow(W) +# m <- ncol(X) +# p1 <- ncol(C1) +# C1_ <- calc_C1_W_interactions(W,C1) +# X_tilde = X - (tcrossprod(C2,deltas_hat) + tcrossprod(C1_,gammas_hat) + tcrossprod(W,mus_hat)) +# const <- -n*log(2*pi) +# sigmas_squared <- sigmas_hat**2 +# V <- tau_hat**2 + tcrossprod(W**2,sigmas_squared) +# C3 <- cbind(as.matrix(numeric(n)+1), C3) +# cl <- NULL +# if (parallel){ +# cl <- init_cluster(num_cores) +# clusterExport(cl, varlist = c("l","k","n","m","p1","C1_","X_tilde","const","V","X","y","W","mus_hat","sigmas_squared","C1","deltas_hat","C1","gammas_hat","tau_hat", "C3", "mdl", "phis", "alphas", "betas", "intercepts"), envir=environment()) +# } +# flog.debug("Calculate the likelihood for every feature...") +# res <- pblapply(1:m,function(j){ +# gammas_hat_j_tilde <- t(Reshape(gammas_hat[j,],p1,k)[,mdl]); +# gammas_hat_j_tilde <- if (p1 == 1) t(gammas_hat_j_tilde) else gammas_hat_j_tilde; +# e1 <- V[,j]; +# e2 <- W[,mdl]*repmat(sigmas_squared[j,mdl],n,1); +# e3 <- t(repmat(e1,l,1)); +# e4 <- repmat(mus_hat[j,mdl],n,1) + tcrossprod(C1, gammas_hat_j_tilde) + (e2 * t(repmat(X_tilde[,j],l,1)) )/e1; +# e5 <- repmat(sigmas_squared[j,mdl],n,1); +# e6 <- e2/e3; +# sigmas_squared_j <- sigmas_squared[j,]; +# phi_j <- phis[j]; +# beta_j <- betas[j,]; +# alpha_j <- alphas[j,]; +# intercept_j <- intercepts[j,]; +# x <- c(phi_j,beta_j,alpha_j,intercept_j) +# neg_ll <- conditional_model_minus_log_likelihood(x, y, C3, const, sigmas_squared_j, e1, e2, e3, e4, e5, e6, mdl, lambda); +# return(-neg_ll$objective); +# }, cl = cl ) +# if (parallel) stop_cluster(cl) +# ll0 <- numeric(m) +# for (j in 1:m) ll0[j] <- res[[j]] +# return(ll0) +# } + conditional_model_minus_log_likelihood <- function(x, y, C3, const, sigmas_squared_j, e1, e2, e3, e4, e5, e6, mdl, lambda){ l <- length(mdl) @@ -690,7 +1074,6 @@ conditional_model_minus_log_likelihood <- function(x, y, C3, const, sigmas_squar alpha_j = x[-1:-(1+l)] s1 <- beta_j**2 s2 <- phi**2 + sum((s1)*(sigmas_squared_j[mdl])) - (tcrossprod(e2,t(beta_j))**2)/e1 # sigma_ij_tilde_squared - #s3 <- tcrossprod(C3,t(alpha_j)) + tcrossprod(e4,t(beta_j)) - y # mu_ij_tilde - y s3 <- C3 %*% as.matrix(alpha_j) + tcrossprod(e4,t(beta_j)) - y # mu_ij_tilde - y s4 <- s3/s2 s5 <- s3*s4 diff --git a/R/refactor.R b/R/refactor.R index 7a2f26f..7e4e75b 100644 --- a/R/refactor.R +++ b/R/refactor.R @@ -27,7 +27,8 @@ refactor.run <- function(X, k, sparsity, C, C.remove, sd_threshold, num_comp, ra X_adj <- X if (!is.null(C)){ flog.info("Adjusting the data for covariates...") - X_adj <- X - tcrossprod(tcrossprod(X,tcrossprod(matrix.inverse(crossprod(C,C)),C)),C) + C_ <- cbind(numeric(nrow(C)) + 1, C) + X_adj <- X - tcrossprod(tcrossprod(X,tcrossprod(matrix.inverse(crossprod(C_,C_)),C_)),C_) } flog.info("Running PCA on X using rand_svd == %s...", rand_svd) diff --git a/R/utils.R b/R/utils.R index 82442cb..bae0f04 100644 --- a/R/utils.R +++ b/R/utils.R @@ -61,8 +61,21 @@ assert <- function (expr, error) { if (! expr) stop(error, call. = FALSE) } +tensor.validate_input <- function(X, scale, parallel, num_cores, log_file, debug){ -tca.validate_input <- function(X, W, C1, C1.map, C2, refit_W, refit_W.features, refit_W.sparsity, refit_W.sd_threshold, tau, parallel, num_cores, max_iters, log_file, debug){ + flog.debug("Validating input types...") + + assert(is.matrix(X), "X must be of class 'matrix'") + assert(!is.null(rownames(X)) & !is.null(colnames(X)), "X must have row names and column names") + assert(is.logical(scale), "scale must be of class 'logical'") + assert(is.logical(debug), "debug must be of class 'logical'") + assert(is.logical(parallel), "parallel must be of class 'logical'") + assert(is.null(num_cores) | is.numeric(num_cores), "argument num_cores must take a numric value or NULL") + assert(is.character(log_file) | is.null(log_file), "log_file must be of class 'character' or NULL") + +} + +tca.validate_input <- function(X, W, C1, C1.map, C2, refit_W, refit_W.features, refit_W.sparsity, refit_W.sd_threshold, tau, constrain_mu, parallel, num_cores, max_iters, log_file, debug){ flog.debug("Validating input types...") @@ -85,14 +98,15 @@ tca.validate_input <- function(X, W, C1, C1.map, C2, refit_W, refit_W.features, assert(is.character(log_file) | is.null(log_file), "log_file must be of class 'character' or NULL") flog.debug("Validating input stucture and values...") - assert(!is.null(rownames(X)) | !is.null(colnames(X)), "X must have row names and column names") - assert(!is.null(rownames(W)) | !is.null(colnames(W)), "W must have row names and column names") - if (!is.null(C1)) assert(!is.null(rownames(C1)) | !is.null(colnames(C1)), "C1 must have row names and column names") - if (!is.null(C2)) assert(!is.null(rownames(C2)) | !is.null(colnames(C2)), "C2 must have row names and column names") + assert(!is.null(rownames(X)) & !is.null(colnames(X)), "X must have row names and column names") + assert(!is.null(rownames(W)) & !is.null(colnames(W)), "W must have row names and column names") + if (!is.null(C1)) assert(!is.null(rownames(C1)) & !is.null(colnames(C1)), "C1 must have row names and column names") + if (!is.null(C2)) assert(!is.null(rownames(C2)) & !is.null(colnames(C2)), "C2 must have row names and column names") flog.debug("Validating input conditions...") if (refit_W) assert(refit_W.sparsity <= nrow(X) , "argument refit_W.sparsity must satisfy refit_W.sparsity <= nrow(X)") if (refit_W) assert(refit_W.sd_threshold >= 0 , "argument refit_W.sd_threshold must satisfy refit_W.sd_threshold >= 0") + if (!(is.null(C1.map))) assert(constrain_mu , "argument C1.map cannot be useed with constrain_mu set to FALSE") flog.debug("Validating matrix dimensions...") assert(dim(X)[2] == dim(W)[1] , "The number of columns in X is inconsistent with the number of rows in W") @@ -134,7 +148,7 @@ tcasub.validate_input <- function(tca.mdl, features, log_file, debug){ } -tcareg.validate_input <- function(X, W, y, C3, test, null_model, alternative_model, save_results, output, sort_results, parallel, num_cores, log_file, features_metadata ,debug){ +tcareg.validate_input <- function(X, W, y, C3, test, null_model, alternative_model, save_results, fast_mode, output, sort_results, parallel, num_cores, log_file, features_metadata ,debug){ flog.debug("Validating input type for tcareg...") @@ -155,14 +169,17 @@ tcareg.validate_input <- function(X, W, y, C3, test, null_model, alternative_mod assert(is.character(output), "argument output must take a character value") assert(is.logical(save_results), "argument save_results must take a logical value") + assert(is.logical(fast_mode), "argument fast_mode must take a logical value") assert(is.logical(sort_results), "argument sort_results must take a logical value") assert(is.logical(parallel), "argument parallel must take a logical value") assert(is.logical(debug), "argument debug must take a logical value") assert(is.character(log_file) | is.null(log_file), "argument log_file must take a character value or NULL") flog.debug("Validating input stucture and values...") - assert(!is.null(rownames(X)) | !is.null(colnames(X)), "X must have row names and column names") - if (!is.null(C3)) assert(!is.null(rownames(C3)) | !is.null(colnames(C3)), "C3 must have row names and column names") + assert(!is.null(rownames(X)) & !is.null(colnames(X)), "X must have row names and column names") + assert(!is.null(rownames(W)) & !is.null(colnames(W)), "W must have row names and column names") + assert(!is.null(rownames(y)), "y must have row names") + if (!is.null(C3)) assert(!is.null(rownames(C3)) & !is.null(colnames(C3)), "C3 must have row names and column names") if (test == "custom") assert( (!is.null(alternative_model)), "argument alternative_model cannot be NULL when test=custom") if ( (!is.null(alternative_model))){ assert(test == "custom", "argument test must be set to 'custom' if argument alternative_model is not NULL") @@ -173,6 +190,7 @@ tcareg.validate_input <- function(X, W, y, C3, test, null_model, alternative_mod assert(length(setdiff(alternative_model, null_model)) > 0 & length(setdiff(null_model, alternative_model)) == 0, "null_model must be nested within alternative_model") } } + assert(!( (sort_results & test == "marginal_conditional") & fast_mode), "sort_results cannot be set to true under fast_mode == TRUE and test = 'marginal_conditional'") flog.debug("Validating matrix dimensions...") if (!is.null(C3)) assert(dim(X)[2] == dim(C3)[1] , "the number of columns in X is inconsistent with the number of rows in C3") @@ -206,36 +224,44 @@ lrt.test <- function(ll0, ll1, df){ } -save_association_results <- function(res, output, test, alternative_model, feature_ids, W_names, C3_names, sort_results, features_metadata){ +save_association_results <- function(res, output, test, fast_mode, alternative_model, feature_ids, W_names, C3_names, sort_results, features_metadata){ flog.debug("Running save_association_results...") m <- length(feature_ids) metadata <- if (is.null(features_metadata)) matrix(0,m,0) else parse_features_metadata(feature_ids, features_metadata) - if (test == "marginal" | test == "marginal_conditional"){ - # marginal or marginal_conditional + if (test == "marginal" | (!fast_mode & test == "marginal_conditional") ){ for (i in 1:length(res)){ filename <- paste(output, ".", test, ".", W_names[i], ".txt", sep ="") wnames <- if (test == "marginal") W_names[i] else W_names - save_association_results.save(res[[i]], filename, feature_ids, sort_results, wnames, C3_names, metadata) + save_association_results.save(res[[i]], test, fast_mode, filename, feature_ids, sort_results, wnames, C3_names, metadata) } }else{ - # joint, single_effect, or custom filename <- paste(output, ".", test, ".txt", sep ="") + keep <- 1:length(W_names) if (test == "custom") keep <- alternative_model - if (test == "joint") keep <- 1:length(W_names) if (test == "single_effect") keep <- 1 - save_association_results.save(res, filename, feature_ids, sort_results, W_names[keep], C3_names, metadata) + save_association_results.save(res, test, fast_mode, filename, feature_ids, sort_results, W_names[keep], C3_names, metadata) } } -save_association_results.save <- function(res, filename, feature_ids, sort_results, W_names, C3_names, metadata){ +save_association_results.save <- function(res, test, fast_mode, filename, feature_ids, sort_results, W_names, C3_names, metadata){ flog.debug("save_association_results.save...") - #config <- config::get() config <- config::get(file = system.file("extdata", "config.yml", package = "TCA"), use_parent = FALSE) m <- length(feature_ids) - data <- data.frame(feature_ids, metadata, res$pvals, res$qvals, res$beta, res$null_ll, res$alternative_ll, res$stats, res$df, res$intercept, res$alpha) - betas <- if(length(W_names) == 1) "beta" else lapply(1:length(W_names), function(i) paste("beta.",W_names[i],sep="")) - colnames(data) <- c("ID", colnames(metadata), "pval", "qval", betas, "null_ll", "alternative_ll", "chi_squared", "df", "intercept", C3_names) + betas <- if(length(W_names) == 1) "beta" else unlist(lapply(1:length(W_names), function(i) paste("beta.",W_names[i],sep=""))) + pvals <- if(test == "marginal_conditional" & fast_mode) unlist(lapply(1:length(W_names), function(i) paste("pval.",W_names[i],sep=""))) else "pval" + qvals <- if(test == "marginal_conditional" & fast_mode) unlist(lapply(1:length(W_names), function(i) paste("qval.",W_names[i],sep=""))) else "qval" + alpha <- if (length(C3_names)) unlist(lapply(1:length(C3_names), function(i) paste("alpha.",C3_names[i],sep=""))) else c() + df <- if(is.null(res$df)) c() else "df" + df.values <- if(is.null(res$df)) matrix(0,m,0) else matrix(0,m,1)+res$df + + if (fast_mode){ + data <- data.frame(feature_ids, metadata, res$pval, res$qval, res$beta, res$alternative_ll, res$stat, df.values, res$intercept, res$alpha, res$phi) + colnames(data) <- c("ID", colnames(metadata), pvals, qvals, betas, "alternative_ll", "stat", df, "intercept", alpha, "phi") + }else{ + data <- data.frame(feature_ids, metadata, res$pval, res$qval, res$beta, res$null_ll, res$alternative_ll, res$stat, res$df, res$intercept, res$alpha, res$phi) + colnames(data) <- c("ID", colnames(metadata), "pval", "qval", betas, "null_ll", "alternative_ll", "chi_squared", "df", "intercept", alpha, "phi") + } if (sort_results){ # sort by p-value data <- data[order(data$pval),] diff --git a/README.md b/README.md index f678cbd..28c3642 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ The R package of TCA is available on CRAN. Run the following in R to install TCA install.packages("TCA") ``` -The full documentation of the TCA R package can be found here. +The manual of the TCA R package can be found here. You can also find a full working example of TCA in this vignette about cell-type-specific resolution epigenetics using TCA in R. @@ -23,7 +23,7 @@ You can also find a full working example of TCA in 1: Rahmani et al. "Cell-type-specific resolution epigenetics without the need for cell sorting or single-cell biology." Nature Communications 2019. +1: Rahmani et al. "Cell-type-specific resolution epigenetics without the need for cell sorting or single-cell biology." Nature communications 10.1 (2019): 1-11. + +2: Houseman et al. "DNA methylation arrays as surrogate measures of cell mixture distribution." BMC bioinformatics 13.1 (2012): 86. + +3: Rahmani et al. "BayesCCE: a Bayesian framework for estimating cell-type composition from DNA methylation without the need for methylation reference." Genome biology 19.1 (2018): 1-18. + +4: Hannum et al. "Genome-wide methylation profiles reveal quantitative views of human aging rates." Molecular cell 49.2 (2013): 359-367. + +5: Liu et al. "Blood monocyte transcriptome and epigenome analyses reveal loci associated with human atherosclerosis." Nature communications 8.1 (2017): 1-12. + +6: Teschendorff et al. "A comparison of reference-based algorithms for correcting cell-type heterogeneity in Epigenome-Wide Association Studies." BMC bioinformatics 18.1 (2017): 105. + +7: Lehne et al. "A coherent approach for analysis of the Illumina HumanMethylation450 BeadChip improves data quality and performance in epigenome-wide association studies." Genome biology 16.1 (2015): 37. + +8: Bell et al. "Epigenome-wide scans identify differentially methylated regions for age and age-related phenotypes in a healthy ageing population." PLoS genetics 8.4 (2012). + +9: Jaffe and Irizarry. "Accounting for cellular heterogeneity is critical in epigenome-wide association studies." Genome biology 15.2 (2014): R31. + +10: Rahmani et al. "Sparse PCA corrects for cell type heterogeneity in epigenome-wide association studies." Nature methods 13.5 (2016): 443. -2: Liu et al. "Epigenome-wide association data implicate DNA methylation as an intermediary of genetic risk in rheumatoid arthritis." Nature biotechnology. 2013 Feb;31(2):142. +11: Rahmani et al. "Correcting for cell-type heterogeneity in DNA methylation: a comprehensive evaluation." Nature methods 14.3 (2017): 218-219. -3: Rahmani et al. "GLINT: a user-friendly toolset for the analysis of high-throughput DNA-methylation array data." Bioinformatics. 2017 Feb 8;33(12):1870-2. +12: Rahmani et al. "GLINT: a user-friendly toolset for the analysis of high-throughput DNA-methylation array data." Bioinformatics 33.12 (2017): 1870-1872. -4: Houseman et al. "DNA methylation arrays as surrogate measures of cell mixture distribution." BMC bioinformatics. 2012 Dec;13(1):86. +13: Chen et al. "Discovery of cross-reactive probes and polymorphic CpGs in the Illumina Infinium HumanMethylation450 microarray." Epigenetics 8.2 (2013): 203-209. -5: Rahmani et al. "BayesCCE: a Bayesian framework for estimating cell-type composition from DNA methylation without the need for methylation reference." Genome biology. 2018 Dec;19(1):141. +14: McCartney et al. "Identification of polymorphic and off-target probe binding sites on the Illumina Infinium MethylationEPIC BeadChip." Genomics data 9 (2016): 22-24. -6: Horvath "DNA methylation age of human tissues and cell types. Genome biology." 2013 Oct;14(10):3156. +15: Paquette et al. "Regions of variable DNA methylation in human placenta associated with newborn neurobehavior." Epigenetics 11.8 (2016): 603-613. -7: Rahmani et al. "Sparse PCA corrects for cell type heterogeneity in epigenome-wide association studies." Nature methods. 2016 May;13(5):443. +16: Xu et al. "ENmix: a novel background correction method for Illumina HumanMethylation450 BeadChip." Nucleic acids research 44.3 (2016): e20-e20. -8: Rahmani et al. "Correcting for cell-type heterogeneity in DNA methylation: a comprehensive evaluation." Nature methods. 2017 Mar;14(3):218. +17: Bakulski et al. "DNA methylation of cord blood cell types: applications for mixed cell birth studies." Epigenetics 11.5 (2016): 354-362. diff --git a/vignettes/vignette_analysis.R b/vignettes/vignette_analysis.R new file mode 100644 index 0000000..7c14583 --- /dev/null +++ b/vignettes/vignette_analysis.R @@ -0,0 +1,427 @@ +library("TCA") +library("ggplot2") +library("ggpubr") +library("pracma") +library("matrixStats") + +prep_data <- function(data_path){ + + file_name1 <- paste(data_path,.Platform$file.sep,"hannum.chr22.RData",sep="") + file_name2 <- paste(data_path,.Platform$file.sep,"liu.cd4.chr22.RData",sep="") + file_name3 <- paste(data_path,.Platform$file.sep,"paquette.chr22.RData",sep="") + + file_names <- c(file_name1, file_name2, file_name3) + + if(any(!file.exists(file_name1, file_name2, file_name3))){ + library("GEOquery") + library("data.table") + library("EpiDISH") + } + + if (!file.exists(file_name1)){ + + # Download the Hannum et al. data + gse <- GEOquery::getGEO("GSE40279", destdir = data_path, GSEMatrix = TRUE) + + # Extract methylation data + X.hannum <- Biobase::exprs(gse[[1]]) + + # covariates; note that there's also 'ethnicity' covariate in the data, however, it is perfectly captured by the plateinformation + plate.hannum <- as.numeric(as.factor((Biobase::pData(gse[[1]])[["plate:ch1"]]))) + cov.hannum <- cbind(as.numeric((Biobase::pData(gse[[1]])[["age (y):ch1"]])), as.numeric(as.factor((Biobase::pData(gse[[1]])[["gender:ch1"]]))), indicator_vars(plate.hannum)) + rownames(cov.hannum) <- colnames(X.hannum) + colnames(cov.hannum) <- c("age", "gender", "plate1", "plate2", "plate3", "plate4", "plate5", "plate6", "plate7", "plate8") + + # Calculate principal components from control probes that reflect technical variability; since we are not working with IDAT files we don't have "real" control probes - instead, we use low variance sites that are not expected to capture any true biological variability. + low_var_pcs.hannum <- low_var_pcs(X.hannum, rank = 10, p = 1000) + cov.hannum <- cbind(cov.hannum, low_var_pcs.hannum) + + # estimate cell-type proportions usign a reference-based approach + ref <- as.matrix(EpiDISH::centDHSbloodDMC.m[,c("Neutro","Eosino","CD4T","CD8T","Mono","B","NK")]) + W.hannum <- EpiDISH::epidish(X.hannum, ref)$estF + # merge Neutro and Eosino + W.hannum.gran <- W.hannum[,"Neutro",drop=F] + W.hannum[,"Eosino",drop=F] + colnames(W.hannum.gran) <- "Gran" + W.hannum <- cbind(W.hannum.gran,W.hannum[,c("CD4T","CD8T","Mono","B","NK")]) + + # remove polymorphic sites, cross-reactive sites, and non-autosomal sites according to Chen et al.; in addition, remove low variance sites, as those are unlikely to demonstrate biological signal. + X.hannum.processed <- filter_data(X.hannum) + + # keep only sites on chromosome 22 (so that tutorial can run quickly) + map <- read.table("https://github.com/cozygene/glint/blob/master/utils/assets/HumanMethylationSites?raw=true",sep=",",row.names = 1) + chrs <- map[rownames(X.hannum.processed),1] + chr22sites <- which(chrs == 22) + X.hannum.processed.22 <- X.hannum.processed[chr22sites,] + + hannum <- list(X = X.hannum.processed.22, cov = cov.hannum, W = W.hannum) + save(hannum, file = file_name1, compress = "bzip2") + + rm(hannum, gse, X.hannum, X.hannum.processed) + file.remove(paste(data_path,.Platform$file.sep,"GPL13534.soft",sep="")) + file.remove(paste(data_path,.Platform$file.sep,"GSE40279_series_matrix.txt.gz",sep="")) + + } + + if (!file.exists(file_name2)){ + + # Download the Liu et al. CD4 data + + # covariates + gse <- GEOquery::getGEO("GSE56581", destdir = data_path, GSEMatrix = TRUE) + liu.cd4.age <- as.matrix(as.numeric(Biobase::pData(gse[[1]])[["age (yrs):ch1"]])) + colnames(liu.cd4.age) <- "age" + + # methylation data + liu.cd4.methfile <- paste(data_path,.Platform$file.sep,"GSE56581_methylome_normalized.txt",sep="") + download.file("ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE56nnn/GSE56581/suppl/GSE56581_methylome_normalized.txt.gz", paste(liu.cd4.methfile,".gz",sep="")) + gunzip(paste(liu.cd4.methfile,".gz",sep="")) + X.liu.cd4 <- fread(liu.cd4.methfile) + + X.liu.cd4.rownames <- X.liu.cd4$ID_REF + X.liu.cd4 <- as.matrix(X.liu.cd4[,2:ncol(X.liu.cd4)]) + rownames(X.liu.cd4) <- X.liu.cd4.rownames + X.liu.cd4 <- X.liu.cd4[,seq(1,ncol(X.liu.cd4),2)] + + # Calculate principal components from control probes that reflect technical variability; since we are not working with IDAT files we don't have "real" control probes - instead, we use low variance sites that are not expected to capture any true biological variability. + low_var_pcs.liu.cd4 <- low_var_pcs(X.liu.cd4, rank = 10) + cov.liu.cd4 <- cbind(liu.cd4.age, low_var_pcs.liu.cd4) + + # keep only sites on chromosome 22 (so that tutorial can run quickly) + map <- read.table("https://github.com/cozygene/glint/blob/master/utils/assets/HumanMethylationSites?raw=true",sep=",",row.names = 1) + chrs <- map[rownames(X.liu.cd4),1] + chr22sites <- which(chrs == 22) + X.liu.cd4.22 <- X.liu.cd4[chr22sites,] + + liu.cd4 <- list(X = X.liu.cd4.22, cov = cov.liu.cd4) + save(liu.cd4, file = file_name2) + + rm(liu.cd4, gse, X.liu.cd4.22, X.liu.cd4) + file.remove(paste(data_path,.Platform$file.sep,"GSE56581_methylome_normalized.txt",sep="")) + file.remove(paste(data_path,.Platform$file.sep,"GSE56581_series_matrix.txt.gz",sep="")) + + } + + if (!file.exists(file_name3)){ + # Paquette et al. (Epigenetics 2016) + gse <- GEOquery::getGEO("GSE75248", destdir = data_path, GSEMatrix = TRUE) + + # methylation data + X.paquette <- Biobase::exprs(gse[[1]]) + + # Calculate principal components from control probes that reflect technical variability; since we are not working with IDAT files we don't have "real" control probes - instead, we use low variance sites that are not expected to capture any true biological variability. + low_var_pcs.paquette <- low_var_pcs(X.paquette, rank = 10, p = 1000) + + # remove polymorphic sites, cross-reactive sites, and non-autosomal sites according to Chen et al.; in addition, remove low variance sites, as those are unlikely to demonstrate biological signal. + X.paquette.processed <- filter_data(X.paquette) + + # keep only sites on chromosome 22 (so that tutorial can run quickly); Paquette et al. reported some signal in chromosome 16 + map <- read.table("https://github.com/cozygene/glint/blob/master/utils/assets/HumanMethylationSites?raw=true",sep=",",row.names = 1) + chrs <- as.character(map[rownames(X.paquette.processed),1]) + chr22sites <- which(chrs == "22") + X.paquette.processed.22 <- X.paquette.processed[chr22sites,] + + # Extract covaraites + gestational_age <-as.numeric(unlist(lapply(Biobase::pData(gse[[1]])[["gestational age:ch1"]], function(x) strsplit(x,";")[[1]][1]))) + arousal <- as.numeric(Biobase::pData(gse[[1]])[["arousal:ch1"]]) + batch <- as.numeric(as.factor((Biobase::pData(gse[[1]])[["batch:ch1"]]))) + batch[is.na(batch)] <- 3 + batch <- indicator_vars(batch) + # since gender information is not available on the GEO record, get gender information that was inferred from the IDAR files based on X chromosome methylation + gender <- read.table("https://raw.githubusercontent.com/cozygene/TCA/master/vignettes/gender.paquette.txt", row.names = 1) + cov.paquette <- data.frame(gender,gestational_age,arousal,batch) + colnames(cov.paquette) <- c("gender","gestational_age","arousal","batch1","batch2") + rownames(cov.paquette) <- colnames(X.paquette.processed.22) + # remove na values + keep <- setdiff(1:nrow(cov.paquette) ,which(rowSums(is.na(cov.paquette)) > 0)) + cov.paquette <- cbind(cov.paquette[keep,], low_var_pcs.paquette[keep,]) + + X.paquette.processed.22 <- X.paquette.processed.22[,keep] + + # Load cell-type proportions usign a reference-based approach; these were estimated using the raw IDAT files of the data (available on GEO) with the package ENmix. + W.paquette <- read.table("https://raw.githubusercontent.com/cozygene/TCA/master/vignettes/W.paquette.txt", row.names = 1) + W.paquette <- W.paquette[,2:ncol(W.paquette)] + W.paquette[W.paquette < 0] <- 0 + # remove lowly abundant cell types + W.paquette <- W.paquette[keep, colMeans(W.paquette)>=0.01] + rownames(W.paquette) <- rownames(cov.paquette) + # normalize cell type proportions (output from the ENmix package didn't require the proportions of an individual to sum up to 1) + W.paquette <- W.paquette/t(repmat(rowSums(W.paquette),ncol(W.paquette),1)) + + # get the set of cord blood reference CpGs that were used for estimating cell-type proportions; we will use these CpGs for re-estimating W + library("FlowSorted.CordBlood.450k") + ref.cord <- get("FlowSorted.CordBlood.450k") + ref.cord <- preprocessRaw(ref.cord) + ref.cord.cpgs <- rownames(FlowSorted.CordBlood.450k.ModelPars) + ref.cord.cpgs.intersect <- intersect(ref.cord.cpgs, rownames(X.paquette)) + + paquette <- list(X = X.paquette.processed.22, cov = cov.paquette, W = W.paquette, X.ref_cpgs = X.paquette[ref.cord.cpgs.intersect,keep]) + save(paquette, file = file_name3) + + rm(paquette, gse, X.paquette, X.paquette.processed, X.paquette.processed.22) + file.remove(paste(data_path,.Platform$file.sep,"GSE75248_series_matrix.txt.gz",sep="")) + + ## note - the files W.paquette.txt and gender.paquette.txt were generated using the original IDAT files of the Paquette data. + # Gender information was inferred from methylation levels frmo the X chromosome as it is not available on Tthe GEO record. + # Code attched below; in order to run it, download the IDAT files from GEO accession number GSE75248 and set 'path' in 'readidat' below to the location of the files. + # require("ENmix") + # rgdat <- readidat(path = ".", manifestfile=NULL, recursive = FALSE, verbose = TRUE) + # meth <- getmeth(rgdat); rm(rgdat) + # W <- estimateCellProp(meth, refdata="FlowSorted.CordBlood.450k", cellTypes=NULL, nonnegative = TRUE, nProbes=50, normalize=TRUE, refplot=FALSE) + # write.table(W, file = "W.paquette.txt", quote = FALSE, sep = " ") + # beta <- getB(meth) + # map <- read.table("https://github.com/cozygene/glint/blob/master/utils/assets/HumanMethylationSites?raw=true",sep=",",row.names = 1) + # chrs <- as.character(map[rownames(beta),1]) + # sites.X <- which(chrs == "X") + # sex_cpgs.pca.paquette <- prcomp(t(beta[sites.X,]), center=TRUE, scale=TRUE, rank = 2) + # write.table(sex_cpgs.pca.paquette$x[,1:2], file = "sex_cpgs.pca.paquette.txt", quote = FALSE, sep = " ") + # plot(sex_cpgs.pca.paquette$x[,1],sex_cpgs.pca.paquette$x[,2]) # shows a separation between males and females; note that the males/females ratio here perfectly match the one in the Paquette et al. paper. + # df <-data.frame(as.numeric(x[,1] > 0)) + # colnames(df) <-"gender"; rownames(df) <- rownames(x) + # write.table(df,file = "gender.paquette.txt", quote = FALSE, sep = " ") + + } + + return(file_names) + +} + + +indicator_vars <- function(x){ + x.indicators <- matrix(0,length(x),length(unique(x))-1) + counter <- 1 + u <- unique(x) + for (i in setdiff(u,u[length(u)])){ + x.indicators[,counter] <- as.numeric(x == i) + counter <- counter + 1 + } + return(x.indicators) +} + +low_var_pcs <- function(X, rank = 10, p = 1000){ + site.variances <- matrixStats::rowVars(X) + names(site.variances) <- rownames(X) + low.var.sites <- names(head(sort(site.variances), p)) + low.var.pca <- prcomp(t(X[low.var.sites,]), center=TRUE, scale=TRUE, rank = rank) + return(low.var.pca$x) +} + +filter_data <- function(X, var_th = 0.0001){ + nonspecific_probes <- read.table("https://raw.githubusercontent.com/cozygene/glint/master/parsers/assets/nonspecific_probes.txt")[,1] + XY_probes <- read.table("https://raw.githubusercontent.com/cozygene/glint/master/parsers/assets/HumanMethylationSites_X_Y.txt")[,1] + polymorphic_probes <- read.table("https://raw.githubusercontent.com/cozygene/glint/master/parsers/assets/polymorphic_cpgs.txt")[,1] + # remove sites with very low variance that are unlikely to exhibit biological signal + site.variances <- matrixStats::rowVars(X) + names(site.variances) <- rownames(X) + low_var_sites <- names(which(site.variances < var_th)) + exclude <- union(nonspecific_probes,union(XY_probes,union(low_var_sites,polymorphic_probes))) + return(X[setdiff(rownames(X),exclude),]) +} + +plot_qq <- function(pvals, labels, ggarrange.nrow = 1, ggarrange.ncol = 1, alpha = 0.05, experiment_wide_line = TRUE){ + significance_th <- list(alpha/length(pvals[[1]])) + if(length(pvals)-1) significance_th[[2]] <- alpha/(length(pvals)*length(pvals[[1]])) + qqplots <- lapply(1:length(pvals), function(p){ + df <- data.frame(pvals.obs = -log10(sort(pvals[[p]])), pvals.exp = -log10(sort((1:length(pvals[[1]]))/length(pvals[[1]])))); + qqplot <- ggplot(df, aes(x = pvals.exp, y = pvals.obs)) + + stat_binhex(geom = "point", bins=1000, size=1) + + geom_abline() + + ggtitle(labels[p]) + + xlab(expression(Expected~-log[10](P))) + ylab(expression(Observed~-log[10](P))) + + theme_bw() + + guides(fill="none") + + geom_hline(yintercept=-log10(significance_th[[1]]), linetype="dashed", color = "red", size=1) + if (length(significance_th)-1 & experiment_wide_line) qqplot <- qqplot + geom_hline(yintercept=-log10(significance_th[[2]]), linetype="dashed", color = "red", size=0.5) + return(qqplot) + }) + ggarrange(plotlist = qqplots, ncol = ggarrange.ncol, nrow = ggarrange.nrow) +} + +plot_scatter <- function(dfs, ggarrange.ncol, ggarrange.nrow, xlab, ylab, titles){ + plots <- vector("list", length = length(dfs)) + for (i in 1:length(dfs)){ + df <- data.frame(y = dfs[[i]]$y, x = dfs[[i]]$x) + plots[[i]] <- ggplot(df, aes(x = x, y = y)) + + geom_point(alpha = 0.6) + + geom_smooth(method=lm) + + stat_cor(method = "pearson", colour = "blue") + + xlab(xlab) + + ylab(ylab) + + ggtitle(titles[i]) + } + ggarrange(plotlist = plots, ncol = ggarrange.ncol, nrow = ggarrange.nrow) +} + + +if(FALSE){ + + # Set a path for storing data + data_path <- "./" + + # Load the data + filenames <- prep_data(data_path) + for (filename in filenames) load(filename) + + ## Experiment #1: detecting CD4 differential methylation with age; working under the assumption X|Y (i.e. age affects methylation levels) + + # Apply the TCA model to the Hannum whole-blood data + tca.mdl.hannum <- tca(X = hannum$X, + W = hannum$W, + C1 = hannum$cov[,c("gender","age")], + C2 = hannum$cov[,3:ncol(hannum$cov)]) + + # Extract p-values of a joint test + tca.mdl.hannum.pvals.joint <- tca.mdl.hannum$gammas_hat_pvals.joint[,"age"] + + # Extract p-values for each cell type, under a marginal conditional test + tca.mdl.hannum.pvals.marg_cond <- tca.mdl.hannum$gammas_hat_pvals[,paste(colnames(hannum$W),".age",sep="")] + + # qq-plots - for the p-values of the joint test, and for the p-values in CD4, under a marginal conditional test + plot_qq(list(tca.mdl.hannum.pvals.joint, tca.mdl.hannum.pvals.marg_cond[,"CD4T.age"]), + labels = c("Joint test with age", "CD4 marginal conditional test with age"), + ggarrange.nrow = 1, + ggarrange.ncol = 2, + experiment_wide_line = FALSE) + + # Run ReFACTor to capture more of the variation of cell-type composition + refactor.mdl.hannum <- refactor(X = hannum$X, + k = 6, + C = hannum$cov[,3:ncol(hannum$cov)]) + + # Rerun TCA, this time include the ReFACTor components as additional covariates + tca.mdl.hannum.2 <- tca(X = hannum$X, + W = hannum$W, + C1 = hannum$cov[,c("gender","age")], + C2 = cbind(hannum$cov[,3:ncol(hannum$cov)],refactor.mdl.hannum$scores)) + + # Extract the updated p-values of a joint test + tca.mdl.hannum.2.pvals.joint <- tca.mdl.hannum.2$gammas_hat_pvals.joint[,"age"] + + # Extract the updated marginal conditional p-values + tca.mdl.hannum.2.pvals.marg_cond <- tca.mdl.hannum.2$gammas_hat_pvals[,paste(colnames(hannum$W),".age",sep="")] + + # qq-plots - for the p-values of the joint test, and for the p-values in CD4, under a marginal conditional test + plot_qq(list(tca.mdl.hannum.2.pvals.joint, tca.mdl.hannum.2.pvals.marg_cond[,"CD4T.age"]), + labels = c("Joint test with age", "CD4 marginal conditional test with age"), + ggarrange.nrow = 1, + ggarrange.ncol = 2, + experiment_wide_line = FALSE) + + # Extract the hits based on the joint test + hits.joint <- names(which(tca.mdl.hannum.2.pvals.joint < 0.05/nrow(hannum$X))) + + # Extract the hits from hits.joint where CD4 cells demonstrate the lowest p-value across all cell types + cd4.hits <- names(which(tca.mdl.hannum.2.pvals.marg_cond[hits.joint,"CD4T.age"] == rowMins(tca.mdl.hannum.2.pvals.marg_cond[hits.joint,]))) + + sprintf("Detected %d associations using a joint test, %d associations using a marginal conditional test, and %d associations in CD4 cells using a marginal conditional test.", + sum(tca.mdl.hannum.2.pvals.joint <= 0.05/nrow(hannum$X)), + sum(tca.mdl.hannum.2.pvals.marg_cond <= 0.05/(nrow(hannum$X)*nrow(hannum$W))), + sum(tca.mdl.hannum.2.pvals.marg_cond[,"CD4T.age"] <= 0.05/(nrow(hannum$X)*nrow(hannum$W)))) + + # Replicate the CD4 hits in the Liu purified CD4 data + cd4.hits.liu.pvals <- unlist(lapply(1:length(cd4.hits), + function(x) summary(lm(y~., data.frame(y=liu.cd4$X[cd4.hits[x],], liu.cd4$cov)))$coefficients["age","Pr(>|t|)"])) + + # Plot adjusted methylation (i.e. adjusted for covariates) as a function of age in all four replicated CD4 associations + dfs <- vector("list", length = 4) + for (i in 1:4){ + r <- scale(residuals(lm(y~., data.frame(y=liu.cd4$X[cd4.hits[i],], liu.cd4$cov[,2:ncol(liu.cd4$cov)])))) + dfs[[i]] <- data.frame(x = liu.cd4$cov[,"age"], y = r) + } + plot_scatter(dfs = dfs, + ggarrange.ncol = 2, + ggarrange.nrow = 2, + xlab = "Age", + ylab = "Adjusted methylation level", + titles = paste("CD4 methylation in ",cd4.hits,sep="")) + + # Verify that p-values are calibrated in the Liu data + liu.cd4.regression.pvals <- unlist(lapply(1:nrow(liu.cd4$X), + function(x) summary(lm(y~.,data.frame(y = liu.cd4$X[x,],liu.cd4$cov)))$coefficients["age","Pr(>|t|)"])) + plot_qq(list(liu.cd4.regression.pvals), labels = "Linear regression (sorted CD4)") + + + + + + ## Experiment #2: detecting differential methylation with infant arousal in granulocytes; working under the assumption Y|X (i.e. methylation levels affect or mediate components affecting infant arousal). + + # Apply the TCA model to the Paquette data + tca.mdl.paquette <- tca(X = paquette$X, + W = paquette$W, + C1 = paquette$cov[,c("gender","gestational_age")], + C2 = paquette$cov[,4:ncol(paquette$cov)], + constrain_mu = TRUE) + + # Run tcareg with a joint test and extract p-values; generate a qq-plot + C3_names <- c("gender","gestational_age","batch1","batch2") + tcareg.mdl.paquette.joint <- tcareg(X = paquette$X, + tca.mdl = tca.mdl.paquette, + y = paquette$cov[,"arousal",drop=F], + C3 = paquette$cov[,C3_names], + test = "joint") + plot_qq(list(tcareg.mdl.paquette.joint$pvals), labels = "Joint test with infant arousal") + + # Since there is an inflation in the qq-plot we try to re-estimate the cell-type proportions + # First, run TCA using only the reference sites for getting a new estimate of W + tca.mdl.paquette.refit_W <- tca(X = paquette$X.ref_cpgs, + W = paquette$W, + C1 = paquette$cov[,c("gender","gestational_age")], + C2 = paquette$cov[,4:ncol(paquette$cov)], + constrain_mu = TRUE, + refit_W = TRUE, + refit_W.features = rownames(paquette$X.ref_cpgs)) + # Use the updated estimate of W in a new execution of TCA on the data + tca.mdl.paquette.2 <- tca(X = paquette$X, + W = tca.mdl.paquette.refit_W$W, + C1 = paquette$cov[,c("gender","gestational_age")], + C2 = paquette$cov[,4:ncol(paquette$cov)], + constrain_mu = TRUE) + + # Run tcareg with a joint test and extract p-values; generate a qq-plot + tcareg.mdl.paquette.joint.2 <- tcareg(X = paquette$X, + tca.mdl = tca.mdl.paquette.2, + y = paquette$cov[,"arousal",drop=F], + C3 = paquette$cov[,C3_names], + test = "joint") + plot_qq(list(tcareg.mdl.paquette.joint.2$pvals), labels = "Joint test with infant arousal") + + # Run tcareg with a marginal conditional test; generate qq plots + tcareg.mdl.paquette.2.marg_cond <- tcareg(X = paquette$X, + tca.mdl = tca.mdl.paquette.2, + y = paquette$cov[,"arousal",drop=F], + C3 = paquette$cov[,C3_names], + test = "marginal_conditional") + plot_qq(split(tcareg.mdl.paquette.2.marg_cond$pvals,rep(1:ncol(paquette$W), each = nrow(paquette$X))), + labels = paste(colnames(paquette$W)," marginal conditional with arousal", sep=""), + ggarrange.nrow = 2, + ggarrange.ncol = 2) + + # Exctract the hit we got from the joint test + hit.joint <- rownames(paquette$X)[which(tcareg.mdl.paquette.joint.2$pvals < 0.05/nrow(paquette$X))] + + # The p-values of the marginal conditional test in the hit we found with the joint test suggest that the association is in granulocytes + tcareg.mdl.paquette.2.marg_cond$pvals[hit.joint,] + + # Extract from the tca model the part that is relevant for the hit found + tcasub.mdl.paquette.2 <- tcasub(tca.mdl = tca.mdl.paquette.2, + features = hit.joint) + + # Calculate cell-type-specific methylation for the samples in the detected CpG + tensor.mdl.hit <- tensor(tca.mdl = tcasub.mdl.paquette.2, + X = paquette$X[hit.joint,,drop=F]) + + # Plot the estimated granulocyte-specific methylation with arasual; adjust methylation levels to covariates + # Compare with using the bulk data + r.gran <- scale(residuals(lm(y~., data.frame(y=tensor.mdl.hit[[2]][1,], paquette$cov[,setdiff(1:ncol(paquette$cov),3)])))) + df.gran <- data.frame(y = r.gran, x = paquette$cov[,"arousal"]) + # The bulk data should be further adjusted for cell-type composition (i.e. tca.mdl.paquette.2$W) + r.bulk <- scale(residuals(lm(y~., data.frame(y=paquette$X[hit.joint,], tca.mdl.paquette.2$W, paquette$cov[,setdiff(1:ncol(paquette$cov),3)])))) + df.bulk <- data.frame(y = r.bulk, x = paquette$cov[,"arousal"]) + plot_scatter(dfs = list(df.bulk, df.gran), + ggarrange.ncol = 2, + ggarrange.nrow = 1, + xlab = "Infant arousal", + ylab = "Adjusted methylation level", + titles = paste(c("Whole-blood methylation in ", "Granulocyte methylation in "),hit.joint,sep="")) + +}