TCA v1.2.1

cozygene · Feb 14, 2021 · 8b824f1 · 8b824f1
1 parent 34bfb65
commit 8b824f1
Show file tree

Hide file tree

Showing 31 changed files with 1,992 additions and 544 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,17 +1,18 @@
 Package: TCA
 Type: Package
 Title: Tensor Composition Analysis
-Version: 1.1.0
-Authors@R: person("Elior", "Rahmani", email = "[email protected]", role = c("aut", "cre"))
-Author: Elior Rahmani [aut, cre]
-Maintainer: Elior Rahmani <[email protected]>
-Description: Tensor Composition Analysis (TCA) allows the deconvolution of two-dimensional data (features by observations) coming from a mixture of sources into a three-dimensional matrix of signals (features by observations by sources). TCA further allows to test the features in the data for different statistical relations with an outcome of interest while modeling source-specific effects (TCA regression); particularly, it allows to look for statistical relations between source-specific signals and an outcome. For example, TCA can deconvolve bulk tissue-level DNA methylation data (methylation sites by individuals) into a tensor of cell-type-specific methylation levels for each individual (methylation sites by individuals by cell types) and it allows to detect cell-type-specific relations (associations) with an outcome of interest. For more details see Rahmani et al. (2018) <DOI:10.1101/437368>.
+Version: 1.2.1
+Authors@R: c(
+    person("Elior", "Rahmani", role=c("aut", "cre"), email = "[email protected]"),
+    person("Brandon", "Jew", role=c("ctb"))
+    )
+Description: Tensor Composition Analysis (TCA) allows the deconvolution of two-dimensional data (features by observations) coming from a mixture of heterogeneous sources into a three-dimensional matrix of signals (features by observations by sources). The TCA framework further allows to test the features in the data for different statistical relations with an outcome of interest while modeling source-specific effects; particularly, it allows to look for statistical relations between source-specific signals and an outcome. For example, TCA can deconvolve bulk tissue-level DNA methylation data (methylation sites by individuals) into a three-dimensional tensor of cell-type-specific methylation levels for each individual (i.e. methylation sites by individuals by cell types) and it allows to detect cell-type-specific statistical relations (associations) with phenotypes. For more details see Rahmani et al. (2019) <DOI:10.1038/s41467-019-11052-9>.
 License: GPL-3
 Encoding: UTF-8
 LazyData: true
 Imports: config, data.table, futile.logger, gmodels, Matrix, matrixcalc, matrixStats, nloptr, parallel, pbapply, pracma, rsvd, stats, quadprog, glmnet
-RoxygenNote: 6.1.1
-Depends: R (>= 3.4.0)
+RoxygenNote: 7.1.1
+Depends: R (>= 3.5.0)
 Suggests: 
     testthat,
     knitr,

diff --git a/NAMESPACE b/NAMESPACE
@@ -30,7 +30,9 @@ importFrom(pbapply,pboptions)
 importFrom(pracma,Reshape)
 importFrom(pracma,lsqlincon)
 importFrom(pracma,repmat)
+importFrom(pracma,sqrtm)
 importFrom(rsvd,rpca)
+importFrom(stats,anova)
 importFrom(stats,lm)
 importFrom(stats,logLik)
 importFrom(stats,p.adjust)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,23 @@
+TCA v1.2.1 (Release date: 02-13-2021)
+==============
+
+*NOTE* v1.2.1 is not backward compatible with earlier versions
+
+Changes:
+
+* A new argument 'vars.mle' in 'tca' which controls the method used for optimization; setting to FALSE allows a much faster optimization.
+
+* A new argument 'fast_mode' in 'tcareg' allows a much faster optimization.
+
+* A new argument 'constrain_mu' in 'tca' allows to determine whether to constrain the mean parameters in the optimization; setting to false now allows to obtain p-values for the covariates in 'C1' and 'C2'.
+
+* A new 'scale' argument in 'tensor' for dividing estimates by estimated standard deviations.
+
+* Default values of several arguments have been changed.
+
+* Switched the default FDR method used in ‘tcareg’ back from “BY” to “BH” (can be changed by editing the config file).
+
+
 TCA v1.1.0 (Release date: 11-14-2019)
 ==============
 

diff --git a/R/TCA.R b/R/TCA.R
diff --git a/R/model_fit.R b/R/model_fit.R
diff --git a/R/refactor.R b/R/refactor.R
@@ -27,7 +27,8 @@ refactor.run <- function(X, k, sparsity, C, C.remove, sd_threshold, num_comp, ra
   X_adj <- X
   if (!is.null(C)){
     flog.info("Adjusting the data for covariates...")
-    X_adj <- X - tcrossprod(tcrossprod(X,tcrossprod(matrix.inverse(crossprod(C,C)),C)),C)
+    C_ <- cbind(numeric(nrow(C)) + 1, C)
+    X_adj <- X - tcrossprod(tcrossprod(X,tcrossprod(matrix.inverse(crossprod(C_,C_)),C_)),C_)
   }
 
   flog.info("Running PCA on X using rand_svd == %s...", rand_svd)

diff --git a/R/utils.R b/R/utils.R
@@ -61,8 +61,21 @@ assert <- function (expr, error) {
   if (! expr) stop(error, call. = FALSE)
 }
 
+tensor.validate_input <- function(X, scale, parallel, num_cores, log_file, debug){
 
-tca.validate_input <- function(X, W, C1, C1.map, C2, refit_W, refit_W.features, refit_W.sparsity, refit_W.sd_threshold, tau, parallel, num_cores, max_iters, log_file, debug){
+  flog.debug("Validating input types...")
+
+  assert(is.matrix(X), "X must be of class 'matrix'")
+  assert(!is.null(rownames(X)) & !is.null(colnames(X)), "X must have row names and column names")
+  assert(is.logical(scale), "scale must be of class 'logical'")
+  assert(is.logical(debug), "debug must be of class 'logical'")
+  assert(is.logical(parallel), "parallel must be of class 'logical'")
+  assert(is.null(num_cores) | is.numeric(num_cores), "argument num_cores must take a numric value or NULL")
+  assert(is.character(log_file) | is.null(log_file), "log_file must be of class 'character' or NULL")
+
+}
+
+tca.validate_input <- function(X, W, C1, C1.map, C2, refit_W, refit_W.features, refit_W.sparsity, refit_W.sd_threshold, tau, constrain_mu, parallel, num_cores, max_iters, log_file, debug){
 
   flog.debug("Validating input types...")
 
@@ -85,14 +98,15 @@ tca.validate_input <- function(X, W, C1, C1.map, C2, refit_W, refit_W.features,
   assert(is.character(log_file) | is.null(log_file), "log_file must be of class 'character' or NULL")
 
   flog.debug("Validating input stucture and values...")
-  assert(!is.null(rownames(X)) | !is.null(colnames(X)), "X must have row names and column names")
-  assert(!is.null(rownames(W)) | !is.null(colnames(W)), "W must have row names and column names")
-  if (!is.null(C1)) assert(!is.null(rownames(C1)) | !is.null(colnames(C1)), "C1 must have row names and column names")
-  if (!is.null(C2)) assert(!is.null(rownames(C2)) | !is.null(colnames(C2)), "C2 must have row names and column names")
+  assert(!is.null(rownames(X)) & !is.null(colnames(X)), "X must have row names and column names")
+  assert(!is.null(rownames(W)) & !is.null(colnames(W)), "W must have row names and column names")
+  if (!is.null(C1)) assert(!is.null(rownames(C1)) & !is.null(colnames(C1)), "C1 must have row names and column names")
+  if (!is.null(C2)) assert(!is.null(rownames(C2)) & !is.null(colnames(C2)), "C2 must have row names and column names")
 
   flog.debug("Validating input conditions...")
   if (refit_W) assert(refit_W.sparsity <= nrow(X) , "argument refit_W.sparsity must satisfy refit_W.sparsity <= nrow(X)")
   if (refit_W) assert(refit_W.sd_threshold >= 0 , "argument refit_W.sd_threshold must satisfy refit_W.sd_threshold >= 0")
+  if (!(is.null(C1.map))) assert(constrain_mu , "argument C1.map cannot be useed with constrain_mu set to FALSE")
 
   flog.debug("Validating matrix dimensions...")
   assert(dim(X)[2] == dim(W)[1] , "The number of columns in X is inconsistent with the number of rows in W")
@@ -134,7 +148,7 @@ tcasub.validate_input <- function(tca.mdl, features, log_file, debug){
 }
 
 
-tcareg.validate_input <- function(X, W, y, C3, test, null_model, alternative_model, save_results, output, sort_results, parallel, num_cores, log_file, features_metadata ,debug){
+tcareg.validate_input <- function(X, W, y, C3, test, null_model, alternative_model, save_results, fast_mode, output, sort_results, parallel, num_cores, log_file, features_metadata ,debug){
 
   flog.debug("Validating input type for tcareg...")
 
@@ -155,14 +169,17 @@ tcareg.validate_input <- function(X, W, y, C3, test, null_model, alternative_mod
   assert(is.character(output), "argument output must take a character value")
 
   assert(is.logical(save_results), "argument save_results must take a logical value")
+  assert(is.logical(fast_mode), "argument fast_mode must take a logical value")
   assert(is.logical(sort_results), "argument sort_results must take a logical value")
   assert(is.logical(parallel), "argument parallel must take a logical value")
   assert(is.logical(debug), "argument debug must take a logical value")
   assert(is.character(log_file) | is.null(log_file), "argument log_file must take a character value or NULL")
 
   flog.debug("Validating input stucture and values...")
-  assert(!is.null(rownames(X)) | !is.null(colnames(X)), "X must have row names and column names")
-  if (!is.null(C3)) assert(!is.null(rownames(C3)) | !is.null(colnames(C3)), "C3 must have row names and column names")
+  assert(!is.null(rownames(X)) & !is.null(colnames(X)), "X must have row names and column names")
+  assert(!is.null(rownames(W)) & !is.null(colnames(W)), "W must have row names and column names")
+  assert(!is.null(rownames(y)), "y must have row names")
+  if (!is.null(C3)) assert(!is.null(rownames(C3)) & !is.null(colnames(C3)), "C3 must have row names and column names")
   if (test == "custom") assert( (!is.null(alternative_model)), "argument alternative_model cannot be NULL when test=custom")
   if ( (!is.null(alternative_model))){
     assert(test == "custom", "argument test must be set to 'custom' if argument alternative_model is not NULL")
@@ -173,6 +190,7 @@ tcareg.validate_input <- function(X, W, y, C3, test, null_model, alternative_mod
       assert(length(setdiff(alternative_model, null_model)) > 0 & length(setdiff(null_model, alternative_model)) == 0, "null_model must be nested within alternative_model")
     }
   }
+  assert(!( (sort_results & test == "marginal_conditional") & fast_mode), "sort_results cannot be set to true under fast_mode == TRUE and test = 'marginal_conditional'")
 
   flog.debug("Validating matrix dimensions...")
   if (!is.null(C3)) assert(dim(X)[2] == dim(C3)[1] , "the number of columns in X is inconsistent with the number of rows in C3")
@@ -206,36 +224,44 @@ lrt.test <- function(ll0, ll1, df){
 }
 
 
-save_association_results <- function(res, output, test, alternative_model, feature_ids, W_names, C3_names, sort_results, features_metadata){
+save_association_results <- function(res, output, test, fast_mode, alternative_model, feature_ids, W_names, C3_names, sort_results, features_metadata){
   flog.debug("Running save_association_results...")
   m <- length(feature_ids)
   metadata <- if (is.null(features_metadata)) matrix(0,m,0) else parse_features_metadata(feature_ids, features_metadata)
-  if (test == "marginal" | test == "marginal_conditional"){
-    # marginal or marginal_conditional
+  if (test == "marginal" | (!fast_mode & test == "marginal_conditional") ){
     for (i in 1:length(res)){
       filename <- paste(output, ".", test, ".", W_names[i], ".txt", sep ="")
       wnames <- if (test == "marginal") W_names[i] else W_names
-      save_association_results.save(res[[i]], filename, feature_ids, sort_results, wnames, C3_names, metadata)
+      save_association_results.save(res[[i]], test, fast_mode, filename, feature_ids, sort_results, wnames, C3_names, metadata)
     }
   }else{
-    # joint, single_effect, or custom
     filename <- paste(output, ".", test, ".txt", sep ="")
+    keep <- 1:length(W_names)
     if (test == "custom") keep <- alternative_model
-    if (test == "joint") keep <- 1:length(W_names)
     if (test == "single_effect") keep <- 1
-    save_association_results.save(res, filename, feature_ids, sort_results, W_names[keep], C3_names, metadata)
+    save_association_results.save(res, test, fast_mode, filename, feature_ids, sort_results, W_names[keep], C3_names, metadata)
   }
 }
 
 
-save_association_results.save <- function(res, filename, feature_ids, sort_results, W_names, C3_names, metadata){
+save_association_results.save <- function(res, test, fast_mode, filename, feature_ids, sort_results, W_names, C3_names, metadata){
   flog.debug("save_association_results.save...")
-  #config <- config::get()
   config <- config::get(file = system.file("extdata", "config.yml", package = "TCA"), use_parent = FALSE)
   m <- length(feature_ids)
-  data <- data.frame(feature_ids, metadata, res$pvals, res$qvals, res$beta, res$null_ll, res$alternative_ll, res$stats, res$df, res$intercept, res$alpha)
-  betas <- if(length(W_names) == 1) "beta" else lapply(1:length(W_names), function(i) paste("beta.",W_names[i],sep=""))
-  colnames(data) <- c("ID", colnames(metadata), "pval", "qval", betas, "null_ll", "alternative_ll", "chi_squared", "df", "intercept", C3_names)
+  betas <- if(length(W_names) == 1) "beta" else unlist(lapply(1:length(W_names), function(i) paste("beta.",W_names[i],sep="")))
+  pvals <- if(test == "marginal_conditional" & fast_mode) unlist(lapply(1:length(W_names), function(i) paste("pval.",W_names[i],sep=""))) else "pval"
+  qvals <- if(test == "marginal_conditional" & fast_mode) unlist(lapply(1:length(W_names), function(i) paste("qval.",W_names[i],sep=""))) else "qval"
+  alpha <- if (length(C3_names)) unlist(lapply(1:length(C3_names), function(i) paste("alpha.",C3_names[i],sep=""))) else c()
+  df <- if(is.null(res$df)) c() else "df"
+  df.values <- if(is.null(res$df)) matrix(0,m,0) else matrix(0,m,1)+res$df
+
+  if (fast_mode){
+    data <- data.frame(feature_ids, metadata, res$pval, res$qval, res$beta, res$alternative_ll, res$stat, df.values, res$intercept, res$alpha, res$phi)
+    colnames(data) <- c("ID", colnames(metadata), pvals, qvals, betas, "alternative_ll", "stat", df, "intercept", alpha, "phi")
+  }else{
+    data <- data.frame(feature_ids, metadata, res$pval, res$qval, res$beta, res$null_ll, res$alternative_ll, res$stat, res$df, res$intercept, res$alpha, res$phi)
+    colnames(data) <- c("ID", colnames(metadata), "pval", "qval", betas, "null_ll", "alternative_ll", "chi_squared", "df", "intercept", alpha, "phi")
+  }
   if (sort_results){
     # sort by p-value
     data <- data[order(data$pval),]

diff --git a/README.md b/README.md
@@ -15,15 +15,15 @@ The R package of TCA is available on CRAN. Run the following in R to install TCA
 install.packages("TCA")
 ```
 
-The full documentation of the TCA R package can be found <a href="https://github.com/cozygene/TCA/blob/master/manual.pdf" target="_blank">here</a>.
+The manual of the TCA R package can be found <a href="https://cran.r-project.org/web/packages/TCA/TCA.pdf" target="_blank">here</a>.
 
 You can also find a full working example of TCA in <a href="https://cran.r-project.org/package=TCA/vignettes/tca-vignette.html" target="_blank">this vignette</a> about cell-type-specific resolution epigenetics using TCA in R.
 
 <!--describe the config file.-->
 
 ## Matlab version
 
-Tne Matlab version of TCA was implemented and tested using Matlab R2015b.
+The Matlab version of TCA was implemented and tested using Matlab R2015b.
 
 ### Usage
 

diff --git a/inst/extdata/config.yml b/inst/extdata/config.yml
@@ -4,11 +4,12 @@ default:
   # optimization configs:
   lsqlincon_inf: 1000000 # lsqlincon fails if given Inf
   min_sd: 0.0001 # minimal value for an estimated variance; required for stability
-  mu_min: NULL # minimal value for each estimated mean; if NULL then this will be set to min(X)
-  mu_max: NULL # maximal value for each estimated mean; if NULL then this will be set to max(X)
+  #mu_min: NULL # minimal value for each estimated mean; if NULL then this will be set to min(X)
+  #mu_max: NULL # maximal value for each estimated mean; if NULL then this will be set to max(X)
   mu_epsilon: 0.00001 # truncates the range [mu_min,mu_max] by this value so that mu is required to be in [mu_mean+mu_epsilon,mu_max-mu_epsilon]; required for numerical stability
   epsilon: 0.001 # percent of difference for convergence
   tau_hat_init: 0.01  # initial estimate of the iid component added on top of the mixture signal
+  V_weight_limit: 1.0e-6  # don't allow extremely small weights in the gmm if vars.mle == TRUE; otherwise estimation may be sensitive to extreme outliers
   # pbapply options:
   nout: 10  # the resolution of the progress bars (higher values -> highr resolution but increased overhead, which in turns results in longer running time)
   type: "txt" # change to "none" to supress the progress bars (this will result is some speed up of the running time)
@@ -23,7 +24,7 @@ default:
   sep: "," # separator for output files
   show_progress: FALSE
   verbose: FALSE
-  fdr_method: "BY"  # can use any method that is provided by p.adjust
+  fdr_method: "BH"  # can use any method that is provided by p.adjust
   # more configs:
   lambda: 0.1 # weight for L2 regularization to handle co-linearily in the association testing