start

daniellyz · Apr 24, 2018 · ab24520 · ab24520
1 parent 3759da7
commit ab24520
Show file tree

Hide file tree

Showing 19 changed files with 771 additions and 0 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -0,0 +1,4 @@
+^.*\.Rproj$
+^\.Rproj\.user$
+^packrat/
+^\.Rprofile$
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+.Rproj.user
+.Rhistory
+.RData
+.Ruserdata
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,14 @@
+Package: MetICA
+Type: Package
+Version: 0.2.0
+Title: Independent component analysis for high-resolution mass-spectrometry based metabolomics
+Author: Youzhong Liu <[email protected]>
+Maintainer: Youzhong Liu <[email protected]>
+Depends: R (>= 3.4.1)
+Imports: fastICA, MASS, e1071, propagate, mixOmics, factoextra, lsa, proxy
+Suggests: ade4, testthat
+Description: ICA is an important alternative to classical statistical approaches for non-targeted metabolomics data. It extends the concept of regular correlation (e.g. in PCA, ASCA and PLS-DA) to statistical dependance by capturing higher order dependencies. However, its algorithm instability (output variations between different algorithm runs) and the biological validity of components have been overlooked when applied to complex metabolomics data. MetICA adresses these problems by gathering ICs estimated from multiple algorithm runs and from bootstrapped datasets, clustering them so as to find the most representative components. While evaluating the algorithmic stability, MetICA also suggests multiple criteria to select the correct number of components and to rank the extracted components.
+License: GPL3
+Encoding: UTF-8
+LazyData: true
+RoxygenNote: 6.0.1
diff --git a/MetICA.Rproj b/MetICA.Rproj
@@ -0,0 +1,22 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
+
+AutoAppendNewline: Yes
+StripTrailingWhitespace: Yes
+
+BuildType: Package
+PackageUseDevtools: Yes
+PackageInstallArgs: --no-multiarch --with-keep.source
+PackageCheckArgs: –as-cran
+PackageRoxygenize: rd,collate,namespace
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,5 @@
+# Generated by roxygen2: do not edit by hand
+
+export(MetICA)
+export(MetICA_extract_model)
+export(validationPlot)
diff --git a/R/MetICA.R b/R/MetICA.R
diff --git a/R/MetICA_extract_model.R b/R/MetICA_extract_model.R
@@ -0,0 +1,81 @@
+#' Extracting MetICA model
+#'
+#' This function extracts a MetICA model with user-decided component numbers.
+#'
+#' @param M1 The entire list object generated from the function MetICA(X, pcs = 15...)
+#' @param ics Number of clusters (MetICA components) decided by users according to validationPlot
+#' @param tops Integer or NULL object. If integer, the tops "best" components based on each evaluation criterion will be plotted.
+#'
+#' @return MetICA model (a list object) whose number of components is decided by users.
+#' \itemize{
+#'  \item{"S"}{Scores of MetICA components. Data matrix contains n rows (samples) and ics columns (components)}
+#'  \item{"A"}{Loadings of MetICA components. Data matrix contains p rows (metabolic features) and ics columns (components)}
+#'  \item{"eval$kurtosis"}{Vector of length ics containing Kurtosis measure of each MetICA component.}
+#'  \item{"eval$cluster_size"}{Number of IPCA estimates in each corresponding cluster.}
+#'  \item{"eval$divergence_index"}{Divergence indexes of clusters.}
+#'  \item{"eval$boot_prop"}{Proportion of bootstrap estimates in each cluster.}
+#'  \item{"eval$boot_stability"}{Stability of each cluster against bootstrapping.}
+#' }
+#'
+#' @export
+#'
+MetICA_extract_model<-function(M1,ics,tops=ics){
+
+  t=as.numeric(Sys.time())
+  set.seed((t - floor(t))*1e8->seed)
+  memory.size(max=2000000000)
+  options(warn=-1)
+
+  # Check function inpus:
+  S_sum=M1$Stage1$S
+  trends=M1$Stage2$trends
+  max.cluster=M1$Stage3$max.cluster
+  boot_id=M1$Stage1$boot_id
+
+  if (is.null(S_sum)){
+    stop("Please enter a valid MetICA object !")}
+
+  if (missing(ics)){
+    stop("Please enter the selected number of clusters according to validationPlot !")}
+
+  if (ics>max.cluster || ics<2){
+    stop("Number of clusters should between 2 and ", M1$Stage3$max.cluster)}
+
+  if (is.numeric(tops)){
+    if (tops>ics){tops=ics}}
+
+  # Extract the selected model:
+
+  M2=list()
+  M2$S=M1$Stage3$S_history[[ics]]
+  M2$A=M1$Stage3$A_history[[ics]]
+  colnames(M2$S)=colnames(M2$A)=paste0("IC",1:ics)
+
+  kurtosis=M1$Stage3$Kurt_history[[ics]]
+  total_number=M1$Stage3$tn_history[[ics]]
+  boot_prop=M1$Stage3$bn_history[[ics]]
+  divergence=M1$Stage3$dispersion_history[[ics]]
+  dis_boot_no_boot=M1$Stage3$boot_eval[[ics]]
+
+  names(kurtosis)=names(total_number)=names(boot_prop)=names(divergence)=names(dis_boot_no_boot)=paste0("IC",1:ics)
+  M2$eval=list(kurtosis=kurtosis,cluster_size=total_number,divergence_index=divergence,boot_prop=boot_prop,boot_stability=dis_boot_no_boot)
+
+  # Visualize and rank each cluster:
+
+  if (!is.null(tops)){
+    par(mfrow=c(1,1))
+
+    kurtosis2=kurtosis[order(kurtosis,decreasing = T)[1:tops]]
+    barplot(kurtosis2,font = 2,ylab="Component kurtosis", cex.lab = 1.3)
+
+    divergence2=divergence[order(divergence)[1:tops]]
+    barplot(divergence2,font = 2,ylab="Cluster dispersion index", cex.lab = 1.3)
+
+    boot_prop2=boot_prop[order(abs(boot_prop-0.5))[1:tops]]
+    barplot(boot_prop2,font = 2,ylab="Proportion of bootstrap estimates", cex.lab = 1.3)
+
+    boot_stability2=dis_boot_no_boot[order(dis_boot_no_boot)[1:tops]]
+    barplot(boot_stability2,font = 2,ylab="Distance bootstrap-non-bootstrap", cex.lab = 1.3)}
+
+  return(M2)
+}
diff --git a/R/bacteria_peptides.R b/R/bacteria_peptides.R
@@ -0,0 +1,18 @@
+#' Bacteria exo-metabolome example data
+#'
+#' Exo-metabolomic profiles of two wine bacteria strains BETA and VP41. The experiment included 6 different growth media and 3 biological replicates. Samples were taken at different day points throughout bacterial fermentation and measured on maXis ESI–QTOF-MS coupled with reversed-phase LC.
+#'
+#' @docType data
+#'
+#' @usage data(bacteria_peptides)
+#'
+#' @format A two-element list object:
+#' \itemize{
+#'   \item features: metabolic features detected and extracted. Each feature is represented by a unique ID, a mass (m/z) and a retention time in minute (RT)
+#'   \item X: data matrix with 159 rows (samples) and 8332 columns (metabolic features). This is the example data matrix for MetICA simulations.
+#' }
+#'
+#' @references Youzhong Liu, Sara Forcisi, Marianna Lucio, Mourad Harir, Florian Bahut, Magali Deleris-Bou, Sibylle Krieger-Weber, Regis D. Gougeon, Herve Alexandre and Philippe Schmitt-Kopplin, Digging into the low molecular weight peptidome with the OligoNet web server, Scientific reports (2017) Vol. 7 no. 11692
+#'
+"bacteria_peptides"
+
diff --git a/R/validationPlot.R b/R/validationPlot.R
@@ -0,0 +1,101 @@
+#' Plots to decide number of clusters
+#'
+#' The function generates multiple plots that help users to decide number of clusters or MetICA components.
+#'
+#' @param M1 The entire list object generated from the function MetICA(X, pcs = 15...)
+#' @param cluster_index Boolean object. TRUE if geometric indices for clusterinig quality needs to be calculted. The calculation provides an additional criterion for cluster number selection but can be time-consuming.
+#'
+#' @return Return an overall graph with multiple plots. If cluster_index = TRUE, also return a list object containing clustering quality score.
+#'
+#' @export
+
+validationPlot<-function(M1,cluster_index=T){
+
+  memory.size(max=2000000000)
+  options(warn=-1)
+
+  # This function helps to find the best number of clusters by visualizing
+
+  new_component_size=sapply(M1$Stage3$Kurt_history,length) # How many new components each time
+  cluster_list=list()
+  for (i in 1:M1$Stage3$max.cluster){cluster_list[[i]]=rep(i,new_component_size[i])}
+  #par(mfrow=c(3,2))
+  par(mfrow=c(1,1))
+
+  # Evaluate newly added clusters when increasing cluster number
+
+  plot(unlist(cluster_list),unlist(M1$Stage3$tn_history),col="blue",pch=19,xlab="Number of clusters",ylab="Number of estimates",font.lab=2,font.axis=2,bty="n")
+  box(lwd=2)
+  title("Total number of estimates in each cluster")
+
+  plot(unlist(cluster_list),unlist(M1$Stage3$Kurt_history),col="blue",pch=19,xlab="Number of clusters",ylab="Kurtosis",font.lab=2,font.axis=2,bty="n")
+  box(lwd=2)
+  title("Kurtosis of cluster centers (components)")
+
+  plot(unlist(cluster_list),unlist(M1$Stage3$bn_history),col="blue",pch=19,xlab="Number of clusters",ylab="Proportion",font.lab=2,font.axis=2,bty="n")
+  box(lwd=2)
+  polygon(c(-1,100,100,-1),c(0.45,0.45,1,1),col=rgb(0.8, 0.9, 0.8,0.5), border = "grey",lty="dashed")
+  title("Proportion of bootstrap estimates in each cluster")
+
+  plot(unlist(cluster_list),unlist(M1$Stage3$boot_eval),col="blue",pch=19,xlab="Number of clusters",ylab="Bootstrap distance",font.lab=2,font.axis=2,bty="n")
+  box(lwd=2)
+  polygon(c(-1,100,100,-1),c(0,0,0.8,0.8),col=rgb(0.8, 0.9, 0.8,0.5), border = "grey",lty="dashed")
+  title("Cluster stability against bootstrapping")
+
+  plot(unlist(cluster_list),unlist(M1$Stage3$dispersion_history),col="blue",pch=19,xlab="Number of clusters",ylab="Dispersion index",font.lab=2,font.axis=2,bty="n")
+  box(lwd=2)
+  polygon(c(-1,100,100,-1),c(0,0,0.8,0.8),col=rgb(0.8, 0.9, 0.8,0.5), border = "grey",lty="dashed")
+  title("Dispersion index")
+
+  if (cluster_index){
+    message("Generating geometric index... please be patient!")
+    results=R_index(M1)
+    plot(results$x,results$y,col="blue",pch=19,xlab="Number of clusters",ylab="R index",font.lab=2,font.axis=2,bty="n")
+    box(lwd=2)
+    title("Geometric index of clusters")
+    suggested=which.min(results$y)
+    message("Suggested cluster number according to the geometric index is:",suggested)
+    return(results)
+  }
+  else{return("Please set cluster_index=T if you cannont make a decision!")}
+}
+
+R_index<-function(M1){
+
+  S_sum=M1$Stage1$S
+  trends=M1$Stage2$trends
+  max.cluster=M1$Stage3$max.cluster
+
+  # Recalculate distance object:
+
+  if (ncol(S_sum)>4000){ # Faster correlation matrix for big data
+    R=bigcor(S_sum,size= 2000,method="spearman",verbose = F) # Big correlation matrix
+    R=R[1:nrow(R), 1:ncol(R)]}
+  else{ # Smaller data size
+    R=cor(S_sum,method="spearman")}
+
+  if (trends){D=(1-R)/2} # Disimilariy Matrix for time dependent data [0,1]
+  else {D=1-abs(R)} # Disimilariy Matrix for no-time dependent data [0,1]
+
+  # Calculate score
+  score_list=c()
+  for (nbcluster in 2:max.cluster){
+    cluster=cutree(M1$Stage2$clusterObj,nbcluster) # CUtting tree
+    cluster_list=1:nbcluster
+    cluster_scores=c()
+    for (i in cluster_list){
+      index_in=which(cluster==i) # Index of elenments in cluster i
+      Cm_in=length(index_in) # Nb of elements in the cluster
+      Var_in=1/(Cm_in^2)*sum(D[index_in,index_in]) # Variance inside the cluster
+      Var_out=c()
+      for (j in cluster_list[-i]){
+        index_out=which(cluster==j)
+        Cm_out=length(index_out) # Nb of elements in cluster j
+        Var_out=c(Var_out,1/(Cm_in*Cm_out)*sum(D[index_in,index_out]))} # Variance between cluster i and j
+      Var_out=min(Var_out) # Take the minimal variance
+      cluster_scores=c(cluster_scores,Var_in/Var_out) # Score of the evaluated cluster
+    }
+    score_list=c(score_list,mean(cluster_scores))}
+  return(list(x=2:max.cluster,y=score_list))
+}
+
diff --git a/R/yeast_metabolome.R b/R/yeast_metabolome.R
@@ -0,0 +1,17 @@
+#' Yeast exo-metabolome example data
+#'
+#' Exo-metabolomic profiles of 15 yeast commercial strains for winemaking. The experiment included 3 biological replicates. Samples were taken at the end of alcoholic fermentationand measured on direct-infusion FT-ICR-MS.
+#'
+#' @docType data
+#'
+#' @usage data(yeast_metabolome)
+#'
+#' @format A two-element list object:
+#' \itemize{
+#'   \item features: metabolic features detected and extracted. Each feature is represented by a unique ID and a mass (m/z).
+#'   \item X: data matrix with 45 rows (samples) and 2700 columns (metabolic features). This is the example data matrix for MetICA simulations.
+#' }
+#'
+#' @references Youzhong Liu, Sara Forcisi, Mourad Harir, Magali Deleris-Bou, Sibylle Krieger-Weber, Marianna Lucio, Cedric Longin, Claudine Degueurce, Regis D. Gougeon, Philippe Schmitt-Kopplin and Herve Alexandre, New molecular evidence of wine yeast-bacteria interaction unraveled by non-targeted exometabolomic profiling, Metabolomics (2016) Vol. 12 no. 69
+
+"yeast_metabolome"
diff --git a/data/bacteria_peptides.rda b/data/bacteria_peptides.rda
diff --git a/data/yeast_metabolome.rda b/data/yeast_metabolome.rda
diff --git a/man/MetICA.Rd b/man/MetICA.Rd
diff --git a/man/MetICA_extract_model.Rd b/man/MetICA_extract_model.Rd