From 1330b7a32eab84e47b3fc698f90c5ec045f9dbba Mon Sep 17 00:00:00 2001 From: showteeth Date: Wed, 22 Nov 2023 09:44:47 +0800 Subject: [PATCH] Fix bugs --- DESCRIPTION | 29 +- NAMESPACE | 8 +- NEWS.md | 3 + R/DataIO.R | 124 ++++-- R/GEO.R | 54 +-- R/PanglaoDB.R | 30 +- R/UCSCCellBrowser.R | 96 +++-- R/Zenodo.R | 40 +- R/bam.R | 32 +- R/cellxgene.R | 70 ++-- R/fastq.R | 45 +- R/hca.R | 49 ++- R/summary.R | 28 +- R/utils.R | 4 +- README.md | 638 ++++++++++++++++++++++++++++- man/Bam2Fastq.Rd | 19 +- man/DownloadBam.Rd | 13 +- man/DownloadSRA.Rd | 11 +- man/ExportSeurat.Rd | 23 +- man/ExtractCBComposition.Rd | 16 +- man/ExtractCBDatasets.Rd | 14 +- man/ExtractCELLxGENEMeta.Rd | 16 +- man/ExtractGEOExp.Rd | 6 - man/ExtractGEOExpSupp.Rd | 9 - man/ExtractGEOExpSupp10x.Rd | 4 - man/ExtractGEOExpSuppAll.Rd | 4 - man/ExtractGEOInfo.Rd | 4 - man/ExtractGEOMeta.Rd | 10 +- man/ExtractGEOSubMeta.Rd | 4 - man/ExtractHCAMeta.Rd | 20 +- man/ExtractPanglaoDBComposition.Rd | 8 +- man/ExtractPanglaoDBMeta.Rd | 10 +- man/ExtractRun.Rd | 4 +- man/ExtractZenodoMeta.Rd | 14 +- man/ImportSeurat.Rd | 24 +- man/ParseCBDatasets.Rd | 22 +- man/ParseCELLxGENE.Rd | 20 +- man/ParseGEO.Rd | 15 +- man/ParseHCA.Rd | 20 +- man/ParsePanglaoDB.Rd | 12 +- man/ParseZenodo.Rd | 15 +- man/SCEAnnData.Rd | 17 +- man/SCELoom.Rd | 18 +- man/ShowCBDatasets.Rd | 10 +- man/ShowCELLxGENEDatasets.Rd | 6 +- man/ShowHCAProjects.Rd | 6 +- man/SplitSRA.Rd | 21 +- man/StatDBAttribute.Rd | 28 +- vignettes/DownloadMatrices.Rmd | 2 +- vignettes/scfetch.Rmd | 129 +++--- 50 files changed, 1358 insertions(+), 466 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 5e0dd8e..e07a1a6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -8,9 +8,12 @@ Authors@R: role = c("aut", "cre"), email = "songyb0519@gmail.com") Maintainer: Yabing Song -Description: The goal of 'scfetch' is to access and format scRNA-seq datasets. It can be used to download scRNA-seq - datasets from widely used public resources, including GEO, Zenodo, CELLxGENE, Human Cell Atlas, PanglaoDB and UCSC Cell Browser. - And, it can also be used to perform object conversion between SeuratObject, loom, h5ad, SingleCellExperiment and CellDataSet/cell_data_set. +Description: The goal of 'scfetch' is to access and format single-cell RNA-seq datasets. It can be used to download single-cell RNA-seq + datasets from widely used public resources, including GEO , Zenodo , + CELLxGENE , Human Cell Atlas , PanglaoDB + and UCSC Cell Browser . And, it can also be used to perform object conversion between SeuratObject , + loom , h5ad , SingleCellExperiment , + CellDataSet and cell_data_set . License: GPL (>= 3) Encoding: UTF-8 LazyData: true @@ -21,7 +24,6 @@ Imports: curl, data.table, dplyr, - GEOfastq, GEOquery, jsonlite, magrittr, @@ -36,10 +38,6 @@ Imports: tibble, tools, utils, - sceasy, - SeuratDisk, - SeuratWrappers, - zellkonverter, SingleCellExperiment, SummarizedExperiment, scater, @@ -48,17 +46,20 @@ Imports: rlang, tidyr, methods -Remotes: - cellgeni/sceasy, - mojaveazure/seurat-disk, - satijalab/seurat-wrappers, - alexvpickering/GEOfastq Suggests: knitr, rmarkdown, scRNAseq, BiocStyle, - htmltools + htmltools, + sceasy, + SeuratDisk, + SeuratWrappers, + zellkonverter, + GEOfastq VignetteBuilder: knitr +Additional_repositories: https://showteeth.github.io/drat/ +URL: https://github.com/showteeth/scfetch +BugReports: https://github.com/showteeth/scfetch/issues Depends: R (>= 2.10) diff --git a/NAMESPACE b/NAMESPACE index 87f71f6..57c6b4a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -34,7 +34,6 @@ importFrom(Biobase,notes) importFrom(Biobase,pData) importFrom(Biobase,phenoData) importFrom(Biobase,sampleNames) -importFrom(GEOfastq,crawl_gsms) importFrom(GEOquery,getGEO) importFrom(GEOquery,getGEOSuppFiles) importFrom(GEOquery,gunzip) @@ -51,9 +50,6 @@ importFrom(Seurat,as.CellDataSet) importFrom(Seurat,as.Seurat) importFrom(Seurat,as.SingleCellExperiment) importFrom(Seurat,as.sparse) -importFrom(SeuratDisk,Connect) -importFrom(SeuratDisk,SaveLoom) -importFrom(SeuratWrappers,as.cell_data_set) importFrom(SingleCellExperiment,"reducedDimNames<-") importFrom(SingleCellExperiment,reducedDimNames) importFrom(SummarizedExperiment,assayNames) @@ -73,6 +69,7 @@ importFrom(jsonlite,flatten) importFrom(jsonlite,fromJSON) importFrom(magrittr,"%>%") importFrom(methods,as) +importFrom(methods,is) importFrom(methods,new) importFrom(openxlsx,read.xlsx) importFrom(parallel,detectCores) @@ -85,7 +82,6 @@ importFrom(rPanglaoDB,getSamples) importFrom(reticulate,use_condaenv) importFrom(rlang,.data) importFrom(scater,logNormCounts) -importFrom(sceasy,convertFormat) importFrom(tibble,column_to_rownames) importFrom(tidyr,spread) importFrom(tools,file_ext) @@ -93,5 +89,3 @@ importFrom(tools,md5sum) importFrom(utils,download.file) importFrom(utils,read.table) importFrom(utils,untar) -importFrom(zellkonverter,readH5AD) -importFrom(zellkonverter,writeH5AD) diff --git a/NEWS.md b/NEWS.md index e4fc280..ab92932 100644 --- a/NEWS.md +++ b/NEWS.md @@ -20,6 +20,9 @@ * Fixed bug in `ImportSeurat`. * Fixed bug in `ExportSeurat`. * Resolved installation. +* Fixed bug in `ExtractZenodoMeta` (API changed). +* Fixed bug in `ShowCBDatasets` (added `--no-check-certificate` when downloading json files). +* Fixed bug in `ShowCELLxGENEDatasets`. ------------------- diff --git a/R/DataIO.R b/R/DataIO.R index 46d1276..dec7492 100644 --- a/R/DataIO.R +++ b/R/DataIO.R @@ -9,24 +9,41 @@ #' @param loom.file File used to save loom results. Default: NULL. #' @param conda.path Conda environment path, used when \code{to} is "AnnData". Default: NULL. #' @param ... Parameter for \code{\link{as.SingleCellExperiment}}, \code{sceasy::convertFormat}, \code{\link{as.CellDataSet}}, -#' \code{\link{as.cell_data_set}}, \code{\link{SaveLoom}}, corresponding to \code{to}. +#' \code{as.cell_data_set}, \code{SaveLoom}, corresponding to \code{to}. #' #' @return Object corresponding to \code{to}. #' @export #' @importFrom Seurat DefaultAssay as.SingleCellExperiment as.CellDataSet -#' @importFrom sceasy convertFormat -#' @importFrom SeuratDisk SaveLoom -#' @importFrom SeuratWrappers as.cell_data_set #' @importFrom reticulate use_condaenv +#' @importFrom methods is #' +#' @examples +#' \dontrun{ +#' library(Seurat) +#' # export to SingleCellExperiment +#' sce.obj <- ExportSeurat(seu.obj = pbmc_small, assay = "RNA", to = "SCE") +#' # export to CellDataSet +#' cds.obj <- ExportSeurat(seu.obj = pbmc_small, assay = "RNA", reduction = "tsne", to = "CellDataSet") +#' # export to cell_data_set +#' cds3.obj <- ExportSeurat(seu.obj = pbmc_small, assay = "RNA", to = "cell_data_set") +#' # export to AnnData, need users to provide the conda path and the output file +#' ExportSeurat( +#' seu.obj = pbmc_small, assay = "RNA", to = "AnnData", conda.path = "/path/to/anaconda3", +#' anndata.file = "/path/to/pbmc_small.h5ad" +#' ) +#' # export to loom, need users to provide the output file +#' ExportSeurat( +#' seu.obj = pbmc_small, assay = "RNA", to = "loom", +#' loom.file = "/path/to/pbmc_small.loom" +#' ) +#' } ExportSeurat <- function(seu.obj, assay = NULL, reduction = NULL, to = c("SCE", "AnnData", "CellDataSet", "cell_data_set", "loom"), anndata.file = NULL, loom.file = NULL, conda.path = NULL, ...) { # check parameters to <- match.arg(arg = to) - # check object - if (class(seu.obj) != "Seurat") { + if (!methods::is(seu.obj, "Seurat")) { stop("Please provide valid Seurat object!") } @@ -58,6 +75,9 @@ ExportSeurat <- function(seu.obj, assay = NULL, reduction = NULL, if (!is.null(conda.path)) { reticulate::use_condaenv(conda.path, required = TRUE) } + if (!requireNamespace("sceasy", quietly = TRUE)) { + stop("Can not find sceasy package, install with devtools::install_github('cellgeni/sceasy')!") + } sceasy::convertFormat(seu.obj, from = "seurat", to = "anndata", drop_single_values = F, assay = assay, outFile = anndata.file, ... @@ -74,6 +94,9 @@ ExportSeurat <- function(seu.obj, assay = NULL, reduction = NULL, return(result.obj) } else if (to == "cell_data_set") { message("Convert SeuratObject to cell_data_set (suitable for Monocle3)!") + if (!requireNamespace("SeuratWrappers", quietly = TRUE)) { + stop("Can not find SeuratWrappers package, install with devtools::install_github('satijalab/seurat-wrappers')!") + } if (is.null(reduction)) { result.obj <- SeuratWrappers::as.cell_data_set(x = seu.obj, assay = assay, ...) } else { @@ -89,6 +112,9 @@ ExportSeurat <- function(seu.obj, assay = NULL, reduction = NULL, loom.file <- file.path(getwd(), paste0(seu.name, ".loom")) } # Convert SingleCellExperiment to loom + if (!requireNamespace("SeuratDisk", quietly = TRUE)) { + stop("Can not find SeuratDisk package, install with devtools::install_github('mojaveazure/seurat-disk')!") + } SeuratDisk::SaveLoom(object = seu.obj, filename = loom.file, overwrite = TRUE, ...) } } @@ -112,26 +138,31 @@ ExportSeurat <- function(seu.obj, assay = NULL, reduction = NULL, #' #' @return A Seurat object. #' @importFrom Seurat as.Seurat -#' @importFrom sceasy convertFormat -#' @importFrom SeuratDisk Connect #' @importFrom reticulate use_condaenv #' @importFrom SummarizedExperiment assayNames #' @importFrom scater logNormCounts +#' @importFrom methods is #' @export #' #' @examples +#' \dontrun{ #' # import data from SingleCellExperiment -#' # seu.obj = ImportSeurat(obj=sce.obj, from="SCE", count.assay="counts", -#' # data.assay="logcounts", assay="RNA") +#' seu.obj <- ImportSeurat( +#' obj = sce.obj, from = "SCE", count.assay = "counts", +#' data.assay = "logcounts", assay = "RNA" +#' ) #' # import data from CellDataSet -#' # seu.obj = ImportSeurat(obj=cds.obj, from="CellDataSet", count.assay="counts", assay = "RNA") +#' seu.obj <- ImportSeurat(obj = cds.obj, from = "CellDataSet", count.assay = "counts", assay = "RNA") #' # import data from cell_data_set -#' # seu.obj = ImportSeurat(obj=sce.obj, from="cell_data_set", count.assay="counts", -#' # data.assay="logcounts", assay="RNA") -#' # import data from AnnData -#' # seu.obj = ImportSeurat(anndata.file = 'path/to/h5ad', from="AnnData", assay = "RNA") -#' # import data from loom -#' # seu.obj = ImportSeurat(loom.file = 'path/to/loom', from="loom") +#' seu.obj <- ImportSeurat( +#' obj = sce.obj, from = "cell_data_set", count.assay = "counts", +#' data.assay = "logcounts", assay = "RNA" +#' ) +#' # import data from AnnData, need users to provide the file for conversion +#' seu.obj <- ImportSeurat(anndata.file = "path/to/h5ad", from = "AnnData", assay = "RNA") +#' # import data from loom, need users to provide the file for conversion +#' seu.obj <- ImportSeurat(loom.file = "path/to/loom", from = "loom") +#' } ImportSeurat <- function(obj = NULL, assay = "RNA", from = c("SCE", "AnnData", "CellDataSet", "cell_data_set", "loom"), count.assay = "counts", data.assay = "logcounts", slot = "counts", anndata.file = NULL, loom.file = NULL, conda.path = NULL, ...) { @@ -144,7 +175,7 @@ ImportSeurat <- function(obj = NULL, assay = "RNA", from = c("SCE", "AnnData", " # check object if (is.null(obj)) { stop("Please provide SingleCellExperiment with obj!") - } else if (class(obj) != "SingleCellExperiment") { + } else if (!methods::is(obj, "SingleCellExperiment")) { stop("Please provide valid SingleCellExperiment object!") } # check assays @@ -175,6 +206,9 @@ ImportSeurat <- function(obj = NULL, assay = "RNA", from = c("SCE", "AnnData", " if (!is.null(conda.path)) { reticulate::use_condaenv(conda.path, required = TRUE) } + if (!requireNamespace("sceasy", quietly = TRUE)) { + stop("Can not find sceasy package, install with devtools::install_github('cellgeni/sceasy')!") + } seu.obj <- sceasy::convertFormat(anndata.file, from = "anndata", to = "seurat", assay = assay, outFile = NULL, ... @@ -185,7 +219,7 @@ ImportSeurat <- function(obj = NULL, assay = "RNA", from = c("SCE", "AnnData", " # check object if (is.null(obj)) { stop("Please provide CellDataSet with obj!") - } else if (class(obj) != "CellDataSet") { + } else if (!methods::is(obj, "CellDataSet")) { stop("Please provide valid CellDataSet object!") } # convert @@ -197,7 +231,7 @@ ImportSeurat <- function(obj = NULL, assay = "RNA", from = c("SCE", "AnnData", " # check object if (is.null(obj)) { stop("Please provide cell_data_set with obj!") - } else if (class(obj) != "cell_data_set") { + } else if (!methods::is(obj, "cell_data_set")) { stop("Please provide valid cell_data_set object!") } # convert @@ -212,6 +246,9 @@ ImportSeurat <- function(obj = NULL, assay = "RNA", from = c("SCE", "AnnData", " if (is.null(loom.file)) { stop("Please provide a file a to loom results.") } else { + if (!requireNamespace("SeuratDisk", quietly = TRUE)) { + stop("Can not find SeuratDisk package, install with devtools::install_github('mojaveazure/seurat-disk')!") + } loom.info <- SeuratDisk::Connect(filename = loom.file, mode = "r") seu.obj <- Seurat::as.Seurat(loom.info, ...) } @@ -229,33 +266,40 @@ ImportSeurat <- function(obj = NULL, assay = "RNA", from = c("SCE", "AnnData", " #' @param anndata.file File used to save or contains AnnData results. Default: NULL. #' @param slot.name Slot name used to save count matrix, used when converting from AnnData to SingleCellExperiment. #' Default: counts. -#' @param ... Parameters for \code{\link{writeH5AD}} and \code{\link{readH5AD}}. +#' @param ... Parameters for \code{writeH5AD} and \code{readH5AD}. #' #' @return NULL or SingleCellExperiment. -#' @importFrom zellkonverter readH5AD writeH5AD #' @importFrom SingleCellExperiment reducedDimNames reducedDimNames<- +#' @importFrom methods is #' @export #' #' @examples -#' # library(scRNAseq) -#' # seger <- SegerstolpePancreasData() -#' # SCEAnnData(from = "SingleCellExperiment", to = "AnnData", sce = seger, X_name = "counts") -#' # sce = SCEAnnData(from = "AnnData", to = "SingleCellExperiment", -#' # anndata.file = "path/to/seger.h5ad") +#' \dontrun{ +#' library(scRNAseq) +#' seger <- SegerstolpePancreasData() +#' SCEAnnData(from = "SingleCellExperiment", to = "AnnData", sce = seger, X_name = "counts") +#' # need users to provide the output file +#' sce <- SCEAnnData( +#' from = "AnnData", to = "SingleCellExperiment", +#' anndata.file = "path/to/seger.h5ad" +#' ) +#' } SCEAnnData <- function(from = c("SingleCellExperiment", "AnnData"), to = c("AnnData", "SingleCellExperiment"), sce = NULL, anndata.file = NULL, slot.name = "counts", ...) { # check parameters from <- match.arg(arg = from) to <- match.arg(arg = to) - + if (!requireNamespace("zellkonverter", quietly = TRUE)) { + stop("Can not find zellkonverter package, install with BiocManager::install('zellkonverter')!") + } # conversion if (from == "SingleCellExperiment" & to == "AnnData") { message("Convert SingleCellExperiment to AnnData.") # check SingleCellExperiment if (is.null(sce)) { stop("Please provide SingleCellExperiment with sce!") - } else if (class(sce) != "SingleCellExperiment") { + } else if (!methods::is(sce, "SingleCellExperiment")) { stop("Please provide valid SingleCellExperiment object!") } # check h5ad file @@ -296,16 +340,22 @@ SCEAnnData <- function(from = c("SingleCellExperiment", "AnnData"), #' @return NULL or SingleCellExperiment. #' @importFrom LoomExperiment SingleCellLoomExperiment export import #' @importFrom SummarizedExperiment assayNames -#' @importFrom methods as +#' @importFrom methods as is #' @export #' #' @examples -#' # convert from loom to SingleCellExperiment -#' # sce.obj = SCELoom(from = "loom", to = "SingleCellExperiment", -#' # loom.file = "path/to/loom") -#' # convert from SingleCellExperiment to loom -#' # SCELoom(from = "SingleCellExperiment", to = "loom",sce = sce.obj, -#' # loom.file = "path/to/loom") +#' \dontrun{ +#' # convert from loom to SingleCellExperiment, need users to provide the loom file +#' sce.obj <- SCELoom( +#' from = "loom", to = "SingleCellExperiment", +#' loom.file = "path/to/loom" +#' ) +#' # convert from SingleCellExperiment to loom, need users to provide the loom file +#' SCELoom( +#' from = "SingleCellExperiment", to = "loom", sce = sce.obj, +#' loom.file = "path/to/loom" +#' ) +#' } SCELoom <- function(from = c("SingleCellExperiment", "loom"), to = c("loom", "SingleCellExperiment"), sce = NULL, loom.file = NULL, ...) { @@ -318,7 +368,7 @@ SCELoom <- function(from = c("SingleCellExperiment", "loom"), # check SingleCellExperiment if (is.null(sce)) { stop("Please provide SingleCellExperiment with sce!") - } else if (class(sce) != "SingleCellExperiment") { + } else if (!methods::is(sce, "SingleCellExperiment")) { stop("Please provide valid SingleCellExperiment object!") } # check loom file diff --git a/R/GEO.R b/R/GEO.R index 551b600..6c0fb36 100644 --- a/R/GEO.R +++ b/R/GEO.R @@ -29,11 +29,16 @@ #' @export #' #' @examples -#' # # the supp files are count matrix -#' # GSE94820.seu = ParseGEO(acce = "GSE94820", down.supp = TRUE, supp.idx = 1, supp.type = "count") -#' # # the supp files are cellranger output files: barcodes, genes/features and matrix -#' # GSE200257.seu = ParseGEO(acce = "GSE200257", down.supp = TRUE, supp.idx = 1, supp.type = "10x", -#' # out.folder = "/path/to/output/folder") +#' \dontrun{ +#' # the supp files are count matrix +#' GSE94820.seu <- ParseGEO(acce = "GSE94820", down.supp = TRUE, supp.idx = 1, supp.type = "count") +#' # the supp files are cellranger output files: barcodes, genes/features and matrix +#' # need users to provide the output folder +#' GSE200257.seu <- ParseGEO( +#' acce = "GSE200257", down.supp = TRUE, supp.idx = 1, supp.type = "10x", +#' out.folder = "/path/to/output/folder" +#' ) +#' } ParseGEO <- function(acce, platform = NULL, down.supp = FALSE, supp.idx = 1, timeout = 3600, data.type = c("sc", "bulk"), supp.type = c("count", "10x"), out.folder = NULL, gene2feature = TRUE, merge = TRUE, ...) { # check parameters @@ -103,10 +108,12 @@ ParseGEO <- function(acce, platform = NULL, down.supp = FALSE, supp.idx = 1, tim #' @export #' #' @examples -#' # # extract metadata of specified platform -#' # GSE200257.meta = ExtractGEOMeta(acce = "GSE200257", platform = "GPL24676") -#' # # extract metadata of all platforms -#' # GSE94820.meta = ExtractGEOMeta(acce = "GSE94820", platform = NULL) +#' \donttest{ +#' # users may need to set the size of the connection buffer +#' # Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 60) +#' # extract metadata of specified platform +#' GSE200257.meta <- ExtractGEOMeta(acce = "GSE200257", platform = "GPL24676") +#' } ExtractGEOMeta <- function(acce, platform = NULL, ...) { # get GEO object if (is.null(platform)) { @@ -174,9 +181,6 @@ SimpleCol <- function(df, col) { #' #' @return A dataframe. #' -#' @examples -#' # pf.obj = GEOobj(acce = "GSE94820", platform = "GPL16791") -#' # pf.info = ExtractGEOInfo(pf.obj) ExtractGEOInfo <- function(pf.obj, sample.wise = FALSE) { # platform information pf.info <- Biobase::experimentData(pf.obj) @@ -227,9 +231,6 @@ ExtractGEOInfo <- function(pf.obj, sample.wise = FALSE) { #' #' @return A dataframe. #' -#' @examples -#' # pf.obj = GEOobj(acce = "GSE94820", platform = "GPL16791") -#' # pf.info = ExtractGEOSubMeta(pf.obj) ExtractGEOSubMeta <- function(pf.obj) { # extract sample detail information pf.info <- as.data.frame(Biobase::pData(Biobase::phenoData(pf.obj))) @@ -263,20 +264,14 @@ ExtractGEOSubMeta <- function(pf.obj) { #' #' @return A dataframe. #' -#' @examples -#' # for bulk rna-seq -#' # count.mat = ExtractGEOExpSupp(acce = "GSE149838") -#' # count.mat = ExtractGEOExpSupp(acce = "GSE147507") -#' # count.mat = ExtractGEOExpSupp(acce = "GSE147507", supp.idx = 2) -#' # count.mat = ExtractGEOExpSupp(acce = "GSE122774") -#' # # for single cell matrix -#' # count.mat = ExtractGEOExpSupp(acce = "GSE94820") ExtractGEOExpSupp <- function(acce, timeout = 3600, supp.idx = 1) { # create tmp folder tmp.folder <- tempdir() # get current timeout if (!is.null(timeout)) { message("Change Timeout to: ", timeout) + env.timeout <- getOption("timeout") + on.exit(options(timeout = env.timeout)) # restore timeout options(timeout = timeout) } # download supplementary file @@ -357,9 +352,6 @@ ExtractGEOExpSupp <- function(acce, timeout = 3600, supp.idx = 1) { #' #' @return NULL #' -#' @examples -#' # ExtractGEOExpSupp10x(acce = "GSE200257", out.folder = '/path/to/output') -#' # ExtractGEOExpSupp10x(acce = "GSE226160", out.folder = '/path/to/output') ExtractGEOExpSupp10x <- function(acce, supp.idx = 1, timeout = 3600, out.folder = NULL, gene2feature = TRUE) { # create tmp folder @@ -367,6 +359,8 @@ ExtractGEOExpSupp10x <- function(acce, supp.idx = 1, timeout = 3600, # get current timeout if (!is.null(timeout)) { message("Change Timeout to: ", timeout) + env.timeout <- getOption("timeout") + on.exit(options(timeout = env.timeout)) # restore timeout options(timeout = timeout) } # download supp file @@ -463,9 +457,6 @@ ExtractGEOExpSupp10x <- function(acce, supp.idx = 1, timeout = 3600, #' #' @return Count matrix (\code{supp.type} is count) or NULL (\code{supp.type} is 10x). #' -#' @examples -#' # exp.data = ExtractGEOExpSuppAll(acce = "GSE200257", supp.idx = 1, supp.type = "10x", -#' # out.folder = "/path/to/output/folder") ExtractGEOExpSuppAll <- function(acce, supp.idx = 1, timeout = 3600, supp.type = c("count", "10x"), out.folder = NULL, gene2feature = TRUE) { if (supp.type == "count") { @@ -493,11 +484,6 @@ ExtractGEOExpSuppAll <- function(acce, supp.idx = 1, timeout = 3600, #' #' @return Count matrix (\code{supp.type} is count) or NULL (\code{supp.type} is 10x). #' -#' @examples -#' # pf.obj = GEOobj(acce = "GSE200257", platform = "GPL24676") -#' # count.mat = ExtractGEOExp(pf.obj, acce = "GSE200257", supp.idx = 1, -#' # down.supp = TRUE, supp.type = "10x", -#' # out.folder = "/path/to/output/folder") ExtractGEOExp <- function(pf.obj, acce, supp.idx = 1, down.supp = FALSE, timeout = 3600, supp.type = c("count", "10x"), out.folder = NULL, gene2feature = TRUE) { # check parameters diff --git a/R/PanglaoDB.R b/R/PanglaoDB.R index d59d8dc..d90aa34 100644 --- a/R/PanglaoDB.R +++ b/R/PanglaoDB.R @@ -16,9 +16,13 @@ #' @export #' #' @examples -#' # human.meta = ExtractPanglaoDBMeta(species = "Homo sapiens", -#' # protocol = c("Smart-seq2", "10x chromium"), -#' # cell.num = c(1000,2000)) +#' \donttest{ +#' human.meta <- ExtractPanglaoDBMeta( +#' species = "Homo sapiens", +#' protocol = c("Smart-seq2", "10x chromium"), +#' cell.num = c(1000, 2000) +#' ) +#' } ExtractPanglaoDBMeta <- function(species = NULL, protocol = NULL, tissue = NULL, cell.num = NULL, show.cell.type = TRUE, local.data = TRUE) { # get all sample metadata if (local.data) { @@ -121,8 +125,12 @@ ExtractPanglaoDBMeta <- function(species = NULL, protocol = NULL, tissue = NULL, #' @export #' #' @examples -#' # human.composition = ExtractPanglaoDBComposition(species = "Homo sapiens", -#' # protocol = c("Smart-seq2", "10x chromium")) +#' \donttest{ +#' human.composition <- ExtractPanglaoDBComposition( +#' species = "Homo sapiens", +#' protocol = c("Smart-seq2", "10x chromium") +#' ) +#' } ExtractPanglaoDBComposition <- function(sra = NULL, srs = NULL, species = NULL, protocol = NULL, tissue = NULL, local.data = TRUE) { if (local.data) { select.compos <- PanglaoDBComposition @@ -184,10 +192,14 @@ ExtractPanglaoDBComposition <- function(sra = NULL, srs = NULL, species = NULL, #' @export #' #' @examples -#' # hsa.meta = ExtractPanglaoDBMeta(species = "Homo sapiens", -#' # protocol = c("Smart-seq2", "10x chromium"), -#' # show.cell.type = TRUE, cell.num = c(1000,2000)) -#' # hsa.seu = ParsePanglaoDB(hsa.meta, merge = TRUE) +#' \dontrun{ +#' hsa.meta <- ExtractPanglaoDBMeta( +#' species = "Homo sapiens", +#' protocol = c("Smart-seq2", "10x chromium"), +#' show.cell.type = TRUE, cell.num = c(1000, 2000) +#' ) +#' hsa.seu <- ParsePanglaoDB(hsa.meta, merge = TRUE) +#' } ParsePanglaoDB <- function(meta, cell.type = "All", include.gene = NA, exclude.gene = NA, merge = FALSE) { # check columns CheckColumns(df = meta, columns = c("SRA", "SRS", "Tissue", "Protocol", "Species")) diff --git a/R/UCSCCellBrowser.R b/R/UCSCCellBrowser.R index a758073..5e4ae9d 100644 --- a/R/UCSCCellBrowser.R +++ b/R/UCSCCellBrowser.R @@ -16,10 +16,12 @@ #' @export #' #' @examples -#' # # first time run (lazy mode) -#' # ucsc.cb.samples = ShowCBDatasets(lazy = TRUE, update = TRUE) -#' # # second time run (lazy mode) -#' # ucsc.cb.samples = ShowCBDatasets(lazy = TRUE, update = FALSE) +#' \dontrun{ +#' # first time run (lazy mode), need users to provide json folder +#' ucsc.cb.samples <- ShowCBDatasets(lazy = TRUE, update = TRUE) +#' # second time run (lazy mode), need users to provide json folder +#' ucsc.cb.samples <- ShowCBDatasets(lazy = TRUE, update = FALSE) +#' } ShowCBDatasets <- function(lazy = TRUE, json.folder = NULL, update = FALSE, quiet = FALSE) { # parse all datasets json if (lazy) { @@ -72,6 +74,37 @@ ShowCBDatasets <- function(lazy = TRUE, json.folder = NULL, update = FALSE, quie desc.folder <- "https://cells.ucsc.edu" datasets.folder <- "https://cells.ucsc.edu" } + # split columns + all.sample.dup.index <- duplicated(colnames(all.samples.df)) + # all.sample.dup.cols = colnames(all.samples.df)[all.sample.dup.index] + # all.samples.df.unique = all.samples.df[!all.sample.dup.index] + # all.samples.df.dup = all.samples.df[all.sample.dup.index] + # # modify columns + # attr.cols = c( + # "tags", "diseases", "organisms", "body_parts", + # "projects", "life_stages", "domains", "sources", "assays" + # ) + # unique.valid = intersect(attr.cols, colnames(all.samples.df.unique)) + # all.samples.df.unique <- PasteAttr(df = all.samples.df.unique, attr = unique.valid) + # dup.valid = intersect(attr.cols, colnames(all.samples.df.dup)) + # all.samples.df.dup <- PasteAttr(df = all.samples.df.dup, attr = dup.valid) + # # deal with duplicated columns + # for (col in all.sample.dup.cols){ + # col.df = data.frame(col1=all.samples.df.unique[, col], col2 = all.samples.df.dup[, col]) + # col.value = apply(col.df, 1, function(x){ + # if(x["col1"] == ""){ + # if(x["col2"] == ""){ + # x["col1"] + # }else{ + # x["col2"] + # } + # }else{ + # x["col1"] + # } + # }) + # all.samples.df.unique[, col] = col.value + # } + all.samples.df <- all.samples.df[!all.sample.dup.index] # modify columns all.samples.df <- PasteAttr(df = all.samples.df, attr = c( "tags", "diseases", "organisms", "body_parts", @@ -161,11 +194,15 @@ ShowCBDatasets <- function(lazy = TRUE, json.folder = NULL, update = FALSE, quie #' @export #' #' @examples -#' # # lazy mode, load datasets json files locally -#' # ucsc.cb.samples = ShowCBDatasets(lazy = TRUE, json.folder = NULL, update = FALSE) -#' # # cell number is between 1000 and 2000 -#' # hbb.sample.df = ExtractCBDatasets(all.samples.df = ucsc.cb.samples, organ = c("brain", "blood"), -#' # organism = "Human (H. sapiens)", cell.num = c(1000,2000)) +#' \dontrun{ +#' # lazy mode, load datasets json files locally, need users to provide json folder +#' ucsc.cb.samples <- ShowCBDatasets(lazy = TRUE, json.folder = NULL, update = FALSE) +#' # cell number is between 1000 and 2000 +#' hbb.sample.df <- ExtractCBDatasets( +#' all.samples.df = ucsc.cb.samples, organ = c("brain", "blood"), +#' organism = "Human (H. sapiens)", cell.num = c(1000, 2000) +#' ) +#' } ExtractCBDatasets <- function(all.samples.df, collection = NULL, sub.collection = NULL, organ = NULL, disease = NULL, organism = NULL, project = NULL, fuzzy.match = TRUE, cell.num = NULL) { # extract row index under different filter @@ -219,12 +256,16 @@ ExtractCBDatasets <- function(all.samples.df, collection = NULL, sub.collection #' @export #' #' @examples -#' # # lazy mode, load datasets json files locally -#' # ucsc.cb.samples = ShowCBDatasets(lazy = TRUE, json.folder = NULL, update = FALSE) -#' # # cell number is between 1000 and 2000 -#' # hbb.sample.df = ExtractCBDatasets(all.samples.df = ucsc.cb.samples, organ = c("brain", "blood"), -#' # organism = "Human (H. sapiens)", cell.num = c(1000,2000)) -#' # hbb.sample.ct = ExtractCBComposition(json.folder = NULL, sample.df = hbb.sample.df) +#' \dontrun{ +#' # lazy mode, load datasets json files locally, need users to provide json folder +#' ucsc.cb.samples <- ShowCBDatasets(lazy = TRUE, json.folder = NULL, update = FALSE) +#' # cell number is between 1000 and 2000 +#' hbb.sample.df <- ExtractCBDatasets( +#' all.samples.df = ucsc.cb.samples, organ = c("brain", "blood"), +#' organism = "Human (H. sapiens)", cell.num = c(1000, 2000) +#' ) +#' hbb.sample.ct <- ExtractCBComposition(json.folder = NULL, sample.df = hbb.sample.df) +#' } ExtractCBComposition <- function(json.folder = NULL, sample.df = NULL, all.samples.df = NULL, collection = NULL, sub.collection = NULL, organ = NULL, disease = NULL, organism = NULL, project = NULL, fuzzy.match = TRUE, cell.num = NULL) { # prepare samples for download @@ -316,15 +357,19 @@ ExtractCBComposition <- function(json.folder = NULL, sample.df = NULL, all.sampl #' @export #' #' @examples -#' # # lazy mode, load datasets json files locally -#' # ucsc.cb.samples = ShowCBDatasets(lazy = TRUE, json.folder = NULL, update = FALSE) -#' # # cell number is between 1000 and 2000 -#' # hbb.sample.df = ExtractCBDatasets(all.samples.df = ucsc.cb.samples, organ = c("brain", "blood"), -#' # organism = "Human (H. sapiens)", cell.num = c(1000,2000)) -#' # hbb.sample.seu = ParseCBDatasets(sample.df = hbb.sample.df) -#' # # test 10x and matrix load -#' # complex.df = ucsc.cb.samples[c(1, 927, 379), ] # two 10x and one matrix -#' # complex.seu.list = ParseCBDatasets(sample.df = test.df, merge = F) +#' \dontrun{ +#' # lazy mode, load datasets json files locally, need users to provide json folder +#' ucsc.cb.samples <- ShowCBDatasets(lazy = TRUE, json.folder = NULL, update = FALSE) +#' # cell number is between 1000 and 2000 +#' hbb.sample.df <- ExtractCBDatasets( +#' all.samples.df = ucsc.cb.samples, organ = c("brain", "blood"), +#' organism = "Human (H. sapiens)", cell.num = c(1000, 2000) +#' ) +#' hbb.sample.seu <- ParseCBDatasets(sample.df = hbb.sample.df) +#' # test 10x and matrix load +#' complex.df <- ucsc.cb.samples[c(1, 927, 379), ] # two 10x and one matrix +#' complex.seu.list <- ParseCBDatasets(sample.df = test.df, merge = F) +#' } ParseCBDatasets <- function(sample.df = NULL, all.samples.df = NULL, collection = NULL, sub.collection = NULL, organ = NULL, disease = NULL, organism = NULL, project = NULL, fuzzy.match = TRUE, cell.num = NULL, timeout = 1000, merge = TRUE) { @@ -343,6 +388,7 @@ ParseCBDatasets <- function(sample.df = NULL, all.samples.df = NULL, collection } # set timeout env.timeout <- getOption("timeout") + on.exit(options(timeout = env.timeout)) # restore timeout options(timeout = timeout) # base url base.url <- "https://cells.ucsc.edu/" @@ -369,8 +415,6 @@ ParseCBDatasets <- function(sample.df = NULL, all.samples.df = NULL, collection meta.file = meta.url, coord.file = coord.file, name = sample.name ) } - # restore timeout - options(timeout = env.timeout) # merge or not if (isTRUE(merge)) { seu.obj <- mergeExperiments(seu.obj.list) diff --git a/R/Zenodo.R b/R/Zenodo.R index ea9ac36..739be77 100644 --- a/R/Zenodo.R +++ b/R/Zenodo.R @@ -12,11 +12,15 @@ #' @export #' #' @examples -#' # zebrafish.df = ExtractZenodoMeta(doi = "10.5281/zenodo.7243603") -#' # ExtractZenodoMeta(doi = "10.5281/zenodo.48065") # Restricted Access -#' # # vector of dois -#' # multi.dois = ExtractZenodoMeta(doi = c("1111", "10.5281/zenodo.7243603", -#' # "10.5281/zenodo.7244441")) +#' \donttest{ +#' zebrafish.df <- ExtractZenodoMeta(doi = "10.5281/zenodo.7243603") +#' ExtractZenodoMeta(doi = "10.5281/zenodo.48065") # Restricted Access +#' # vector of dois +#' multi.dois <- ExtractZenodoMeta(doi = c( +#' "1111", "10.5281/zenodo.7243603", +#' "10.5281/zenodo.7244441" +#' )) +#' } ExtractZenodoMeta <- function(doi, file.ext = c("rdata", "h5ad")) { # check doi doi.status <- startsWith(x = doi, prefix = "10.5281/zenodo.") @@ -53,6 +57,7 @@ ExtractZenodoMetaSingle <- function(doi, file.ext = c("rdata", "h5ad")) { if (is.null(file.ext)) { record.files.used <- record.files } else { + record.files$type <- tolower(tools::file_ext(record.files$key)) record.files.used <- record.files %>% dplyr::filter(.data[["type"]] %in% file.ext) } # check the data @@ -61,9 +66,14 @@ ExtractZenodoMetaSingle <- function(doi, file.ext = c("rdata", "h5ad")) { } # prepare md5sum record.files.used$checksum <- gsub(pattern = "md5:", replacement = "", record.files.used$checksum) + # record.files.used.final <- data.frame( + # title = record.content$metadata$title, description = record.content$metadata$description, + # url = record.files.used$links$self, filename = basename(record.files.used$links$self), + # md5 = record.files.used$checksum, license = record.content$metadata$license$id + # ) record.files.used.final <- data.frame( title = record.content$metadata$title, description = record.content$metadata$description, - url = record.files.used$links$self, filename = basename(record.files.used$links$self), + url = record.files.used$links$self, filename = basename(record.files.used$key), md5 = record.files.used$checksum, license = record.content$metadata$license$id ) } else { @@ -97,10 +107,17 @@ ExtractZenodoMetaSingle <- function(doi, file.ext = c("rdata", "h5ad")) { #' @export #' #' @examples -#' # multi.dois.parse = ParseZenodo(doi = c("1111", "10.5281/zenodo.7243603", -#' # "10.5281/zenodo.7244441"), -#' # file.ext = c("rdata", "rds"), -#' # out.folder = "/path/to/outfoder") +#' \dontrun{ +#' # need users to provide the output folder +#' multi.dois.parse <- ParseZenodo( +#' doi = c( +#' "1111", "10.5281/zenodo.7243603", +#' "10.5281/zenodo.7244441" +#' ), +#' file.ext = c("rdata", "rds"), +#' out.folder = "/path/to/outfoder" +#' ) +#' } ParseZenodo <- function(doi = NULL, file.ext = c("rdata", "rds", "h5ad"), doi.df = NULL, out.folder = NULL, timeout = 1000, quiet = FALSE, parallel = TRUE) { if (!is.null(doi.df)) { @@ -117,6 +134,7 @@ ParseZenodo <- function(doi = NULL, file.ext = c("rdata", "rds", "h5ad"), doi.df doi.df$filename <- file.path(out.folder, doi.df$filename) # set timeout env.timeout <- getOption("timeout") + on.exit(options(timeout = env.timeout)) # restore timeout options(timeout = timeout) message("Start downloading!") if (isTRUE(parallel)) { @@ -129,8 +147,6 @@ ParseZenodo <- function(doi = NULL, file.ext = c("rdata", "rds", "h5ad"), doi.df down.status <- utils::download.file(url = doi.df$url, destfile = doi.df$filename, quiet = quiet, mode = "wb") } message("Finish downloading!") - # restore timeout - options(timeout = env.timeout) # check the md5sum down.md5 <- tools::md5sum(doi.df$filename) raw.md5 <- doi.df$md5 diff --git a/R/bam.R b/R/bam.R index 781c7ae..c81a8f8 100644 --- a/R/bam.R +++ b/R/bam.R @@ -12,10 +12,15 @@ #' @export #' #' @examples -#' # GSE138266.runs = ExtractRun(acce = "GSE138266", platform = "GPL18573") -#' # GSE138266.down = DownloadBam(gsm.df = GSE138266.runs, bam.type = "10x", -#' # prefetch.path = "/path/to/prefetch", -#' # out.folder = "/path/to/output") +#' \dontrun{ +#' # need users to provide prefetch.path +#' GSE138266.runs <- ExtractRun(acce = "GSE138266", platform = "GPL18573") +#' GSE138266.down <- DownloadBam( +#' gsm.df = GSE138266.runs, bam.type = "10x", +#' prefetch.path = "/path/to/prefetch", +#' out.folder = "/path/to/output" +#' ) +#' } DownloadBam <- function(gsm.df, bam.type = c("10x", "other"), prefetch.path = NULL, samdump.path = NULL, out.folder = NULL, prefetch.paras = "-X 100G", samdump.paras = "") { # check parameters @@ -121,12 +126,19 @@ RunSamdump <- function(sra, samdump.path, samdump.paras) { #' @export #' #' @examples -#' # GSE138266.runs = ExtractRun(acce = "GSE138266", platform = "GPL18573") -#' # GSE138266.down = DownloadBam(gsm.df = GSE138266.runs, prefetch.path = "/path/to/prefetch", -#' # out.folder = "/path/to/output") -#' # GSE138266.convert = Bam2Fastq(bam.folder = "/path/to/output", -#' # bamtofastq.path = "/path/to/bamtofastq_linux or samtools", -#' # bamtofastq.paras = "--nthreads 4") +#' \dontrun{ +#' # need users to provide prefetch.path and bamtofastq.path +#' GSE138266.runs <- ExtractRun(acce = "GSE138266", platform = "GPL18573") +#' GSE138266.down <- DownloadBam( +#' gsm.df = GSE138266.runs, prefetch.path = "/path/to/prefetch", +#' out.folder = "/path/to/output" +#' ) +#' GSE138266.convert <- Bam2Fastq( +#' bam.folder = "/path/to/output", +#' bamtofastq.path = "/path/to/bamtofastq_linux or samtools", +#' bamtofastq.paras = "--nthreads 4" +#' ) +#' } Bam2Fastq <- function(bam.folder = NULL, bam.path = NULL, bam.type = c("10x", "other"), pair.end = NULL, bamtofastq.path = NULL, bamtofastq.paras = "--nthreads 4", sort.name = FALSE, sort.thread = 4) { # check parameters diff --git a/R/cellxgene.R b/R/cellxgene.R index fb8b3e8..312a905 100644 --- a/R/cellxgene.R +++ b/R/cellxgene.R @@ -9,8 +9,10 @@ #' @references https://gist.github.com/ivirshup/f1a1603db69de3888eacb4bdb6a9317a #' #' @examples -#' # # all available datasets -#' # all.cellxgene.datasets = ShowCELLxGENEDatasets() +#' \donttest{ +#' # all available datasets +#' all.cellxgene.datasets <- ShowCELLxGENEDatasets() +#' } ShowCELLxGENEDatasets <- function() { # urls cellxgene.base.url <- "https://api.cellxgene.cziscience.com/dp/v1/" @@ -64,22 +66,25 @@ ShowCELLxGENEDatasets <- function() { # add h5ad and rds information cellxgene.collections.datasets.list <- lapply(1:nrow(cellxgene.collections.datasets.df), function(x) { x.df <- cellxgene.collections.datasets.df[x, ] - x.df$dataset_id <- unique(x.df$dataset_assets[[1]]$dataset_id) - if ("RDS" %in% unique(x.df$dataset_assets[[1]]$filetype)) { - x.rds.idx <- which(x.df$dataset_assets[[1]]$filetype == "RDS") - x.df$rds_id <- x.df$dataset_assets[[1]]$id[x.rds.idx] - x.df$rds_s3_uri <- x.df$dataset_assets[[1]]$s3_uri[x.rds.idx] - x.df$rds_user_submitted <- x.df$dataset_assets[[1]]$user_submitted[x.rds.idx] + x.df.dataset <- x.df$dataset_assets[[1]] + # remove duplicated urls + x.df.dataset <- x.df.dataset[!duplicated(x.df.dataset$s3_uri), ] + x.df$dataset_id <- unique(x.df.dataset$dataset_id) + if ("RDS" %in% unique(x.df.dataset$filetype)) { + x.rds.idx <- which(x.df.dataset$filetype == "RDS") + x.df$rds_id <- x.df.dataset$id[x.rds.idx] + x.df$rds_s3_uri <- x.df.dataset$s3_uri[x.rds.idx] + x.df$rds_user_submitted <- x.df.dataset$user_submitted[x.rds.idx] } else { x.df$rds_id <- NA x.df$rds_s3_uri <- NA x.df$rds_user_submitted <- NA } - if ("H5AD" %in% unique(x.df$dataset_assets[[1]]$filetype)) { - x.h5ad.idx <- which(x.df$dataset_assets[[1]]$filetype == "H5AD") - x.df$h5ad_id <- x.df$dataset_assets[[1]]$id[x.h5ad.idx] - x.df$h5ad_s3_uri <- x.df$dataset_assets[[1]]$s3_uri[x.h5ad.idx] - x.df$h5ad_user_submitted <- x.df$dataset_assets[[1]]$user_submitted[x.h5ad.idx] + if ("H5AD" %in% unique(x.df.dataset$filetype)) { + x.h5ad.idx <- which(x.df.dataset$filetype == "H5AD") + x.df$h5ad_id <- x.df.dataset$id[x.h5ad.idx] + x.df$h5ad_s3_uri <- x.df.dataset$s3_uri[x.h5ad.idx] + x.df$h5ad_user_submitted <- x.df.dataset$user_submitted[x.h5ad.idx] } else { x.df$h5ad_id <- NA x.df$h5ad_s3_uri <- NA @@ -125,12 +130,16 @@ ShowCELLxGENEDatasets <- function() { #' @references https://gist.github.com/ivirshup/f1a1603db69de3888eacb4bdb6a9317a #' #' @examples -#' # # all available datasets -#' # all.cellxgene.datasets = ShowCELLxGENEDatasets() -#' # # human 10x v2 and v3 datasets -#' # human.10x.cellxgene.meta = ExtractCELLxGENEMeta(all.samples.df = all.cellxgene.datasets, -#' # assay = c("10x 3' v2", "10x 3' v3"), -#' # organism = "Homo sapiens") +#' \donttest{ +#' # all available datasets +#' all.cellxgene.datasets <- ShowCELLxGENEDatasets() +#' # human 10x v2 and v3 datasets +#' human.10x.cellxgene.meta <- ExtractCELLxGENEMeta( +#' all.samples.df = all.cellxgene.datasets, +#' assay = c("10x 3' v2", "10x 3' v3"), +#' organism = "Homo sapiens" +#' ) +#' } ExtractCELLxGENEMeta <- function(all.samples.df, organism = NULL, ethnicity = NULL, sex = NULL, tissue = NULL, disease = NULL, assay = NULL, suspension.type = NULL, cell.type = NULL, cell.num = NULL) { # all datasets information @@ -182,14 +191,18 @@ ExtractCELLxGENEMeta <- function(all.samples.df, organism = NULL, ethnicity = NU #' @references https://gist.github.com/ivirshup/f1a1603db69de3888eacb4bdb6a9317a #' #' @examples -#' # # all available datasets -#' # all.cellxgene.datasets = ShowCELLxGENEDatasets() -#' # # human 10x v2 and v3 datasets -#' # human.10x.cellxgene.meta = ExtractCELLxGENEMeta(all.samples.df = all.cellxgene.datasets, -#' # assay = c("10x 3' v2", "10x 3' v3"), -#' # organism = "Homo sapiens") -#' # # download -#' # ParseCELLxGENE(meta = human.10x.cellxgene.meta, out.folder = "/path/to/output") +#' \dontrun{ +#' # all available datasets +#' all.cellxgene.datasets <- ShowCELLxGENEDatasets() +#' # human 10x v2 and v3 datasets +#' human.10x.cellxgene.meta <- ExtractCELLxGENEMeta( +#' all.samples.df = all.cellxgene.datasets, +#' assay = c("10x 3' v2", "10x 3' v3"), +#' organism = "Homo sapiens" +#' ) +#' # download, need to provide the output folder +#' ParseCELLxGENE(meta = human.10x.cellxgene.meta, out.folder = "/path/to/output") +#' } ParseCELLxGENE <- function(meta, file.ext = c("rds", "h5ad"), out.folder = NULL, timeout = 3600, quiet = FALSE, parallel = TRUE) { # check file extension @@ -227,6 +240,7 @@ ParseCELLxGENE <- function(meta, file.ext = c("rds", "h5ad"), out.folder = NULL, # download urls # set timeout env.timeout <- getOption("timeout") + on.exit(options(timeout = env.timeout)) # restore timeout options(timeout = timeout) message("Start downloading!") if (isTRUE(parallel)) { @@ -238,8 +252,6 @@ ParseCELLxGENE <- function(meta, file.ext = c("rds", "h5ad"), out.folder = NULL, } else { down.status <- utils::download.file(url = download.urls, destfile = names(download.urls), quiet = quiet, mode = "wb") } - # restore timeout - options(timeout = env.timeout) # process failed datasets down.status <- unlist(down.status) fail.status <- which(down.status != 0) diff --git a/R/fastq.R b/R/fastq.R index 1fe4366..7533d93 100644 --- a/R/fastq.R +++ b/R/fastq.R @@ -12,11 +12,12 @@ #' @importFrom GEOquery getGEO #' @importFrom Biobase annotation experimentData pData phenoData notes sampleNames exprs #' @importFrom parallel detectCores -#' @importFrom GEOfastq crawl_gsms #' @export #' #' @examples -#' # GSE186003.runs = ExtractRun(acce = "GSE186003", platform = "GPL24247") +#' \dontrun{ +#' GSE186003.runs <- ExtractRun(acce = "GSE186003", platform = "GPL24247", parallel = FALSE) +#' } ExtractRun <- function(gsm = NULL, acce = NULL, platform = NULL, parallel = TRUE, ...) { # get GSM if (is.null(gsm)) { @@ -31,6 +32,10 @@ ExtractRun <- function(gsm = NULL, acce = NULL, platform = NULL, parallel = TRUE cores.used <- 1 } # extract srr + rnsgeofastq <- requireNamespace("GEOfastq", quietly = TRUE) + if (!rnsgeofastq) { + stop("Can not find GEOfastq package, install with devtools::install_github('alexvpickering/GEOfastq')!") + } gsm.run.df <- GEOfastq::crawl_gsms(gsm, max.workers = cores.used) if (is.null(gsm.run.df)) { stop("There is no valid srr numbers available, please check the raw data available under ", paste0(gsm, collapse = ", ")) @@ -51,9 +56,14 @@ ExtractRun <- function(gsm = NULL, acce = NULL, platform = NULL, parallel = TRUE #' @export #' #' @examples -#' # GSE186003.runs = ExtractRun(acce = "GSE186003", platform = "GPL24247") -#' # GSE186003.down = DownloadSRA(gsm.df = GSE186003.runs, prefetch.path = "/path/to/prefetch", -#' # out.folder = "/path/to/output") +#' \dontrun{ +#' # need users to provide the prefetch.path and out.folder +#' GSE186003.runs <- ExtractRun(acce = "GSE186003", platform = "GPL24247") +#' GSE186003.down <- DownloadSRA( +#' gsm.df = GSE186003.runs, prefetch.path = "/path/to/prefetch", +#' out.folder = "/path/to/output" +#' ) +#' } DownloadSRA <- function(gsm.df, prefetch.path = NULL, out.folder = NULL, prefetch.paras = "-X 100G") { # check dataframe if (nrow(gsm.df) == 0) { @@ -76,7 +86,6 @@ DownloadSRA <- function(gsm.df, prefetch.path = NULL, out.folder = NULL, prefetc } else { prefetch.path <- prefetch.path } - cwd <- getwd() # prepare sample output folder samples.folder <- file.path(out.folder, gsm.df$gsm_name) names(samples.folder) <- gsm.df$run @@ -84,7 +93,6 @@ DownloadSRA <- function(gsm.df, prefetch.path = NULL, out.folder = NULL, prefetc sf <- samples.folder[x] RunPrefetch(sra = x, prefetch.path = prefetch.path, out.folder = sf, prefetch.paras = prefetch.paras) }) - setwd(cwd) # select fail samples fail.flag <- sapply(names(all.runs.down), function(x) { !is.null(all.runs.down[[x]]) @@ -109,6 +117,8 @@ RunPrefetch <- function(sra, prefetch.path, out.folder, prefetch.paras) { if (!dir.exists(out.folder)) { dir.create(path = out.folder, recursive = TRUE) } + cwd <- getwd() + on.exit(setwd(cwd)) # change directory to avoid bam download bug setwd(out.folder) # prefetch command @@ -148,13 +158,20 @@ RunPrefetch <- function(sra, prefetch.path, out.folder, prefetch.paras) { #' @export #' #' @examples -#' # GSE186003.runs = ExtractRun(acce = "GSE186003", platform = "GPL24247") -#' # GSE186003.down = DownloadSRA(gsm.df = GSE186003.runs, prefetch.path = "/path/to/prefetch", -#' # out.folder = "/path/to/output") -#' # GSE186003.split = SplitSRA(sra.folder = "/path/to/output", -#' # split.cmd.path = "/path/to/parallel-fastq-dump", -#' # sratools.path = "/path/to/sra/bin", fastq.type = "10x", -#' # split.cmd.threads = 4) +#' \dontrun{ +#' # need users to provide the prefetch.path, sra.folder, split.cmd.path, sratools.path and out.folder +#' GSE186003.runs <- ExtractRun(acce = "GSE186003", platform = "GPL24247") +#' GSE186003.down <- DownloadSRA( +#' gsm.df = GSE186003.runs, prefetch.path = "/path/to/prefetch", +#' out.folder = "/path/to/output" +#' ) +#' GSE186003.split <- SplitSRA( +#' sra.folder = "/path/to/output", +#' split.cmd.path = "/path/to/parallel-fastq-dump", +#' sratools.path = "/path/to/sra/bin", fastq.type = "10x", +#' split.cmd.threads = 4 +#' ) +#' } SplitSRA <- function(sra.folder = NULL, sra.path = NULL, fastq.type = c("10x", "other"), split.cmd.path = NULL, sratools.path = NULL, split.cmd.paras = NULL, split.cmd.threads = NULL, format.10x = TRUE, remove.raw = FALSE) { # check parameters diff --git a/R/hca.R b/R/hca.R index d65803d..c7dc896 100644 --- a/R/hca.R +++ b/R/hca.R @@ -43,8 +43,10 @@ ExtractHCAProjects <- function(catalog = NULL) { #' @export #' #' @examples -#' # # all available projects -#' # all.hca.projects = ShowHCAProjects() +#' \donttest{ +#' # all available projects +#' all.hca.projects <- ShowHCAProjects() +#' } ShowHCAProjects <- function(catalog = NULL) { # get all projects information hca.projects.df <- ExtractHCAProjects(catalog = catalog) @@ -174,14 +176,18 @@ ShowHCAProjects <- function(catalog = NULL) { #' @references https://bioconductor.org/packages/release/bioc/html/hca.html #' #' @examples -#' # # all available projects -#' # all.hca.projects = ShowHCAProjects() -#' # # all human projects -#' # all.human.projects = ExtractHCAMeta(all.projects.df = all.hca.projects, organism = "Homo sapiens") -#' # # all human and 10x 3' v2 -#' # all.human.10x.projects = ExtractHCAMeta(all.projects.df = all.hca.projects, -#' # organism = "Homo sapiens", -#' # protocol = c("10x 3' v2", "10x 3' v3")) +#' \donttest{ +#' # all available projects +#' all.hca.projects <- ShowHCAProjects() +#' # all human projects +#' all.human.projects <- ExtractHCAMeta(all.projects.df = all.hca.projects, organism = "Homo sapiens") +#' # all human and 10x 3' v2 +#' all.human.10x.projects <- ExtractHCAMeta( +#' all.projects.df = all.hca.projects, +#' organism = "Homo sapiens", +#' protocol = c("10x 3' v2", "10x 3' v3") +#' ) +#' } ExtractHCAMeta <- function(all.projects.df, organism = NULL, sex = NULL, organ = NULL, organ.part = NULL, disease = NULL, sample.type = NULL, preservation.method = NULL, protocol = NULL, suspension.type = NULL, cell.type = NULL, cell.num = NULL, sequencing.type = NULL) { @@ -242,14 +248,18 @@ ExtractHCAMeta <- function(all.projects.df, organism = NULL, sex = NULL, organ = #' @export #' #' @examples -#' # # all available projects -#' # all.hca.projects = ShowHCAProjects() -#' # # all human and 10x 3' v2 -#' # all.human.10x.projects = ExtractHCAMeta(all.projects.df = all.hca.projects, -#' # organism = "Homo sapiens", -#' # protocol = c("10x 3' v2", "10x 3' v3")) -#' # # download -#' # ParseHCA(meta = all.human.10x.projects, out.folder = "/path/to/output") +#' \dontrun{ +#' # all available projects +#' all.hca.projects <- ShowHCAProjects() +#' # all human and 10x 3' v2 +#' all.human.10x.projects <- ExtractHCAMeta( +#' all.projects.df = all.hca.projects, +#' organism = "Homo sapiens", +#' protocol = c("10x 3' v2", "10x 3' v3") +#' ) +#' # download, need users to provide the output folder +#' ParseHCA(meta = all.human.10x.projects, out.folder = "/path/to/output") +#' } ParseHCA <- function(meta, file.ext = c("rds", "rdata", "h5", "h5ad", "loom"), out.folder = NULL, timeout = 3600, quiet = FALSE, parallel = TRUE) { # file.ext: ignore case, tar.gz, gz @@ -324,6 +334,7 @@ ParseHCA <- function(meta, file.ext = c("rds", "rdata", "h5", "h5ad", "loom"), o # download urls # set timeout env.timeout <- getOption("timeout") + on.exit(options(timeout = env.timeout)) # restore timeout options(timeout = timeout) message("Start downloading!") if (isTRUE(parallel)) { @@ -335,8 +346,6 @@ ParseHCA <- function(meta, file.ext = c("rds", "rdata", "h5", "h5ad", "loom"), o } else { down.status <- utils::download.file(url = download.urls, destfile = names(download.urls), quiet = quiet, mode = "wb") } - # restore timeout - options(timeout = env.timeout) # process failed datasets down.status <- unlist(down.status) fail.status <- which(down.status != 0) diff --git a/R/summary.R b/R/summary.R index 604120f..4db96fc 100644 --- a/R/summary.R +++ b/R/summary.R @@ -10,18 +10,22 @@ #' @export #' #' @examples -#' # # PanglaoDB -#' # StatDBAttribute(df = PanglaoDBMeta, filter = c("species", "protocol"), database = "PanglaoDB") -#' # # UCSC Cell Browser -#' # ucsc.cb.samples = ShowCBDatasets(lazy = TRUE, json.folder = NULL, update = FALSE) -#' # StatDBAttribute(df = ucsc.cb.samples, filter = c("organism", "organ"), database = "UCSC") -#' # # CELLxGENE -#' # all.cellxgene.datasets = ShowCELLxGENEDatasets() -#' # StatDBAttribute(df = all.cellxgene.datasets, filter = c("organism", "sex"), -#' # database = "CELLxGENE") -#' # # HCA -#' # all.hca.projects = ShowHCAProjects() -#' # StatDBAttribute(df = all.hca.projects, filter = c("organism", "sex"), database = "HCA") +#' \dontrun{ +#' # PanglaoDB +#' StatDBAttribute(df = PanglaoDBMeta, filter = c("species", "protocol"), database = "PanglaoDB") +#' # UCSC Cell Browser, need users to provide the json folder +#' ucsc.cb.samples <- ShowCBDatasets(lazy = TRUE, json.folder = NULL, update = FALSE) +#' StatDBAttribute(df = ucsc.cb.samples, filter = c("organism", "organ"), database = "UCSC") +#' # CELLxGENE +#' all.cellxgene.datasets <- ShowCELLxGENEDatasets() +#' StatDBAttribute( +#' df = all.cellxgene.datasets, filter = c("organism", "sex"), +#' database = "CELLxGENE" +#' ) +#' # HCA +#' all.hca.projects <- ShowHCAProjects() +#' StatDBAttribute(df = all.hca.projects, filter = c("organism", "sex"), database = "HCA") +#' } StatDBAttribute <- function(df, filter, database = c("PanglaoDB", "UCSC", "CELLxGENE", "HCA")) { # check parameters database <- match.arg(arg = database) diff --git a/R/utils.R b/R/utils.R index 1909e35..7621132 100644 --- a/R/utils.R +++ b/R/utils.R @@ -40,8 +40,8 @@ ExtractSample <- function(df, base.url, json.folder, quiet) { dir.create(x, showWarnings = FALSE, recursive = TRUE) }) down.status <- lapply(df$name, function(x) { - utils::download.file(url = df.json[x], destfile = file.path(df.json.folder[x], "dataset.json"), quiet = quiet, mode = "wb", method = "wget") - utils::download.file(url = df.desc[x], destfile = file.path(df.json.folder[x], "desc.json"), quiet = quiet, mode = "wb", method = "wget") + utils::download.file(url = df.json[x], destfile = file.path(df.json.folder[x], "dataset.json"), quiet = quiet, mode = "wb", method = "wget", extra = "--no-check-certificate") + utils::download.file(url = df.desc[x], destfile = file.path(df.json.folder[x], "desc.json"), quiet = quiet, mode = "wb", method = "wget", extra = "--no-check-certificate") }) } # process diff --git a/README.md b/README.md index da9c52a..933c0fd 100644 --- a/README.md +++ b/README.md @@ -25,16 +25,29 @@ ## Installation -You can install the development version of `scfetch` from [GitHub](https://github.com/showteeth/scfetch) with: +`scfetch` is an R package distributed as part of the [CRAN](https://cran.r-project.org/web/packages/scfetch/index.html). To install the package, start R and enter: +```R +# install via CRAN +install.packages("scfetch") -``` r +# you can also install the development version from GitHub # install.packages("devtools") devtools::install_github("showteeth/scfetch") ``` -**For issues about installation, please refer [Installation guide](./INSTALL.md).** +There are some conditionally used packages: -For data structures conversion, `scfetch` requires several python pcakages, you can install with: +```R +# install.packages("devtools") #In case you have not installed it. +devtools::install_github("alexvpickering/GEOfastq") # download fastq +devtools::install_github("cellgeni/sceasy") # format conversion +devtools::install_github("mojaveazure/seurat-disk") # format conversion +devtools::install_github("satijalab/seurat-wrappers") # format conversion +``` + +**For issues about installation, please refer `INSTALL.md`.** + +For data structures conversion, `scfetch` requires several python packages, you can install with: ``` bash # install python packages @@ -181,8 +194,623 @@ Detailed usage is available in [website](https://showteeth.github.io/scfetch/).
+### Downloas fastq and bam + +Since the downloading process is time-consuming, we provide the commands used to illustrate the usage. + +#### Downloas fastq + +##### Prepare run number + +For fastq files stored in SRA, `scfetch` can extract sample information and run number with GEO accession number or users can also provide a dataframe contains the run number of interested samples. + +Extract all samples under `GSE130636` and the platform is `GPL20301` (use `platform = NULL` for all platforms): +```r +GSE130636.runs <- ExtractRun(acce = "GSE130636", platform = "GPL20301") +``` + +
+ +##### Download sra + +With the dataframe contains gsm and run number, `scfetch` will download sra files using `prefetch`. The returned result is a dataframe contains failed runs. If not `NULL`, users can re-run `DownloadSRA` by setting `gsm.df` to the returned result. + +```r +# a small test +GSE130636.runs <- GSE130636.runs[GSE130636.runs$run %in% c("SRR9004346", "SRR9004351"), ] +# download, you may need to set prefetch.path +out.folder <- tempdir() +GSE130636.down <- DownloadSRA( + gsm.df = GSE130636.runs, + out.folder = out.folder +) +# GSE130636.down is null or dataframe contains failed runs +``` + +The `out.folder` structure will be: `gsm_number/run_number`. + +
+ +##### Split fastq + +After obtaining the sra files, `scfetch` provides function `SplitSRA` to split sra files to fastq files using `parallel-fastq-dump` (parallel, fastest and gzip output), `fasterq-dump` (parallel, fast but unzipped output) and `fastq-dump` (slowest and gzip output). + +For fastqs generated with 10x Genomics, `SplitSRA` can identify read1, read2 and index files and format the read1 and read2 to 10x required format (`sample1_S1_L001_R1_001.fastq.gz` and `sample1_S1_L001_R2_001.fastq.gz`). In detail, the file with read length 26 or 28 is considered as read1, the files with read length 8 or 10 are considered as index files and the remain file is considered as read2. The read length rules is from [Sequencing Requirements for Single Cell 3'](https://www.10xgenomics.com/cn/support/single-cell-gene-expression/documentation/steps/sequencing/sequencing-requirements-for-single-cell-3) and [Sequencing Requirements for Single Cell V(D)J](https://www.10xgenomics.com/cn/support/single-cell-immune-profiling/documentation/steps/sequencing/sequencing-requirements-for-single-cell-v-d-j). + +The returned result is a vector of failed sra files. If not `NULL`, users can re-run `SplitSRA` by setting `sra.path` to the returned result. + +```r +# parallel-fastq-dump requires sratools.path +# you may need to set split.cmd.path and sratools.path +sra.folder <- tempdir() +GSE130636.split <- SplitSRA( + sra.folder = sra.folder, + fastq.type = "10x", split.cmd.threads = 4 +) +``` + +
+ +#### Download bam + +##### Prepare run number + +`scfetch` can extract sample information and run number with GEO accession number or users can also provide a dataframe contains the run number of interested samples. + +```r +GSE138266.runs <- ExtractRun(acce = "GSE138266", platform = "GPL18573") +``` + +
+ +##### Download bam + +With the dataframe contains gsm and run number, `scfetch` provides `DownloadBam` to download bam files using `prefetch`. It suooorts 10x generated bam files and normal bam files. + +* 10x generated bam: While bam files generated from 10x softwares (e.g. CellRanger) contain custom tags which are not kept when using default parameters of `prefetch`, `scfetch` adds `--type TenX` to make sure the downloaded bam files contain these tags. +* normal bam: For normal bam files, `DownloadBam` will download sra files first and then convert sra files to bam files with `sam-dump`. After testing the efficiency of `prefetch` + `sam-dump` and `sam-dump`, the former is much faster than the latter (52G sra and 72G bam files): +```bash +# # use prefetch to download sra file +# prefetch -X 60G SRR1976036 +# # real 117m26.334s +# # user 16m42.062s +# # sys 3m28.295s + +# # use sam-dump to convert sra to bam +# time (sam-dump SRR1976036.sra | samtools view -bS - -o SRR1976036.bam) +# # real 536m2.721s +# # user 749m41.421s +# # sys 20m49.069s + + +# use sam-dump to download bam directly +# time (sam-dump SRR1976036 | samtools view -bS - -o SRR1976036.bam) +# # more than 36hrs only get ~3G bam files, too slow +``` + +The returned result is a dataframe containing failed runs (either failed to download sra files or failed to convert to bam files for normal bam; failed to download bam files for 10x generated bam). If not `NULL`, users can re-run `DownloadBam` by setting `gsm.df` to the returned result. The following is an example to download 10x generated bam file: + +```r +# a small test +GSE138266.runs <- GSE138266.runs[GSE138266.runs$run %in% c("SRR10211566"), ] +# download, you may need to set prefetch.path +out.folder <- tempdir() +GSE138266.down <- DownloadBam( + gsm.df = GSE138266.runs, + out.folder = out.folder +) +# GSE138266.down is null or dataframe contains failed runs +``` + +The `out.folder` structure will be: `gsm_number/run_number`. + +
+ +##### Convert bam to fastq + +With downloaded bam files, `scfetch` provides function `Bam2Fastq` to convert bam files to fastq files. For bam files generated from 10x softwares, `Bam2Fastq` utilizes `bamtofastq` tool developed by 10x Genomics. + +The returned result is a vector of bam files failed to convert to fastq files. If not `NULL`, users can re-run `Bam2Fastq` by setting `bam.path` to the returned result. + +```r +bam.folder <- tempdir() +# you may need to set bamtofastq.path and bamtofastq.paras +GSE138266.convert <- Bam2Fastq( + bam.folder = bam.folder +) +``` + +
+ +### Download count matrix + +`scfetch` provides functions for users to download **count matrices** and **annotations** (e.g. cell type annotation and composition) from GEO and some single-cell databases (e.g. [PanglaoDB](https://panglaodb.se/index.html) and [UCSC Cell Browser](https://cells.ucsc.edu/?#)). `scfetch` also supports loading the downloaded data to `Seurat`. + +Until now, the public resources supported and the returned results: + +| Resources | URL | Download Type | Returned results | +|-------------------|-----------------------------------|---------------|-----------------------------------------------| +| GEO | https://www.ncbi.nlm.nih.gov/geo/ | count matrix | SeuratObject or count matrix for bulk RNA-seq | +| PanglaoDB | https://panglaodb.se/index.html | count matrix | SeuratObject | +| UCSC Cell Browser | https://cells.ucsc.edu/ | count matrix | SeuratObject | + +
+ +#### GEO + +[GEO is an international public repository that archives and freely distributes microarray, next-generation sequencing, and other forms of high-throughput functional genomics data submitted by the research community.](https://www.ncbi.nlm.nih.gov/geo/info/overview.html) It provides a very convenient way for users to explore and select interested scRNA-seq datasets. + +##### Extract metadata + +`scfetch` provides `ExtractGEOMeta` to extract sample metadata, including sample title, source name/tissue, description, cell type, treatment, paper title, paper abstract, organism, protocol, data processing methods, et al. + +```r +# extract metadata of specified platform +GSE200257.meta <- ExtractGEOMeta(acce = "GSE200257", platform = "GPL24676") +# set VROOM_CONNECTION_SIZE to avoid error: Error: The size of the connection buffer (786432) was not large enough +Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 60) +# extract metadata of all platforms +GSE94820.meta <- ExtractGEOMeta(acce = "GSE94820", platform = NULL) +``` + +
+ +##### Download matrix and load to Seurat + +After manually check the extracted metadata, users can **download count matrix** and **load the count matrix** to Seurat with `ParseGEO`. + +For count matrix, `ParseGEO` supports downloading the matrix from **supplementary files** and extracting from `ExpressionSet`, users can control the source by specifying `down.supp` or detecting automatically (`ParseGEO` will extract the count matrix from `ExpressionSet` first, if the count matrix is NULL or contains non-integer values, `ParseGEO` will download supplementary files). While the supplementary files have two main types: single count matrix file containing all cells and CellRanger-style outputs (barcode, matrix, feature/gene), users are required to choose the type of supplementary files with `supp.type`. + +With the count matrix, `ParseGEO` will load the matrix to Seurat automatically. If multiple samples available, users can choose to merge the SeuratObject with `merge`. + +```r +# for cellranger output +out.folder <- tempdir() +GSE200257.seu <- ParseGEO( + acce = "GSE200257", platform = NULL, supp.idx = 1, down.supp = TRUE, supp.type = "10x", + out.folder = out.folder +) +# for count matrix, no need to specify out.folder, download count matrix to tmp folder +GSE94820.seu <- ParseGEO(acce = "GSE94820", platform = NULL, supp.idx = 1, down.supp = TRUE, supp.type = "count") +``` + +**For bulk RNA-seq**, set `data.type = "bulk"` in `ParseGEO`, this will return count matrix. + +
+ +#### PanglaoDB + +[PanglaoDB](https://panglaodb.se/index.html) is a database which contains scRNA-seq datasets from mouse and human. Up to now, it contains **5,586,348 cells** from **1368 datasets (1063 from Mus musculus and 305 from Homo sapiens)**. It has well organized metadata for every dataset, including tissue, protocol, species, number of cells and cell type annotation (computationally identified). Daniel Osorio has developed [rPanglaoDB](https://github.com/dosorio/rPanglaoDB/) to access [PanglaoDB](https://panglaodb.se/index.html) data, the functions of `scfetch` here are based on [rPanglaoDB](https://github.com/dosorio/rPanglaoDB/). + +Since [PanglaoDB](https://panglaodb.se/about.html) is no longer maintained, `scfetch` has cached all metadata and cell type composition and use these cached data by default to accelerate, users can access the cached data with `PanglaoDBMeta` (all metadata) and `PanglaoDBComposition` (all cell type composition). + +##### Summarise attributes + +`scfetch` provides `StatDBAttribute` to summary attributes of [PanglaoDB](https://panglaodb.se/index.html): + +```r +# use cached metadata +StatDBAttribute(df = PanglaoDBMeta, filter = c("species", "protocol"), database = "PanglaoDB") +``` + +
+ +##### Extract metadata + +`scfetch` provides `ExtractPanglaoDBMeta` to select interested datasets with specified **species**, **protocol**, **tissue** and **cell number** (The available values of these attributes can be obtained with `StatDBAttribute`). User can also choose to whether to add cell type annotation to every dataset with `show.cell.type`. + +`scfetch` uses cached metadata and cell type composition by default, users can change this by setting `local.data = FALSE`. + +```r +hsa.meta <- ExtractPanglaoDBMeta( + species = "Homo sapiens", protocol = c("Smart-seq2", "10x chromium"), + show.cell.type = TRUE, cell.num = c(1000, 2000) +) +``` + +
+ +##### Extract cell type composition + +`scfetch` provides `ExtractPanglaoDBComposition` to extract cell type annotation and composition (use cached data by default to accelerate, users can change this by setting `local.data = FALSE`). + +```r +hsa.composition <- ExtractPanglaoDBComposition(species = "Homo sapiens", protocol = c("Smart-seq2", "10x chromium")) +``` + +
+ +##### Download matrix and load to Seurat + +After manually check the extracted metadata, `scfetch` provides `ParsePanglaoDB` to **download count matrix** and **load the count matrix** to Seurat. With available cell type annotation, uses can filter datasets without specified cell type with `cell.type`. Users can also include/exclude cells expressing specified genes with `include.gene`/`exclude.gene`. + +With the count matrix, `ParsePanglaoDB` will load the matrix to Seurat automatically. If multiple datasets available, users can choose to merge the SeuratObject with `merge`. + +```r +# small test +hsa.seu <- ParsePanglaoDB(hsa.meta[1:3, ], merge = TRUE) +``` + +
+ +#### UCSC Cell Browser + +The [UCSC Cell Browser](https://cells.ucsc.edu/?#) is a web-based tool that allows scientists to interactively visualize scRNA-seq datasets. It contains **1040 single cell datasets** from **17 different species**. And, it is **organized with the hierarchical structure**, which can help users quickly locate the datasets they are interested in. + +##### Show available datasets + +`scfetch` provides `ShowCBDatasets` to show all available datasets. Due to the large number of datasets, `ShowCBDatasets` enables users to perform *lazy load* of dataset json files instead of downloading the json files online (time-consuming!!!). This *lazy load* requires users to provide `json.folder` to save json files and set `lazy = TRUE` (for the first time of run, `ShowCBDatasets` will download current json files to `json.folder`, for next time of run, with `lazy = TRUE`, `ShowCBDatasets` will load the downloaded json files from `json.folder`.). And, `ShowCBDatasets` supports updating the local datasets with `update = TRUE`. + +```r +json.folder <- tempdir() +# first time run, the json files are stored under json.folder +# ucsc.cb.samples = ShowCBDatasets(lazy = TRUE, json.folder = json.folder, update = TRUE) + +# second time run, load the downloaded json files +ucsc.cb.samples <- ShowCBDatasets(lazy = TRUE, json.folder = json.folder, update = FALSE) + +# always read online +# ucsc.cb.samples = ShowCBDatasets(lazy = FALSE) +``` + +The number of datasets and all available species: + +```r +# the number of datasets +nrow(ucsc.cb.samples) + +# available species +unique(unlist(sapply(unique(gsub(pattern = "\\|parent", replacement = "", x = ucsc.cb.samples$organisms)), function(x) { + unlist(strsplit(x = x, split = ", ")) +}))) +``` + +
+ +##### Summarise attributes + +`scfetch` provides `StatDBAttribute` to summary attributes of [UCSC Cell Browser](https://cells.ucsc.edu/?#): + +```r +StatDBAttribute(df = ucsc.cb.samples, filter = c("organism", "organ"), database = "UCSC") +``` + +
+ +##### Extract metadata + +`scfetch` provides `ExtractCBDatasets` to filter metadata with **collection**, **sub-collection**, **organ**, **disease status**, **organism**, **project** and **cell number** (The available values of these attributes can be obtained with `StatDBAttribute` except **cell number**). All attributes except cell number support fuzzy match with `fuzzy.match`, this is useful when selecting datasets. + +```{r cb_extract, eval=FALSE} +hbb.sample.df <- ExtractCBDatasets(all.samples.df = ucsc.cb.samples, organ = c("brain", "blood"), organism = "Human (H. sapiens)", cell.num = c(1000, 2000)) +``` + +
+ +##### Extract cell type composition + +`scfetch` provides `ExtractCBComposition` to extract cell type annotation and composition. + +```r +hbb.sample.ct <- ExtractCBComposition(json.folder = json.folder, sample.df = hbb.sample.df) +``` + +
+ +##### Load the online datasets to Seurat + +After manually check the extracted metadata, `scfetch` provides `ParseCBDatasets` to **load the online count matrix** to Seurat. All the attributes available in `ExtractCBDatasets` are also same here. Please note that the loading process provided by `ParseCBDatasets` will load the online count matrix instead of downloading it to local. If multiple datasets available, users can choose to merge the SeuratObject with `merge`. + +```r +hbb.sample.seu <- ParseCBDatasets(sample.df = hbb.sample.df) +``` + +
+ +### Download object + +`scfetch` provides functions for users to download processed single-cell RNA-seq data from [Zenodo](https://zenodo.org/), [CELLxGENE](https://cellxgene.cziscience.com/) and [Human Cell Atlas](https://www.humancellatlas.org/), including `RDS`, `RData`, `h5ad`, `h5`, `loom` objects. + +Until now, the public resources supported and the returned results: + +| Resources | URL | Download Type | Returned results | +|------------------|-----------------------------------|----------------------------------------|-------------------------| +| Zenodo | https://zenodo.org/ | count matrix, rds, rdata, h5ad, et al. | NULL or failed datasets | +| CELLxGENE | https://cellxgene.cziscience.com/ | rds, h5ad | NULL or failed datasets | +| Human Cell Atlas | https://www.humancellatlas.org/ | rds, rdata, h5, h5ad, loom | NULL or failed projects | + +
+ +#### Zenodo + +[Zenodo](https://zenodo.org/) contains various types of processed objects, such as `SeuratObject` which has been clustered and annotated, `AnnData` which contains processed results generated by `scanpy`. + +##### Extract metadata + +`scfetch` provides `ExtractZenodoMeta` to extract dataset metadata, including dataset title, description, available files and corresponding md5. Please note that when the dataset is restricted access, the returned dataframe will be empty. + +```r +# single doi +zebrafish.df <- ExtractZenodoMeta(doi = "10.5281/zenodo.7243603") + +# vector dois +multi.dois <- ExtractZenodoMeta(doi = c("1111", "10.5281/zenodo.7243603", "10.5281/zenodo.7244441")) +``` + +
+ +##### Download object + +After manually check the extracted metadata, users can **download the specified objects** with `ParseZenodo`. The downloaded objects are controlled by `file.ext` and **the provided object formats should be in lower case (e.g. rds/rdata/h5ad).** + +The returned result is a dataframe containing failed objects. If not `NULL`, users can re-run `ParseZenodo` by setting `doi.df` to the returned result. + +```r +out.folder <- tempdir() +multi.dois.parse <- ParseZenodo( + doi = c("1111", "10.5281/zenodo.7243603", "10.5281/zenodo.7244441"), + file.ext = c("rdata", "rds"), out.folder = out.folder +) +``` + +
+ +#### CELLxGENE + +The [CELLxGENE](https://cellxgene.cziscience.com/) is a web server contains **910** single-cell datasets, users can explore, download and upload own datasets. The downloaded datasets provided by [CELLxGENE](https://cellxgene.cziscience.com/) have two formats: `h5ad (AnnData v0.8)` and `rds (Seurat v4)`. + +##### Show available datasets + +`scfetch` provides `ShowCELLxGENEDatasets` to extract dataset metadata, including dataset title, description, contact, organism, ethnicity, sex, tissue, disease, assay, suspension type, cell type, et al. + +```r +# all available datasets +all.cellxgene.datasets <- ShowCELLxGENEDatasets() +``` + +
+ +##### Summarise attributes + +`scfetch` provides `StatDBAttribute` to summary attributes of [CELLxGENE](https://cellxgene.cziscience.com/): + +```r +StatDBAttribute(df = all.cellxgene.datasets, filter = c("organism", "sex"), database = "CELLxGENE") +``` + +
+ +##### Extract metadata + +`scfetch` provides `ExtractCELLxGENEMeta` to filter dataset metadata, the available values of attributes can be obtained with `StatDBAttribute` except **cell number**: + +```r +# human 10x v2 and v3 datasets +human.10x.cellxgene.meta <- ExtractCELLxGENEMeta( + all.samples.df = all.cellxgene.datasets, + assay = c("10x 3' v2", "10x 3' v3"), organism = "Homo sapiens" +) +``` + +
+ +##### Download object + +After manually check the extracted metadata, users can **download the specified objects** with `ParseCELLxGENE`. The downloaded objects are controlled by `file.ext` (choose from `"rds"` and `"h5ad"`). + +The returned result is a dataframe containing failed datasets. If not `NULL`, users can re-run `ParseCELLxGENE` by setting `meta` to the returned result. + +```r +out.folder <- tempdir() +ParseCELLxGENE( + meta = human.10x.cellxgene.meta[1:5, ], file.ext = "rds", + out.folder = out.folder +) +``` + +
+ +### Format conversion + +There are many tools have been developed to process scRNA-seq data, such as [Scanpy](https://scanpy.readthedocs.io/en/stable/), [Seurat](https://satijalab.org/seurat/), [scran](https://bioconductor.org/packages/release/bioc/html/scran.html) and [Monocle](http://cole-trapnell-lab.github.io/monocle-release/). These tools have their own objects, such as `Anndata` of `Scanpy`, `SeuratObject` of `Seurat`, `SingleCellExperiment` of `scran` and `CellDataSet`/`cell_data_set` of `Monocle2`/`Monocle3`. There are also some file format designed for large omics datasets, such as [loom](http://loompy.org/). To perform a comprehensive scRNA-seq data analysis, we usually need to combine multiple tools, which means we need to perform object conversion frequently. To facilitate user analysis of scRNA-seq data, `scfetch` provides multiple functions to perform object conversion between widely used tools and formats. The object conversion implemented in `scfetch` has two main advantages: + +* **one-step conversion between different objects**. There will be no conversion to intermediate objects, thus preventing unnecessary information loss. +* **tools used for object conversion are developed by the team of the source/destination object as far as possible**. For example, we use `SeuratDisk` to convert SeuratObject to loom, use `zellkonverter` to perform conversion between `SingleCellExperiment` and `Anndata`. When there is no such tools, we use `sceasy` to perform conversion. + +
+ +#### Test data + +```r +# library +library(Seurat) # pbmc_small +library(scRNAseq) # seger +``` + +`SeuratObject`: + +```r +# object +pbmc_small +``` + +`SingleCellExperiment`: +```r +seger <- scRNAseq::SegerstolpePancreasData() +``` + +
+ +#### Convert SeuratObject to other objects + +Here, we will convert SeuratObject to `SingleCellExperiment`, `CellDataSet`/`cell_data_set`, `Anndata`, `loom`. + +##### SeuratObject to SingleCellExperiment + +The conversion is performed with functions implemented in `Seurat`: +```r +sce.obj <- ExportSeurat(seu.obj = pbmc_small, assay = "RNA", to = "SCE") +``` + +
+ +##### SeuratObject to CellDataSet/cell_data_set + +To `CellDataSet` (The conversion is performed with functions implemented in `Seurat`): + +```r +# BiocManager::install("monocle") # reuqire monocle +cds.obj <- ExportSeurat(seu.obj = pbmc_small, assay = "RNA", reduction = "tsne", to = "CellDataSet") +``` + +To `cell_data_set` (The conversion is performed with functions implemented in `SeuratWrappers`): + +```r +# remotes::install_github('cole-trapnell-lab/monocle3') # reuqire monocle3 +cds3.obj <- ExportSeurat(seu.obj = pbmc_small, assay = "RNA", to = "cell_data_set") +``` + +
+ +##### SeuratObject to AnnData + +`AnnData` is a Python object, `reticulate` is used to communicate between Python and R. User should create a Python environment which contains `anndata` package and specify the environment path with `conda.path` to ensure the exact usage of this environment. + +The conversion is performed with functions implemented in `sceasy`: +```r +# remove pbmc_small.h5ad first +anndata.file <- tempfile(pattern = "pbmc_small_", fileext = ".h5ad") +# you may need to set conda.path +ExportSeurat( + seu.obj = pbmc_small, assay = "RNA", to = "AnnData", + anndata.file = anndata.file +) +``` + +
+ +##### SeuratObject to loom + +The conversion is performed with functions implemented in `SeuratDisk`: +```r +loom.file <- tempfile(pattern = "pbmc_small_", fileext = ".loom") +ExportSeurat( + seu.obj = pbmc_small, assay = "RNA", to = "loom", + loom.file = loom.file +) +``` + +
+ +#### Convert other objects to SeuratObject + +##### SingleCellExperiment to SeuratObject + +The conversion is performed with functions implemented in `Seurat`: +```r +seu.obj.sce <- ImportSeurat(obj = sce.obj, from = "SCE", count.assay = "counts", data.assay = "logcounts", assay = "RNA") +``` + +
+ +##### CellDataSet/cell_data_set to SeuratObject + +`CellDataSet` to `SeuratObject` (The conversion is performed with functions implemented in `Seurat`): +```r +seu.obj.cds <- ImportSeurat(obj = cds.obj, from = "CellDataSet", count.assay = "counts", assay = "RNA") +``` + +`cell_data_set` to `SeuratObject` (The conversion is performed with functions implemented in `Seurat`): +```{r cds2seu2, eval=FALSE} +seu.obj.cds3 <- ImportSeurat(obj = cds3.obj, from = "cell_data_set", count.assay = "counts", data.assay = "logcounts", assay = "RNA") +``` + +
+ +##### AnnData to SeuratObject + +`AnnData` is a Python object, `reticulate` is used to communicate between Python and R. User should create a Python environment which contains `anndata` package and specify the environment path with `conda.path` to ensure the exact usage of this environment. + +The conversion is performed with functions implemented in `sceasy`: +```r +# you may need to set conda.path +seu.obj.h5ad <- ImportSeurat( + anndata.file = anndata.file, from = "AnnData", assay = "RNA" +) +``` + +
+ +##### loom to SeuratObject + +The conversion is performed with functions implemented in `SeuratDisk` and `Seurat`: + +```r +# loom will lose reduction +seu.obj.loom <- ImportSeurat(loom.file = loom.file, from = "loom") +``` + +
+ +#### Conversion between SingleCellExperiment and AnnData + +The conversion is performed with functions implemented in `zellkonverter`. + +##### SingleCellExperiment to AnnData + +```r +# remove seger.h5ad first +seger.anndata.file <- tempfile(pattern = "seger_", fileext = ".h5ad") +SCEAnnData( + from = "SingleCellExperiment", to = "AnnData", sce = seger, X_name = "counts", + anndata.file = seger.anndata.file +) +``` + +
+ +##### AnnData to SingleCellExperiment + +```r +seger.anndata <- SCEAnnData( + from = "AnnData", to = "SingleCellExperiment", + anndata.file = seger.anndata.file +) +``` + +
+ +#### Conversion between SingleCellExperiment and loom + +The conversion is performed with functions implemented in `LoomExperiment`. + +##### SingleCellExperiment to loom + +```r +# remove seger.loom first +seger.loom.file <- tempfile(pattern = "seger_", fileext = ".loom") +SCELoom( + from = "SingleCellExperiment", to = "loom", sce = seger, + loom.file = seger.loom.file +) +``` + +
+ +##### loom to SingleCellExperiment + +```r +seger.loom <- SCELoom( + from = "loom", to = "SingleCellExperiment", + loom.file = seger.loom.file +) +``` + +
+ + ## Contact -For any question, feature request or bug report please write an email to [songyb0519@gmail.com](songyb0519@gmail.com). +For any question, feature request or bug report please write an email to `songyb0519@gmail.com`.
diff --git a/man/Bam2Fastq.Rd b/man/Bam2Fastq.Rd index e79a925..5b0622b 100644 --- a/man/Bam2Fastq.Rd +++ b/man/Bam2Fastq.Rd @@ -40,10 +40,17 @@ NULL or paths of failed bams. Convert bam files to fastq files. } \examples{ -# GSE138266.runs = ExtractRun(acce = "GSE138266", platform = "GPL18573") -# GSE138266.down = DownloadBam(gsm.df = GSE138266.runs, prefetch.path = "/path/to/prefetch", -# out.folder = "/path/to/output") -# GSE138266.convert = Bam2Fastq(bam.folder = "/path/to/output", -# bamtofastq.path = "/path/to/bamtofastq_linux or samtools", -# bamtofastq.paras = "--nthreads 4") +\dontrun{ +# need users to provide prefetch.path and bamtofastq.path +GSE138266.runs <- ExtractRun(acce = "GSE138266", platform = "GPL18573") +GSE138266.down <- DownloadBam( + gsm.df = GSE138266.runs, prefetch.path = "/path/to/prefetch", + out.folder = "/path/to/output" +) +GSE138266.convert <- Bam2Fastq( + bam.folder = "/path/to/output", + bamtofastq.path = "/path/to/bamtofastq_linux or samtools", + bamtofastq.paras = "--nthreads 4" +) +} } diff --git a/man/DownloadBam.Rd b/man/DownloadBam.Rd index abed023..1e1e4a8 100644 --- a/man/DownloadBam.Rd +++ b/man/DownloadBam.Rd @@ -36,8 +36,13 @@ Dataframe contains failed runs or NULL. Download bam. } \examples{ -# GSE138266.runs = ExtractRun(acce = "GSE138266", platform = "GPL18573") -# GSE138266.down = DownloadBam(gsm.df = GSE138266.runs, bam.type = "10x", -# prefetch.path = "/path/to/prefetch", -# out.folder = "/path/to/output") +\dontrun{ +# need users to provide prefetch.path +GSE138266.runs <- ExtractRun(acce = "GSE138266", platform = "GPL18573") +GSE138266.down <- DownloadBam( + gsm.df = GSE138266.runs, bam.type = "10x", + prefetch.path = "/path/to/prefetch", + out.folder = "/path/to/output" +) +} } diff --git a/man/DownloadSRA.Rd b/man/DownloadSRA.Rd index 609b56e..fe1eeca 100644 --- a/man/DownloadSRA.Rd +++ b/man/DownloadSRA.Rd @@ -27,7 +27,12 @@ Dataframe contains failed runs or NULL. Download SRA. } \examples{ -# GSE186003.runs = ExtractRun(acce = "GSE186003", platform = "GPL24247") -# GSE186003.down = DownloadSRA(gsm.df = GSE186003.runs, prefetch.path = "/path/to/prefetch", -# out.folder = "/path/to/output") +\dontrun{ +# need users to provide the prefetch.path and out.folder +GSE186003.runs <- ExtractRun(acce = "GSE186003", platform = "GPL24247") +GSE186003.down <- DownloadSRA( + gsm.df = GSE186003.runs, prefetch.path = "/path/to/prefetch", + out.folder = "/path/to/output" +) +} } diff --git a/man/ExportSeurat.Rd b/man/ExportSeurat.Rd index 8d8b200..92e0ae3 100644 --- a/man/ExportSeurat.Rd +++ b/man/ExportSeurat.Rd @@ -32,7 +32,7 @@ Default: "SCE".} \item{conda.path}{Conda environment path, used when \code{to} is "AnnData". Default: NULL.} \item{...}{Parameter for \code{\link{as.SingleCellExperiment}}, \code{sceasy::convertFormat}, \code{\link{as.CellDataSet}}, -\code{\link{as.cell_data_set}}, \code{\link{SaveLoom}}, corresponding to \code{to}.} +\code{as.cell_data_set}, \code{SaveLoom}, corresponding to \code{to}.} } \value{ Object corresponding to \code{to}. @@ -40,3 +40,24 @@ Object corresponding to \code{to}. \description{ Export SeuratObject to Other Formats. } +\examples{ +\dontrun{ +library(Seurat) +# export to SingleCellExperiment +sce.obj <- ExportSeurat(seu.obj = pbmc_small, assay = "RNA", to = "SCE") +# export to CellDataSet +cds.obj <- ExportSeurat(seu.obj = pbmc_small, assay = "RNA", reduction = "tsne", to = "CellDataSet") +# export to cell_data_set +cds3.obj <- ExportSeurat(seu.obj = pbmc_small, assay = "RNA", to = "cell_data_set") +# export to AnnData, need users to provide the conda path and the output file +ExportSeurat( + seu.obj = pbmc_small, assay = "RNA", to = "AnnData", conda.path = "/path/to/anaconda3", + anndata.file = "/path/to/pbmc_small.h5ad" +) +# export to loom, need users to provide the output file +ExportSeurat( + seu.obj = pbmc_small, assay = "RNA", to = "loom", + loom.file = "/path/to/pbmc_small.loom" +) +} +} diff --git a/man/ExtractCBComposition.Rd b/man/ExtractCBComposition.Rd index 662d1a2..be5a369 100644 --- a/man/ExtractCBComposition.Rd +++ b/man/ExtractCBComposition.Rd @@ -56,10 +56,14 @@ Dataframe contains sample information and cell type composition. Extract Cell Type Composition of UCSC Cell Browser Datasets. } \examples{ -# # lazy mode, load datasets json files locally -# ucsc.cb.samples = ShowCBDatasets(lazy = TRUE, json.folder = NULL, update = FALSE) -# # cell number is between 1000 and 2000 -# hbb.sample.df = ExtractCBDatasets(all.samples.df = ucsc.cb.samples, organ = c("brain", "blood"), -# organism = "Human (H. sapiens)", cell.num = c(1000,2000)) -# hbb.sample.ct = ExtractCBComposition(json.folder = NULL, sample.df = hbb.sample.df) +\dontrun{ +# lazy mode, load datasets json files locally, need users to provide json folder +ucsc.cb.samples <- ShowCBDatasets(lazy = TRUE, json.folder = NULL, update = FALSE) +# cell number is between 1000 and 2000 +hbb.sample.df <- ExtractCBDatasets( + all.samples.df = ucsc.cb.samples, organ = c("brain", "blood"), + organism = "Human (H. sapiens)", cell.num = c(1000, 2000) +) +hbb.sample.ct <- ExtractCBComposition(json.folder = NULL, sample.df = hbb.sample.df) +} } diff --git a/man/ExtractCBDatasets.Rd b/man/ExtractCBDatasets.Rd index 3e23a28..d2c3ba2 100644 --- a/man/ExtractCBDatasets.Rd +++ b/man/ExtractCBDatasets.Rd @@ -49,9 +49,13 @@ Dataframe contains filtered datasets. Extract UCSC Cell Browser Datasets with Attributes. } \examples{ -# # lazy mode, load datasets json files locally -# ucsc.cb.samples = ShowCBDatasets(lazy = TRUE, json.folder = NULL, update = FALSE) -# # cell number is between 1000 and 2000 -# hbb.sample.df = ExtractCBDatasets(all.samples.df = ucsc.cb.samples, organ = c("brain", "blood"), -# organism = "Human (H. sapiens)", cell.num = c(1000,2000)) +\dontrun{ +# lazy mode, load datasets json files locally, need users to provide json folder +ucsc.cb.samples <- ShowCBDatasets(lazy = TRUE, json.folder = NULL, update = FALSE) +# cell number is between 1000 and 2000 +hbb.sample.df <- ExtractCBDatasets( + all.samples.df = ucsc.cb.samples, organ = c("brain", "blood"), + organism = "Human (H. sapiens)", cell.num = c(1000, 2000) +) +} } diff --git a/man/ExtractCELLxGENEMeta.Rd b/man/ExtractCELLxGENEMeta.Rd index ca8b10d..bf738a1 100644 --- a/man/ExtractCELLxGENEMeta.Rd +++ b/man/ExtractCELLxGENEMeta.Rd @@ -55,12 +55,16 @@ Dataframe contains filtered datasets. Extract Metadata of CELLxGENE Datasets with Attributes. } \examples{ -# # all available datasets -# all.cellxgene.datasets = ShowCELLxGENEDatasets() -# # human 10x v2 and v3 datasets -# human.10x.cellxgene.meta = ExtractCELLxGENEMeta(all.samples.df = all.cellxgene.datasets, -# assay = c("10x 3' v2", "10x 3' v3"), -# organism = "Homo sapiens") +\donttest{ +# all available datasets +all.cellxgene.datasets <- ShowCELLxGENEDatasets() +# human 10x v2 and v3 datasets +human.10x.cellxgene.meta <- ExtractCELLxGENEMeta( + all.samples.df = all.cellxgene.datasets, + assay = c("10x 3' v2", "10x 3' v3"), + organism = "Homo sapiens" +) +} } \references{ https://gist.github.com/ivirshup/f1a1603db69de3888eacb4bdb6a9317a diff --git a/man/ExtractGEOExp.Rd b/man/ExtractGEOExp.Rd index d89d59e..f2fcba1 100644 --- a/man/ExtractGEOExp.Rd +++ b/man/ExtractGEOExp.Rd @@ -41,9 +41,3 @@ Count matrix (\code{supp.type} is count) or NULL (\code{supp.type} is 10x). \description{ Extract Raw Count Matrix or Fortmat Supplementary Files to 10x. } -\examples{ -# pf.obj = GEOobj(acce = "GSE200257", platform = "GPL24676") -# count.mat = ExtractGEOExp(pf.obj, acce = "GSE200257", supp.idx = 1, -# down.supp = TRUE, supp.type = "10x", -# out.folder = "/path/to/output/folder") -} diff --git a/man/ExtractGEOExpSupp.Rd b/man/ExtractGEOExpSupp.Rd index 0e7dd97..67d27bc 100644 --- a/man/ExtractGEOExpSupp.Rd +++ b/man/ExtractGEOExpSupp.Rd @@ -19,12 +19,3 @@ A dataframe. \description{ Extract Raw Count Matrix from Supplementary Files. } -\examples{ -# for bulk rna-seq -# count.mat = ExtractGEOExpSupp(acce = "GSE149838") -# count.mat = ExtractGEOExpSupp(acce = "GSE147507") -# count.mat = ExtractGEOExpSupp(acce = "GSE147507", supp.idx = 2) -# count.mat = ExtractGEOExpSupp(acce = "GSE122774") -# # for single cell matrix -# count.mat = ExtractGEOExpSupp(acce = "GSE94820") -} diff --git a/man/ExtractGEOExpSupp10x.Rd b/man/ExtractGEOExpSupp10x.Rd index a6ae912..e07af86 100644 --- a/man/ExtractGEOExpSupp10x.Rd +++ b/man/ExtractGEOExpSupp10x.Rd @@ -27,7 +27,3 @@ Default: TURE.} \description{ Fortmat Supplementary Files to 10x. } -\examples{ -# ExtractGEOExpSupp10x(acce = "GSE200257", out.folder = '/path/to/output') -# ExtractGEOExpSupp10x(acce = "GSE226160", out.folder = '/path/to/output') -} diff --git a/man/ExtractGEOExpSuppAll.Rd b/man/ExtractGEOExpSuppAll.Rd index d686b7e..e2a137b 100644 --- a/man/ExtractGEOExpSuppAll.Rd +++ b/man/ExtractGEOExpSuppAll.Rd @@ -34,7 +34,3 @@ Count matrix (\code{supp.type} is count) or NULL (\code{supp.type} is 10x). \description{ Extract Raw Count Matrix from Supplementary Files or Fortmat Supplementary Files to 10x. } -\examples{ -# exp.data = ExtractGEOExpSuppAll(acce = "GSE200257", supp.idx = 1, supp.type = "10x", -# out.folder = "/path/to/output/folder") -} diff --git a/man/ExtractGEOInfo.Rd b/man/ExtractGEOInfo.Rd index be0e6e7..525751f 100644 --- a/man/ExtractGEOInfo.Rd +++ b/man/ExtractGEOInfo.Rd @@ -17,7 +17,3 @@ A dataframe. \description{ Extract GEO Study Information. } -\examples{ -# pf.obj = GEOobj(acce = "GSE94820", platform = "GPL16791") -# pf.info = ExtractGEOInfo(pf.obj) -} diff --git a/man/ExtractGEOMeta.Rd b/man/ExtractGEOMeta.Rd index 2679564..e0552d7 100644 --- a/man/ExtractGEOMeta.Rd +++ b/man/ExtractGEOMeta.Rd @@ -20,8 +20,10 @@ Dataframe contains all metadata of provided GEO accession number. Extract Sample Metadata from GEO. } \examples{ -# # extract metadata of specified platform -# GSE200257.meta = ExtractGEOMeta(acce = "GSE200257", platform = "GPL24676") -# # extract metadata of all platforms -# GSE94820.meta = ExtractGEOMeta(acce = "GSE94820", platform = NULL) +\donttest{ +# users may need to set the size of the connection buffer +# Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 60) +# extract metadata of specified platform +GSE200257.meta <- ExtractGEOMeta(acce = "GSE200257", platform = "GPL24676") +} } diff --git a/man/ExtractGEOSubMeta.Rd b/man/ExtractGEOSubMeta.Rd index 134480a..a073d03 100644 --- a/man/ExtractGEOSubMeta.Rd +++ b/man/ExtractGEOSubMeta.Rd @@ -15,7 +15,3 @@ A dataframe. \description{ Extract Sample Metadata. } -\examples{ -# pf.obj = GEOobj(acce = "GSE94820", platform = "GPL16791") -# pf.info = ExtractGEOSubMeta(pf.obj) -} diff --git a/man/ExtractHCAMeta.Rd b/man/ExtractHCAMeta.Rd index b16d590..0482b1e 100644 --- a/man/ExtractHCAMeta.Rd +++ b/man/ExtractHCAMeta.Rd @@ -66,14 +66,18 @@ Dataframe contains filtered projects. Extract Metadata of Human Cell Atlas Projects with Attributes. } \examples{ -# # all available projects -# all.hca.projects = ShowHCAProjects() -# # all human projects -# all.human.projects = ExtractHCAMeta(all.projects.df = all.hca.projects, organism = "Homo sapiens") -# # all human and 10x 3' v2 -# all.human.10x.projects = ExtractHCAMeta(all.projects.df = all.hca.projects, -# organism = "Homo sapiens", -# protocol = c("10x 3' v2", "10x 3' v3")) +\donttest{ +# all available projects +all.hca.projects <- ShowHCAProjects() +# all human projects +all.human.projects <- ExtractHCAMeta(all.projects.df = all.hca.projects, organism = "Homo sapiens") +# all human and 10x 3' v2 +all.human.10x.projects <- ExtractHCAMeta( + all.projects.df = all.hca.projects, + organism = "Homo sapiens", + protocol = c("10x 3' v2", "10x 3' v3") +) +} } \references{ https://bioconductor.org/packages/release/bioc/html/hca.html diff --git a/man/ExtractPanglaoDBComposition.Rd b/man/ExtractPanglaoDBComposition.Rd index 83dbb91..aa7423c 100644 --- a/man/ExtractPanglaoDBComposition.Rd +++ b/man/ExtractPanglaoDBComposition.Rd @@ -37,6 +37,10 @@ Dataframe contains sample metadata, cluster, cell number and cell type informati Extract Cell Type Composition of PanglaoDB Datasets. } \examples{ -# human.composition = ExtractPanglaoDBComposition(species = "Homo sapiens", -# protocol = c("Smart-seq2", "10x chromium")) +\donttest{ +human.composition <- ExtractPanglaoDBComposition( + species = "Homo sapiens", + protocol = c("Smart-seq2", "10x chromium") +) +} } diff --git a/man/ExtractPanglaoDBMeta.Rd b/man/ExtractPanglaoDBMeta.Rd index c63531c..e789c4f 100644 --- a/man/ExtractPanglaoDBMeta.Rd +++ b/man/ExtractPanglaoDBMeta.Rd @@ -34,7 +34,11 @@ Dataframe contains SRA, SRS, Tissue, Protocol, Species, Cells, CellType (inferre Extract Metadata of scRNA-seq Datasets in PanglaoDB. } \examples{ -# human.meta = ExtractPanglaoDBMeta(species = "Homo sapiens", -# protocol = c("Smart-seq2", "10x chromium"), -# cell.num = c(1000,2000)) +\donttest{ +human.meta <- ExtractPanglaoDBMeta( + species = "Homo sapiens", + protocol = c("Smart-seq2", "10x chromium"), + cell.num = c(1000, 2000) +) +} } diff --git a/man/ExtractRun.Rd b/man/ExtractRun.Rd index 3db3c20..8be675f 100644 --- a/man/ExtractRun.Rd +++ b/man/ExtractRun.Rd @@ -25,5 +25,7 @@ Dataframe contains GSM and Runs. Extract Runs with GEO Accession Number or GSM Number. } \examples{ -# GSE186003.runs = ExtractRun(acce = "GSE186003", platform = "GPL24247") +\dontrun{ +GSE186003.runs <- ExtractRun(acce = "GSE186003", platform = "GPL24247", parallel = FALSE) +} } diff --git a/man/ExtractZenodoMeta.Rd b/man/ExtractZenodoMeta.Rd index 3adfe61..df40d5b 100644 --- a/man/ExtractZenodoMeta.Rd +++ b/man/ExtractZenodoMeta.Rd @@ -18,9 +18,13 @@ Dataframe contains files with valid extension in given Zenodo DOI. Prepare Dataframe with Zenodo DOIs. } \examples{ -# zebrafish.df = ExtractZenodoMeta(doi = "10.5281/zenodo.7243603") -# ExtractZenodoMeta(doi = "10.5281/zenodo.48065") # Restricted Access -# # vector of dois -# multi.dois = ExtractZenodoMeta(doi = c("1111", "10.5281/zenodo.7243603", -# "10.5281/zenodo.7244441")) +\donttest{ +zebrafish.df <- ExtractZenodoMeta(doi = "10.5281/zenodo.7243603") +ExtractZenodoMeta(doi = "10.5281/zenodo.48065") # Restricted Access +# vector of dois +multi.dois <- ExtractZenodoMeta(doi = c( + "1111", "10.5281/zenodo.7243603", + "10.5281/zenodo.7244441" +)) +} } diff --git a/man/ImportSeurat.Rd b/man/ImportSeurat.Rd index 8604e23..ce71e49 100644 --- a/man/ImportSeurat.Rd +++ b/man/ImportSeurat.Rd @@ -49,16 +49,22 @@ A Seurat object. Convert Other Formats to SeuratObject. } \examples{ +\dontrun{ # import data from SingleCellExperiment -# seu.obj = ImportSeurat(obj=sce.obj, from="SCE", count.assay="counts", -# data.assay="logcounts", assay="RNA") +seu.obj <- ImportSeurat( + obj = sce.obj, from = "SCE", count.assay = "counts", + data.assay = "logcounts", assay = "RNA" +) # import data from CellDataSet -# seu.obj = ImportSeurat(obj=cds.obj, from="CellDataSet", count.assay="counts", assay = "RNA") +seu.obj <- ImportSeurat(obj = cds.obj, from = "CellDataSet", count.assay = "counts", assay = "RNA") # import data from cell_data_set -# seu.obj = ImportSeurat(obj=sce.obj, from="cell_data_set", count.assay="counts", -# data.assay="logcounts", assay="RNA") -# import data from AnnData -# seu.obj = ImportSeurat(anndata.file = 'path/to/h5ad', from="AnnData", assay = "RNA") -# import data from loom -# seu.obj = ImportSeurat(loom.file = 'path/to/loom', from="loom") +seu.obj <- ImportSeurat( + obj = sce.obj, from = "cell_data_set", count.assay = "counts", + data.assay = "logcounts", assay = "RNA" +) +# import data from AnnData, need users to provide the file for conversion +seu.obj <- ImportSeurat(anndata.file = "path/to/h5ad", from = "AnnData", assay = "RNA") +# import data from loom, need users to provide the file for conversion +seu.obj <- ImportSeurat(loom.file = "path/to/loom", from = "loom") +} } diff --git a/man/ParseCBDatasets.Rd b/man/ParseCBDatasets.Rd index d1f0ae2..98ebfc6 100644 --- a/man/ParseCBDatasets.Rd +++ b/man/ParseCBDatasets.Rd @@ -58,13 +58,17 @@ Seurat object (if \code{merge} is TRUE) or list of Seurat objects (if \code{merg Download UCSC Cell Browser Datasets. } \examples{ -# # lazy mode, load datasets json files locally -# ucsc.cb.samples = ShowCBDatasets(lazy = TRUE, json.folder = NULL, update = FALSE) -# # cell number is between 1000 and 2000 -# hbb.sample.df = ExtractCBDatasets(all.samples.df = ucsc.cb.samples, organ = c("brain", "blood"), -# organism = "Human (H. sapiens)", cell.num = c(1000,2000)) -# hbb.sample.seu = ParseCBDatasets(sample.df = hbb.sample.df) -# # test 10x and matrix load -# complex.df = ucsc.cb.samples[c(1, 927, 379), ] # two 10x and one matrix -# complex.seu.list = ParseCBDatasets(sample.df = test.df, merge = F) +\dontrun{ +# lazy mode, load datasets json files locally, need users to provide json folder +ucsc.cb.samples <- ShowCBDatasets(lazy = TRUE, json.folder = NULL, update = FALSE) +# cell number is between 1000 and 2000 +hbb.sample.df <- ExtractCBDatasets( + all.samples.df = ucsc.cb.samples, organ = c("brain", "blood"), + organism = "Human (H. sapiens)", cell.num = c(1000, 2000) +) +hbb.sample.seu <- ParseCBDatasets(sample.df = hbb.sample.df) +# test 10x and matrix load +complex.df <- ucsc.cb.samples[c(1, 927, 379), ] # two 10x and one matrix +complex.seu.list <- ParseCBDatasets(sample.df = test.df, merge = F) +} } diff --git a/man/ParseCELLxGENE.Rd b/man/ParseCELLxGENE.Rd index 076afe2..f2bd0e6 100644 --- a/man/ParseCELLxGENE.Rd +++ b/man/ParseCELLxGENE.Rd @@ -35,14 +35,18 @@ Dataframe contains failed datasets or NULL. Download CELLxGENE Datasets. } \examples{ -# # all available datasets -# all.cellxgene.datasets = ShowCELLxGENEDatasets() -# # human 10x v2 and v3 datasets -# human.10x.cellxgene.meta = ExtractCELLxGENEMeta(all.samples.df = all.cellxgene.datasets, -# assay = c("10x 3' v2", "10x 3' v3"), -# organism = "Homo sapiens") -# # download -# ParseCELLxGENE(meta = human.10x.cellxgene.meta, out.folder = "/path/to/output") +\dontrun{ +# all available datasets +all.cellxgene.datasets <- ShowCELLxGENEDatasets() +# human 10x v2 and v3 datasets +human.10x.cellxgene.meta <- ExtractCELLxGENEMeta( + all.samples.df = all.cellxgene.datasets, + assay = c("10x 3' v2", "10x 3' v3"), + organism = "Homo sapiens" +) +# download, need to provide the output folder +ParseCELLxGENE(meta = human.10x.cellxgene.meta, out.folder = "/path/to/output") +} } \references{ https://gist.github.com/ivirshup/f1a1603db69de3888eacb4bdb6a9317a diff --git a/man/ParseGEO.Rd b/man/ParseGEO.Rd index 05491c2..2b4cdcc 100644 --- a/man/ParseGEO.Rd +++ b/man/ParseGEO.Rd @@ -52,9 +52,14 @@ If \code{data.type} is "bulk", return count matrix. Download Matrix from GEO and Load to Seurat. } \examples{ -# # the supp files are count matrix -# GSE94820.seu = ParseGEO(acce = "GSE94820", down.supp = TRUE, supp.idx = 1, supp.type = "count") -# # the supp files are cellranger output files: barcodes, genes/features and matrix -# GSE200257.seu = ParseGEO(acce = "GSE200257", down.supp = TRUE, supp.idx = 1, supp.type = "10x", -# out.folder = "/path/to/output/folder") +\dontrun{ +# the supp files are count matrix +GSE94820.seu <- ParseGEO(acce = "GSE94820", down.supp = TRUE, supp.idx = 1, supp.type = "count") +# the supp files are cellranger output files: barcodes, genes/features and matrix +# need users to provide the output folder +GSE200257.seu <- ParseGEO( + acce = "GSE200257", down.supp = TRUE, supp.idx = 1, supp.type = "10x", + out.folder = "/path/to/output/folder" +) +} } diff --git a/man/ParseHCA.Rd b/man/ParseHCA.Rd index bd8b6c7..2222ffa 100644 --- a/man/ParseHCA.Rd +++ b/man/ParseHCA.Rd @@ -36,12 +36,16 @@ Dataframe contains failed projects or NULL. Download Human Cell Atlas Datasets. } \examples{ -# # all available projects -# all.hca.projects = ShowHCAProjects() -# # all human and 10x 3' v2 -# all.human.10x.projects = ExtractHCAMeta(all.projects.df = all.hca.projects, -# organism = "Homo sapiens", -# protocol = c("10x 3' v2", "10x 3' v3")) -# # download -# ParseHCA(meta = all.human.10x.projects, out.folder = "/path/to/output") +\dontrun{ +# all available projects +all.hca.projects <- ShowHCAProjects() +# all human and 10x 3' v2 +all.human.10x.projects <- ExtractHCAMeta( + all.projects.df = all.hca.projects, + organism = "Homo sapiens", + protocol = c("10x 3' v2", "10x 3' v3") +) +# download, need users to provide the output folder +ParseHCA(meta = all.human.10x.projects, out.folder = "/path/to/output") +} } diff --git a/man/ParsePanglaoDB.Rd b/man/ParsePanglaoDB.Rd index 73512e7..a37bb32 100644 --- a/man/ParsePanglaoDB.Rd +++ b/man/ParsePanglaoDB.Rd @@ -31,8 +31,12 @@ Seurat object (if \code{merge} is TRUE) or list of Seurat objects (if \code{merg Parse PanglaoDB Data. } \examples{ -# hsa.meta = ExtractPanglaoDBMeta(species = "Homo sapiens", -# protocol = c("Smart-seq2", "10x chromium"), -# show.cell.type = TRUE, cell.num = c(1000,2000)) -# hsa.seu = ParsePanglaoDB(hsa.meta, merge = TRUE) +\dontrun{ +hsa.meta <- ExtractPanglaoDBMeta( + species = "Homo sapiens", + protocol = c("Smart-seq2", "10x chromium"), + show.cell.type = TRUE, cell.num = c(1000, 2000) +) +hsa.seu <- ParsePanglaoDB(hsa.meta, merge = TRUE) +} } diff --git a/man/ParseZenodo.Rd b/man/ParseZenodo.Rd index 9b320df..d088ec8 100644 --- a/man/ParseZenodo.Rd +++ b/man/ParseZenodo.Rd @@ -39,8 +39,15 @@ When successful, NULL. When MD5 verification failure, a dataframe contains failu Download Data with Zenodo DOI. } \examples{ -# multi.dois.parse = ParseZenodo(doi = c("1111", "10.5281/zenodo.7243603", -# "10.5281/zenodo.7244441"), -# file.ext = c("rdata", "rds"), -# out.folder = "/path/to/outfoder") +\dontrun{ +# need users to provide the output folder +multi.dois.parse <- ParseZenodo( + doi = c( + "1111", "10.5281/zenodo.7243603", + "10.5281/zenodo.7244441" + ), + file.ext = c("rdata", "rds"), + out.folder = "/path/to/outfoder" +) +} } diff --git a/man/SCEAnnData.Rd b/man/SCEAnnData.Rd index b3d430a..533d9b0 100644 --- a/man/SCEAnnData.Rd +++ b/man/SCEAnnData.Rd @@ -27,7 +27,7 @@ Default: AnnData.} \item{slot.name}{Slot name used to save count matrix, used when converting from AnnData to SingleCellExperiment. Default: counts.} -\item{...}{Parameters for \code{\link{writeH5AD}} and \code{\link{readH5AD}}.} +\item{...}{Parameters for \code{writeH5AD} and \code{readH5AD}.} } \value{ NULL or SingleCellExperiment. @@ -36,9 +36,14 @@ NULL or SingleCellExperiment. Data Format Conversion between SingleCellExperiment and AnnData. } \examples{ -# library(scRNAseq) -# seger <- SegerstolpePancreasData() -# SCEAnnData(from = "SingleCellExperiment", to = "AnnData", sce = seger, X_name = "counts") -# sce = SCEAnnData(from = "AnnData", to = "SingleCellExperiment", -# anndata.file = "path/to/seger.h5ad") +\dontrun{ +library(scRNAseq) +seger <- SegerstolpePancreasData() +SCEAnnData(from = "SingleCellExperiment", to = "AnnData", sce = seger, X_name = "counts") +# need users to provide the output file +sce <- SCEAnnData( + from = "AnnData", to = "SingleCellExperiment", + anndata.file = "path/to/seger.h5ad" +) +} } diff --git a/man/SCELoom.Rd b/man/SCELoom.Rd index fff8437..00f2209 100644 --- a/man/SCELoom.Rd +++ b/man/SCELoom.Rd @@ -32,10 +32,16 @@ NULL or SingleCellExperiment. Data Format Conversion between SingleCellExperiment and loom. } \examples{ -# convert from loom to SingleCellExperiment -# sce.obj = SCELoom(from = "loom", to = "SingleCellExperiment", -# loom.file = "path/to/loom") -# convert from SingleCellExperiment to loom -# SCELoom(from = "SingleCellExperiment", to = "loom",sce = sce.obj, -# loom.file = "path/to/loom") +\dontrun{ +# convert from loom to SingleCellExperiment, need users to provide the loom file +sce.obj <- SCELoom( + from = "loom", to = "SingleCellExperiment", + loom.file = "path/to/loom" +) +# convert from SingleCellExperiment to loom, need users to provide the loom file +SCELoom( + from = "SingleCellExperiment", to = "loom", sce = sce.obj, + loom.file = "path/to/loom" +) +} } diff --git a/man/ShowCBDatasets.Rd b/man/ShowCBDatasets.Rd index 8b79717..cfb67fa 100644 --- a/man/ShowCBDatasets.Rd +++ b/man/ShowCBDatasets.Rd @@ -23,8 +23,10 @@ Dataframe contains all available datasets. Show All Available Datasets in UCSC Cell Browser. } \examples{ -# # first time run (lazy mode) -# ucsc.cb.samples = ShowCBDatasets(lazy = TRUE, update = TRUE) -# # second time run (lazy mode) -# ucsc.cb.samples = ShowCBDatasets(lazy = TRUE, update = FALSE) +\dontrun{ +# first time run (lazy mode), need users to provide json folder +ucsc.cb.samples <- ShowCBDatasets(lazy = TRUE, update = TRUE) +# second time run (lazy mode), need users to provide json folder +ucsc.cb.samples <- ShowCBDatasets(lazy = TRUE, update = FALSE) +} } diff --git a/man/ShowCELLxGENEDatasets.Rd b/man/ShowCELLxGENEDatasets.Rd index 6ac42fb..c2cd7d7 100644 --- a/man/ShowCELLxGENEDatasets.Rd +++ b/man/ShowCELLxGENEDatasets.Rd @@ -13,8 +13,10 @@ Dataframe contains all available datasets. Show All Available Datasets in CELLxGENE. } \examples{ -# # all available datasets -# all.cellxgene.datasets = ShowCELLxGENEDatasets() +\donttest{ +# all available datasets +all.cellxgene.datasets <- ShowCELLxGENEDatasets() +} } \references{ https://gist.github.com/ivirshup/f1a1603db69de3888eacb4bdb6a9317a diff --git a/man/ShowHCAProjects.Rd b/man/ShowHCAProjects.Rd index 493de75..cf68c54 100644 --- a/man/ShowHCAProjects.Rd +++ b/man/ShowHCAProjects.Rd @@ -17,6 +17,8 @@ Dataframe contains all available projects. Show All Available Projects in Human Cell Atlas. } \examples{ -# # all available projects -# all.hca.projects = ShowHCAProjects() +\donttest{ +# all available projects +all.hca.projects <- ShowHCAProjects() +} } diff --git a/man/SplitSRA.Rd b/man/SplitSRA.Rd index dedce8b..0199d62 100644 --- a/man/SplitSRA.Rd +++ b/man/SplitSRA.Rd @@ -46,11 +46,18 @@ NULL or paths of failed sras. Split SRA to fastq Files and Format to 10x Standard Style. } \examples{ -# GSE186003.runs = ExtractRun(acce = "GSE186003", platform = "GPL24247") -# GSE186003.down = DownloadSRA(gsm.df = GSE186003.runs, prefetch.path = "/path/to/prefetch", -# out.folder = "/path/to/output") -# GSE186003.split = SplitSRA(sra.folder = "/path/to/output", -# split.cmd.path = "/path/to/parallel-fastq-dump", -# sratools.path = "/path/to/sra/bin", fastq.type = "10x", -# split.cmd.threads = 4) +\dontrun{ +# need users to provide the prefetch.path, sra.folder, split.cmd.path, sratools.path and out.folder +GSE186003.runs <- ExtractRun(acce = "GSE186003", platform = "GPL24247") +GSE186003.down <- DownloadSRA( + gsm.df = GSE186003.runs, prefetch.path = "/path/to/prefetch", + out.folder = "/path/to/output" +) +GSE186003.split <- SplitSRA( + sra.folder = "/path/to/output", + split.cmd.path = "/path/to/parallel-fastq-dump", + sratools.path = "/path/to/sra/bin", fastq.type = "10x", + split.cmd.threads = 4 +) +} } diff --git a/man/StatDBAttribute.Rd b/man/StatDBAttribute.Rd index dc2887a..78fe893 100644 --- a/man/StatDBAttribute.Rd +++ b/man/StatDBAttribute.Rd @@ -25,16 +25,20 @@ List of attributes information, including attribute, value and number. Stat Database Attributes. } \examples{ -# # PanglaoDB -# StatDBAttribute(df = PanglaoDBMeta, filter = c("species", "protocol"), database = "PanglaoDB") -# # UCSC Cell Browser -# ucsc.cb.samples = ShowCBDatasets(lazy = TRUE, json.folder = NULL, update = FALSE) -# StatDBAttribute(df = ucsc.cb.samples, filter = c("organism", "organ"), database = "UCSC") -# # CELLxGENE -# all.cellxgene.datasets = ShowCELLxGENEDatasets() -# StatDBAttribute(df = all.cellxgene.datasets, filter = c("organism", "sex"), -# database = "CELLxGENE") -# # HCA -# all.hca.projects = ShowHCAProjects() -# StatDBAttribute(df = all.hca.projects, filter = c("organism", "sex"), database = "HCA") +\dontrun{ +# PanglaoDB +StatDBAttribute(df = PanglaoDBMeta, filter = c("species", "protocol"), database = "PanglaoDB") +# UCSC Cell Browser, need users to provide the json folder +ucsc.cb.samples <- ShowCBDatasets(lazy = TRUE, json.folder = NULL, update = FALSE) +StatDBAttribute(df = ucsc.cb.samples, filter = c("organism", "organ"), database = "UCSC") +# CELLxGENE +all.cellxgene.datasets <- ShowCELLxGENEDatasets() +StatDBAttribute( + df = all.cellxgene.datasets, filter = c("organism", "sex"), + database = "CELLxGENE" +) +# HCA +all.hca.projects <- ShowHCAProjects() +StatDBAttribute(df = all.hca.projects, filter = c("organism", "sex"), database = "HCA") +} } diff --git a/vignettes/DownloadMatrices.Rmd b/vignettes/DownloadMatrices.Rmd index 4a56915..c0c8d43 100644 --- a/vignettes/DownloadMatrices.Rmd +++ b/vignettes/DownloadMatrices.Rmd @@ -145,7 +145,7 @@ After manually check the extracted metadata, `scfetch` provides `ParsePanglaoDB` With the count matrix, `ParsePanglaoDB` will load the matrix to Seurat automatically. If multiple datasets available, users can choose to merge the SeuratObject with `merge`. ```{r panglaodb_parse, eval=FALSE} -hsa.seu <- ParsePanglaoDB(hsa.meta, merge = TRUE) +hsa.seu <- ParsePanglaoDB(hsa.meta[1:3, ], merge = TRUE) ```
diff --git a/vignettes/scfetch.Rmd b/vignettes/scfetch.Rmd index 20290ae..f78144b 100644 --- a/vignettes/scfetch.Rmd +++ b/vignettes/scfetch.Rmd @@ -1,17 +1,11 @@ --- title: > - scfetch User Guide + scfetch - Access and Format Single-cell RNA-seq Datasets from Public Resources author: - name: Yabing Song - affiliation: - - &id1 School of Life Sciences, Tsinghua University - email: songyb0519@gmail.com date: "`r BiocStyle::doc_date()`" -package: "`r BiocStyle::pkg_ver('scfetch')`" -abstract: > - The goal of `scfetch` is to access and format scRNA-seq datasets. It can be used to download scRNA-seq datasets from widely used public resources, including GEO, Zenodo, CELLxGENE, PanglaoDB and UCSC Cell Browser. And, it can also be used to perform object conversion between SeuratObject, loom, h5ad, SingleCellExperiment and CellDataSet/cell_data_set. output: - BiocStyle::html_document: + html_document: toc_depth: 4 toc_float: true fig_caption: TRUE @@ -20,10 +14,6 @@ vignette: > %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- - -```{r style, echo = FALSE, results = 'asis'} -BiocStyle::markdown() -``` ```{r setup, echo=FALSE, warning=FALSE} library(knitr) @@ -37,7 +27,19 @@ knitr::opts_chunk$set( ) ``` -# Getting started +# Introduction + +`scfetch` is designed to accelerate users download and prepare single-cell datasets from public resources. It can be used to: + +* **Download fastq files** from `GEO/SRA`, **foramt fastq files** to standard style that can be identified by 10x softwares (e.g. CellRanger). +* **Download bam files** from `GEO/SRA`, support **downloading original 10x generated bam files (with custom tags) and normal bam files**, and **convert bam files to fastq files**. +* Download scRNA-seq **matrix** and **annotation (e.g. cell type)** information from `GEO`, `PanglanDB` and `UCSC Cell Browser`, **load the downnloaded matrix to `Seurat`**. +* Download processed objects from `Zeenodo` and `CELLxGENE`. +* **Formats conversion between widely used single cell objects** (`SeuratObject`, `AnnData`, `SingleCellExperiment`, `CellDataSet/cell_data_set` and `loom`). + +
+ +# Installation `scfetch` is an R package distributed as part of the [CRAN](https://cran.r-project.org/). To install the package, start R and enter: @@ -47,10 +49,10 @@ To install the package, start R and enter: install.packages("scfetch") # if you install from CRAN, you should install the following packages # install.packages("devtools") #In case you have not installed it. -devtools::install_github('alexvpickering/GEOfastq') # download fastq -devtools::install_github('cellgeni/sceasy') # format conversion -devtools::install_github('mojaveazure/seurat-disk') # format conversion -devtools::install_github('satijalab/seurat-wrappers') # format conversion +devtools::install_github("alexvpickering/GEOfastq") # download fastq +devtools::install_github("cellgeni/sceasy") # format conversion +devtools::install_github("mojaveazure/seurat-disk") # format conversion +devtools::install_github("satijalab/seurat-wrappers") # format conversion # install via Github (v0.5.0) devtools::install_github("showteeth/scfetch") @@ -72,18 +74,6 @@ Once `scfetch` is installed, it can be loaded by the following command. library("scfetch") ``` -# Introduction - - - -`scfetch` is designed to accelerate users download and prepare single-cell datasets from public resources. It can be used to: - -* **Download fastq files** from `GEO/SRA`, **foramt fastq files** to standard style that can be identified by 10x softwares (e.g. CellRanger). -* **Download bam files** from `GEO/SRA`, support **downloading original 10x generated bam files (with custom tags) and normal bam files**, and **convert bam files to fastq files**. -* Download scRNA-seq **matrix** and **annotation (e.g. cell type)** information from `GEO`, `PanglanDB` and `UCSC Cell Browser`, **load the downnloaded matrix to `Seurat`**. -* Download processed objects from `Zeenodo` and `CELLxGENE`. -* **Formats conversion between widely used single cell objects** (`SeuratObject`, `AnnData`, `SingleCellExperiment`, `CellDataSet/cell_data_set` and `loom`). -
# Downloas fastq and bam @@ -110,11 +100,11 @@ With the dataframe contains gsm and run number, `scfetch` will download sra file ```{r dwonload_sra, eval=FALSE} # a small test GSE130636.runs <- GSE130636.runs[GSE130636.runs$run %in% c("SRR9004346", "SRR9004351"), ] -# download +# download, you may need to set prefetch.path +out.folder <- tempdir() GSE130636.down <- DownloadSRA( gsm.df = GSE130636.runs, - prefetch.path = "/Users/soyabean/software/sratoolkit.3.0.6-mac64/bin/prefetch", - out.folder = "/Users/soyabean/Desktop/tmp/scdown/download_fastq" + out.folder = out.folder ) # GSE130636.down is null or dataframe contains failed runs ``` @@ -133,11 +123,11 @@ The returned result is a vector of failed sra files. If not `NULL`, users can re ```{r split_sra, eval=FALSE} # parallel-fastq-dump requires sratools.path +# you may need to set split.cmd.path and sratools.path +sra.folder <- tempdir() GSE130636.split <- SplitSRA( - sra.folder = "/Users/soyabean/Desktop/tmp/scdown/download_fastq", - fastq.type = "10x", - split.cmd.path = "/Applications/anaconda3/bin/parallel-fastq-dump", - sratools.path = "/usr/local/bin", split.cmd.threads = 4 + sra.folder = sra.folder, + fastq.type = "10x", split.cmd.threads = 4 ) ``` @@ -185,11 +175,11 @@ The returned result is a dataframe containing failed runs (either failed to down ```{r dwonload_bam, eval=FALSE} # a small test GSE138266.runs <- GSE138266.runs[GSE138266.runs$run %in% c("SRR10211566"), ] -# download +# download, you may need to set prefetch.path +out.folder <- tempdir() GSE138266.down <- DownloadBam( gsm.df = GSE138266.runs, - prefetch.path = "/Users/soyabean/software/sratoolkit.3.0.6-mac64/bin/prefetch", - out.folder = "/Users/soyabean/Desktop/tmp/scdown/download_bam" + out.folder = out.folder ) # GSE138266.down is null or dataframe contains failed runs ``` @@ -205,10 +195,10 @@ With downloaded bam files, `scfetch` provides function `Bam2Fastq` to convert ba The returned result is a vector of bam files failed to convert to fastq files. If not `NULL`, users can re-run `Bam2Fastq` by setting `bam.path` to the returned result. ```{r convert_bam_fastq, eval=FALSE} +bam.folder <- tempdir() +# you may need to set bamtofastq.path and bamtofastq.paras GSE138266.convert <- Bam2Fastq( - bam.folder = "/Users/soyabean/Desktop/tmp/scdown/download_bam", - bamtofastq.path = "/Users/soyabean/software/bamtofastq_macos", - bamtofastq.paras = "--nthreads 4" + bam.folder = bam.folder ) ``` @@ -257,9 +247,10 @@ With the count matrix, `ParseGEO` will load the matrix to Seurat automatically. ```{r geo_parse, eval=FALSE} # for cellranger output +out.folder <- tempdir() GSE200257.seu <- ParseGEO( acce = "GSE200257", platform = NULL, supp.idx = 1, down.supp = TRUE, supp.type = "10x", - out.folder = "/Users/soyabean/Desktop/tmp/scdown/dwonload_geo" + out.folder = out.folder ) # for count matrix, no need to specify out.folder, download count matrix to tmp folder GSE94820.seu <- ParseGEO(acce = "GSE94820", platform = NULL, supp.idx = 1, down.supp = TRUE, supp.type = "count") @@ -275,7 +266,7 @@ GSE94820.seu <- ParseGEO(acce = "GSE94820", platform = NULL, supp.idx = 1, down. Since [PanglaoDB](https://panglaodb.se/about.html) is no longer maintained, `scfetch` has cached all metadata and cell type composition and use these cached data by default to accelerate, users can access the cached data with `PanglaoDBMeta` (all metadata) and `PanglaoDBComposition` (all cell type composition). -### Summary attributes +### Summarise attributes `scfetch` provides `StatDBAttribute` to summary attributes of [PanglaoDB](https://panglaodb.se/index.html): @@ -293,7 +284,10 @@ StatDBAttribute(df = PanglaoDBMeta, filter = c("species", "protocol"), database `scfetch` uses cached metadata and cell type composition by default, users can change this by setting `local.data = FALSE`. ```{r panglaodb_meta, eval=FALSE} -hsa.meta <- ExtractPanglaoDBMeta(species = "Homo sapiens", protocol = c("Smart-seq2", "10x chromium"), show.cell.type = TRUE, cell.num = c(1000, 2000)) +hsa.meta <- ExtractPanglaoDBMeta( + species = "Homo sapiens", protocol = c("Smart-seq2", "10x chromium"), + show.cell.type = TRUE, cell.num = c(1000, 2000) +) ```
@@ -315,7 +309,8 @@ After manually check the extracted metadata, `scfetch` provides `ParsePanglaoDB` With the count matrix, `ParsePanglaoDB` will load the matrix to Seurat automatically. If multiple datasets available, users can choose to merge the SeuratObject with `merge`. ```{r panglaodb_parse, eval=FALSE} -hsa.seu <- ParsePanglaoDB(hsa.meta, merge = TRUE) +# small test +hsa.seu <- ParsePanglaoDB(hsa.meta[1:3, ], merge = TRUE) ```
@@ -329,11 +324,12 @@ The [UCSC Cell Browser](https://cells.ucsc.edu/?#) is a web-based tool that allo `scfetch` provides `ShowCBDatasets` to show all available datasets. Due to the large number of datasets, `ShowCBDatasets` enables users to perform *lazy load* of dataset json files instead of downloading the json files online (time-consuming!!!). This *lazy load* requires users to provide `json.folder` to save json files and set `lazy = TRUE` (for the first time of run, `ShowCBDatasets` will download current json files to `json.folder`, for next time of run, with `lazy = TRUE`, `ShowCBDatasets` will load the downloaded json files from `json.folder`.). And, `ShowCBDatasets` supports updating the local datasets with `update = TRUE`. ```{r cb_show, eval=FALSE} +json.folder <- tempdir() # first time run, the json files are stored under json.folder -# ucsc.cb.samples = ShowCBDatasets(lazy = TRUE, json.folder = "/Users/soyabean/Desktop/tmp/scdown/cell_browser/json", update = TRUE) +# ucsc.cb.samples = ShowCBDatasets(lazy = TRUE, json.folder = json.folder, update = TRUE) # second time run, load the downloaded json files -ucsc.cb.samples <- ShowCBDatasets(lazy = TRUE, json.folder = "/Users/soyabean/Desktop/tmp/scdown/cell_browser/json", update = FALSE) +ucsc.cb.samples <- ShowCBDatasets(lazy = TRUE, json.folder = json.folder, update = FALSE) # always read online # ucsc.cb.samples = ShowCBDatasets(lazy = FALSE) @@ -353,7 +349,7 @@ unique(unlist(sapply(unique(gsub(pattern = "\\|parent", replacement = "", x = uc
-### Summary attributes +### Summarise attributes `scfetch` provides `StatDBAttribute` to summary attributes of [UCSC Cell Browser](https://cells.ucsc.edu/?#): @@ -378,7 +374,7 @@ hbb.sample.df <- ExtractCBDatasets(all.samples.df = ucsc.cb.samples, organ = c(" `scfetch` provides `ExtractCBComposition` to extract cell type annotation and composition. ```{r cb_celltype, eval=FALSE} -hbb.sample.ct <- ExtractCBComposition(json.folder = "/Users/soyabean/Desktop/tmp/scdown/cell_browser/json", sample.df = hbb.sample.df) +hbb.sample.ct <- ExtractCBComposition(json.folder = json.folder, sample.df = hbb.sample.df) ```
@@ -432,9 +428,10 @@ After manually check the extracted metadata, users can **download the specified The returned result is a dataframe containing failed objects. If not `NULL`, users can re-run `ParseZenodo` by setting `doi.df` to the returned result. ```{r zenodo_parse, eval=FALSE} +out.folder <- tempdir() multi.dois.parse <- ParseZenodo( doi = c("1111", "10.5281/zenodo.7243603", "10.5281/zenodo.7244441"), - file.ext = c("rdata", "rds"), out.folder = "/Users/soyabean/Desktop/tmp/scdown/download_zenodo" + file.ext = c("rdata", "rds"), out.folder = out.folder ) ``` @@ -455,7 +452,7 @@ all.cellxgene.datasets <- ShowCELLxGENEDatasets()
-### Summary attributes +### Summarise attributes `scfetch` provides `StatDBAttribute` to summary attributes of [CELLxGENE](https://cellxgene.cziscience.com/): @@ -486,9 +483,10 @@ After manually check the extracted metadata, users can **download the specified The returned result is a dataframe containing failed datasets. If not `NULL`, users can re-run `ParseCELLxGENE` by setting `meta` to the returned result. ```{r cellxgene_parse, eval=FALSE} +out.folder <- tempdir() ParseCELLxGENE( meta = human.10x.cellxgene.meta[1:5, ], file.ext = "rds", - out.folder = "/Users/soyabean/Desktop/tmp/scdown/download_cellxgene" + out.folder = out.folder ) ``` @@ -563,9 +561,11 @@ cds3.obj <- ExportSeurat(seu.obj = pbmc_small, assay = "RNA", to = "cell_data_se The conversion is performed with functions implemented in `sceasy`: ```{r seu2anndata, eval=FALSE} # remove pbmc_small.h5ad first +anndata.file <- tempfile(pattern = "pbmc_small_", fileext = ".h5ad") +# you may need to set conda.path ExportSeurat( - seu.obj = pbmc_small, assay = "RNA", to = "AnnData", conda.path = "/Applications/anaconda3", - anndata.file = "/Users/soyabean/Desktop/tmp/scdown/conversion/pbmc_small.h5ad" + seu.obj = pbmc_small, assay = "RNA", to = "AnnData", + anndata.file = anndata.file ) ``` @@ -575,9 +575,10 @@ ExportSeurat( The conversion is performed with functions implemented in `SeuratDisk`: ```{r seu2loom, eval=FALSE} +loom.file <- tempfile(pattern = "pbmc_small_", fileext = ".loom") ExportSeurat( seu.obj = pbmc_small, assay = "RNA", to = "loom", - loom.file = "/Users/soyabean/Desktop/tmp/scdown/conversion/pbmc_small.loom" + loom.file = loom.file ) ``` @@ -614,9 +615,9 @@ seu.obj.cds3 <- ImportSeurat(obj = cds3.obj, from = "cell_data_set", count.assay The conversion is performed with functions implemented in `sceasy`: ```{r anndata2seu, eval=FALSE} +# you may need to set conda.path seu.obj.h5ad <- ImportSeurat( - anndata.file = "/Users/soyabean/Desktop/tmp/scdown/conversion/pbmc_small.h5ad", - from = "AnnData", assay = "RNA", conda.path = "/Applications/anaconda3" + anndata.file = anndata.file, from = "AnnData", assay = "RNA" ) ``` @@ -628,7 +629,7 @@ The conversion is performed with functions implemented in `SeuratDisk` and `Seur ```{r loom2seu, eval=FALSE} # loom will lose reduction -seu.obj.loom <- ImportSeurat(loom.file = "/Users/soyabean/Desktop/tmp/scdown/conversion/pbmc_small.loom", from = "loom") +seu.obj.loom <- ImportSeurat(loom.file = loom.file, from = "loom") ```
@@ -641,9 +642,10 @@ The conversion is performed with functions implemented in `zellkonverter`. ```{r sce2anndata, eval=FALSE} # remove seger.h5ad first +seger.anndata.file <- tempfile(pattern = "seger_", fileext = ".h5ad") SCEAnnData( from = "SingleCellExperiment", to = "AnnData", sce = seger, X_name = "counts", - anndata.file = "/Users/soyabean/Desktop/tmp/scdown/conversion/seger.h5ad" + anndata.file = seger.anndata.file ) ``` @@ -654,7 +656,7 @@ SCEAnnData( ```{r anndata2sce, eval=FALSE} seger.anndata <- SCEAnnData( from = "AnnData", to = "SingleCellExperiment", - anndata.file = "/Users/soyabean/Desktop/tmp/scdown/conversion/seger.h5ad" + anndata.file = seger.anndata.file ) ``` @@ -668,9 +670,10 @@ The conversion is performed with functions implemented in `LoomExperiment`. ```{r sce2loom, eval=FALSE} # remove seger.loom first +seger.loom.file <- tempfile(pattern = "seger_", fileext = ".loom") SCELoom( from = "SingleCellExperiment", to = "loom", sce = seger, - loom.file = "/Users/soyabean/Desktop/tmp/scdown/conversion/seger.loom" + loom.file = seger.loom.file ) ``` @@ -681,7 +684,7 @@ SCELoom( ```{r loom2sce, eval=FALSE} seger.loom <- SCELoom( from = "loom", to = "SingleCellExperiment", - loom.file = "/Users/soyabean/Desktop/tmp/scdown/conversion/seger.loom" + loom.file = seger.loom.file ) ```