Skip to content

Commit 30e6c70

Browse files
committed
fix: various improvements to loading tracks, closes showteeth#46
1 parent 6cd38ab commit 30e6c70

File tree

3 files changed

+76
-39
lines changed

3 files changed

+76
-39
lines changed

NAMESPACE

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,17 +65,26 @@ importFrom(GenomicRanges,GRanges)
6565
importFrom(GenomicRanges,end)
6666
importFrom(GenomicRanges,makeGRangesFromDataFrame)
6767
importFrom(GenomicRanges,reduce)
68+
importFrom(GenomicRanges,resize)
69+
importFrom(GenomicRanges,restrict)
6870
importFrom(GenomicRanges,setdiff)
6971
importFrom(GenomicRanges,start)
72+
importFrom(GenomicRanges,strand)
7073
importFrom(GenomicRanges,trim)
74+
importFrom(GenomicRanges,width)
7175
importFrom(IRanges,IRanges)
7276
importFrom(IRanges,findOverlaps)
7377
importFrom(IRanges,subsetByOverlaps)
7478
importFrom(Rsamtools,ScanBamParam)
79+
importFrom(Rsamtools,countBam)
80+
importFrom(Rsamtools,idxstatsBam)
7581
importFrom(Rsamtools,indexBam)
82+
importFrom(Rsamtools,scanBam)
7683
importFrom(dplyr,"%>%")
7784
importFrom(dplyr,all_of)
7885
importFrom(dplyr,arrange)
86+
importFrom(dplyr,bind_rows)
87+
importFrom(dplyr,desc)
7988
importFrom(dplyr,filter)
8089
importFrom(dplyr,group_by)
8190
importFrom(dplyr,mutate)
@@ -145,6 +154,7 @@ importFrom(scales,rescale)
145154
importFrom(scales,scientific)
146155
importFrom(stats,as.formula)
147156
importFrom(stats,reshape)
157+
importFrom(utils,head)
148158
importFrom(utils,menu)
149159
importFrom(utils,read.csv)
150160
importFrom(utils,read.table)

R/LoadTrack.R

Lines changed: 59 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
#' @param format Track file format, chosen from bam, wig, bw(bigwig), bedgraph(bedGraph) and txt.
66
#' @param region Region to extract coverage for, eg: chr14:21,677,306-21,737,601 or chr14:21,677,306.
77
#' Default: NULL, coverage is extracted from the first annotated chromosome/sequence.
8-
#' @param extend Extend length of \code{region}. Default: 2000.
8+
#' @param extend Extend length of \code{region}. Default: 0.
99
#' @param gtf.gr Granges object of GTF, created with \code{\link[rtracklayer]{import.gff}}. Default: NULL.
1010
#' @param gene.name The name of gene. Default: HNRNPC.
1111
#' @param gene.name.type Gene name type (filed of \code{gtf.gr}), chosen from gene_name and gene_id.
@@ -19,20 +19,22 @@
1919
#' Default: RPKM.
2020
#' @param single.nuc Logical value, whether to visualize at single nucleotide level. Default: FALSE.
2121
#' @param single.nuc.region Region for \code{single.nuc}. Default: NULL
22-
#' @param bin.size Size of the bins, in bases. Default: 10. Only used for BAM files, ignored for Wig, Bigwig, etc.
22+
#' @param bin.size Approximate size of the bins, in bases. Default: 10. Only used for BAM files, ignored for Wig, Bigwig, etc.
2323
#' Set to NULL to turn binning off.
24+
#' @param bin.func Function used to summarize read counts over bins when norm.method = "None". Can be one of "sum",
25+
#' "mean", "median". Default: "sum". Only used for BAM files, ignored for Wig, Bigwig, etc.
2426
#' @param bc.extra.para Extra parameters for \code{bamCoverage}, eg: "--effectiveGenomeSize 2700000000 --ignoreForNormalization chrX"
2527
#' @param n.cores The number of cores to be used for this job. Default: 1.
2628
#'
2729
#' @return A dataframe.
2830
#' @importFrom rtracklayer import
29-
#' @importFrom Rsamtools indexBam ScanBamParam
31+
#' @importFrom Rsamtools indexBam ScanBamParam scanBam countBam idxstatsBam
3032
#' @importFrom utils read.csv
3133
#' @importFrom GenomicAlignments alphabetFrequencyFromBam readGAlignments coverage
32-
#' @importFrom GenomicRanges GRanges
34+
#' @importFrom GenomicRanges GRanges restrict width resize strand
3335
#' @importFrom IRanges IRanges subsetByOverlaps
3436
#' @importFrom dplyr %>%
35-
#' @importFrom dplyr select filter mutate all_of group_by summarize
37+
#' @importFrom dplyr select filter mutate all_of group_by summarize arrange desc bind_rows
3638
#' @importFrom BiocParallel register MulticoreParam bplapply
3739
#' @importFrom ggplot2 cut_width
3840
#' @export
@@ -58,14 +60,14 @@
5860
LoadTrackFile <- function(
5961
track.file, track.folder = NULL,
6062
format = c("bam", "wig", "bw", "bedgraph", "txt"),
61-
region = NULL, extend = 2000,
63+
region = NULL, extend = 0,
6264
gtf.gr = NULL, gene.name = "HNRNPC",
6365
gene.name.type = c("gene_name", "gene_id"),
6466
meta.info = NULL, meta.file = "",
6567
bamcoverage.path = NULL,
6668
norm.method = c("RPKM", "CPM", "BPM", "RPGC", "None"),
6769
single.nuc = FALSE, single.nuc.region = NULL,
68-
bin.size = 10, bc.extra.para = NULL, n.cores = 1) {
70+
bin.size = 10, bin.func = "sum", bc.extra.para = NULL, n.cores = 1) {
6971
# check parameters
7072
format <- match.arg(arg = format)
7173
gene.name.type <- match.arg(arg = gene.name.type)
@@ -78,24 +80,38 @@ LoadTrackFile <- function(
7880

7981
# get genomic region if supplied, else it is guessed from input
8082
if (is.null(region)) {
81-
message("No 'region' specified; extracting coverage for an example range\n(<=100,000 bases, first annotated sequence)")
83+
message("No 'region' specified; extracting coverage based on file content")
8284
if (format == "bam") {
83-
seqnames <- Rsamtools::scanBamHeader(track.file[1]) %>%
84-
lapply(function(x) x$targets) %>%
85-
unname() %>%
86-
unlist()
85+
file_stats <- Rsamtools::countBam(track.file[1])
86+
message(paste0(
87+
"Estimating coverage for file '", file_stats$file,
88+
"' with ", file_stats$records, " reads and ",
89+
file_stats$nucleotides, " nucleotides"
90+
))
91+
record_stats <- Rsamtools::idxstatsBam(track.file[1]) %>%
92+
dplyr::arrange(dplyr::desc(.data$mapped)) %>%
93+
dplyr::slice(1)
94+
record_range <- Rsamtools::scanBam(track.file[1], param = Rsamtools::ScanBamParam(what = c("rname", "pos"))) %>%
95+
as.data.frame() %>%
96+
dplyr::filter(.data$rname == as.character(record_stats$seqnames)) %>%
97+
dplyr::pull(.data$pos) %>%
98+
range()
99+
record_range[2] <- min(record_range[1] + 100000, record_range[2])
87100
gr <- GenomicRanges::GRanges(
88-
seqnames = names(seqnames[1]),
89-
IRanges(start = 1, end = min(100000, seqnames[1]))
101+
seqnames = as.character(record_stats$seqnames),
102+
IRanges::IRanges(start = record_range[1], end = record_range[2])
90103
)
104+
message(paste0("Extracted range of length ", diff(record_range),
105+
" from SeqRecord '", record_stats$seqnames,
106+
"' (", record_range[1], ":", record_range[2],")"
107+
))
91108
} else if (format %in% c("wig", "bw", "bedgraph")) {
92109
gr <- range(rtracklayer::import(track.file[1]))
93110
seqnames <- as.character(seqnames(gr))
94111
if (GenomicRanges::width(gr) <= 100000) {
95112
gr <- GenomicRanges::resize(gr, width = 100000)
96113
}
97114
}
98-
message(paste0("Coverage extracted from sequence/chromosome: ", names(seqnames[1])))
99115
} else {
100116
gr <- PrepareRegion(
101117
region = region,
@@ -146,13 +162,13 @@ LoadTrackFile <- function(
146162
message("Calculating coverage with GenomicAlignments when 'norm.method = None'")
147163
if (is.null(n.cores) || n.cores == 1) {
148164
track.list <- lapply(
149-
track.file, import_bam_ga, gr, bin.size
165+
track.file, import_bam_ga, gr, bin.size, bin.func
150166
)
151167
} else {
152168
track.list <- BiocParallel::bplapply(
153169
track.file,
154170
BPPARAM = BiocParallel::MulticoreParam(),
155-
FUN = import_bam_ga, gr, bin.size
171+
FUN = import_bam_ga, gr, bin.size, bin.func
156172
)
157173
}
158174
} else {
@@ -249,25 +265,26 @@ import_txt <- function(x) {
249265
return(single.track.df)
250266
}
251267

252-
import_bam_ga <- function(x, gr, bin.size) {
268+
import_bam_ga <- function(x, gr, bin_size, bin.func) {
253269
# get basename
254-
track.file.base <- basename(x)
270+
base_name <- basename(x)
255271
# load track
256272
param <- Rsamtools::ScanBamParam(which = gr)
257273
ga <- GenomicAlignments::readGAlignments(x, param = param)
258-
ga.cov <- GenomicAlignments::coverage(ga)
259-
ga.cov.gr <- GenomicRanges::GRanges(ga.cov)
260-
ga.cov.df <- IRanges::subsetByOverlaps(ga.cov.gr, gr) %>%
261-
as.data.frame()
262-
# valid the region
263-
gr.df <- as.data.frame(gr)
264-
ga.cov.df[1, "start"] <- gr.df[1, "start"]
265-
ga.cov.df[nrow(ga.cov.df), "end"] <- gr.df[1, "end"]
266-
# optional binning
267-
ga.cov.df <- bin_coverage(ga.cov.df, bin.size)
274+
ga_cov_df <- lapply(c("+", "-", "*"), function(ga_strand) {
275+
ga_cov <- GenomicAlignments::coverage(ga[strand(ga) == ga_strand])
276+
ga_cov_gr <- GenomicRanges::GRanges(ga_cov)
277+
GenomicRanges::strand(ga_cov_gr) <- ga_strand
278+
IRanges::subsetByOverlaps(ga_cov_gr, gr) %>%
279+
GenomicRanges::restrict(start = start(gr), end = end(gr), keep.all.ranges = FALSE) %>%
280+
as.data.frame() %>%
281+
dplyr::filter(!(.data$score == 0 & .data$start == start(gr) & .data$end == end(gr)))
282+
}) %>%
283+
dplyr::bind_rows()
284+
ga_cov_df <- bin_coverage(ga_cov_df, bin_size, bin.func)
268285
# add track file
269-
ga.cov.df$TrackFile <- track.file.base
270-
return(ga.cov.df)
286+
ga_cov_df$TrackFile <- base_name
287+
return(ga_cov_df)
271288
}
272289

273290
index_bam <- function(x) {
@@ -330,15 +347,21 @@ bam_coverage <- function(
330347
return(single.track.df)
331348
}
332349

333-
bin_coverage <- function(df, bin.size = 10) {
334-
if (!is.null(bin.size) && is.numeric(bin.size)) {
350+
bin_coverage <- function(df, bin_size = 10, bin_func = "sum") {
351+
if (!is.null(bin_size) && is.numeric(bin_size)) {
335352
binned_df <- df %>%
336353
dplyr::mutate(
337-
bin = ggplot2::cut_width(start, width = bin.size, center = bin.size / 2, labels = FALSE) * bin.size
354+
bin = ggplot2::cut_width(start, width = bin_size, center = bin_size / 2, labels = FALSE)
338355
) %>%
339356
dplyr::group_by(.data$seqnames, .data$bin, .data$strand) %>%
340-
dplyr::summarize(score = mean(.data$score, na.rm = TRUE), .groups = "drop") %>%
341-
dplyr::mutate(start = .data$bin - (min(.data$bin) - 1), end = .data$bin, width = bin.size) %>%
357+
dplyr::summarize(
358+
start = start[1],
359+
end = tail(end, 1),
360+
width = end - start + 1,
361+
score = do.call(bin_func, list(x = .data$score, na.rm = TRUE)),
362+
.groups = "drop"
363+
) %>%
364+
dplyr::arrange(seqnames, strand, start) %>%
342365
dplyr::select(dplyr::all_of(c("seqnames", "start", "end", "width", "strand", "score"))) %>%
343366
as.data.frame()
344367
return(binned_df)

man/LoadTrackFile.Rd

Lines changed: 7 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)