5
5
# ' @param format Track file format, chosen from bam, wig, bw(bigwig), bedgraph(bedGraph) and txt.
6
6
# ' @param region Region to extract coverage for, eg: chr14:21,677,306-21,737,601 or chr14:21,677,306.
7
7
# ' Default: NULL, coverage is extracted from the first annotated chromosome/sequence.
8
- # ' @param extend Extend length of \code{region}. Default: 2000 .
8
+ # ' @param extend Extend length of \code{region}. Default: 0 .
9
9
# ' @param gtf.gr Granges object of GTF, created with \code{\link[rtracklayer]{import.gff}}. Default: NULL.
10
10
# ' @param gene.name The name of gene. Default: HNRNPC.
11
11
# ' @param gene.name.type Gene name type (filed of \code{gtf.gr}), chosen from gene_name and gene_id.
19
19
# ' Default: RPKM.
20
20
# ' @param single.nuc Logical value, whether to visualize at single nucleotide level. Default: FALSE.
21
21
# ' @param single.nuc.region Region for \code{single.nuc}. Default: NULL
22
- # ' @param bin.size Size of the bins, in bases. Default: 10. Only used for BAM files, ignored for Wig, Bigwig, etc.
22
+ # ' @param bin.size Approximate size of the bins, in bases. Default: 10. Only used for BAM files, ignored for Wig, Bigwig, etc.
23
23
# ' Set to NULL to turn binning off.
24
+ # ' @param bin.func Function used to summarize read counts over bins when norm.method = "None". Can be one of "sum",
25
+ # ' "mean", "median". Default: "sum". Only used for BAM files, ignored for Wig, Bigwig, etc.
24
26
# ' @param bc.extra.para Extra parameters for \code{bamCoverage}, eg: "--effectiveGenomeSize 2700000000 --ignoreForNormalization chrX"
25
27
# ' @param n.cores The number of cores to be used for this job. Default: 1.
26
28
# '
27
29
# ' @return A dataframe.
28
30
# ' @importFrom rtracklayer import
29
- # ' @importFrom Rsamtools indexBam ScanBamParam
31
+ # ' @importFrom Rsamtools indexBam ScanBamParam scanBam countBam idxstatsBam
30
32
# ' @importFrom utils read.csv
31
33
# ' @importFrom GenomicAlignments alphabetFrequencyFromBam readGAlignments coverage
32
- # ' @importFrom GenomicRanges GRanges
34
+ # ' @importFrom GenomicRanges GRanges restrict width resize strand
33
35
# ' @importFrom IRanges IRanges subsetByOverlaps
34
36
# ' @importFrom dplyr %>%
35
- # ' @importFrom dplyr select filter mutate all_of group_by summarize
37
+ # ' @importFrom dplyr select filter mutate all_of group_by summarize arrange desc bind_rows
36
38
# ' @importFrom BiocParallel register MulticoreParam bplapply
37
39
# ' @importFrom ggplot2 cut_width
38
40
# ' @export
58
60
LoadTrackFile <- function (
59
61
track.file , track.folder = NULL ,
60
62
format = c(" bam" , " wig" , " bw" , " bedgraph" , " txt" ),
61
- region = NULL , extend = 2000 ,
63
+ region = NULL , extend = 0 ,
62
64
gtf.gr = NULL , gene.name = " HNRNPC" ,
63
65
gene.name.type = c(" gene_name" , " gene_id" ),
64
66
meta.info = NULL , meta.file = " " ,
65
67
bamcoverage.path = NULL ,
66
68
norm.method = c(" RPKM" , " CPM" , " BPM" , " RPGC" , " None" ),
67
69
single.nuc = FALSE , single.nuc.region = NULL ,
68
- bin.size = 10 , bc.extra.para = NULL , n.cores = 1 ) {
70
+ bin.size = 10 , bin.func = " sum " , bc.extra.para = NULL , n.cores = 1 ) {
69
71
# check parameters
70
72
format <- match.arg(arg = format )
71
73
gene.name.type <- match.arg(arg = gene.name.type )
@@ -78,24 +80,38 @@ LoadTrackFile <- function(
78
80
79
81
# get genomic region if supplied, else it is guessed from input
80
82
if (is.null(region )) {
81
- message(" No 'region' specified; extracting coverage for an example range \n (<=100,000 bases, first annotated sequence) " )
83
+ message(" No 'region' specified; extracting coverage based on file content " )
82
84
if (format == " bam" ) {
83
- seqnames <- Rsamtools :: scanBamHeader(track.file [1 ]) %> %
84
- lapply(function (x ) x $ targets ) %> %
85
- unname() %> %
86
- unlist()
85
+ file_stats <- Rsamtools :: countBam(track.file [1 ])
86
+ message(paste0(
87
+ " Estimating coverage for file '" , file_stats $ file ,
88
+ " ' with " , file_stats $ records , " reads and " ,
89
+ file_stats $ nucleotides , " nucleotides"
90
+ ))
91
+ record_stats <- Rsamtools :: idxstatsBam(track.file [1 ]) %> %
92
+ dplyr :: arrange(dplyr :: desc(.data $ mapped )) %> %
93
+ dplyr :: slice(1 )
94
+ record_range <- Rsamtools :: scanBam(track.file [1 ], param = Rsamtools :: ScanBamParam(what = c(" rname" , " pos" ))) %> %
95
+ as.data.frame() %> %
96
+ dplyr :: filter(.data $ rname == as.character(record_stats $ seqnames )) %> %
97
+ dplyr :: pull(.data $ pos ) %> %
98
+ range()
99
+ record_range [2 ] <- min(record_range [1 ] + 100000 , record_range [2 ])
87
100
gr <- GenomicRanges :: GRanges(
88
- seqnames = names( seqnames [ 1 ] ),
89
- IRanges(start = 1 , end = min( 100000 , seqnames [ 1 ]) )
101
+ seqnames = as.character( record_stats $ seqnames ),
102
+ IRanges :: IRanges (start = record_range [ 1 ] , end = record_range [ 2 ] )
90
103
)
104
+ message(paste0(" Extracted range of length " , diff(record_range ),
105
+ " from SeqRecord '" , record_stats $ seqnames ,
106
+ " ' (" , record_range [1 ], " :" , record_range [2 ]," )"
107
+ ))
91
108
} else if (format %in% c(" wig" , " bw" , " bedgraph" )) {
92
109
gr <- range(rtracklayer :: import(track.file [1 ]))
93
110
seqnames <- as.character(seqnames(gr ))
94
111
if (GenomicRanges :: width(gr ) < = 100000 ) {
95
112
gr <- GenomicRanges :: resize(gr , width = 100000 )
96
113
}
97
114
}
98
- message(paste0(" Coverage extracted from sequence/chromosome: " , names(seqnames [1 ])))
99
115
} else {
100
116
gr <- PrepareRegion(
101
117
region = region ,
@@ -146,13 +162,13 @@ LoadTrackFile <- function(
146
162
message(" Calculating coverage with GenomicAlignments when 'norm.method = None'" )
147
163
if (is.null(n.cores ) || n.cores == 1 ) {
148
164
track.list <- lapply(
149
- track.file , import_bam_ga , gr , bin.size
165
+ track.file , import_bam_ga , gr , bin.size , bin.func
150
166
)
151
167
} else {
152
168
track.list <- BiocParallel :: bplapply(
153
169
track.file ,
154
170
BPPARAM = BiocParallel :: MulticoreParam(),
155
- FUN = import_bam_ga , gr , bin.size
171
+ FUN = import_bam_ga , gr , bin.size , bin.func
156
172
)
157
173
}
158
174
} else {
@@ -249,25 +265,26 @@ import_txt <- function(x) {
249
265
return (single.track.df )
250
266
}
251
267
252
- import_bam_ga <- function (x , gr , bin.size ) {
268
+ import_bam_ga <- function (x , gr , bin_size , bin.func ) {
253
269
# get basename
254
- track.file.base <- basename(x )
270
+ base_name <- basename(x )
255
271
# load track
256
272
param <- Rsamtools :: ScanBamParam(which = gr )
257
273
ga <- GenomicAlignments :: readGAlignments(x , param = param )
258
- ga.cov <- GenomicAlignments :: coverage(ga )
259
- ga.cov.gr <- GenomicRanges :: GRanges(ga.cov )
260
- ga.cov.df <- IRanges :: subsetByOverlaps(ga.cov.gr , gr ) %> %
261
- as.data.frame()
262
- # valid the region
263
- gr.df <- as.data.frame(gr )
264
- ga.cov.df [1 , " start" ] <- gr.df [1 , " start" ]
265
- ga.cov.df [nrow(ga.cov.df ), " end" ] <- gr.df [1 , " end" ]
266
- # optional binning
267
- ga.cov.df <- bin_coverage(ga.cov.df , bin.size )
274
+ ga_cov_df <- lapply(c(" +" , " -" , " *" ), function (ga_strand ) {
275
+ ga_cov <- GenomicAlignments :: coverage(ga [strand(ga ) == ga_strand ])
276
+ ga_cov_gr <- GenomicRanges :: GRanges(ga_cov )
277
+ GenomicRanges :: strand(ga_cov_gr ) <- ga_strand
278
+ IRanges :: subsetByOverlaps(ga_cov_gr , gr ) %> %
279
+ GenomicRanges :: restrict(start = start(gr ), end = end(gr ), keep.all.ranges = FALSE ) %> %
280
+ as.data.frame() %> %
281
+ dplyr :: filter(! (.data $ score == 0 & .data $ start == start(gr ) & .data $ end == end(gr )))
282
+ }) %> %
283
+ dplyr :: bind_rows()
284
+ ga_cov_df <- bin_coverage(ga_cov_df , bin_size , bin.func )
268
285
# add track file
269
- ga.cov.df $ TrackFile <- track.file.base
270
- return (ga.cov.df )
286
+ ga_cov_df $ TrackFile <- base_name
287
+ return (ga_cov_df )
271
288
}
272
289
273
290
index_bam <- function (x ) {
@@ -330,15 +347,21 @@ bam_coverage <- function(
330
347
return (single.track.df )
331
348
}
332
349
333
- bin_coverage <- function (df , bin.size = 10 ) {
334
- if (! is.null(bin.size ) && is.numeric(bin.size )) {
350
+ bin_coverage <- function (df , bin_size = 10 , bin_func = " sum " ) {
351
+ if (! is.null(bin_size ) && is.numeric(bin_size )) {
335
352
binned_df <- df %> %
336
353
dplyr :: mutate(
337
- bin = ggplot2 :: cut_width(start , width = bin.size , center = bin.size / 2 , labels = FALSE ) * bin.size
354
+ bin = ggplot2 :: cut_width(start , width = bin_size , center = bin_size / 2 , labels = FALSE )
338
355
) %> %
339
356
dplyr :: group_by(.data $ seqnames , .data $ bin , .data $ strand ) %> %
340
- dplyr :: summarize(score = mean(.data $ score , na.rm = TRUE ), .groups = " drop" ) %> %
341
- dplyr :: mutate(start = .data $ bin - (min(.data $ bin ) - 1 ), end = .data $ bin , width = bin.size ) %> %
357
+ dplyr :: summarize(
358
+ start = start [1 ],
359
+ end = tail(end , 1 ),
360
+ width = end - start + 1 ,
361
+ score = do.call(bin_func , list (x = .data $ score , na.rm = TRUE )),
362
+ .groups = " drop"
363
+ ) %> %
364
+ dplyr :: arrange(seqnames , strand , start ) %> %
342
365
dplyr :: select(dplyr :: all_of(c(" seqnames" , " start" , " end" , " width" , " strand" , " score" ))) %> %
343
366
as.data.frame()
344
367
return (binned_df )
0 commit comments