Bioconductor · Roleren · Sep 14, 2020 · Sep 14, 2020
diff --git a/.gitignore b/.gitignore
@@ -1 +1,4 @@
 /R/.Rhistory
+.Rproj.user
+GenomicFeatures.Rproj
+.Rbuildignore
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: GenomicFeatures
 Title: Conveniently import and query gene models
-Version: 1.41.3
+Version: 1.41.4
 Encoding: UTF-8
 Author: M. Carlson, H. Pagès, P. Aboyoun, S. Falcon, M. Morgan,
 	D. Sarkar, M. Lawrence, V. Obenchain

diff --git a/R/coverageByTranscript.R b/R/coverageByTranscript.R
@@ -35,97 +35,206 @@
     ans
 }
 
+### OLD VERSION
+# ### Computing coverage by transcript (or CDS) of a set of ranges is an
+# ### operation that feels a lot like extracting transcript sequences from a
+# ### genome. Defined as an ordinary function for now.
+# coverageByTranscript <- function(x, transcripts, ignore.strand=FALSE)
+# {
+#     if (!is(transcripts, "GRangesList")) {
+#         transcripts <- try(exonsBy(transcripts, by="tx", use.names=TRUE),
+#                            silent=TRUE)
+#         if (is(transcripts, "try-error"))
+#             stop(wmsg("failed to extract the exon ranges ",
+#                       "from 'transcripts' with ",
+#                       "exonsBy(transcripts, by=\"tx\", use.names=TRUE)"))
+#     }
+#     if (!isTRUEorFALSE(ignore.strand))
+#         stop(wmsg("'ignore.strand' must be TRUE or FALSE"))
+#
+#   ## STEP 1 - Infer seqlengths that are missing from 'x' and 'transcripts'.
+#
+#     ## This will guarantee that subsetting the named RleList object
+#     ## representing the read coverage by the GRanges object representing
+#     ## the set of unique exons won't fail due to out-of-bounds ranges in
+#     ## the subscript. See STEP 3 below.
+#     seqinfo(x) <- .merge_seqinfo_and_infer_missing_seqlengths(x, transcripts)
+#
+#   ## STEP 2 - Compute unique exons ('uex').
+#
+#     ex <- unlist(transcripts, use.names=FALSE)
+#     ## We could simply do 'uex <- unique(ex)' here but we're going to need
+#     ## 'sm' and 'is_unique' later to compute the "reverse index" so we compute
+#     ## them now and use them to extract the unique exons. That way we hash
+#     ## 'ex' only once (the expensive operation).
+#     sm <- selfmatch(ex)  # uses a hash table internally
+#     is_unique <- sm == seq_along(sm)
+#     uex2ex <- which(is_unique)  # index of unique exons
+#     uex <- ex[uex2ex]  # unique exons
+#
+#   ## STEP 3 - Compute coverage for each unique exon ('uex_cvg').
+#
+#     #There doesn't seem to be much benefit in doing this.
+#     #x <- subsetByOverlaps(x, transcripts, ignore.strand=TRUE)
+#     if (ignore.strand) {
+#         cvg <- coverage(x)
+#         ## Because we've inferred the seqlengths that are missing from 'x'
+#         ## and 'transcripts' (see STEP 1 above), 'uex' should never contain
+#         ## out-of-bounds ranges i.e. the list elements in 'cvg' should always
+#         ## be Rle objects that are long enough with respect to the ranges
+#         ## in 'uex'.
+#         uex_cvg <- cvg[uex]  # parallel to 'uex'
+#     } else {
+#         x1 <- x[strand(x) %in% c("+", "*")]
+#         x2 <- x[strand(x) %in% c("-", "*")]
+#         cvg1 <- coverage(x1)
+#         cvg2 <- coverage(x2)
+#         is_plus_ex <- strand(uex) == "+"
+#         is_minus_ex <- strand(uex) == "-"
+#         if (!identical(is_plus_ex, !is_minus_ex))
+#             stop(wmsg("'transcripts' has exons on the * strand. ",
+#                       "This is not supported at the moment."))
+#         ## Because we've inferred the seqlengths that are missing from 'x'
+#         ## and 'transcripts' (see STEP 1 above), 'uex' should never contain
+#         ## out-of-bounds ranges i.e. the list elements in 'cvg1' and 'cvg2'
+#         ## should always be Rle objects that are long enough with respect to
+#         ## the ranges in 'uex'.
+#         uex_cvg <- cvg1[uex]
+#         uex_cvg[is_minus_ex] <- cvg2[uex[is_minus_ex]]
+#     }
+#
+#   ## STEP 4 - Flip coverage for exons on minus strand.
+#
+#     ## It feels like this is not as fast as it could be (the bottleneck being
+#     ## subsetting an Rle object which needs to be revisited at some point).
+#     uex_cvg <- revElements(uex_cvg, strand(uex) == "-")
+#
+#   ## STEP 5 - Compute coverage by original exon ('ex_cvg').
+#
+#     ex2uex <- (seq_along(sm) - cumsum(!is_unique))[sm]  # reverse index
+#     #stopifnot(identical(ex2uex[uex2ex], seq_along(uex2ex)))  # sanity
+#     #stopifnot(identical(ex2uex[sm], ex2uex))  # sanity
+#     #stopifnot(all(uex[ex2uex] == ex))  # sanity
+#
+#     ex_cvg <- uex_cvg[ex2uex]  # parallel go 'ex'
+#
+#   ## STEP 6 - Compute coverage of each transcript by concatenating coverage of
+#   ##          its exons.
+#
+#     ans <- IRanges:::regroupBySupergroup(ex_cvg, transcripts)
+#
+#   ## STEP 7 - Propagate 'mcols(transcripts)'.
+#
+#     mcols(ans) <- mcols(transcripts)
+#     ans
+# }
+
 ### Computing coverage by transcript (or CDS) of a set of ranges is an
 ### operation that feels a lot like extracting transcript sequences from a
 ### genome. Defined as an ordinary function for now.
-coverageByTranscript <- function(x, transcripts, ignore.strand=FALSE)
-{
-    if (!is(transcripts, "GRangesList")) {
-        transcripts <- try(exonsBy(transcripts, by="tx", use.names=TRUE),
-                           silent=TRUE)
-        if (is(transcripts, "try-error"))
-            stop(wmsg("failed to extract the exon ranges ",
-                      "from 'transcripts' with ",
-                      "exonsBy(transcripts, by=\"tx\", use.names=TRUE)"))
-    }
-    if (!isTRUEorFALSE(ignore.strand))
-        stop(wmsg("'ignore.strand' must be TRUE or FALSE"))
-
+coverageByTranscript <- function (x, transcripts, ignore.strand = FALSE,
+                                   weight = 1L) {
+  if (!is(transcripts, "GRangesList")) {
+    transcripts <- try(exonsBy(transcripts, by = "tx", use.names = TRUE),
+                       silent = TRUE)
+    if (is(transcripts, "try-error"))
+      stop(wmsg("failed to extract the exon ranges ",
+                "from 'transcripts' with ", "exonsBy(transcripts, by=\"tx\", use.names=TRUE)"))
+  }
+  if (!isTRUEorFALSE(ignore.strand))
+    stop(wmsg("'ignore.strand' must be TRUE or FALSE"))
   ## STEP 1 - Infer seqlengths that are missing from 'x' and 'transcripts'.
 
-    ## This will guarantee that subsetting the named RleList object
-    ## representing the read coverage by the GRanges object representing
-    ## the set of unique exons won't fail due to out-of-bounds ranges in
-    ## the subscript. See STEP 3 below.
-    seqinfo(x) <- .merge_seqinfo_and_infer_missing_seqlengths(x, transcripts)
+  ## This will guarantee that subsetting the named RleList object
+  ## representing the read coverage by the GRanges object representing
+  ## the set of unique exons won't fail due to out-of-bounds ranges in
+  ## the subscript. See STEP 3 below.
+  seqinfo(x) <- .merge_seqinfo_and_infer_missing_seqlengths(x, transcripts)
 
   ## STEP 2 - Compute unique exons ('uex').
 
-    ex <- unlist(transcripts, use.names=FALSE)
-    ## We could simply do 'uex <- unique(ex)' here but we're going to need
-    ## 'sm' and 'is_unique' later to compute the "reverse index" so we compute
-    ## them now and use them to extract the unique exons. That way we hash
-    ## 'ex' only once (the expensive operation).
-    sm <- selfmatch(ex)  # uses a hash table internally
-    is_unique <- sm == seq_along(sm)
-    uex2ex <- which(is_unique)  # index of unique exons
-    uex <- ex[uex2ex]  # unique exons
+  ex <- unlist(transcripts, use.names=FALSE)
+  ## We could simply do 'uex <- unique(ex)' here but we're going to need
+  ## 'sm' and 'is_unique' later to compute the "reverse index" so we compute
+  ## them now and use them to extract the unique exons. That way we hash
+  ## 'ex' only once (the expensive operation).
+  sm <- selfmatch(ex)  # uses a hash table internally
+  is_unique <- sm == seq_along(sm)
+  uex2ex <- which(is_unique)  # index of unique exons
+  uex <- ex[uex2ex]  # unique exons
 
   ## STEP 3 - Compute coverage for each unique exon ('uex_cvg').
 
-    #There doesn't seem to be much benefit in doing this.
-    #x <- subsetByOverlaps(x, transcripts, ignore.strand=TRUE)
-    if (ignore.strand) {
-        cvg <- coverage(x)
-        ## Because we've inferred the seqlengths that are missing from 'x'
-        ## and 'transcripts' (see STEP 1 above), 'uex' should never contain
-        ## out-of-bounds ranges i.e. the list elements in 'cvg' should always
-        ## be Rle objects that are long enough with respect to the ranges
-        ## in 'uex'.
-        uex_cvg <- cvg[uex]  # parallel to 'uex'
+  #There doesn't seem to be much benefit in doing this.
+  #x <- subsetByOverlaps(x, transcripts, ignore.strand=TRUE)
+
+  ## Fix GAlignments not allowing mcol weight, remove when they fix it
+  ## in GAlignments definition of coverage.
+  ## Because we've inferred the seqlengths that are missing from 'x'
+  ## and 'transcripts' (see STEP 1 above), 'uex' should never contain
+  ## out-of-bounds ranges i.e. the list elements in 'cvg' should always
+  ## be Rle objects that are long enough with respect to the ranges
+  ## in 'uex'.
+  if ((is(x, "GAlignments") | is(x, "GAlignmentPairs"))
+      & is.character(weight)) {
+    if (!(weight %in% colnames(mcols(x))))
+      stop("weight is character and not mcol of x,",
+           " check spelling of weight.")
+    weight <- mcols(x)[, weight]
+    x <- grglist(x) # convert to grl
+    weight = weight[groupings(x)] # repeat weight per group
+  }
+
+  if (ignore.strand) {
+    cvg <- coverage(x, weight = weight)
+    uex_cvg <- cvg[uex]
+  }
+  else {
+    pluss <- BiocGenerics::`%in%`(strand(x), c("+", "*"))
+    minus <- BiocGenerics::`%in%`(strand(x), c("-", "*"))
+    x1 <- x[pluss]
+    x2 <- x[minus]
+    if (length(weight) > 1) {
+      # Add unlist in case of GAlignments
+      cvg1 <- coverage(x1, weight = weight[as.logical(unlist(pluss))])
+      cvg2 <- coverage(x2, weight = weight[as.logical(unlist(minus))])
     } else {
-        x1 <- x[strand(x) %in% c("+", "*")]
-        x2 <- x[strand(x) %in% c("-", "*")]
-        cvg1 <- coverage(x1)
-        cvg2 <- coverage(x2)
-        is_plus_ex <- strand(uex) == "+"
-        is_minus_ex <- strand(uex) == "-"
-        if (!identical(is_plus_ex, !is_minus_ex))
-            stop(wmsg("'transcripts' has exons on the * strand. ",
-                      "This is not supported at the moment."))
-        ## Because we've inferred the seqlengths that are missing from 'x'
-        ## and 'transcripts' (see STEP 1 above), 'uex' should never contain
-        ## out-of-bounds ranges i.e. the list elements in 'cvg1' and 'cvg2'
-        ## should always be Rle objects that are long enough with respect to
-        ## the ranges in 'uex'.
-        uex_cvg <- cvg1[uex]
-        uex_cvg[is_minus_ex] <- cvg2[uex[is_minus_ex]]
+      cvg1 <- coverage(x1, weight = weight)
+      cvg2 <- coverage(x2, weight = weight)
     }
 
+    is_plus_ex <- strand(uex) == "+"
+    is_minus_ex <- strand(uex) == "-"
+    if (!identical(is_plus_ex, !is_minus_ex))
+      stop(wmsg("'transcripts' has exons on the * strand. ",
+                "This is not supported at the moment."))
+    uex_cvg <- cvg1[uex]
+    uex_cvg[is_minus_ex] <- cvg2[uex[is_minus_ex]]
+  }
   ## STEP 4 - Flip coverage for exons on minus strand.
 
-    ## It feels like this is not as fast as it could be (the bottleneck being
-    ## subsetting an Rle object which needs to be revisited at some point).
-    uex_cvg <- revElements(uex_cvg, strand(uex) == "-")
+  ## It feels like this is not as fast as it could be (the bottleneck being
+  ## subsetting an Rle object which needs to be revisited at some point).
+  uex_cvg <- revElements(uex_cvg, strand(uex) == "-")
 
   ## STEP 5 - Compute coverage by original exon ('ex_cvg').
 
-    ex2uex <- (seq_along(sm) - cumsum(!is_unique))[sm]  # reverse index
-    #stopifnot(identical(ex2uex[uex2ex], seq_along(uex2ex)))  # sanity
-    #stopifnot(identical(ex2uex[sm], ex2uex))  # sanity
-    #stopifnot(all(uex[ex2uex] == ex))  # sanity
+  ex2uex <- (seq_along(sm) - cumsum(!is_unique))[sm]  # reverse index
+  #stopifnot(identical(ex2uex[uex2ex], seq_along(uex2ex)))  # sanity
+  #stopifnot(identical(ex2uex[sm], ex2uex))  # sanity
+  #stopifnot(all(uex[ex2uex] == ex))  # sanity
 
-    ex_cvg <- uex_cvg[ex2uex]  # parallel go 'ex'
+  ex_cvg <- uex_cvg[ex2uex]  # parallel go 'ex'
 
   ## STEP 6 - Compute coverage of each transcript by concatenating coverage of
   ##          its exons.
 
-    ans <- IRanges:::regroupBySupergroup(ex_cvg, transcripts)
+  ans <- IRanges:::regroupBySupergroup(ex_cvg, transcripts)
 
   ## STEP 7 - Propagate 'mcols(transcripts)'.
 
-    mcols(ans) <- mcols(transcripts)
-    ans
+  mcols(ans) <- mcols(transcripts)
+  ans
 }
 
 

diff --git a/man/coverageByTranscript.Rd b/man/coverageByTranscript.Rd
@@ -14,7 +14,7 @@
 }
 
 \usage{
-coverageByTranscript(x, transcripts, ignore.strand=FALSE)
+coverageByTranscript(x, transcripts, ignore.strand=FALSE, weight = 1L)
 
 pcoverageByTranscript(x, transcripts, ignore.strand=FALSE, ...)
 }
@@ -65,6 +65,13 @@ pcoverageByTranscript(x, transcripts, ignore.strand=FALSE, ...)
     the range to contribute coverage to the exon. If TRUE then the strand
     is ignored.
   }
+  \item{weight}{
+    weight assigns a weight to each range in x.
+
+If x is an IntegerRanges or Views object: each of these arguments must be an integer or numeric vector parallel to x (will get recycled if necessary). Alternatively, each of these arguments can also be specified as a single string naming a metadata column in x (i.e. a column in mcols(x)) to be used as the shift (or weight) vector. Note that when x is an IPos object, each of these arguments can only be a single number.
+
+If x is an IntegerRangesList object: each of these arguments must be a numeric vector or list-like object of the same length as x (will get recycled if necessary). If it's a numeric vector, it's first turned into a list with as.list. After recycling, each list element shift[[i]] (or weight[[i]]) must be an integer or numeric vector parallel to x[[i]] (will get recycled if necessary).
+  }
   \item{...}{
     Additional arguments passed to the internal call to
     \code{\link[GenomicRanges]{grglist}()}.