diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100755 index 0000000..64837d1 --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,17 @@ +Package: intervals +Version: 0.15.0 +Date: 2014-09-19 +Type: Package +Title: Tools for working with points and intervals +Author: Richard Bourgon +Maintainer: Edzer Pebesma +Depends: R (>= 2.9.0), methods +Suggests: +Description: Tools for working with and comparing sets of points and intervals. +License: Artistic-2.0 +LazyLoad: yes +URL: http://github.com/edzer/intervals +Packaged: 2013-11-03 17:10:26 UTC; hornik +NeedsCompilation: yes +Repository: CRAN +Date/Publication: 2013-11-03 18:23:55 diff --git a/MD5 b/MD5 new file mode 100644 index 0000000..797f767 --- /dev/null +++ b/MD5 @@ -0,0 +1,62 @@ +e3b52f25c29216dab4411db63fc9924d *DESCRIPTION +8fad5c1380be8ac95292709b667580c0 *NAMESPACE +395419ebcf2325cd51382cf869b7ce5c *NEWS +5d3f4dfbcb8c9119dd1562ccbc412467 *R/Intervals-class.R +b324268152a9423e6c65c0c988d6c54b *R/as.matrix.R +38bfefc3e9a840c8c923c89880c1587c *R/c.R +34cd12ab4fe68a81147862e97b15949f *R/close_intervals-methods.R +b3eb24412f49233d264cfa4b59d7e2e4 *R/closed-methods.R +ff9d706b9f496d57e129b6900b73f897 *R/clusters.R +bf6fd3b2e3d8a0423e0d5cc56886861b *R/distance_to_nearest-methods.R +eab17d98c078805adf0e123cb637cc15 *R/empty-methods.R +fc7e40041f4ae6b2796830c2fa03fd59 *R/expand-methods.R +af2a9dcabf565023997fdd01481ebd1c *R/head-methods.R +36b33045bde48137c282a6019157e0c9 *R/interval_complement-methods.R +90cc9b22d5562826c992b7e8afb189d6 *R/interval_difference-methods.R +521baba587949ec360abde100182c525 *R/interval_included-methods.R +1564356296f5c31f7f4b8e2af3f81198 *R/interval_intersection-methods.R +22e0554a7dd6017d28ed6af0b739a005 *R/interval_overlap-methods.R +c4db0a892e9af14025c7140d24cbc146 *R/interval_union-methods.R +606491d4aac135785542a9ccfc8a3070 *R/is.na-methods.R +c0676ed4c23fb7e3ddbc2943706e449e *R/plot.Intervals_virtual.R +c98be2c3ff6e8bb290722d3b0d7cb40f *R/reduce-methods.R +dd2d5638db27a01280f3ba7b51499ca2 *R/show-methods.R +9fb026e61d588212f18e5ce675e25b6d *R/size-methods.R +f60a5d3db5f7ffdc7b2e2f89ad73aa8e *R/split.R +1f063de2558da95b89939fe6ded3cb17 *R/t-methods.R +269d9904ee4f9077977318a06c0ea80b *R/type-methods.R +68f0dd826dd4eacc4f1dd5b9563ca0a6 *R/which_nearest-methods.R +27f6ca46de01553328e15e76bebe5643 *build/vignette.rds +92331b5a7c5d0c6c690134be22d279fd *data/sgd.rdata +a2f1dae28510f0ca5a3cb5089741ff79 *inst/doc/intervals_overview.R +61b77aed42f5d4a9517527dfd20dcd9f *inst/doc/intervals_overview.Rnw +746f4bf572ae118beddaa1361890addd *inst/doc/intervals_overview.pdf +35de4238afd3ef852acdffeaee785700 *man/Intervals-class.Rd +0177e9e1b85eacda57458bfe8298c35c *man/Intervals_virtual-class.Rd +e3e3202f6d824d8f23a435eb9ebf58d1 *man/Intervals_virtual_or_numeric-class.Rd +fa157f068564f9751e7a820fd76113cc *man/as.matrix.Intervals.Rd +fbfe10a5da6b8e08f574a14e7f6a3e5e *man/c.Intervals.Rd +90ca50aa8682d0b5d29a7296a7ae6452 *man/close_intervals-methods.Rd +2abf435889e72a82124c8500f4be5c98 *man/clusters-methods.Rd +75da89fdf7d4e884ff0d8d194d6e1f01 *man/distance_to_nearest-methods.Rd +1dba3b2837c85159bbaf2688ae0adbad *man/empty-methods.Rd +e382cd554673c0068670a67278c6115a *man/expand-methods.Rd +096699acb78d27aa015bfcf956f4f557 *man/interval_complement-methods.Rd +d5e010258d8a735cf40deb9f5a46dfaf *man/interval_difference-methods.Rd +f294dea1388f4faa935666279c852ceb *man/interval_included-methods.Rd +5795101b91efd8d1858b2a883716d9da *man/interval_intersection-methods.Rd +0800a00a632882b9723a3df5d80d3160 *man/interval_overlap-methods.Rd +8c3d9577d7b5ee414115951dc669f921 *man/interval_union-methods.Rd +e4fe3b5b457265fdfc8bf6b6ff82ae8b *man/intervals-package.Rd +a0f85d87af5844eb046f54e67d62331a *man/plot.Intervals.Rd +5c627704aac7e9dcaea89534d6a7a6fe *man/reduce-methods.Rd +27dce9eb576a05a2deaa48fd10c8c4d2 *man/sgd.Rd +c02935db7a4e097bf4316e967f41cb67 *man/size-methods.Rd +4a9e6b81bd312461016f35fa3b3909b0 *man/split.Intervals_virtual.Rd +32a4b517594cd57c13e31a54ce542808 *man/which_nearest-methods.Rd +8e00c2c424c18850880d2273098bfa60 *src/Endpoint.cpp +7a32b00c4f944861c37a65658958b93c *src/Endpoint.h +21e3c6ce0a8cb6f07ce6e0e0cc049316 *src/plot_overlap.cpp +07904a7e49cad3fa6b13cb957818460b *src/reduce.cpp +5419ee07a71faefc60ec68a9dd67d451 *src/which_nearest.cpp +477a72cf673ae334fb367f84bc7870e0 *tests/intervals_test_code.R diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..367056c --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,65 @@ +######## Compiled code + +useDynLib( "intervals" ) + +######## Imports + +# Class, method, and regular imports from the methods package are not listed +# here, since we depend on the package. + +import("methods") + +importFrom( "utils", "head", "tail" ) + +######## Exports + +export( "Intervals", "Intervals_full" ) + +S3method( "split", "Intervals_virtual" ) +S3method( "as.matrix", "Intervals_virtual" ) +S3method( "c", "Intervals" ) +S3method( "c", "Intervals_full" ) +S3method( "plot", "Intervals" ) +S3method( "plot", "Intervals_full" ) +exportClasses( + "Intervals", + "Intervals_full", + "Intervals_virtual", + "Intervals_virtual_or_numeric" +) + +exportMethods( + "[", + "[<-", + "as.matrix", + "adjust_closure", + "close_intervals", + "closed", + "closed<-", + "clusters", + "coerce", + "contract", + "distance_to_nearest", + "empty", + "expand", + "head", + "initialize", + "interval_complement", + "interval_difference", + "interval_included", + "interval_intersection", + "interval_overlap", + "interval_union", + "is.na", + "open_intervals", + "plot", + "reduce", + "show", + "size", + "split", + "t", + "tail", + "type", + "type<-", + "which_nearest" +) diff --git a/NEWS b/NEWS new file mode 100644 index 0000000..6e0a514 --- /dev/null +++ b/NEWS @@ -0,0 +1,74 @@ + ******** VERSION 0.13.3 ******** + +BUG FIXES + + - The S3 plot() method for class Intervals was ignoring the y argument. (The + method for class Intervals_full did, however, use the argument properly.) + This has been fixed. + + ******** VERSION 0.13.2 ******** + +BUG FIXES + + - The initialization method for the Intervals class threw errors when empty + objects were created. This has been fixed. + + ******** VERSION 0.13.0 ******** +NEW FEATURES + + - New interval_included methods, similar to interval_overlap, but requiring + full inclusion. + + ******** VERSION 0.12.3 ******** +NEW FEATURES + + - S3 as.matrix methods for Intervals and Intervals_full objects. + + ******** VERSION 0.12.0 ******** +NEW FEATURES + + - S3 plot methods for Intervals and Intervals_full objects. + + ******** VERSION 0.11.1 ******** + +SIGNIFICANT USER-VISIBLE CHANGES + + - The interval_overlap and distance_to_nearest methods are now just wrappers + for which_nearest. Because of this, the argument names for interval_overlap, + have been changed from 'target' and 'query' to 'from' and 'to', + respectively, in order to match the other two functions. Argument order is + the same as before, but functions which call arguments by name and use the + old names will now generate an (informative) error message. + +BUG FIXES + + - The C++ code for interval_overlap correctly handles all varieties of + endpoint closure, but does not handle cases like (0,2) vs. (1,3) over + Z. (These intervals do not overlap, even though the right endpoint of the + first interval is nominally after the left endpoint of the second interval.) + This should have been handled by the R wrapper -- by applying + close_intervals before passing on to compiled code -- but this step had + previously been omitted. This is now fixed. + +NEW FEATURES + + - The interval_overlap, which_nearest, and distance_to_nearest methods now all + accept a numeric vector in both the 'from' and 'to' arguments. + +IMPROVEMENTS + + - The distance_to_nearest methods were previously based on old code using + approxfun(). They now work of the same algorithm used for interval_overlap + and which_nearest. This algorithm simultaneously generates all three types + of information: identity of overlaps and nearest neighbors, as well as + distance to nearest neighbors. + + ******** VERSION 0.11.0 ******** + +NEW FEATURES + + - Adding a NEWS file! + + - Added which_nearest methods, as a complement to distance_to_nearest. The C++ + code previously used for interval_overlap calculations was augmented to + provide this functionality. \ No newline at end of file diff --git a/R/Intervals-class.R b/R/Intervals-class.R new file mode 100644 index 0000000..7a260bf --- /dev/null +++ b/R/Intervals-class.R @@ -0,0 +1,351 @@ +# We define two classes for two-column interval endpoint matrices. The basic class +# has a two-element boolean vector indicating whether endpoints are closed or +# not. The full class caries a matrix with one boolean per endpoint, permitting +# full control. + + + + +######## Class definitions + +#### Virtual + +# (R 2.6.2) If I add the "VIRTUAL" tag to the representation, I am not able to +# extend this class! I believe this arises because I am already extending a base +# class, but I am not sure. This tag would be more appropriate, but I leave it +# off... + +setClass( + "Intervals_virtual", + representation( type = "character" ), + prototype( + matrix( 0, 0, 2 ), + type = "R" + ), + contains = "matrix", + validity = function( object ) { + # Check main matrix + if ( !is.double( object@.Data ) || ncol( object@.Data ) != 2 ) + return( "The 'Intervals' classes are based on two-column, numeric matrices." ) + # Check 'type' string + if ( length( object@type ) != 1 || !( object@type %in% c( "Z", "R" ) ) ) + return( "The 'type' slot should be 'Z' or 'R'." ) + # For type 'Z', check for integral endpoints + if ( object@type == "Z" && !all( object@.Data[ is.finite( object@.Data ) ] %% 1 == 0 ) ) + return( "Non-integer-valued endpoints not permitted for type 'Z'." ) + # Check for valid intervals + if ( any( object@.Data[,2] < object@.Data[,1], na.rm = TRUE ) ) + return( "One or more intervals with second endpoint before first." ) + return( TRUE ) + } + ) + +setMethod( + "initialize", + signature( "Intervals_virtual" ), + function( .Object, .Data, ... ) { + if ( missing( .Data ) ) callNextMethod( .Object, ... ) + else { + if ( is.data.frame( .Data ) ) + .Data <- as.matrix( .Data ) + if ( !is.matrix( .Data ) ) + .Data <- matrix( .Data, ncol = 2 ) + if ( is.integer( .Data ) ) { + # warning( "Converting endpoints from 'integer' to 'numeric' data type. See class documentation.", call. = FALSE ) + .Data <- matrix( as.numeric( .Data ), nrow( .Data ), ncol( .Data ) ) + } + callNextMethod( .Object, .Data, ... ) + } + } + ) + +#### Intervals + +# Common endpoint closure state for all intervals + +setClass( + "Intervals", + representation( closed = "logical" ), + prototype( closed = c( TRUE, TRUE ) ), + contains = "Intervals_virtual", + validity = function( object ) { + # Check 'closed' slot + if ( length( object@closed ) != 2 || any( is.na( object@closed ) ) ) + return( "The 'closed' slot should be a logical of length 2. NA values are not permitted." ) + return( TRUE ) + } + ) + +setMethod( + "initialize", + signature( "Intervals" ), + function( .Object, .Data, closed, ... ) { + if ( missing( .Data ) ) + callNextMethod( .Object, ... ) + else { + if ( missing( closed ) ) + callNextMethod( .Object, .Data, ... ) + else { + if ( length( closed ) == 1 ) closed <- c( closed, closed ) + callNextMethod( .Object, .Data, closed = closed, ... ) + } + } + } + ) + +#### Intervals_full + +# Full control of endpoint closure. Note that if the 'closed' slot is omitted, +# we use an 'initialize' method to create an appropriately sized matrix of TRUE +# values. We also permit vector input, with recycling, for the 'closed' slot. + +setClass( + "Intervals_full", + representation( closed = "matrix" ), + prototype( closed = matrix( TRUE, 0, 2 ) ), + contains = "Intervals_virtual", + validity = function( object ) { + # Check 'closed' slot + if ( + !is.logical( object@closed ) || + dim( object@.Data ) != dim( object@closed ) || + any( is.na( object@closed ) ) + ) + return( "The 'closed' slot should be a logical matrix with the same dimensions as the main endpoints matrix. NA values are not permitted." ) + return( TRUE ) + } + ) + +setMethod( + "initialize", + signature( "Intervals_full" ), + function( .Object, .Data, closed, ... ) { + if ( missing( .Data ) ) + callNextMethod( .Object, ... ) + else { + if ( !is.matrix( .Data ) ) + .Data <- matrix( .Data, ncol = 2 ) + if ( missing( closed ) ) + closed <- matrix( TRUE, nrow( .Data ), 2 ) + if ( is.vector( closed ) ) { + if ( length( closed ) > 2 ) + stop( "The 'closed' argument should be a matrix, or a vector of length 1 or 2." ) + closed <- matrix( + if ( nrow( .Data ) == 0 ) logical() else closed, + nrow( .Data ), + 2, + byrow = TRUE + ) + } + callNextMethod( .Object, .Data, closed = closed, ... ) + } + } + ) + + + + +######## Constructor functions + +Intervals <- function( ... ) + new( "Intervals", ... ) + +Intervals_full <- function( ... ) + new( "Intervals_full", ... ) + + + + +######## Subsetting + +setMethod( + "[", + signature( "Intervals" ), + function( x, i, j, ..., drop ) { + if ( missing(i) ) i <- rep( TRUE, nrow(x) ) + if ( missing(j) ) { + # Preserve class. Note that both [i,] and [i] syntax subset rows. + if ( any( is.na( i ) ) ) + warning( "NA indices encountered.", call. = FALSE ) + x@.Data <- x@.Data[i,,drop=FALSE] + return( x ) + } + else return( x@.Data[i,j] ) + } + ) + +# Note: row name handling for matices is completely inadequate for our purposes +# here. Lots of obviously desirable behavior (like setting rownames(x)[i] when x +# doesn't yet have rownames) tends to produce errors. + +setMethod( + "[<-", + signature( x = "Intervals", i = "ANY", j = "missing", value = "Intervals_virtual" ), + function( x, i, j, value ) { + #### Error checking + if ( type(x) != type(value) ) + stop( "Types do not match (Z vs. R)." ) + if ( is.character(i) ) i <- match( i, rownames( x ) ) + if ( any( is.na( i ) ) ) + stop( "Cannot assign to NA indices or row names which do not exist." ) + n <- length( (1:nrow(x))[i] ) + if ( n != nrow( value ) ) + stop( "Replacement object is of the wrong size." ) + #### Coerce up? + coerce_x <- FALSE + if ( is( value, "Intervals_full" ) ) coerce_x <- TRUE + else { + matches <- all( closed(x) == closed(value) ) + if ( !matches ) { + if ( type(x) == "Z" ) value <- adjust_closure( value, closed(x)[1], closed(x)[2] ) + else coerce_x <- TRUE + } + } + if ( coerce_x ) { + warning( "Coercion to 'Intervals_full' required.", call. = FALSE ) + x <- as( x, "Intervals_full" ) + x[ i, ] <- value + return(x) + } + #### Data + x@.Data[i,] <- value@.Data + #### Rownames + has_names_x <- !is.null( rownames(x) ) + has_names_value <- !is.null( rownames(value) ) + if ( has_names_x ) { + if ( has_names_value ) rownames(x)[i] <- rownames(value) + else rownames(x)[i] <- "" + } + else { + if ( has_names_value ) { + rownames(x) <- rep( "", nrow(x) ) + rownames(x)[i] <- rownames(value) + } + } + return(x) + } + ) + +setMethod( + "[", + signature( "Intervals_full" ), + function( x, i, j, ..., drop ) { + if ( missing(i) ) i <- rep( TRUE, nrow(x) ) + if ( missing(j) ) { + # Preserve class. Note that both [i,] and [i] syntax subset rows. + if ( is.character(i) ) i <- match( i, rownames( x ) ) + if ( any( is.na( i ) ) ) + warning( "NA indices encountered.", call. = FALSE ) + x@.Data <- x@.Data[i,,drop=FALSE] + x@closed <- x@closed[i,,drop=FALSE] + # We may have NAs in closed if present in i, if any(is.na(i)) == TRUE + x@closed[ is.na(x@closed) ] <- TRUE + return( x ) + } + else return( x@.Data[i,j] ) + } + ) + +setMethod( + "[<-", + signature( x = "Intervals_full", i = "ANY", j = "missing", value = "Intervals_virtual" ), + function( x, i, j, value ) { + #### Error checking + if ( type(x) != type(value) ) + stop( "Types do not match (Z vs. R)." ) + if ( is.character(i) ) i <- match( i, rownames( x ) ) + if ( any( is.na( i ) ) ) + stop( "Cannot assign to NA indices or row names which do not exist." ) + n <- length( (1:nrow(x))[i] ) + if ( n != nrow( value ) ) + stop( "Replacement object is of the wrong size." ) + #### Data + x@.Data[i,] <- value@.Data + if ( is( value, "Intervals" ) ) + x@closed[i,] <- matrix( value@closed, n, 2, byrow = TRUE ) + else + x@closed[i,] <- value@closed + #### Rownames + has_names_x <- !is.null( rownames(x) ) + has_names_value <- !is.null( rownames(value) ) + if ( has_names_x ) { + if ( has_names_value ) rownames(x)[i] <- rownames(value) + else rownames(x)[i] <- "" + } + else { + if ( has_names_value ) { + rownames(x) <- rep( "", nrow(x) ) + rownames(x)[i] <- rownames(value) + } + } + return(x) + } + ) + + + + +######## Coercion + +setMethod( + "coerce", + signature( from = "Intervals", to = "Intervals_full" ), + function( from, to, strict ) { + new( + "Intervals_full", + from@.Data, + type = type( from ), + closed = cbind( + rep( closed(from)[1], nrow( from ) ), + rep( closed(from)[2], nrow( from ) ) + ) + ) + } + ) + +setMethod( + "coerce", + signature( from = "Intervals_full", to = "Intervals" ), + function( from, to, strict ) { + if ( nrow( from ) == 0 ) new_closed <- rep( TRUE, 2 ) + else new_closed <- closed( from )[1,] + if ( !all( t( closed( from ) ) == new_closed ) ) + stop( "Intervals do not all have the same endpoint closure." ) + new( + "Intervals", + from@.Data, + type = type( from ), + closed = new_closed + ) + } + ) + +setMethod( + "coerce", + signature( from = "Intervals_virtual", to = "character" ), + function( from, to, strict ) { + if ( nrow( from ) == 0 ) + return( character() ) + else { + cl <- closed( from ) + # So we only write main code once + if ( is( from, "Intervals" ) ) + cl <- matrix( cl, nrow(from), 2, byrow = TRUE ) + result <- paste( + ifelse( cl[,1], "[", "(" ), + from[,1], ", ", from[,2], + ifelse( cl[,2], "]", ")" ), + sep = "" + ) + names( result ) <- rownames( from ) + return( result ) + } + } + ) + + + + +######## Union + +setClassUnion( "Intervals_virtual_or_numeric", c( "Intervals_virtual", "numeric" ) ) diff --git a/R/as.matrix.R b/R/as.matrix.R new file mode 100644 index 0000000..5c7cec6 --- /dev/null +++ b/R/as.matrix.R @@ -0,0 +1,5 @@ +# S3 methods + +as.matrix.Intervals_virtual <- function( x, ... ) as( x, "matrix" ) + +setMethod( "as.matrix", "Intervals_virtual", as.matrix.Intervals_virtual ) diff --git a/R/c.R b/R/c.R new file mode 100644 index 0000000..c062421 --- /dev/null +++ b/R/c.R @@ -0,0 +1,63 @@ +# As of 2.8.1, we are still not able to write S4 methods for rbind(). Our +# original plan was to use combine() as in biobase, but this seems to be causing +# clashes for the generic function when both packages are loaded. Improved S4 +# support for functions whose first argument is ... seems to be on the way, so +# we'll just do S3 for now, and move up to S4 once it's implemented. Note that +# as of 2.8.1, rbind.data.frame() still exists and is used for data frames. + +# Update! S3 method dispatch for rbind() is non-standard (see its documentation) +# and it produced unexpected dispatch to the matrix method when presented with a +# mix of Intervals and Intervals_full objects. As a consequence, we switched to +# c(), which uses standard S3 dispatch. + +c.Intervals <- function( ... ) { + args <- list(...) + # Drop NULL arguments + if ( any( sapply( args, is.null ) ) ) + args <- args[ !sapply( args, is.null ) ] + # Check if we should just return a list + classes <- sapply( args, class ) + if ( !all( classes %in% c( "Intervals", "Intervals_full" ) ) ) + return( list( ... ) ) + same_class <- all( classes == "Intervals" ) + # We are in fact dealing with intervals only... + if ( !all( sapply( args, type ) == type( args[[1]] ) ) ) + stop( "All arguments should have the same 'type' slot." ) + # Check for common closure + same_closed <- all( sapply( args[-1], function(x) identical( closed( args[[1]] ), closed( x ) ) ) ) + # Coerce up if necessary + if ( !same_class || ( type( args[[1]] ) == "R" & !same_closed ) ) { + warning( "Coercion to 'Intervals_full' required.", call. = FALSE ) + return( do.call( c, lapply( args, as, "Intervals_full" ) ) ) + } + # Convert to common closure for Z + if ( type( args[[1]] ) == "Z" & !same_closed ) + args <- lapply( + args, + adjust_closure, + close_left = closed( args[[1]] )[1], + close_right = closed( args[[1]] )[2] + ) + result <- args[[1]] + result@.Data <- do.call( rbind, lapply( args, function(x) x@.Data ) ) + return( result ) +} + +c.Intervals_full <- function( ... ) { + args <- list(...) + if ( any( sapply( args, is.null ) ) ) + args <- args[ !sapply( args, is.null ) ] + classes <- sapply( args, class ) + if ( !all( classes %in% c( "Intervals", "Intervals_full" ) ) ) + return( list( ... ) ) + if ( !all( sapply( args, type ) == type( args[[1]] ) ) ) + stop( "All arguments should have the same 'type' slot." ) + if ( !all( classes == "Intervals_full" ) ) { + warning( "Coercion to 'Intervals_full' required.", call. = FALSE ) + args <- lapply( args, as, "Intervals_full" ) + } + result <- args[[1]] + result@.Data <- do.call( rbind, lapply( args, function(y) y@.Data ) ) + closed(result) <- do.call( rbind, lapply( args, closed ) ) + return(result) +} diff --git a/R/close_intervals-methods.R b/R/close_intervals-methods.R new file mode 100644 index 0000000..fdde110 --- /dev/null +++ b/R/close_intervals-methods.R @@ -0,0 +1,57 @@ +setGeneric( "close_intervals", def = function(x) standardGeneric( "close_intervals" ) ) + +setMethod( + "close_intervals", + signature( "Intervals_virtual" ), + function( x ) adjust_closure( x, close_left = TRUE, close_right = TRUE ) + ) + +setGeneric( "open_intervals", def = function(x) standardGeneric( "open_intervals" ) ) + +setMethod( + "open_intervals", + signature( "Intervals_virtual" ), + function( x ) adjust_closure( x, close_left = FALSE, close_right = FALSE ) + ) + +setGeneric( "adjust_closure", def = function(x, ...) standardGeneric( "adjust_closure" ) ) + +setMethod( + "adjust_closure", + signature( "Intervals" ), + function(x, close_left = TRUE, close_right = TRUE) { + if ( type(x) == "R" ) + stop( "Only applicable to type 'Z'." ) + if ( any( empty(x), na.rm = TRUE ) ) { + warning( "Empty intervals encountered and removed.", call. = FALSE ) + x <- x[ is.na(x) | !empty(x), ] + } + if ( !closed(x)[1] && close_left ) x[,1] <- x[,1] + 1 + if ( closed(x)[1] && !close_left ) x[,1] <- x[,1] - 1 + if ( !closed(x)[2] && close_right ) x[,2] <- x[,2] - 1 + if ( closed(x)[2] && !close_right ) x[,2] <- x[,2] + 1 + closed(x) <- c( close_left, close_right ) + return( x ) + } + ) + +setMethod( + "adjust_closure", + signature( "Intervals_full" ), + function(x, close_left = TRUE, close_right = TRUE) { + if ( type(x) == "R" ) + stop( "Only applicable to type 'Z'." ) + if ( any( empty(x), na.rm = TRUE ) ) { + warning( "Empty intervals encountered and removed.", call. = FALSE ) + x <- x[ is.na(x) | !empty(x), ] + } + # Left side + if ( close_left ) x[ !closed(x)[,1], 1 ] <- x[ !closed(x)[,1], 1 ] + 1 + else x[ closed(x)[,1], 1 ] <- x[ closed(x)[,1], 1 ] - 1 + # Right side + if ( close_right ) x[ !closed(x)[,2], 2 ] <- x[ !closed(x)[,2], 2 ] - 1 + else x[ closed(x)[,2], 2 ] <- x[ closed(x)[,2], 2 ] + 1 + closed(x) <- c( close_left, close_right ) + return( x ) + } + ) diff --git a/R/closed-methods.R b/R/closed-methods.R new file mode 100644 index 0000000..0ece401 --- /dev/null +++ b/R/closed-methods.R @@ -0,0 +1,40 @@ +setGeneric( "closed", function(x) standardGeneric( "closed" ) ) + +setMethod( + "closed", + signature( "Intervals_virtual" ), + function( x ) x@closed + ) + +setGeneric( "closed<-", function( x, value ) standardGeneric( "closed<-" ) ) + +setReplaceMethod( + "closed", "Intervals", + function( x, value ) { + if ( !is.vector( value ) || !( length( value ) %in% 1:2 ) ) + stop( "The 'closed' argument should be a vector of length 1 or 2." ) + x@closed[ 1:2 ] <- value + return(x) + } + ) + +setReplaceMethod( + "closed", "Intervals_full", + function( x, value ) { + error_msg <- "The 'value' argument should be a matrix, or a vector of length 1 or 2." + if ( is.vector( value ) ) { + if ( length( value ) > 2 ) + stop( error_msg ) + value <- matrix( + if ( nrow( x ) == 0 ) logical() else value, + nrow( x ), + 2, + byrow = TRUE + ) + } + if ( !is.matrix( value ) || nrow( value ) != nrow( x ) || ncol( value ) != 2 ) + stop( error_msg ) + x@closed <- value + return(x) + } + ) diff --git a/R/clusters.R b/R/clusters.R new file mode 100644 index 0000000..7d2d503 --- /dev/null +++ b/R/clusters.R @@ -0,0 +1,34 @@ +setGeneric( "clusters", function( x, ... ) standardGeneric( "clusters" ) ) + +setMethod( + "clusters", + signature( "numeric" ), + function( x, w, which = FALSE, check_valid = TRUE ) { + if ( is.integer(x) ) x <- as.numeric(x) + regions <- reduce( Intervals( cbind( x, x + w ), type = "R" ), check_valid ) + clusters <- interval_overlap( regions, x, check_valid ) + clusters <- clusters[ sapply( clusters, length ) > 1 ] + if ( which ) return( clusters ) + else return( lapply( clusters, function(i) x[i] ) ) + } + ) + +setMethod( + "clusters", + signature( "Intervals_virtual" ), + function( x, w, which = FALSE, check_valid = TRUE ) { + if ( type(x) == "Z" && ( w %% 1 != 0 ) ) + stop( "Non-integer 'w' supplied for intervals over Z.", call. = FALSE ) + regions <- reduce( + new( + class(x), + cbind( x[,1], x[,2] + w ), closed = closed(x), type = type(x) + ), + check_valid + ) + clusters <- interval_overlap( regions, x, check_valid ) + clusters <- clusters[ sapply( clusters, length ) > 1 ] + if ( which ) return( clusters ) + else return( lapply( clusters, function(i) x[i,] ) ) + } + ) diff --git a/R/distance_to_nearest-methods.R b/R/distance_to_nearest-methods.R new file mode 100644 index 0000000..511dc8b --- /dev/null +++ b/R/distance_to_nearest-methods.R @@ -0,0 +1,11 @@ +setGeneric( "distance_to_nearest", function( from, to, ... ) standardGeneric( "distance_to_nearest" ) ) + +setMethod( + "distance_to_nearest", + signature( "Intervals_virtual_or_numeric", "Intervals_virtual_or_numeric" ), + function( from, to, check_valid = TRUE ) { + result <- which_nearest( from, to, check_valid )$distance_to_nearest + names( result ) <- rownames( from ) + return( result ) + } + ) diff --git a/R/empty-methods.R b/R/empty-methods.R new file mode 100644 index 0000000..d122d5a --- /dev/null +++ b/R/empty-methods.R @@ -0,0 +1,32 @@ +setGeneric( "empty", def = function(x) standardGeneric( "empty" ) ) + +setMethod( + "empty", + signature( "Intervals" ), + function(x) { + result <- rep( FALSE, nrow(x) ) + result[ is.na( x[,1] ) | is.na( x[,2] ) ] <- NA + if ( !all( closed(x) ) ) { + # Valid objects have x[,1] <= x[,2], so we only check this case. + result[ x[,1] == x[,2] ] <- TRUE + if ( type(x) == "Z" && !any( closed( x ) ) ) + result[ x[,1] + 1 == x[,2] ] <- TRUE + } + return( result ) + } + ) + +setMethod( + "empty", + signature( "Intervals_full" ), + function(x) { + result <- rep( FALSE, nrow(x) ) + result[ is.na( x[,1] ) | is.na( x[,2] ) ] <- NA + any_open <- !( closed(x)[,1] & closed(x)[,2] ) + both_open <- !closed(x)[,1] & !closed(x)[,2] + result[ any_open & ( x[,1] == x[,2] ) ] <- TRUE + if ( type(x) == "Z" ) + result[ both_open & ( x[,1] + 1 == x[,2] ) ] <- TRUE + return( result ) + } + ) diff --git a/R/expand-methods.R b/R/expand-methods.R new file mode 100644 index 0000000..0b42bf5 --- /dev/null +++ b/R/expand-methods.R @@ -0,0 +1,59 @@ +# Not exported +adjust <- function( x, delta, type, direction = 1 ) { + signs <- rep( c( direction, -direction ), c( nrow(x), nrow(x) ) ) + if ( nrow(x) %% length(delta) != 0 ) + warning( "Length of 'delta' does not evenly divide number of intervals.", call. = FALSE ) + delta <- rep( delta, length = nrow( x ) ) + switch( + type, + relative = x * ( 1 - delta ) ^ ( sign(x) * signs ), + absolute = x - delta * signs + ) +} + +setGeneric( "expand", def = function( x, ... ) standardGeneric( "expand" ) ) + +setMethod( + "expand", + signature( "Intervals_virtual" ), + function( + x, + delta = 0, + type = c( "absolute", "relative" ) + ) + { + if ( any( delta < 0, na.rm = TRUE ) ) + stop( "The 'delta' argument should not contain negative values." ) + type = match.arg( type ) + if ( type(x) == "Z" && ( type == "relative" || any( delta %% 1 != 0, na.rm = TRUE ) ) ) + stop( "Only absolute, integer-valued expansion permitted for type 'Z'." ) + x@.Data <- adjust( x, delta, type, 1 ) + return( x ) + } + ) + +setGeneric( "contract", def = function( x, ... ) standardGeneric( "contract" ) ) + +setMethod( + "contract", + signature( "Intervals_virtual" ), + function( + x, + delta = 0, + type = c( "absolute", "relative" ) + ) + { + if ( any( delta < 0, na.rm = TRUE ) ) + stop( "The 'delta' argument should not contain negative values." ) + type = match.arg( type ) + if ( type(x) == "Z" && ( type == "relative" || any( delta %% 1 != 0, na.rm = TRUE ) ) ) + stop( "Only absolute, integer-valued contraction permitted for type 'Z'." ) + x@.Data <- adjust( x, delta, type, -1 ) + drop <- x[,1] > x[,2] | empty(x) + if ( any( drop ) ) { + warning( "Some empty intervals eliminated.", call. = FALSE ) + x <- x[ !drop, ] + } + return( x ) + } + ) diff --git a/R/head-methods.R b/R/head-methods.R new file mode 100644 index 0000000..2fc1d77 --- /dev/null +++ b/R/head-methods.R @@ -0,0 +1,20 @@ +setGeneric( "head", function( x, ... ) standardGeneric( "head" ) ) + +setMethod( + "head", + signature( "Intervals_virtual" ), + function( x, n = 6 ) { + if ( nrow(x) > 0 ) x[ 1:min( n, nrow(x) ), ] else x + } + ) + +setGeneric( "tail", function( x, ... ) standardGeneric( "tail" ) ) + +setMethod( + "tail", + signature( "Intervals_virtual" ), + function( x, n = 6 ) { + if ( nrow(x) > 0 ) x[ max( 1, nrow(x) - n + 1 ):nrow(x), ] else x + } + ) + diff --git a/R/interval_complement-methods.R b/R/interval_complement-methods.R new file mode 100644 index 0000000..1cfdce2 --- /dev/null +++ b/R/interval_complement-methods.R @@ -0,0 +1,44 @@ +setGeneric( "interval_complement", def = function(x, ...) standardGeneric( "interval_complement" ) ) + +setMethod( + "interval_complement", + signature( "Intervals_virtual" ), + function(x, check_valid = TRUE) { + # Sort and clean up + x <- reduce( x, check_valid ) + # When the data type of the endpoints matrix is integer, + # complications arise from maximum/minimum representable integer + # values. For Intervals objects, the complement will often + # need to include the minimal/maximal represenable integer, but one + # or both endpoints may be open. In such cases, we adjust close with + # a warning. For the moment, we force numeric endpoints throughout + # the package. + endpoints <- + if ( nrow(x) == 0 ) + matrix( c( -Inf, Inf ), 1 ) + else + rbind( + if ( is.finite( x[1,1] ) ) c( -Inf, x[1,1] ) else NULL, + cbind( x[-nrow(x),2], x[-1,1] ), + if ( is.finite( x[nrow(x),2] ) ) c( x[nrow(x),2], Inf ) else NULL + ) + closed <- + if ( class(x) == "Intervals" ) + # Note that we ignore closure for non-finite endpoints. + !closed(x)[2:1] + else + if ( nrow(x) == 0 ) TRUE + else + !rbind( + if ( is.finite( x[1,1] ) ) c( TRUE, closed(x)[1,1] ) else NULL, + cbind( closed(x)[-nrow(x),2], closed(x)[-1,1] ), + if ( is.finite( x[nrow(x),2] ) ) c( closed(x)[nrow(x),2], TRUE ) else NULL + ) + new( + class(x), + endpoints, + type = type(x), + closed = closed + ) + } + ) diff --git a/R/interval_difference-methods.R b/R/interval_difference-methods.R new file mode 100644 index 0000000..d8185c6 --- /dev/null +++ b/R/interval_difference-methods.R @@ -0,0 +1,17 @@ +setGeneric( "interval_difference", def = function(x, y, ...) standardGeneric( "interval_difference" ) ) + +setMethod( + "interval_difference", + signature( "Intervals_virtual", "Intervals_virtual" ), + function(x, y, check_valid = TRUE) { + if ( check_valid ) { + validObject( x ) + validObject( y ) + } + interval_intersection( + x, + interval_complement( y, check_valid = FALSE ), + check_valid = FALSE + ) + } + ) diff --git a/R/interval_included-methods.R b/R/interval_included-methods.R new file mode 100644 index 0000000..f6d16d4 --- /dev/null +++ b/R/interval_included-methods.R @@ -0,0 +1,61 @@ +setGeneric( "interval_included", def = function( from, to, ... ) standardGeneric( "interval_included" ) ) + +setMethod( + "interval_included", + signature( "Intervals", "Intervals" ), + function( from, to, check_valid = TRUE ) { + # For inclusion, both endpoints from a "to" interval should be + # inside an particular "from" interval. Because interval_overlap + # treats numerics as double-closed single-point intervals, some care + # is required with open endpoints for "to": these should be + # considered to overlap closed OR open "from" endpoints with which + # they coincide. We handle this, over R, by adjusting the "from" + # closure. + if ( any( empty(to) ) ) { + warning( "Some empty 'to' intervals encountered. Setting to NA...", call. = FALSE ) + to[ empty(to), ] <- NA + } + if ( type(to) == "Z" ) + to <- close_intervals(to) + else + closed( from ) <- closed( from ) | !closed( to ) + mapply( + intersect, + interval_overlap( from, to[,1], check_valid ), + interval_overlap( from, to[,2], check_valid ) + ) + } + ) + +setMethod( + "interval_included", + signature( "Intervals_full", "Intervals_full" ), + function( from, to, check_valid = TRUE ) { + # For the same reasons given above, open endpoints in "to" need to + # be re-checked against open endpoints in "from", since equality of + # the endpoint value is consistent with inclusion. + if ( any( empty(to) ) ) { + warning( "Some empty 'to' intervals encountered. Setting to NA...", call. = FALSE ) + to[ empty(to), ] <- NA + } + if ( type(to) == "Z" ) + to <- close_intervals(to) + # Left side + left <- interval_overlap( from, to[,1], check_valid ) + to[ closed(to)[,1], 1 ] <- NA + left_open <- interval_overlap( from[,1], to[,1], check_valid ) + # Right side + right <- interval_overlap( from, to[,2], check_valid ) + to[ closed(to)[,2], 2 ] <- NA + right_open <- interval_overlap( from[,2], to[,2], check_valid ) + mapply( + function(l,lo,r,ro) intersect( c(l,lo), c(r,ro) ), + left, left_open, right, right_open + ) + } + ) + +# Note: there isn't much sense in asking what's included in a set of +# points. Further, overlap and inclusion are the same when "from" contains +# actual intervals while "to" contains point. For these reasons, there are no +# methods for class "numeric." diff --git a/R/interval_intersection-methods.R b/R/interval_intersection-methods.R new file mode 100644 index 0000000..79f29a8 --- /dev/null +++ b/R/interval_intersection-methods.R @@ -0,0 +1,37 @@ +setGeneric( "interval_intersection", def = function( x, ... ) standardGeneric( "interval_intersection" ) ) + +setMethod( + "interval_intersection", + signature( "Intervals_virtual" ), + function( x, ..., check_valid = TRUE ) { + args <- c( list(x), list(...) ) + if ( check_valid ) for ( y in args ) validObject( y ) + complements <- lapply( args, interval_complement, check_valid = FALSE ) + interval_complement( + do.call( + interval_union, + c( complements, list( check_valid = FALSE ) ) + ) + ) + } + ) + +setMethod( + "interval_intersection", + signature( "missing" ), + function( x, ..., check_valid = TRUE ) { + # Permitting do.call use with named lists, since do.call will put + # elements whose names are not "x" into the ... argument. Stripping + # names, however, puts arguments in place positionally. + args <- list(...) + names( args ) <- NULL + if ( length( args ) == 0 ) return ( NULL ) + else + return( + do.call( + interval_intersection, + c( args, list( check_valid = check_valid ) ) + ) + ) + } + ) diff --git a/R/interval_overlap-methods.R b/R/interval_overlap-methods.R new file mode 100644 index 0000000..5a5e3fe --- /dev/null +++ b/R/interval_overlap-methods.R @@ -0,0 +1,30 @@ +setGeneric( "interval_overlap", def = function( from, to, ... ) standardGeneric( "interval_overlap" ) ) + +setMethod( + "interval_overlap", + signature( "Intervals_virtual_or_numeric", "Intervals_virtual_or_numeric" ), + function( from, to, check_valid = TRUE ) { + result <- which_nearest( from, to, check_valid )$which_overlap + names( result ) <- rownames( from ) + return( result ) + } + ) + +argument_error <- paste( + "The 'from' and 'to' arguments are required. Note that the", + " interval_overlap argument names changed at v. 0.11.1.", + " See documentation.", + sep = "\n" + ) + +setMethod( + "interval_overlap", + signature( from = "missing", to = "ANY" ), + function( from, to, check_valid, ... ) stop( argument_error ) + ) + +setMethod( + "interval_overlap", + signature( from = "ANY", to = "missing" ), + function( from, to, check_valid, ... ) stop( argument_error ) + ) diff --git a/R/interval_union-methods.R b/R/interval_union-methods.R new file mode 100644 index 0000000..16a9768 --- /dev/null +++ b/R/interval_union-methods.R @@ -0,0 +1,30 @@ +setGeneric( "interval_union", def = function( x, ... ) standardGeneric( "interval_union" ) ) + +setMethod( + "interval_union", + signature( "Intervals_virtual" ), + function( x, ..., check_valid = TRUE ) { + reduce( c( x, ... ), check_valid ) + } + ) + +setMethod( + "interval_union", + signature( "missing" ), + function( x, ..., check_valid = TRUE ) { + # Permitting do.call use with named lists, since do.call will put + # elements whose names are not "x" into the ... argument. Stripping + # names, however, puts arguments in place positionally. + args <- list(...) + names( args ) <- NULL + if ( length( args ) == 0 ) return ( NULL ) + else + return( + do.call( + interval_union, + c( args, list( check_valid = check_valid ) ) + ) + ) + } + ) + diff --git a/R/is.na-methods.R b/R/is.na-methods.R new file mode 100644 index 0000000..685bd6e --- /dev/null +++ b/R/is.na-methods.R @@ -0,0 +1,5 @@ +setMethod( + "is.na", + signature( x = "Intervals_virtual" ), + function(x) is.na( x[,1] ) | is.na( x[,2] ) + ) diff --git a/R/plot.Intervals_virtual.R b/R/plot.Intervals_virtual.R new file mode 100644 index 0000000..d02c9a3 --- /dev/null +++ b/R/plot.Intervals_virtual.R @@ -0,0 +1,78 @@ +plot.Intervals_full <- function( + x, y = NULL, + axes = TRUE, + xlab = "", ylab = "", + xlim = NULL, ylim = NULL, + col = "black", lwd = 1, + cex = 1, + use_points = TRUE, + use_names = TRUE, + names_cex = 1, + ... + ) +{ + # Subset first, for efficiency, and so that the maximal y plotted is + # appropriate for the region shown. + + if ( any( is.na( x ) ) ) x <- x[ !is.na(x), ] + + if ( is.null(xlim) ) + xlim <- range( x@.Data ) + else + x <- x[ x[,2] >= xlim[1] & x[,1] <= xlim[2], ] + + if ( is.null(y) ) + y <- .Call( "_plot_overlap", x@.Data, closed(x), is( x, "Intervals_full" ) ) + + if ( is.null(ylim) ) + ylim <- c( 0, max( y ) ) + + plot( + 0, 0, + type = "n", + xlim = xlim, ylim = ylim, + axes = FALSE, + xlab = xlab, ylab = ylab, + ... + ) + # Careful with non-finite endpoints, which segments() ignores. + segments( + pmax( x[,1], par("usr")[1] ), y, + pmin( x[,2], par("usr")[2] ), y, + col = col, + lwd = lwd + ) + if ( use_points ) { + # Careful with points... + adjust <- ( x[,1] == x[,2] ) & !closed(x)[,1] + closed(x)[ adjust, 2 ] <- FALSE + points( + x@.Data, rep( y, 2 ), + pch = 21, cex = cex, + col = col, bg = ifelse( closed(x), col, "white" ) + ) + } + if ( use_names && !is.null( rownames(x) ) ) { + mids <- ( x[,1] + x[,2] ) / 2 + text( + mids, y, + rownames( x ), + pos = 3, offset = .5, + cex = names_cex, + xpd = NA + ) + } + if ( axes ) + axis( 1 ) +} + +plot.Intervals <- function( x, y = NULL, ... ) { + plot( as( x, "Intervals_full" ), y, ... ) +} + +setMethod( "plot", c( "Intervals", "missing" ), function( x, y, ... ) plot.Intervals( x, ... ) ) +setMethod( "plot", c( "Intervals", "ANY" ), function( x, y, ... ) plot.Intervals( x, y, ... ) ) + +setMethod( "plot", c( "Intervals_full", "missing" ), function( x, y, ... ) plot.Intervals_full( x, ... ) ) +setMethod( "plot", c( "Intervals_full", "ANY" ), function( x, y, ... ) plot.Intervals_full( x, y, ... ) ) + diff --git a/R/reduce-methods.R b/R/reduce-methods.R new file mode 100644 index 0000000..68efee4 --- /dev/null +++ b/R/reduce-methods.R @@ -0,0 +1,25 @@ +setGeneric( "reduce", def = function( x, ... ) standardGeneric( "reduce" ) ) + +setMethod( + "reduce", + signature( "Intervals_virtual" ), + function( x, check_valid = TRUE ) { + if ( check_valid ) validObject( x ) + has_na <- is.na( x[,1] ) | is.na( x[,2] ) + if ( any( has_na ) ) { + warning( "Intervals with NA endpoints removed.", call. = FALSE ) + x <- x[ !has_na, ] + } + if ( any( empty( x ) ) ) + x <- x[ !empty(x), ] + # In order to collapse over abutting intervals over Z + if ( type(x) == "Z" ) x <- open_intervals( x ) + result <- .Call( + "_reduce", + x@.Data, + closed( x ), + is( x, "Intervals_full" ) + ) + new( class(x), result[[1]], closed = result[[2]], type = type(x) ) + } + ) diff --git a/R/show-methods.R b/R/show-methods.R new file mode 100644 index 0000000..7ef5253 --- /dev/null +++ b/R/show-methods.R @@ -0,0 +1,24 @@ +setMethod( + "show", + signature( "Intervals_virtual" ), + function( object ) { + cat( + "Object of class ", + class( object ), + "\n", + nrow( object ), + " interval", + ifelse( nrow( object ) == 1, "", "s" ), + " over ", + type(object), + ":\n", + sep = "" + ) + ints <- as( object, "character") + if ( !is.null( rownames( object ) ) ) { + fmt <- sprintf( "%%%is", max( nchar( rownames( object ) ) ) ) + ints <- paste( sprintf( fmt, rownames( object ) ), ints ) + } + cat( ints, sep = "\n" ) + } + ) diff --git a/R/size-methods.R b/R/size-methods.R new file mode 100644 index 0000000..8f023ca --- /dev/null +++ b/R/size-methods.R @@ -0,0 +1,37 @@ +# For R, size is Lebesgue measure, so closure is irrelevant. Note that we don't +# use the close_intervals method here, which is important since this method +# requires empty, and empty currently uses size. We need to avoid circular +# dependency. + +setGeneric( "size", def = function( x, ... ) standardGeneric( "size" ) ) + +setMethod( + "size", + signature( "Intervals" ), + function( x, as = type(x) ) { + result <- x[,2] - x[,1] + if ( as == "Z" ) { + ties <- x[,2] == x[,1] + ties[ is.na( ties ) ] <- FALSE + result[ ties ] <- ifelse( all( closed(x) ), 1, 0 ) + result[ !ties ] <- result[ !ties ] + sum( closed(x) ) - 1 # NAs just stay NA + } + return( result ) + } + ) + +setMethod( + "size", + signature( "Intervals_full" ), + function( x, as = type(x) ) { + result <- x[,2] - x[,1] + if ( as == "Z" ) { + ties <- x[,2] == x[,1] + ties[ is.na( ties ) ] <- FALSE + rs <- rowSums( closed(x) ) + result[ ties ] <- ifelse( rs[ ties ] == 2, 1, 0 ) + result[ !ties ] <- result[ !ties ] + rs[ !ties ] - 1 + } + return( result ) + } + ) diff --git a/R/split.R b/R/split.R new file mode 100644 index 0000000..f568d10 --- /dev/null +++ b/R/split.R @@ -0,0 +1,6 @@ +# Cause split.data.frame to be used. See "Methods for S3 Generic Functions" in +# help for "Methods", for guidance and examples. + +split.Intervals_virtual <- split.data.frame + +setMethod( "split", "Intervals_virtual", split.Intervals_virtual ) diff --git a/R/t-methods.R b/R/t-methods.R new file mode 100644 index 0000000..bcfe483 --- /dev/null +++ b/R/t-methods.R @@ -0,0 +1,7 @@ +setGeneric( "t", function(x) standardGeneric( "t" ) ) + +setMethod( + "t", + signature( "Intervals_virtual" ), + function(x) t( x@.Data ) + ) diff --git a/R/type-methods.R b/R/type-methods.R new file mode 100644 index 0000000..653115d --- /dev/null +++ b/R/type-methods.R @@ -0,0 +1,21 @@ +setGeneric( "type", function(x) standardGeneric( "type" ) ) + +setMethod( + "type", + signature( "Intervals_virtual" ), + function( x ) x@type + ) + +setGeneric( "type<-", function( x, value ) standardGeneric( "type<-" ) ) + +setReplaceMethod( + "type", "Intervals_virtual", + function( x, value ) { + if ( length( value ) != 1 || !( value %in% c( "Z", "R" ) ) ) + stop( "The 'type' slot should be 'Z' or 'R'." ) + if ( value == "Z" && !all( x@.Data %% 1 == 0, na.rm = TRUE ) ) + stop( "Non-integer-valued endpoints not permitted for type 'Z'." ) + x@type <- value + return(x) + } + ) diff --git a/R/which_nearest-methods.R b/R/which_nearest-methods.R new file mode 100644 index 0000000..b58a8f9 --- /dev/null +++ b/R/which_nearest-methods.R @@ -0,0 +1,83 @@ +setGeneric( "which_nearest", def = function( from, to, ... ) standardGeneric( "which_nearest" ) ) + +setMethod( + "which_nearest", + signature( "Intervals_virtual", "Intervals_virtual" ), + function( from, to, check_valid = TRUE ) { + if ( check_valid && !( validObject(to) && validObject(from) ) ) + stop( "The 'to' and/or 'from' objects are invalid." ) + if ( type(to) != type(from) ) + stop( "Both 'to' and 'from' should have the same type." ) + if ( any( empty( to ), na.rm = TRUE ) ) { + warning( "Some empty 'to' intervals encountered. Setting to NA...", call. = FALSE ) + to[ empty(to), ] <- NA + } + if ( any( empty( from ), na.rm = TRUE ) ) { + warning( "Some empty 'from' intervals encountered. Setting to NA...", call. = FALSE ) + from[ empty(from), ] <- NA + } + if( type(to) == "Z" ) { + to <- close_intervals( to ) + from <- close_intervals( from ) + } + result <- .Call( + "_which_nearest", + to@.Data, from@.Data, + closed(to), closed(from), + class(to) == "Intervals_full", class(from) == "Intervals_full" + ) + result[[1]][ !is.finite( result[[1]] ) ] <- as.numeric( NA ) + data.frame( + distance_to_nearest = result[[1]], + which_nearest = I( result[[2]] ), + which_overlap = I( result[[3]] ), + row.names = rownames( from ) + ) + } + ) + +setMethod( + "which_nearest", + signature( "numeric", "Intervals_virtual" ), + function( from, to, check_valid = TRUE ) { + if ( type( to ) == "Z" ) { + non_int <- ( from %% 1 != 0 ) + if ( any( non_int, na.rm = TRUE ) ) + stop( "The 'to' object is of type 'Z'. Non-integral values are not permitted in 'from'.", call. = FALSE ) + } + which_nearest( + new( class( to ), cbind( from, from ), closed = TRUE, type = type( to ) ), + to, + check_valid = check_valid + ) + } + ) + +setMethod( + "which_nearest", + signature( "Intervals_virtual", "numeric" ), + function( from, to, check_valid = TRUE ) { + if ( type( from ) == "Z" ) { + non_int <- ( to %% 1 != 0 ) + if ( any( non_int, na.rm = TRUE ) ) + stop( "The 'from' object is of type 'Z'. Non-integral values are not permitted in 'to'.", call. = FALSE ) + } + which_nearest( + from, + new( class( from ), cbind( to, to ), closed = TRUE, type = type( from ) ), + check_valid = check_valid + ) + } + ) + +setMethod( + "which_nearest", + signature( "numeric", "numeric" ), + function( from, to, check_valid = TRUE ) { + which_nearest( + Intervals( cbind( from, from ) ), + Intervals( cbind( to, to ) ), + check_valid = FALSE + ) + } + ) diff --git a/data/sgd.rdata b/data/sgd.rdata new file mode 100755 index 0000000..07bb309 Binary files /dev/null and b/data/sgd.rdata differ diff --git a/man/Intervals-class.Rd b/man/Intervals-class.Rd new file mode 100644 index 0000000..3185a2d --- /dev/null +++ b/man/Intervals-class.Rd @@ -0,0 +1,218 @@ +\name{Intervals-class} + +\docType{class} + +\alias{Intervals} +\alias{Intervals_full} + +\alias{Intervals-class} +\alias{Intervals_full-class} + +\alias{[,Intervals-method} +\alias{[,Intervals_full-method} +\alias{[<-,Intervals,ANY,missing,Intervals_virtual-method} +\alias{[<-,Intervals_full,ANY,missing,Intervals_virtual-method} +\alias{closed<-} +\alias{closed<-,Intervals-method} +\alias{closed<-,Intervals_full-method} +\alias{coerce,Intervals,Intervals_full-method} +\alias{coerce,Intervals_full,Intervals-method} +\alias{initialize,Intervals-method} +\alias{initialize,Intervals_full-method} + + + + +\title{Classes "Intervals" and "Intervals_full"} + +\description{ + \code{"Intervals"} objects are two-column matrices which represent + sets, possibly non-disjoint and in no particular order, of intervals + on either the integers or the real line. All intervals in each object + have the same endpoint closure pattern. \code{"Intervals_full"} + objects are similar, but permit interval-by-interval endpoint closure + specification. +} + +\section{Objects from the Class}{ + Objects can be created by calls of the form \code{new("Intervals", + \dots)}, or better, by using the constructor functions + \code{\link{Intervals}(\dots)} and + \code{\link{Intervals_full}(\dots)}. +} + +\section{Slots}{ + + \describe{ + + \item{\code{.Data}:}{See \code{"\linkS4class{Intervals_virtual}"}.} + + \item{\code{closed}:}{ + For \code{"Intervals"} objects, a two-element logical vector. For + \code{"Intervals_full"} objects, a two-column logical matrix with + the same dimensions as \code{.Data}. If omitted in a \code{new} + call, the \code{closed} slot will be initialized to an object of + appropriate type and size, with all entries \code{TRUE}. If + \code{closed} is a vector of length 1, or a vector of length 2 for + the \code{"Intervals_full"} class, an appropriate object will be + made by reusing the supplied values row-wise. See the example + below. + } + + \item{\code{type}:}{See \code{"\linkS4class{Intervals_virtual}"}.} + + } + +} + +\section{Extends}{ + + Class \code{"\linkS4class{Intervals_virtual}"}, directly. + + Class \code{"\linkS4class{matrix}"}, by class + \code{"Intervals_virtual"}, distance 2. + + Class \code{"\linkS4class{array}"}, by class + \code{"Intervals_virtual"}, distance 3. + + Class \code{"\linkS4class{structure}"}, by class + \code{"Intervals_virtual"}, distance 4. + + Class \code{"\linkS4class{vector}"}, by class + \code{"Intervals_virtual"}, distance 5, with explicit coerce. + +} + +\seealso{ + See \code{"\linkS4class{Intervals_virtual}"}. +} + +\section{S3 methods}{ + + As of R 2.8.1, it still does not seem possible to write S4 methods for + \code{rbind} or \code{c}. To concatenate sets of intervals into a + single sets, the S3 methods \code{\link{c.Intervals}} and + \code{\link{c.Intervals_full}} are provided. While \code{rbind} might + seem more natural, its S3 dispatch is non-standard and it could not be + used. Both methods are documented separately. + +} + +\section{S4 methods}{ + + \describe{ + + \item{[}{\code{ signature(x = "Intervals")} } + \item{[}{\code{ signature(x = "Intervals_full")} } + \item{[<-}{\code{ signature(x = "Intervals", i = "ANY", j = "missing", value = "Intervals_virtual")} } + \item{[<-}{\code{ signature(x = "Intervals_full", i = "ANY", j = "missing", value = "Intervals_virtual")} } + \item{adjust\_closure}{\code{ signature(x = "Intervals")} } + \item{adjust\_closure}{\code{ signature(x = "Intervals_full")} } + \item{closed<-}{\code{ signature(x = "Intervals")} } + \item{closed<-}{\code{ signature(x = "Intervals_full")} } + \item{coerce}{\code{ signature(from = "Intervals", to = "Intervals_full")} } + \item{coerce}{\code{ signature(from = "Intervals_full", to = "Intervals")} } + \item{empty}{\code{ signature(x = "Intervals")} } + \item{empty}{\code{ signature(x = "Intervals_full")} } + \item{initialize}{\code{ signature(.Object = "Intervals")} } + \item{initialize}{\code{ signature(.Object = "Intervals_full")} } + \item{size}{\code{ signature(x = "Intervals")} } + \item{size}{\code{ signature(x = "Intervals_full")} } + + % \item{\code{[}:}{ + % When used to subset rows, class is preserved; when used to subset + % columns, the \code{closed} and \code{type} slots are discarded and + % an appropriately subset version of \code{.Data} is returned. See + % example below. + % } + % + % \item{\code{closed<-}:}{ + % Replacement accessor for \code{closed} slot. See the example + % below. See description of the \code{closed} slot above for details + % on how one- or two-element logical vectors are interpreted. + % } + % + % \item{\code{coerce}:}{ + % Coercion methods are provided for converting back and forth + % between \code{"Intervals"} and \code{"Intervals_full"} + % objects. See example below. An error will be generated when + % attemption to down-class a \code{"Intervals_full"} object which + % does not have the same closure settings for every interval. + % + % A coercion method is also provided for pretty character strings. + % } + + } + +} + +\note{ + We do not currently permit an integer data type for the endpoints + matrix, even when \code{type == "Z"}, because this creates + complications when taking complements -- which is most easily handled + through the use of \code{-Inf} and \code{Inf}. This is particularly + awkward for objects of class \code{"Intervals"}, since current endpoint + closure settings may not permit inclusion of the minimal/maximal + integer. This issue may be addressed, however, in future updates. (We + do, however, check that endpoints are congruent to 0 mod 1 when + \code{type == "Z"}.) + + When creating object, non-matrix endpoint sources will be converted to + a two-column matrix, for convenience. Recycling is supported for the + \code{closed} slot when creating new objects. +} + +\section{Warning}{ + Validity checking takes place when, for example, using the + \code{type<-} replacement accessor: if one attempts to set type to + \code{"Z"} but the endpoint matrix contains non-integer values, an + error is generated. Because accessors are not used for the endpoint + matrix itself, though, it is possible to create invalid \code{"Z"} + objects by setting endpoints to inappropriate values. +} + +\examples{ + +# The "Intervals" class + +i <- Intervals( + matrix( + c(1,2, + 3,5, + 4,6, + 8,9 + ), + byrow = TRUE, + ncol = 2 + ), + closed = c( TRUE, TRUE ), + type = "Z" + ) + +# Row subsetting preserves class. Column subsetting causes coercion to +# "matrix" class. + +i +i[1:2,] +i[,1:2] + +# Full endpoint control + +j <- as( i, "Intervals_full" ) +closed(j)[ 3:4, 2 ] <- FALSE +closed(j)[ 4, 1 ] <- FALSE +j + +# Rownames may be used + +rownames(j) <- c( "apple", "banana", "cherry", "date" ) +j + +# Assignment preserves class, coercing if necessary + +j[2:3] <- i[1:2,] +j + +} + +\keyword{classes} diff --git a/man/Intervals_virtual-class.Rd b/man/Intervals_virtual-class.Rd new file mode 100644 index 0000000..3b7913c --- /dev/null +++ b/man/Intervals_virtual-class.Rd @@ -0,0 +1,99 @@ +\name{Intervals_virtual-class} + +\docType{class} + +\alias{Intervals_virtual-class} + +\alias{closed} +\alias{closed,Intervals_virtual-method} +\alias{closed,Intervals_virtual-method} +\alias{coerce,Intervals_virtual,character-method} +\alias{head,Intervals_virtual-method} +\alias{initialize,Intervals_virtual-method} +\alias{is.na,Intervals_virtual-method} +\alias{show,Intervals_virtual-method} +\alias{t,Intervals_virtual-method} +\alias{tail,Intervals_virtual-method} +\alias{type} +\alias{type,Intervals_virtual-method} +\alias{type<-} +\alias{type<-,Intervals_virtual-method} + +\title{Class "Intervals_virtual"} + +\description{ + A virtual class from which the \code{"Intervals"} and + \code{"Intervals_full"} classes derive. +} + +\section{Slots}{ + + \describe{ + + \item{\code{.Data}:}{ + Object of class \code{"matrix"}. A two-column, numeric (see below) + format is required. For a valid object, no value in the first + column may exceed its partner in the second column. (Note that + this \emph{does} permit empty interval rows, when both endpoints + are of equal value and not both closed.) Only integral (though not + \code{"integer"} class) endpoints are permitted if \code{type} is + \code{"Z"}. See the note on this point in documentation for + \code{"\linkS4class{Intervals}"}. + } + + \item{\code{type}:}{ + Object of class \code{"character"}. A one-element character vector + with either \code{"Z"} or \code{"R"} is required. + } + + } + +} + +\section{Extends}{ + + Class \code{"\linkS4class{matrix}"}, from data part. + + Class \code{"\linkS4class{array}"}, by class "matrix", distance 2. + + Class \code{"\linkS4class{structure}"}, by class "matrix", distance 3. + + Class \code{"\linkS4class{vector}"}, by class "matrix", distance 4, + with explicit coerce. + +} + +\section{Methods}{ + \describe{ + \item{close\_intervals}{ \code{signature(x = "Intervals_virtual")} } + \item{closed}{ \code{signature(x = "Intervals_virtual")} } + \item{clusters}{ \code{signature(x = "Intervals_virtual")} } + \item{coerce}{ \code{signature(from = "Intervals_virtual", to = "character")} } + \item{contract}{ \code{signature(x = "Intervals_virtual")} } + \item{expand}{ \code{signature(x = "Intervals_virtual")} } + \item{head}{ \code{signature(x = "Intervals_virtual")} } + \item{initialize}{ \code{signature(.Object = "Intervals_virtual")} } + \item{interval\_complement}{ \code{signature(x = "Intervals_virtual")} } + \item{interval\_difference}{ \code{signature(x = "Intervals_virtual", y = "Intervals_virtual")} } + \item{interval\_intersection}{ \code{signature(x = "Intervals_virtual")} } + \item{interval\_union}{ \code{signature(x = "Intervals_virtual")} } + \item{is.na}{ \code{signature(x = "Intervals_virtual")} } + \item{open\_intervals}{ \code{signature(x = "Intervals_virtual")} } + \item{reduce}{ \code{signature(x = "Intervals_virtual")} } + \item{show}{ \code{signature(object = "Intervals_virtual")} } + \item{t}{ \code{signature(x = "Intervals_virtual")} } + \item{tail}{ \code{signature(x = "Intervals_virtual")} } + \item{type}{ \code{signature(x = "Intervals_virtual")} } + \item{type<-}{ \code{signature(x = "Intervals_virtual")} } + \item{which\_nearest}{ \code{signature(from = "numeric", to = "Intervals_virtual")} } + \item{which\_nearest}{ \code{signature(from = "Intervals_virtual", to = "numeric")} } + \item{which\_nearest}{ \code{signature(from = "Intervals_virtual", to = "Intervals_virtual")} } + } +} + +\seealso{ + See the \code{"\linkS4class{Intervals}"} and + \code{"\linkS4class{Intervals_full}"} classes. +} + +\keyword{classes} diff --git a/man/Intervals_virtual_or_numeric-class.Rd b/man/Intervals_virtual_or_numeric-class.Rd new file mode 100644 index 0000000..a5bfe94 --- /dev/null +++ b/man/Intervals_virtual_or_numeric-class.Rd @@ -0,0 +1,23 @@ +\name{Intervals_virtual_or_numeric-class} + +\docType{class} + +\alias{Intervals_virtual_or_numeric-class} + +\title{Class "Intervals_virtual_or_numeric"} + +\description{ + A class union combining \code{"\linkS4class{Intervals_virtual}"} and + \code{"\linkS4class{numeric}"}. Used by, e.g., + \code{\link{distance_to_nearest}} and \code{\link{which_nearest}}. +} + +\section{Methods}{ + \describe{ + \item{distance_to_nearest}{\code{signature(from = "Intervals_virtual_or_numeric", to = "Intervals_virtual_or_numeric")}} + \item{interval_overlap}{\code{signature(from = "Intervals_virtual_or_numeric", to = "Intervals_virtual_or_numeric")}} + } +} + + +\keyword{classes} \ No newline at end of file diff --git a/man/as.matrix.Intervals.Rd b/man/as.matrix.Intervals.Rd new file mode 100644 index 0000000..57a77a3 --- /dev/null +++ b/man/as.matrix.Intervals.Rd @@ -0,0 +1,29 @@ +\name{as.matrix} + +\alias{as.matrix} +\alias{as.matrix.Intervals_virtual} +\alias{as.matrix,Intervals_virtual-method} + +\title{Extract matrix of endpoints} + +\description{ S3 and S4 methods for extracting the matrix of endpoints + from S4 objects. } + +\usage{ +\S3method{as.matrix}{Intervals_virtual}(x, ...) + +\S4method{as.matrix}{Intervals_virtual}(x, ...) +} + +\arguments{ + + \item{x}{\code{"Intervals"} or \code{"Intervals_full"} objects.} + + \item{...}{Unused, but required by the S3 generic.} + +} + +\value{ + A two-column matrix, equivalent to \code{x@.Data} or \code{as(x, + "matrix")}. +} diff --git a/man/c.Intervals.Rd b/man/c.Intervals.Rd new file mode 100644 index 0000000..fc4f0c1 --- /dev/null +++ b/man/c.Intervals.Rd @@ -0,0 +1,74 @@ +\name{c} + +\alias{c} +\alias{c.Intervals} +\alias{c.Intervals_full} + +\title{Combine different interval matrix objects} + +\description{ + S3 methods for concatenating sets of intervals into a single set. +} + +\usage{ +\S3method{c}{Intervals}(...) +\S3method{c}{Intervals_full}(...) +} + +\arguments{ + + \item{...}{\code{"Intervals"} or \code{"Intervals_full"} objects.} + +} + +\details{ + + All objects are expected to have the same value in the \code{type} + slot. If the \code{closed} slots differ for + \code{"\linkS4class{Intervals}"} objects and \code{type == "Z"}, the + objects will be adjusted to have \code{closed} values matching that of + \code{x}; if \code{type == "R"}, however, then all objects must first + be coerced to class \code{"\linkS4class{Intervals_full}"}, with a + warning. This coercion also occurs when a mixture of object types is + passed in. A \code{NULL} in any argument is ignored. + +} + +\value{ + A single \code{"\linkS4class{Intervals}"} or + \code{"\linkS4class{Intervals_full}"} object. Input objects are + concatenated in their order of appearance in the the argument list. + + If any input argument is not a set of intervals, \code{list(...)} is + returned instead. +} + +\note{ + These methods will be converted to S4 once the necessary dispatch on + \code{...} is supported. +} + +\examples{ +f1 <- Intervals( 1:2, type = "Z" ) +g1 <- open_intervals( f1 + 5 ) + +# Combining Intervals objects over Z may require closure adjustment +c( f1, g1 ) + +f2 <- f1; g2 <- g1 +type( f2 ) <- type( g2 ) <- "R" + +# Combine Intervals objects over R which have different closure requires +# coercion + +h <- c( f2, g2 ) + +# Coercion for mixed combinations as well +c( h, g2 + 10 ) + +\dontrun{ +# Combining different types is not permitted +c( h, g1 + 10 ) +} + +} \ No newline at end of file diff --git a/man/close_intervals-methods.Rd b/man/close_intervals-methods.Rd new file mode 100644 index 0000000..e7c2323 --- /dev/null +++ b/man/close_intervals-methods.Rd @@ -0,0 +1,87 @@ +\name{close_intervals} + +\alias{close_intervals} +\alias{close_intervals,Intervals_virtual-method} + +\alias{open_intervals} +\alias{open_intervals,Intervals_virtual-method} + +\alias{adjust_closure} +\alias{adjust_closure,Intervals-method} +\alias{adjust_closure,Intervals_full-method} + + +\title{Re-represent integer intervals with open or closed endpoints} + +\description{ + Given an integer interval matrix, adjust endpoints so that all + intervals have the requested closure status. +} + +\usage{ +\S4method{close_intervals}{Intervals_virtual}(x) + +\S4method{open_intervals}{Intervals_virtual}(x) + +\S4method{adjust_closure}{Intervals}(x, close_left = TRUE, close_right = TRUE) + +\S4method{adjust_closure}{Intervals_full}(x, close_left = TRUE, close_right = TRUE) +} + +\arguments{ + + \item{x}{ + An object of appropriate class, and for which \code{x@type == + "Z"}. If \code{x@type == "R"}, an error is generated. + } + + \item{close_left}{ + Should the left endpoints be closed or open? + } + + \item{close_right}{ + Should the right endpoints be closed or open? + } + +} + +\value{ + An object of the same class as \code{x}, with endpoints adjusted as + necessary and all \code{closed(x)} set to either \code{TRUE} or + \code{FALSE}, as appropriate. +} + +\note{ + The \code{close_intervals} and \code{open_intervals} are for + convenience, and just call \code{adjust_closure} with the approriate + arguments. + + The \code{x} object may contain empty intervals, with at least one + open endpoint, and still be valid. (Intervals are invalid if their + second endpoint is less than their first.) The \code{close_intervals} + method would, in such cases, create an invalid result; to prevent + this, empty intervals are detected and removed, with a warning. + + This package does not make a distinction between closed and open + infinite endpoints: an interval with an infinite endpoint extends to + (plus or minus) infinity regardless of the closure state. For example, + \code{\link{distance_to_nearest}} will return a \code{0} when + \code{Inf} is compared to both \code{"[0, Inf)"} and \code{"[0, Inf]"}. +} + +\examples{ +x <- Intervals( + c( 1, 5, 10, 1, 6, 20 ), + closed = c( TRUE, FALSE ), + type = "Z" + ) + +# Empties are dropped +close_intervals(x) +adjust_closure(x, FALSE, TRUE) + +# Intervals_full +y <- as( x, "Intervals_full" ) +closed(y)[1,2] <- TRUE +open_intervals(y) +} \ No newline at end of file diff --git a/man/clusters-methods.Rd b/man/clusters-methods.Rd new file mode 100644 index 0000000..41ac0a4 --- /dev/null +++ b/man/clusters-methods.Rd @@ -0,0 +1,109 @@ +\name{clusters} + +\alias{clusters} +\alias{clusters,numeric-method} +\alias{clusters,Intervals_virtual-method} + +\title{Identify clusters in a collection of positions or intervals} + +\description{ + This function uses tools in the \pkg{intervals} package to quickly + identify clusters -- contiguous collections of positions or intervals + which are separated by no more than a given distance from their + neighbors to either side. +} + +\usage{ +\S4method{clusters}{numeric}(x, w, which = FALSE, check_valid = TRUE) + +\S4method{clusters}{Intervals_virtual}(x, w, which = FALSE, check_valid = TRUE) +} + +\arguments{ + + \item{x}{An appropriate object.} + + \item{w}{ + Maximum permitted distance between a cluster member and its + neighbors to either side. + } + + \item{which}{ + Should indices into the \code{x} object be returned instead of + actual subsets? + } + + \item{check_valid}{ + Should \code{\link{validObject}} be called before passing to + compiled code? Also see \code{\link{interval_overlap}} and + \code{\link{reduce}}. + } + +} + +\details{ + A cluster is defined to be a maximal collection, with at least two + members, of components of \code{x} which are separated by no more than + \code{w}. Note that when \code{x} represents intervals, an interval + must actually \emph{contain a point} at distance \code{w} or less from + a neighboring interval to be assigned to the same cluster. If the ends + of both intervals in question are open and exactly at distance + \code{w}, they will not be deemed to be cluster co-members. See the + example below. +} + +\note{ + Implementation is by a call to \code{\link{reduce}} followed by a call + to \code{\link{interval_overlap}}. The \code{clusters} methods are + included to illustrate the utility of the core functions in the + \pkg{intervals} package, although they are also useful in their own + right. +} + +\value{ + A list whose components are the clusters. Each component is thus a + subset of \code{x}, or, if \code{which == TRUE}, a vector of + indices into the \code{x} object. (The indices correspond to row + numbers when \code{x} is of class \code{"Intervals_virtual"}.) +} + +\examples{ +# Numeric method +w <- 20 +x <- sample( 1000, 100 ) +c1 <- clusters( x, w ) + +# Check results +sapply( c1, function( x ) all( diff(x) <= w ) ) +d1 <- diff( sort(x) ) +all.equal( + as.numeric( d1[ d1 <= w ] ), + unlist( sapply( c1, diff ) ) + ) + +# Intervals method, starting with a reduced object so we know that all +# intervals are disjoint and sorted. +B <- 100 +left <- runif( B, 0, 1e4 ) +right <- left + rexp( B, rate = 1/10 ) +y <- reduce( Intervals( cbind( left, right ) ) ) + +gaps <- function(x) x[-1,1] - x[-nrow(x),2] +hist( gaps(y), breaks = 30 ) + +w <- 200 +c2 <- clusters( y, w ) +head( c2 ) +sapply( c2, function(x) all( gaps(x) <= w ) ) + +# Clusters and open end points. See "Details". +z <- Intervals( + matrix( 1:4, 2, 2, byrow = TRUE ), + closed = c( TRUE, FALSE ) + ) +z +clusters( z, 1 ) +closed(z)[1] <- FALSE +z +clusters( z, 1 ) +} \ No newline at end of file diff --git a/man/distance_to_nearest-methods.Rd b/man/distance_to_nearest-methods.Rd new file mode 100644 index 0000000..8eddaaa --- /dev/null +++ b/man/distance_to_nearest-methods.Rd @@ -0,0 +1,58 @@ +\name{distance_to_nearest} + +\alias{distance_to_nearest} +\alias{distance_to_nearest,Intervals_virtual_or_numeric,Intervals_virtual_or_numeric-method} + +\title{Compute distance to nearest position in a set of intervals} + +\description{ + For each point or interval in the \code{from} argument, compute the + distance to the nearest position in the \code{to} argument. +} + +\usage{ +\S4method{distance_to_nearest}{Intervals_virtual_or_numeric,Intervals_virtual_or_numeric}(from, to, check_valid = TRUE) +} + +\arguments{ + + \item{from}{An object of appropriate type.} + + \item{to}{An object of appropriate type.} + + \item{check_valid}{ + Should \code{\link{validObject}} be called before passing to + compiled code? Also see \code{\link{interval_overlap}}. + } + +} + +\value{ + A vector of distances, with one entry per point or interval in + \code{from}. Any intervals in \code{from} which are either empty (see + \code{\link{empty}}) or have \code{NA} endpoints produce a \code{NA} + result. +} + +\note{ + This function is now just a wrapper for \code{\link{which_nearest}}. +} + +\seealso{ + See \code{\link{which_nearest}}, which also returns indices for the + interval or intervals (in case of ties) at the distance reported. +} + +\examples{ +# Point to interval + +to <- Intervals( c(0,5,3,Inf) ) +from <- -5:10 +plot( from, distance_to_nearest( from, to ), type = "l" ) +segments( to[,1], 1, pmin(to[,2], par("usr")[2]), 1, col = "red" ) + +# Interval to interval + +from <- Intervals( c(-Inf,-Inf,3.5,-1,1,4) ) +distance_to_nearest( from, to ) +} diff --git a/man/empty-methods.Rd b/man/empty-methods.Rd new file mode 100644 index 0000000..edb40eb --- /dev/null +++ b/man/empty-methods.Rd @@ -0,0 +1,76 @@ +\name{empty} + +\alias{empty} +\alias{empty,Intervals-method} +\alias{empty,Intervals_full-method} + +\title{Identify empty interval rows} + +\description{ + A valid interval matrix may contain empty intervals: those with common + endpoints, at least one of which is open. The \code{empty} method + identifies these rows. +} + +\usage{ +\S4method{empty}{Intervals}(x) + +\S4method{empty}{Intervals_full}(x) +} + +\arguments{ + \item{x}{An \code{"Intervals"} or \code{"Intervals_full"} object.} +} + +\details{ + Intervals are deemed to be empty when their endpoints are equal and + not both closed, or for \code{type == "Z"}, when their endpoints differ + by 1 and both are open. The matrices \code{x} and \code{x[!empty(x),]} + represent the same subset of the integers or the real line. +} + +\value{ + A boolean vector with length equal to \code{nrow(x)}. +} + +\section{Warning}{ + Exact equality (\code{==}) comparisons are used by \code{empty}. See + the package vignette for a discussion of equality and floating point + numbers. +} + +\note{ + Note that intervals of size 0 may not be empty over the reals, and + intervals whose second endpoint is strictly greater than the first + \emph{may} be empty over the integers, if both endpoints are open. +} + +\seealso{ + See \code{\link{size}} to compute the size of each interval in an + object. +} + +\examples{ +z1 <- Intervals( cbind( 1, 1:3 ), type = "Z" ) +z2 <- z1; closed(z2)[1] <- FALSE +z3 <- z1; closed(z3) <- FALSE + +empty(z1) +empty(z2) +empty(z3) + +r1 <- z1; type(r1) <- "R" +r2 <- z2; type(r2) <- "R" +r3 <- z3; type(r3) <- "R" + +empty(r1) +empty(r2) +empty(r3) + +s1 <- Intervals_full( matrix( 1, 3, 2 ), type = "Z" ) +closed(s1)[2,2] <- FALSE +closed(s1)[3,] <- FALSE + +empty(s1) +} + diff --git a/man/expand-methods.Rd b/man/expand-methods.Rd new file mode 100644 index 0000000..ae5da1d --- /dev/null +++ b/man/expand-methods.Rd @@ -0,0 +1,96 @@ +\name{expand} + +\alias{expand} +\alias{expand,Intervals_virtual-method} + +\alias{contract} +\alias{contract,Intervals_virtual-method} + +\title{Expand or contract intervals} + +\description{ + It is often useful to shrink or grow each interval in a set of + intervals: to smooth over small, uninteresting gaps, or to address + possible imprecision resulting from floating point arithmetic. The + \code{expand} and \code{contract} methods implement this, using either + absolute or relative difference. +} + +\usage{ +\S4method{expand}{Intervals_virtual}(x, delta = 0, type = c("absolute", "relative")) + +\S4method{contract}{Intervals_virtual}(x, delta = 0, type = c("absolute", "relative")) +} + + +\arguments{ + + \item{x}{ An \code{"Intervals"} or \code{"Intervals_full"} object.} + + \item{delta}{ + A non-negative adjustement value. A vector is permitted, + and its entries will be recycled if necessary. + } + + \item{type}{ + Should adjustment be based on relative or absolute difference. When + \code{type == "relative"} intervals are expanded/contracted to + include/exclude points for which a relative difference with respect + to the nominal value is less than or equal to \code{delta}. (See the + note below.) When \code{type == "absolute"}, absolute rather than + relative difference is used, i.e., all intervals are expanded or + contracted by the same amount. + } + +} + +\value{ + A single object of appropriate class, with endpoint positions adjusted + as requested. Expansion returns an object with the same dimension as + \code{x}; contraction may lead to the elimination of now-empty rows. +} + +\note{ + Here, the relative difference between \emph{x} and \emph{y} is + |\emph{x} - \emph{y}|/max(|\emph{x}|, |\emph{y}|). +} + +\examples{ +# Using adjustment to remove small gaps + +x <- Intervals( c(1,10,100,8,50,200), type = "Z" ) +close_intervals( contract( reduce( expand(x, 1) ), 1 ) ) + +# Finding points for which, as a result of possible floating point +# error, intersection may be ambiguous. Whether y1 intersects y2[2,] +# depends on precision. + +delta <- .Machine$double.eps^0.5 +y1 <- Intervals( c( .5, 1 - delta / 2 ) ) +y2 <- Intervals( c( .25, 1, .75, 2 ) ) + +# Nominal + +interval_intersection( y1, y2 ) + +# Inner limit + +inner <- interval_intersection( + contract( y1, delta, "relative" ), + contract( y2, delta, "relative" ) + ) + +# Outer limit + +outer <- interval_intersection( + expand( y1, delta, "relative" ), + expand( y2, delta, "relative" ) + ) + +# The ambiguous set, corresponding to points which may or may not be in +# the intersection -- depending on numerical values for endpoints +# which are, with respect to relative difference, indistinguishable from +# the nominal values. + +interval_difference( outer, inner ) +} \ No newline at end of file diff --git a/man/interval_complement-methods.Rd b/man/interval_complement-methods.Rd new file mode 100644 index 0000000..2cd50f6 --- /dev/null +++ b/man/interval_complement-methods.Rd @@ -0,0 +1,38 @@ +\name{interval_complement} + +\alias{interval_complement} +\alias{interval_complement,Intervals_virtual-method} + +\title{Compute the complement of a set of intervals} + +\description{ + Compute the complement of a set of intervals. +} + +\usage{ +\S4method{interval_complement}{Intervals_virtual}(x, check_valid = TRUE) +} + +\arguments{ + \item{x}{An \code{"Intervals"} or \code{"Intervals_full"} object.} + + \item{check_valid}{ + Should \code{\link{validObject}} be called before passing to + compiled code? Also see \code{\link{interval_overlap}}. + } +} + +\value{ + An object of the same class as \code{x}, compactly representing the + complement of the intervals described in \code{x}. +} + +\note{ + For objects of class \code{"Intervals"}, closure on \code{-Inf} or + \code{Inf} endpoints is set to match that of all the intervals with + finite endpoints. For objects of class \code{"Intervals_full"}, + non-finite endpoints are left open (although in general, this package + does not make a distinction between closed and open infinite + endpoints). +} + diff --git a/man/interval_difference-methods.Rd b/man/interval_difference-methods.Rd new file mode 100644 index 0000000..9c0447f --- /dev/null +++ b/man/interval_difference-methods.Rd @@ -0,0 +1,40 @@ +\name{interval_difference} + +\alias{interval_difference} +\alias{interval_difference,Intervals_virtual,Intervals_virtual-method} + +\title{Compute set difference} + +\description{ + Compute the set difference between two objects. +} + +\usage{ +\S4method{interval_difference}{Intervals_virtual,Intervals_virtual}(x, y, check_valid = TRUE) +} + +\arguments{ + \item{x}{An \code{"Intervals"} or \code{"Intervals_full"} object.} + + \item{y}{ + An \code{"Intervals"} or \code{"Intervals_full"} object, with a + \code{type} slot matching that of \code{x}. + } + + \item{check_valid}{ + Should \code{\link{validObject}} be called on \code{x} and \code{y} + before passing to compiled code? Also see + \code{\link{interval_overlap}}. + } +} + +\value{ + An object representing the subset of the integers or real line, as + determined by \code{type(x)}, found in \code{x} but not in \code{y}. +} + +\seealso{ + These methods are just wrappers for + \code{\link{interval_intersection}} and + \code{\link{interval_complement}}. +} \ No newline at end of file diff --git a/man/interval_included-methods.Rd b/man/interval_included-methods.Rd new file mode 100644 index 0000000..795e536 --- /dev/null +++ b/man/interval_included-methods.Rd @@ -0,0 +1,138 @@ +\name{interval_included} + +\alias{interval_included} +\alias{interval_included,Intervals,Intervals-method} +\alias{interval_included,Intervals_full,Intervals_full-method} + +\title{Assess inclusion of one set of intervals with respect to another} + +\description{ + Determine which intervals in the one set are completely included in + the intervals of a second set. +} + +\usage{ +\S4method{interval_included}{Intervals,Intervals}(from, to, check_valid = TRUE) +\S4method{interval_included}{Intervals_full,Intervals_full}(from, to, check_valid = TRUE) +} + +\arguments{ + + \item{from}{ + An \code{"Intervals"} or \code{"Intervals_full"} object, or a + vector of class \code{"numeric"}. + } + + \item{to}{ + An \code{"Intervals"} or \code{"Intervals_full"} object, or a + vector of class \code{"numeric"}. + } + + \item{check_valid}{ + Should \code{\link{validObject}} be called before passing to + compiled code? This, among other things, verifies that endpoints are + of data type \code{"numeric"} and the \code{closed} vector/matrix is + appropriately sized and of the correct data type. (Compiled code + does no further checking.) + } + +} + +\value{ + A list, with one element for each row/component of \code{from}. The + elements are vectors of indices, indicating which \code{to} rows (or + components, for the \code{"numeric"} method) are completely included + within each interval in \code{from}. A list element of length 0 + indicates no included elements. Note that empty \code{to} elements are + not included in anything, and empty \code{from} elements do not + include anything. +} + +\seealso{ + See \code{\link{interval_overlap}} for partial overlaps -- i.e., at at + least a point. +} + +\examples{ +# Note that 'from' and 'to' contain valid but empty intervals. + +to <- Intervals( + matrix( + c( + 2, 6, + 2, 8, + 2, 9, + 4, 4, + 6, 8 + ), + ncol = 2, byrow = TRUE + ), + closed = c( TRUE, FALSE ), + type = "Z" + ) + +from <- Intervals( + matrix( + c( + 2, 8, + 8, 9, + 6, 9, + 11, 12, + 3, 3 + ), + ncol = 2, byrow = TRUE + ), + closed = c( TRUE, FALSE ), + type = "Z" + ) +rownames(from) <- letters[1:nrow(from)] + +from +to +interval_included(from, to) + +closed(to) <- TRUE +to +interval_included(from, to) + +# Intervals_full + +F <- FALSE +T <- TRUE + +to <- Intervals_full( + rep( c(2,8), c(4,4) ), + closed = matrix( c(F,F,T,T,F,T,F,T), ncol = 2 ), + type = "R" + ) + +type( from ) <- "R" +from <- as( from, "Intervals_full" ) + +from +to +interval_included(from, to) + +# Testing + +B <- 1000 + +x1 <- rexp( B, 1/1000 ) +s1 <- runif( B, max=5 ) +x2 <- rexp( B, 1/1000 ) +s2 <- runif( B, max=3 ) + +from <- Intervals_full( cbind( x1, x1 + s1 ) ) +to <- Intervals_full( cbind( x2, x2 + s2 ) ) + +ii <- interval_included( from, to ) +ii_match <- which( sapply( ii, length ) > 0 ) + +from[ ii_match[1:3], ] +lapply( ii[ ii_match[1:3] ], function(x) to[x,] ) + +included <- to[ unlist( ii ), ] +dim( included ) + +interval_intersection( included, interval_complement( from ) ) +} diff --git a/man/interval_intersection-methods.Rd b/man/interval_intersection-methods.Rd new file mode 100644 index 0000000..4c176d4 --- /dev/null +++ b/man/interval_intersection-methods.Rd @@ -0,0 +1,43 @@ +\name{interval_intersection} + +\alias{interval_intersection} +\alias{interval_intersection,Intervals_virtual-method} +\alias{interval_intersection,missing-method} + +\title{Compute the intersection of one or more sets of intervals} + +\description{ + Given one or more sets of intervals, produce a new set compactly + representing points contained in at least one interval of each input + object. +} + +\usage{ +\S4method{interval_intersection}{Intervals_virtual}(x, ..., check_valid = TRUE) + +\S4method{interval_intersection}{missing}(x, ..., check_valid = TRUE) +} + +\arguments{ + + \item{x}{ An \code{"Intervals"} or \code{"Intervals_full"} object.} + + \item{\dots}{Additional objects of the same classes permitted for \code{x}.} + + \item{check_valid}{ + Should \code{\link{validObject}} be called before passing to + compiled code? Also see \code{\link{interval_overlap}}. + } + +} + +\value{ + A single object representing points contained in each of the objects + supplied in the \code{x} and \code{...} arguments. +} + +\seealso{ + See \code{\link{interval_union}} and + \code{\link{interval_complement}}, which are used to produce the + results. +} \ No newline at end of file diff --git a/man/interval_overlap-methods.Rd b/man/interval_overlap-methods.Rd new file mode 100644 index 0000000..cf053ed --- /dev/null +++ b/man/interval_overlap-methods.Rd @@ -0,0 +1,128 @@ +\name{interval_overlap} + +\alias{interval_overlap} +\alias{interval_overlap,Intervals_virtual_or_numeric,Intervals_virtual_or_numeric-method} +\alias{interval_overlap,missing,ANY-method} +\alias{interval_overlap,ANY,missing-method} + +\title{Assess overlap from one set of intervals to another} + +\description{ + Asses overlap from intervals in one set to intervals in another set, + and return the relevant indices. +} + +\usage{ +\S4method{interval_overlap}{Intervals_virtual_or_numeric,Intervals_virtual_or_numeric}(from, to, check_valid = TRUE) +} + +\arguments{ + + \item{from}{ + An \code{"Intervals"} or \code{"Intervals_full"} object, or a + vector of class \code{"numeric"}. \emph{Note!} Prior to v. 0.11.1, + this argument was called \code{target}. + } + + \item{to}{ + An \code{"Intervals"} or \code{"Intervals_full"} object, or a + vector of class \code{"numeric"}. \emph{Note!} Prior to v. 0.11.1, + this argument was called \code{query}. + } + + \item{check_valid}{ + Should \code{\link{validObject}} be called before passing to + compiled code? This, among other things, verifies that endpoints are + of data type \code{"numeric"} and the \code{closed} vector/matrix is + appropriately sized and of the correct data type. (Compiled code + does no further checking.) + } + +} + +\details{ + Intervals which meet at endpoints overlap only if both endpoints are + closed. Intervals in \code{to} with \code{NA} endpoints are + ignored, with a warning; in \code{from}, such intervals produce no + matches. Intervals in either \code{to} or \code{from} which are + actually empty have their endpoints set to \code{NA} before + proceeding, with warning, and so do not generate matches. If + eith \code{to} or \code{from} is a vector of class \code{"numeric"}, + overlap will be assess for the corresponding set of points. +} + +\value{ + A list, with one element for each row/component of \code{from}. The + elements are vectors of indices, indicating which \code{to} rows (or + components, for the \code{"numeric"} method) overlap each interval in + \code{from}. A list element of length 0 indicates no overlapping + elements. +} + +\note{ + If you want real (\code{type == "R"}) intervals that overlap in a set + of positive measure --- not just at endpoints --- set all endpoints to + open (i.e., \code{close(from) <- FALSE; closed(to) <- FALSE}) first. + + This function is now just a wrapper for \code{\link{which_nearest}}. +} + +\seealso{ + See \code{\link{which_nearest}} for details on nearby as well as + overlapping intervals in \code{to}. +} + +\examples{ +# Note that 'from' contains a valid but empty interval. + +to <- Intervals( + matrix( + c( + 2, 8, + 3, 4, + 5, 10 + ), + ncol = 2, byrow = TRUE + ), + closed = c( TRUE, FALSE ), + type = "Z" + ) + +from <- Intervals( + matrix( + c( + 2, 8, + 8, 9, + 6, 9, + 11, 12, + 3, 3 + ), + ncol = 2, byrow = TRUE + ), + closed = c( TRUE, FALSE ), + type = "Z" + ) +rownames(from) <- letters[1:nrow(from)] + +empty(to) +empty(from) + +interval_overlap(from, to) + +# Non-empty real intevals of size 0 can overlap other intervals. + +u <- to +type(u) <- "R" + +v <- Intervals_full( rep(3,4) ) +closed(v)[2,] <- FALSE +v +empty(v) +size(v) + +interval_overlap(v, u) + +# Working with points + +interval_overlap( from, c( 2, 3, 6, NA ) ) +} diff --git a/man/interval_union-methods.Rd b/man/interval_union-methods.Rd new file mode 100644 index 0000000..0c9d1f9 --- /dev/null +++ b/man/interval_union-methods.Rd @@ -0,0 +1,55 @@ +\name{interval_union} + +\alias{interval_union} +\alias{interval_union,Intervals_virtual-method} +\alias{interval_union,missing-method} + +\title{Compute the union of intervals in one or more interval matrices} + +\description{ + Compute the union of intervals in one or more interval matrices. The + intervals contained in a single interval matrix object need not, in + general, be disjoint; \code{interval_union}, however, always returns a + matrix with sorted, disjoint intervals. +} + +\usage{ +\S4method{interval_union}{Intervals_virtual}(x, ..., check_valid = TRUE) + +\S4method{interval_union}{missing}(x, ..., check_valid = TRUE) +} + +\arguments{ + + \item{x}{ An \code{"Intervals"} or \code{"Intervals_full"} object.} + + \item{\dots}{ + Optionally, additional objects which can be combined with + \code{x}. See \code{\link{c.Intervals}} for details on mixing + different types of objects. + } + + \item{check_valid}{ + Should \code{\link{validObject}} be called before passing to + compiled code? Also see \code{\link{interval_overlap}}. + } + +} + +\details{ + All supplied objects are combined using \code{\link[=c.Intervals]{c}} + and then then passed to \code{\link{reduce}}. The \code{missing} + method is only to permit use of \code{\link{do.call}} with named list, + since no named element will typically match \code{x}. +} + +\value{ + A single object of appropriate class, compactly representing the union + of all intervals in \code{x}, and optionally, in \code{...} as + well. For class \code{"Intervals"}, the result will have the same + \code{closed} values as \code{x}. +} + +\seealso{ + See \code{\link{reduce}}, which is used to produce the results. +} \ No newline at end of file diff --git a/man/intervals-package.Rd b/man/intervals-package.Rd new file mode 100644 index 0000000..daa5761 --- /dev/null +++ b/man/intervals-package.Rd @@ -0,0 +1,69 @@ +\name{intervals-package} + +\alias{intervals-package} +\alias{intervals} + +\docType{package} + +\title{ +Tools for working with points and intervals +} + +\description{ +Tools for working with and comparing sets of points and intervals. +} + +\details{ + + Index: + \describe{ + \item{\code{\link{Intervals-class}}}{Classes \code{"Intervals"} and \code{"Intervals_full"}.} + \item{\code{\link{Intervals_virtual-class}}}{Class \code{"Intervals_virtual"}.} + \item{\code{\link{Intervals_virtual_or_numeric-class}}}{Class union \code{"Intervals_virtual_or_numeric"}.} + \item{\code{\link[=as.matrix.Intervals_virtual]{as.matrix}}}{Coerce endpoints to a matrix.} + \item{\code{\link[=c.Intervals]{c}}}{Concatenate different sets of intervals.} + \item{\code{\link{close_intervals}}}{Re-represent integer intervals with open or closed endpoints.} + \item{\code{\link{closed}}}{Accessor for \code{closed} slot: closure vector/matrix.} + \item{\code{\link{clusters}}}{Identify clusters in a collection of positions or intervals.} + \item{\code{\link{contract}}}{Contract sets.} + \item{\code{\link{distance_to_nearest}}}{Compute distance to nearest position in a set of intervals.} + \item{\code{\link{empty}}}{Identify empty interval rows.} + \item{\code{\link{expand}}}{Expand sets.} + \item{\code{\link{interval_complement}}}{Compute the complement of a set of intervals.} + \item{\code{\link{interval_difference}}}{Compute set difference.} + \item{\code{\link{interval_included}}}{Assess inclusion of one set of intervals with respect to another.} + \item{\code{\link{interval_intersection}}}{Compute the intersection of one or more sets of intervals.} + \item{\code{\link{interval_overlap}}}{Assess which query intervals overlap which targets.} + \item{\code{\link{interval_union}}}{Compute the union of intervals in one or more interval matrices.} + \item{\code{\link{is.na}}}{Identify interval rows with \code{NA} endpoints.} + \item{\code{\link[=plot.Intervals]{plot}}}{S3 plotting methods for intervals objects.} + \item{\code{\link{reduce}}}{Compactly re-represent the points in a set of intervals.} + \item{\code{\link{sgd}}}{Yeast gene model sample data.} + \item{\code{\link{size}}}{Compute interval sizes.} + \item{\code{\link[=split.Intervals_virtual]{split}}}{Split an intervals object according to a factor.} + \item{\code{\link{type}}}{Accessor for \code{type} slot: Z or R.} + \item{\code{\link{which_nearest}}}{Identify nearest member(s) in a set of intervals.} + } + + Further information is available in the following vignettes: + \describe{ + \item{\code{intervals_overview}}{Overview of the intervals package.} + } + +} + +\author{ +Richard Bourgon +} + +\keyword{ package } + +\section{Acknowledgments}{ + Thanks to Julien Gagneur, Simon Anders, and Wolfgang Huber for + numerous helpful suggestions about the package content and code. +} + +\seealso{ + See the \pkg{genomeIntervals} package in Bioconductor, which extends + the functionality of this package. +} diff --git a/man/plot.Intervals.Rd b/man/plot.Intervals.Rd new file mode 100644 index 0000000..73b6c6e --- /dev/null +++ b/man/plot.Intervals.Rd @@ -0,0 +1,152 @@ +\name{plot.Intervals} + +\alias{plot} +\alias{plot.Intervals} +\alias{plot.Intervals_full} +\alias{plot,Intervals,missing-method} +\alias{plot,Intervals_full,missing-method} +\alias{plot,Intervals,ANY-method} +\alias{plot,Intervals_full,ANY-method} + +\title{Plotting methods for interval objects} + +\description{ + S3 methods for plotting \code{"Intervals"} and \code{"Intervals_full"} + objects. +} + +\usage{ +\S3method{plot}{Intervals}(x, y, ...) +\S3method{plot}{Intervals_full}( + x, y = NULL, + axes = TRUE, + xlab = "", ylab = "", + xlim = NULL, ylim = NULL, + col = "black", lwd = 1, + cex = 1, + use_points = TRUE, + use_names = TRUE, + names_cex = 1, + ... + ) + +\S4method{plot}{Intervals,missing}(x, y, ...) +\S4method{plot}{Intervals_full,missing}(x, y, ...) +\S4method{plot}{Intervals,ANY}(x, y, ...) +\S4method{plot}{Intervals_full,ANY}(x, y, ...) +} + +\arguments{ + + \item{x}{An \code{"Intervals"} or \code{"Intervals_full"} object.} + + + \item{y}{ + Optional vector of heights at which to plot intervals. If omitted, + \code{y} will be automatically computed to generate a compact plot + but with no overlap. + } + + + \item{axes}{As for \code{\link{plot.default}}.} + + \item{xlab}{As for \code{\link{plot.default}}.} + + \item{ylab}{As for \code{\link{plot.default}}.} + + \item{xlim}{As for \code{\link{plot.default}}.} + + \item{ylim}{ + If not explicitly supplied, \code{ylim} is set to the maximum value + required for intervals which are visible for the given \code{xlim}. + } + + \item{col}{ + Color used for segments and endpoint points and interiors. Recycled + if necessary. + } + + \item{lwd}{Line width for segments. See \code{\link{par}}.} + + \item{cex}{ + Endpoint magnification. Only relevant if \code{use_points = + TRUE}. See \code{\link{par}}. + } + + \item{use_points}{Should points be plotted at interval endpoints?} + + \item{use_names}{ + Should \code{rownames(x)} by used for segment labels in the plot? + } + + \item{names_cex}{ + Segment label magnification. Only relevant if \code{use_names = + TRUE}. + } + + \item{...}{Other arguments for \code{\link{plot.default}}.} + +} + +\details{ + Intervals with \code{NA} for either endpoint are not + plotted. Vertical placement is on the integers, beginning with 0. +} + +\value{None.} + +\examples{ +# Note plot symbol for empty interval in 'from'. + +from <- Intervals( + matrix( + c( + 2, 8, + 8, 9, + 6, 9, + 11, 12, + 3, 3 + ), + ncol = 2, byrow = TRUE + ), + closed = c( FALSE, TRUE ), + type = "Z" + ) + +rownames(from) <- c("a","b","c","d","e") + +to <- Intervals( + matrix( + c( + 2, 8, + 3, 4, + 5, 10 + ), + ncol = 2, byrow = TRUE + ), + closed = c( FALSE, TRUE ), + type = "Z" + ) + +rownames(to) <- c("x","y","z") + +empty(from) + +plot( + c(from,to), + col = rep(1:2, c(nrow(from), nrow(to))) + ) + +legend("topright", c("from","to"), col=1:2, lwd=1) + +# More intervals. The maximal height shown is adapted to the plotting +# window. + +B <- 10000 +left <- runif( B, 0, 1e5 ) +right <- left + rexp( B, rate = 1/10 ) +x <- Intervals( cbind( left, right ) ) + +plot(x, use_points=FALSE) +plot(x, use_points=FALSE, xlim = c(0, 500)) +} \ No newline at end of file diff --git a/man/reduce-methods.Rd b/man/reduce-methods.Rd new file mode 100644 index 0000000..0ab7566 --- /dev/null +++ b/man/reduce-methods.Rd @@ -0,0 +1,42 @@ +\name{reduce} + +\alias{reduce} +\alias{reduce,Intervals_virtual-method} + +\title{Compactly re-represent the points in a set of intervals} + +\description{ + In general, \code{"\linkS4class{Intervals}"} and + \code{"\linkS4class{Intervals_full}"} objects may be redundant, the + intervals they contain may be in arbitrary order, and they may contain + non-informative intervals for which one or both endpoints are + \code{NA}. The \code{reduce} function re-represents the underlying + subsets of the integers or the real line in the unique, minimal form, + removing intervals with \code{NA} endpoints (with warning). +} + +\usage{ +\S4method{reduce}{Intervals_virtual}( x, check_valid = TRUE ) +} + +\arguments{ + + \item{x}{ An \code{"Intervals"} or \code{"Intervals_full"} object.} + + \item{check_valid}{ + Should \code{\link{validObject}} be called before passing to + compiled code? Also see \code{\link{interval_overlap}}. + } + +} + +\value{ + A single object of appropriate class, compactly representing the + union of all intervals in \code{x}. All intervals in \code{reduce(x)} + have numeric (i.e., not \code{NA}) endpoints. +} + +\seealso{ + See \code{\link{interval_union}}, which is really just concatenates its + arguments and then calls \code{reduce}. +} \ No newline at end of file diff --git a/man/sgd.Rd b/man/sgd.Rd new file mode 100644 index 0000000..9021231 --- /dev/null +++ b/man/sgd.Rd @@ -0,0 +1,91 @@ +\name{sgd} + +\alias{sgd} + +\docType{data} + +\title{Yeast gene model sample data} + +\description{ + This data set contains a data frame describing a subset of + the chromosome feature data represented in Fall 2007 version of + \file{saccharomyces\_cerevisiae.gff}, available for download from the + \emph{Saccharomyces} Genome Database (\url{http://www.yeastgenome.org}). +} + +\usage{data(sgd)} + +\format{ + A data frame with 14080 observations on the following 8 variables. + \describe{ + \item{\code{SGDID}}{SGD feature ID.} + \item{\code{type}}{ + Only four feature types have been retatined: \code{"CDS"}, + \code{"five_prime_UTR_intron"}, \code{"intron"}, and \code{"ORF"}. Note + that \code{"ORF"} correspond to a whole gene while \code{"CDS"}, to an + exon. \emph{S. cerevisae} does not, however, have many + multi-exonic genes. + } + \item{\code{feature_name}}{A character vector} + \item{\code{parent_feature_name}}{ + The \code{feature_name} of the a larger element to which the + current feature belongs. All retained \code{"CDS"} entries, for + example, belong to an \code{"ORF"} entry. + } + \item{\code{chr}}{ + The chromosome on which the feature occurs. + } + \item{\code{start}}{Feature start base.} + \item{\code{stop}}{Feature stop base.} + \item{\code{strand}}{Is the feature on the Watson or Crick strand?} + } +} + +\examples{ + +# An example to compute "promoters", defined to be the 500 bases +# upstream from an ORF annotation, provided these bases don't intersect +# another orf. See documentation for the sgd data set for more details +# on the annotation set. + +use_chr <- "chr01" + +data( sgd ) +sgd <- subset( sgd, chr == use_chr ) + +orf <- Intervals( + subset( sgd, type == "ORF", c( "start", "stop" ) ), + type = "Z" + ) +rownames( orf ) <- subset( sgd, type == "ORF" )$feature_name + +W <- subset( sgd, type == "ORF", "strand" ) == "W" + +promoters_W <- Intervals( + cbind( orf[W,1] - 500, orf[W,1] - 1 ), + type = "Z" + ) + +promoters_W <- interval_intersection( + promoters_W, + interval_complement( orf ) + ) + +# Many Watson-strand genes have another ORF upstream at a distance of +# less than 500 bp + +hist( size( promoters_W ) ) + +# All CDS entries are completely within their corresponding ORF entry. + +cds_W <- Intervals( + subset( sgd, type == "CDS" & strand == "W", c( "start", "stop" ) ), + type = "Z" + ) +rownames( cds_W ) <- NULL + +interval_intersection( cds_W, interval_complement( orf[W,] ) ) + +} + +\keyword{datasets} diff --git a/man/size-methods.Rd b/man/size-methods.Rd new file mode 100644 index 0000000..4476031 --- /dev/null +++ b/man/size-methods.Rd @@ -0,0 +1,73 @@ +\name{size} + +\alias{size} +\alias{size,Intervals-method} +\alias{size,Intervals_full-method} + +\title{Compute interval sizes} + +\description{ + Compute the size, in either Z or R as appropriate, for each interval + in an interval matrix. +} + +\usage{ +\S4method{size}{Intervals}(x, as = type(x)) + +\S4method{size}{Intervals_full}(x, as = type(x)) +} + +\arguments{ + + \item{x}{An \code{"Intervals"} or \code{"Intervals_full"} object.} + + \item{as}{ + Should the intervals be thought of as in Z or R? This is usually + determined automatically from the \code{type} slot, but because + changing type may cause object copying, it is sometimes convenient + to temporarily override this slot without actually resetting it. + } + +} + +\details{ + For type \code{"Z"} objects, counting measure; for type \code{"R"} + objects, Lebesgue measure. For type \code{"Z"} objects, intervals of + form (\emph{a},\emph{a}] and (\emph{a},\emph{a}) are both of length + 0. +} + +\value{ + A numeric vector with length equal to \code{nrow(x)}. +} + +\seealso{ + See \code{\link{empty}} to identify empty intervals. Note that when + \code{type(x) == "R"}, a size of 0 does not imply that an interval is + empty. +} + +\examples{ +z1 <- Intervals( cbind( 1, 1:3 ), type = "Z" ) +z2 <- z1; closed(z2)[1] <- FALSE +z3 <- z1; closed(z3) <- FALSE + +size(z1) +size(z2) +size(z3) + +r1 <- z1; type(r1) <- "R" +r2 <- z2; type(r2) <- "R" +r3 <- z3; type(r3) <- "R" + +size(r1) +size(r2) +size(r3) + +s1 <- Intervals_full( matrix( 1, 3, 2 ), type = "Z" ) +closed(s1)[2,2] <- FALSE +closed(s1)[3,] <- FALSE + +size(s1) +} + diff --git a/man/split.Intervals_virtual.Rd b/man/split.Intervals_virtual.Rd new file mode 100644 index 0000000..d4e2752 --- /dev/null +++ b/man/split.Intervals_virtual.Rd @@ -0,0 +1,37 @@ +\name{split} + +\alias{split} +\alias{split.Intervals_virtual} +\alias{split,Intervals_virtual-method} + +\title{Split an intervals object according to a factor} + +\description{ + S3 and S4 methods for splitting \code{"Intervals"} or + \code{"Intervals_full"} objects. +} + +\usage{ +\S3method{split}{Intervals_virtual}(x, f, drop = FALSE, ...) + +\S4method{split}{Intervals_virtual}(x, f, drop = FALSE, ...) +} + +\arguments{ + + \item{x}{ \code{"Intervals"} or \code{"Intervals_full"} objects. } + + \item{f}{ Passed to \code{\link{split.data.frame}}. } + + \item{drop}{ Passed to \code{\link{split.data.frame}}. } + + \item{...}{ Passed to \code{\link{split.data.frame}}. } + +} + +\value{ A list of objects of the same class as \code{x}, split by the + levels of \code{f}. Until R 2.15, special handling was not + required. Subsequent changes to the \pkg{base} package + \code{\link{split}} function required an explicit method here, but + code already provided by \code{\link{split.data.frame}} was + sufficient. } diff --git a/man/which_nearest-methods.Rd b/man/which_nearest-methods.Rd new file mode 100644 index 0000000..d9271b0 --- /dev/null +++ b/man/which_nearest-methods.Rd @@ -0,0 +1,108 @@ +\name{which_nearest} + +\alias{which_nearest} +\alias{which_nearest,numeric,Intervals_virtual-method} +\alias{which_nearest,Intervals_virtual,Intervals_virtual-method} +\alias{which_nearest,Intervals_virtual,numeric-method} +\alias{which_nearest,numeric,numeric-method} + +\title{Identify nearest member(s) in a set of intervals} + +\description{ + For each point or interval in the \code{from} argument, + identify the nearest member or members (in case of ties) of the + interval set in the \code{to} argument. +} + +\usage{ +\S4method{which_nearest}{numeric,Intervals_virtual}(from, to, check_valid = TRUE) + +\S4method{which_nearest}{Intervals_virtual,numeric}(from, to, check_valid = TRUE) + +\S4method{which_nearest}{Intervals_virtual,Intervals_virtual}(from, to, check_valid = TRUE) + +\S4method{which_nearest}{numeric,numeric}(from, to, check_valid = TRUE) +} + +\arguments{ + + \item{from}{An object of appropriate type.} + + \item{to}{An object of appropriate type.} + + \item{check_valid}{ + Should \code{\link{validObject}} be called before passing to + compiled code? Also see \code{\link{interval_overlap}}. + } + +} + +\value{ + A data frame with three columns: \code{distance_to_nearest}, + \code{which_nearest}, and \code{which_overlap}. The last two are + actually lists, since there may be zero, one, or more + nearest/overlapping intervals in the \code{to} object for any given + interval in \code{from}. + + Empty intervals in \code{to}, or intervals with \code{NA} endpoints, + produce a \code{NA} distance result, and no nearest or overlapping + hits. +} + +\note{ + (v. 0.11.0) The code used for the \code{distance_to_nearest} column + here is completely distinct from that used for the original + \code{\link{distance_to_nearest}} function. For the moment, they will + co-exist for testing purposes, but this function's code will + eventually replace the older code. + + Note that a naive way of implementing \code{which_nearest} would be to + use the simpler, old implementation of \code{distance_to_nearest}, use + \code{expand} to grow all intervals by the correspnoding amount, and + then use \code{interval_overlap} to identify target. This approach, + however, will miss a small fraction of targets due to floating point + issues. +} + +\examples{ +# Point to interval. Empty rows, or those with NA endpoints, do not +# generate hits. Note that distance_to_nearest can be 0 but without +# overlap, depending on endpoint closure. + +to <- Intervals_full( c(-1,0,NA,5,-1,3,10,Inf) ) +closed(to)[1,] <- FALSE +closed(to)[2,2] <- FALSE +from <- c( NA, -3:5 ) + +to +cbind( from, which_nearest( from, to ) ) + +# Completely empty to object + +which_nearest( from, to[1,] ) + +# Interval to interval + +from <- Intervals( c(-Inf,-Inf,3.5,-1,1,4) ) +from +which_nearest( from, to ) + +# Checking behavior with ties + +from <- Intervals_full( c(2,2,4,4,3,3,5,5) ) +closed( from )[2:3,] <- FALSE +to <- Intervals_full( c(0,0,6,6,1,1,7,8) ) +closed( to )[2:3,] <- FALSE + +from +to +which_nearest( from, to ) + +from <- Intervals_full( c(1,3,6,2,4,7) ) +to <- Intervals_full( c(4,4,5,5) ) +closed( to )[1,] <- FALSE + +from +to +which_nearest( from, to ) +} diff --git a/src/Endpoint.cpp b/src/Endpoint.cpp new file mode 100644 index 0000000..a22b1cd --- /dev/null +++ b/src/Endpoint.cpp @@ -0,0 +1,73 @@ +#include "Endpoint.h" + + + + +//////// Ordering for tied endpoints + +/* + We use two different tied endpoint orderings, one for reduce, and a different + one for interval_overlap. These should be set by any code that uses sorting, + so we initialize to -1, and have the sort method check this and throw an error + if it's still found. + + Let Q/T be query/target, L/R be left/right, and O/C be open/closed. Our + ordering, when pos is effectively tied, is then: + + QRO < TRO ... < TLC < QLC < QRC < TRC ... < TLO < QLO + 0 1 2 3 4 5 6 7 + + The basic principals are, for similar closure, start targets before + queries but finish them after queries. For abutting intervals, start new + intervals before finishing old ones, unless one or both endpoints are + open, in which case we should finish old intervals first. +*/ + +int Endpoint::state_array[2][2][2] = {{{0,0},{0,0}},{{0,0},{0,0}}}; + + + + +//////// Endpoint methods + +Endpoint::Endpoint(int i, double p, bool q, bool l, bool c) { + index = i; pos = p; query = q; left = l; closed = c; +} + +void Endpoint::R_print() const { + Rprintf( + "index = %i, pos = %f (%s, %s, %s)\n", + index, pos, + query ? "query" : "target", + left ? "left" : "right", + closed ? "closed" : "open" + ); +} + + + + +//////// Endpoints methods + +Endpoints::Endpoints( const double * pos, const int * closed, int n, bool query, bool is_full ) { + /* + The pos pointer should point to an n x 2 array of endpoints, and the closed + pointer, to either an array of booleans of the same size (if full = true) + or an array of two booleans (if full = false). Note that R uses int, not + bool, for logicals. Intervals with R numeric NA in either slot are + dropped. + */ + int i; + this->reserve( 2 * n ); + for ( i = 0; i < n; i++ ) { + if ( ISNA( pos[i] ) || ISNA( pos[i+n] ) ) continue; + this->push_back( Endpoint( i, pos[i], query, true, (bool) closed[ is_full ? i : 0 ] ) ); + this->push_back( Endpoint( i, pos[i+n], query, false, (bool) closed[ is_full ? i+n : 1 ] ) ); + } +} + +void Endpoints::R_print() const { + Endpoints::const_iterator it; + for ( it = this->begin(); it < this->end(); it++ ) + it->R_print(); +} diff --git a/src/Endpoint.h b/src/Endpoint.h new file mode 100644 index 0000000..57c679b --- /dev/null +++ b/src/Endpoint.h @@ -0,0 +1,69 @@ +#ifndef ENDPOINT_H + +#define ENDPOINT_H +#include +#include +#include + + + + +//////// Endpoint class + +class Endpoint{ + +private: + + static int state_array[2][2][2]; + + inline int state() const { return( state_array[query][left][closed] ); } + +public: + + int index; + double pos; + bool query, left, closed; + + Endpoint(int i, double p, bool q, bool l, bool c); + + inline bool operator< (const Endpoint& other) const { + /* + We assume that the calling code is aware of the difficulty in assessing + equality for floating point numbers and that values are passed in as + exactly equal (in their floating point representation) if and only if + exact equality is intended by the calling code. Given this assumption, + there is no need for a relative difference approach here. + */ + if ( this->pos == other.pos ) + return( this->state() < other.state() ); + else + return( this->pos < other.pos ); + } + + static void set_state_array( const int new_array[2][2][2] ) { + int i, j, k; + for( i = 0; i < 2; i++ ) + for( j = 0; j < 2; j++ ) + for( k = 0; k < 2; k++ ) + Endpoint::state_array[i][j][k] = new_array[i][j][k]; + } + + void R_print() const; + +}; + + + + +//////// Endpoints class + +class Endpoints : public std::vector< Endpoint > { +public: + Endpoints( const double * pos, const int * closed, int n, bool query, bool is_full ); + void R_print() const; +}; + + + + +#endif // #ifndef ENDPOINT_H diff --git a/src/plot_overlap.cpp b/src/plot_overlap.cpp new file mode 100644 index 0000000..7ae66ae --- /dev/null +++ b/src/plot_overlap.cpp @@ -0,0 +1,85 @@ +#include +#include +#include "Endpoint.h" +#include +#include +#include + +/* + What we require to prevent segfaults is the same as for + interval_overlap.cpp. See details there. Everything should be checked by the + calling code in R. + + For plotting without visual overlap, we do not care about endpoint closure, + and we always open new intervals before closing old ones. + + When an interval opens, we assign it the minimal free interior option, if + there is one; otherwise, we assign it the current count of open + intervals. When an interval closes, if it does not have the current maximum y, + its y gets added to the free_interior set. +*/ + + + + +const int reduce_order[2][2][2] = { + {{2,2},{1,1}}, // Target: {{ ), ] }, { (, [ }} + {{0,0},{0,0}} // Query: {{ ), ] }, { (, [ }} +}; + + + + +extern "C" +{ + + SEXP _plot_overlap(SEXP e, SEXP c, SEXP full) { + + // Load data and sort + int n = nrows(e); + bool full_bool = *LOGICAL(full); + Endpoints ep ( REAL(e), LOGICAL(c), n, false, full_bool ); + + // Set sorting order, then sort + Endpoint::set_state_array( reduce_order ); + sort( ep.begin(), ep.end() ); + + // Process + int i; + int active_count = 0; + std::set free_interior; + std::vector y (n); + Endpoints::const_iterator it; + + // Initialize to NA + for ( i = 0; i < n; i++ ) y[i] = R_NaInt; + + for ( it = ep.begin(); it < ep.end(); it++ ) { + if ( it->left ) { + // Opening an interval + if ( free_interior.size() > 0 ) { + y[ it->index ] = *free_interior.begin(); + free_interior.erase( free_interior.begin() ); + } + else y[ it->index ] = active_count; + active_count++; + } + else{ + // Closing an interval + active_count--; + if ( y[ it->index ] < active_count + free_interior.size() ) + free_interior.insert( y[ it->index ] ); + } + } + + // Prepare and return result. + SEXP result; + + PROTECT( result = allocVector( INTSXP, n ) ); + copy( y.begin(), y.end(), INTEGER( result ) ); + UNPROTECT(1); + return( result ); + + } + +} diff --git a/src/reduce.cpp b/src/reduce.cpp new file mode 100644 index 0000000..28c14af --- /dev/null +++ b/src/reduce.cpp @@ -0,0 +1,97 @@ +#include +#include +#include "Endpoint.h" +#include +#include + +/* + What we require to prevent segfaults is the same as for + interval_overlap.cpp. See details there. Everything should be checked by the + calling code in R. +*/ + + + + +const int reduce_order[2][2][2] = { + {{2,4},{3,1}}, // Target: {{ ), ] }, { (, [ }} + {{0,0},{0,0}} // Query: {{ ), ] }, { (, [ }} +}; + + + + +extern "C" +{ + + SEXP _reduce(SEXP e, SEXP c, SEXP full) { + + // Load data and sort + int n = nrows(e); + bool full_bool = *LOGICAL(full); + Endpoints ep ( REAL(e), LOGICAL(c), n, false, full_bool ); + + // Set sorting order, then sort + Endpoint::set_state_array( reduce_order ); + sort( ep.begin(), ep.end() ); + + // Process + int score = 0; + std::vector start, end; + std::vector start_c, end_c; + Endpoints::const_iterator it; + + for ( it = ep.begin(); it < ep.end(); it++ ) { + if ( score == 0 ) { + if ( !it->left ) + error("Internal error: unexpected endpoint type when score = 0."); + start.push_back( it->pos ); + if ( full_bool ) start_c.push_back( (int) it->closed ); + } + score += ( it->left ? +1 : -1 ); + if ( score == 0 ) { + if ( it->left ) + error("Internal error: unexpected endpoint type when score = 0."); + end.push_back( it->pos ); + if ( full_bool ) end_c.push_back( (int) it->closed ); + } + } + + if ( start.size() != end.size() ) + error("Internal error: mismatched start and end endpoint sets."); + + // Prepare and return result. + SEXP result; + + PROTECT( result = allocVector( VECSXP, 2 ) ); + + SET_VECTOR_ELT( result, 0, allocMatrix( REALSXP, start.size(), 2 ) ); + copy( start.begin(), start.end(), REAL( VECTOR_ELT( result, 0 ) ) ); + copy( + end.begin(), end.end(), + REAL( VECTOR_ELT( result, 0 ) ) + start.size() + ); + + if ( full_bool ) { + SET_VECTOR_ELT( result, 1, allocMatrix( LGLSXP, start.size(), 2 ) ); + copy( + start_c.begin(), start_c.end(), + LOGICAL( VECTOR_ELT( result, 1 ) ) + ); + copy( + end_c.begin(), end_c.end(), + LOGICAL( VECTOR_ELT( result, 1 ) ) + start.size() + ); + } + else { + SET_VECTOR_ELT( result, 1, allocVector( LGLSXP, 2 ) ); + LOGICAL( VECTOR_ELT( result, 1 ) )[0] = LOGICAL(c)[0]; + LOGICAL( VECTOR_ELT( result, 1 ) )[1] = LOGICAL(c)[1]; + } + + UNPROTECT(1); + return( result ); + + } + +} diff --git a/src/which_nearest.cpp b/src/which_nearest.cpp new file mode 100644 index 0000000..dab5c41 --- /dev/null +++ b/src/which_nearest.cpp @@ -0,0 +1,215 @@ +#include +#include +#include "Endpoint.h" +#include +#include +#include +#include + +/* + #### What we require to prevent segfaults: + + 1. For Intervals_full objects, the endpoint and closure matrices must be of + the same dimension, and have two columns each. For Intervals objects, we + expect a closure vector of length 2. + + 2. Endpoints must be type double, and closure must be R logical, which ends up + as int in C++. + + The validity function for the classes should verify both of these properties, + so we do no additional checks here, i.e., we assume that we've been passed + valid objects. + + #### To-do + + - Try hash_set instead of set, if it compiles OK. + + #### The "active" and "pending" sets + + - The q_active and t_active sets record indices of intervals which are + currently open as we proceed through the list of points. + + - The q_pending set records the query interval (or intervals, in the case of + ties) which has closed most recently to the left. If multiple target intervals + come by before the next query interval is opened, each will need to be compare + to the right endpoint of the intervals in q_pending. Once we open a new query + interval, q_pending is cleared. + + - The t_pending set records indices of target intervals which have closed + since the last time a query interval opened -- and distances to the left were + therefor checked. Each of these target intervals needs to be checked against + the next query interval to become active, as well as any additional query + intervals that start at the same location. The t_pending set will be cleared + when we get ready to add a new index to it but see that (i) a query interval + has been opened since the last time we tried to add an index, and (ii) that + pos has changed. + + the last closure of a query interval. When we open a new query interval, + each of these target intervals needs to be compared to the new query + interval's left endpoint. Once we do this -- and provided that the next query + interval does not start in the same place -- then t_pending is cleared. + +*/ + +const int overlap_order[2][2][2] = { + {{1,5},{6,2}}, // Target: {{ ), ] }, { (, [ }} + {{0,4},{7,3}} // Query: {{ ), ] }, { (, [ }} +}; + +extern "C" +{ + + SEXP _which_nearest(SEXP qe, SEXP te, SEXP qc, SEXP tc, SEXP q_full, SEXP t_full) { + + // Load data, combine + int qn = nrows(qe); + int tn = nrows(te); + + Endpoints q ( REAL(qe), LOGICAL(qc), qn, true, *LOGICAL(q_full) ); + Endpoints t ( REAL(te), LOGICAL(tc), tn, false, *LOGICAL(t_full) ); + + double* q_right = REAL(qe) + qn; + double* t_right = REAL(te) + tn; + + q.insert( q.end(), t.begin(), t.end() ); + + // Set sorting order, then sort + Endpoint::set_state_array( overlap_order ); + sort(q.begin(), q.end()); + + // Process overlaps + std::set q_active, t_active; + std::vector< std::set > indices ( tn ); + + // For distance_to_nearest and which_nearest + double q_check_pos = std::numeric_limits::infinity(); + std::set q_pending, t_pending; + std::vector delta ( tn ); + std::vector< std::set > which ( tn ); + double diff; + + // For looping + Endpoints::const_iterator it; + std::set::iterator set_it; + int i; + + // Initialize delta + for ( i = 0; i < tn; i++ ) + delta[i] = std::numeric_limits::infinity(); + + for ( it = q.begin(); it < q.end(); it++ ) { + if ( it->query ) { + if ( it->left ) { + // Query left + for( set_it = t_active.begin(); set_it != t_active.end(); set_it++ ) { + indices[ *set_it ].insert( it->index + 1 ); + if ( delta[ *set_it ] > 0 ) which[ *set_it ].clear(); + delta[ *set_it ] = 0; + which[ *set_it ].insert( it->index + 1 ); + } + for( set_it = t_pending.begin(); set_it != t_pending.end(); set_it++ ) { + diff = it->pos - t_right[ *set_it ]; + if ( delta[ *set_it ] == diff ) + which[ *set_it ].insert( it->index + 1 ); + if ( delta[ *set_it ] > diff ) { + which[ *set_it ].clear(); + delta[ *set_it ] = diff; + which[ *set_it ].insert( it->index + 1 ); + } + } + q_active.insert( it->index ); + q_check_pos = it->pos; + q_pending.clear(); + } + else { + // Query right + q_active.erase( it->index ); + if ( q_right[ it->index ] > q_right[ *q_pending.begin() ] ) + q_pending.clear(); + q_pending.insert( it->index ); + } + } + else { + if ( it->left ) { + // Target left + /* + Note that some care is required here. It is possible that there is + another interval at distance 0 to the left, but which, due to + endpoint closure, does not actually overlap the target interval we + are activating. In this case, q_active could be empty but we should + still set delta to 0 for this target interval and add the index of + the query interval immediately to the left to the which + set. Similarly, if there are both overlapping and non-overlapping + distance-zero query intervals, we want to return delta = 0 and the + indices for both types. + */ + if ( q_active.size() > 0 ) { + delta[ it->index ] = 0; + for( set_it = q_active.begin(); set_it != q_active.end(); set_it++ ) { + indices[ it->index ].insert( *set_it + 1 ); + which[ it->index ].insert( *set_it + 1 ); + } + } + for ( set_it = q_pending.begin(); set_it != q_pending.end(); set_it++ ) { + diff = it->pos - q_right[ *set_it ]; + if ( delta[ it->index ] == diff ) + which[ it->index ].insert( *set_it + 1 ); + if ( delta[ it->index ] > diff ) { + which[ it->index ].clear(); + delta[ it->index ] = diff; + which[ it->index ].insert( *set_it + 1 ); + } + } + t_active.insert( it->index ); + } + else { + // Target right + t_active.erase( it->index ); + if ( it->pos > q_check_pos ) { + t_pending.clear(); + q_check_pos = std::numeric_limits::infinity(); + } + t_pending.insert( it->index ); + } + } + } + + // Prepare and return result. + SEXP result; + + PROTECT( result = allocVector( VECSXP, 3 ) ); + + SET_VECTOR_ELT( result, 0, allocVector( REALSXP, tn ) ); // delta + SET_VECTOR_ELT( result, 1, allocVector( VECSXP, tn ) ); // which + SET_VECTOR_ELT( result, 2, allocVector( VECSXP, tn ) ); // which_overlap + + copy( + delta.begin(), delta.end(), + REAL( VECTOR_ELT( result, 0 ) ) + ); + + for( i = 0; i < tn; i++ ) { + SET_VECTOR_ELT( + VECTOR_ELT( result, 1 ), i, + allocVector( INTSXP, which[i].size() ) + ); + copy( + which[i].begin(), which[i].end(), + INTEGER( VECTOR_ELT( VECTOR_ELT( result, 1 ), i ) ) + ); + SET_VECTOR_ELT( + VECTOR_ELT( result, 2 ), i, + allocVector( INTSXP, indices[i].size() ) + ); + copy( + indices[i].begin(), indices[i].end(), + INTEGER( VECTOR_ELT( VECTOR_ELT( result, 2 ), i ) ) + ); + } + + UNPROTECT(1); + return( result ); + + } + +} diff --git a/tests/intervals_test_code.R b/tests/intervals_test_code.R new file mode 100644 index 0000000..ea506c0 --- /dev/null +++ b/tests/intervals_test_code.R @@ -0,0 +1,140 @@ +library( "intervals" ) + +######## NAs and empty intervals + +u <- Intervals_full( as.numeric(NA), type = "Z" ) +u <- c( u, u ) +v <- Intervals_full( c(1,3,1,Inf), type = "Z" ) +x <- Intervals( 1, closed = FALSE, type = "Z" ) # empty +w <- c( x, u, v ) +rownames(w) <- letters[ 1:nrow(w) ] + +x +u +v +w + +is.na(w) +empty(w) + +distance_to_nearest( u, v ) +distance_to_nearest( w, v ) + +pts <- c( -Inf, Inf, NA, NaN, 0:2 ) + +distance_to_nearest( pts, v ) + +identical( + distance_to_nearest( pts, v ), + distance_to_nearest( pts, open_intervals( v ) ) + ) + +interval_overlap( w, v ) + +reduce( w ) + +open_intervals(w) + + + + +######## Subset assignment + +u <- Intervals( 1:8, type = "Z" ) +rownames( u ) <- letters[ 1:nrow(u) ] +v <- Intervals( -4:-1, type = "Z" ) +rownames( v ) <- letters[ nrow(u) + 1:nrow(v) ] +w <- open_intervals(u) + +u +v +w + +# Basic +z <- u; z[2:3,] <- v +z + +# With names +z <- u; z[c("a","d"),] <- v +z + +# Missing row name +result <- try( { z <- u; z[c("a","e"),] <- v }, TRUE ) +result + +# Closure adjustment +z <- w; z[2:3,] <- v +z + +# Size mismatch error +result <- try( { z <- w; z[2:3,] <- v[1,] }, TRUE ) +result + +# Intervals_full method +x <- Intervals_full( 1:6, c(TRUE,FALSE) ) +rownames( x ) <- letters[ 1:nrow(x) ] +y <- Intervals_full( -4:-1 ) + +x +y + +# Missing value names +z <- x; z[2,] <- y[1,] +z + +# Missing x names +z <- y; z[1,] <- x[1,] +z + +# Type mismatch error +result <- try( { z <- x; z[2:3,] <- v }, TRUE ) +result + +# Coercion up +type(v) <- "R" +closed(v) <- c( FALSE, TRUE ) +x +v +z <- x; z[2:3,] <- v +z + +# With warning due to assignment +z <- v; z[1,] <- x[3,] +z + +# No row name copying with matrices +A <- matrix( 0, 2, 2 ) +rownames(A) <- c("1","2") +z <- x; z[1:2,] <- A +z + + + + +######## distance_to_nearest() behavior + +a <- Intervals_full( c(2,5), FALSE ) +b <- Intervals_full( c(1,5,2,10), matrix(c(TRUE,FALSE,FALSE,TRUE),2,2) ) + +a +b + +distance_to_nearest(a,b) + +type(a) <- "Z" +type(b) <- "Z" + +distance_to_nearest(a,b) + +a <- as( a, "Intervals" ) +b <- as( open_intervals( b ), "Intervals" ) + +a +b + +distance_to_nearest(a,b) + +type(a) <- "R" +type(b) <- "R" + +distance_to_nearest(a,b) diff --git a/vignettes/intervals_overview.R b/vignettes/intervals_overview.R new file mode 100644 index 0000000..5c1564c --- /dev/null +++ b/vignettes/intervals_overview.R @@ -0,0 +1,167 @@ +### R code from vignette source 'intervals_overview.Rnw' + +################################################### +### code chunk number 1: width +################################################### +options( width = 80 ) + + +################################################### +### code chunk number 2: Intervals +################################################### +library( intervals ) +x <- Intervals( matrix( 1:6, ncol = 2 ) ) +x +x[2,2] <- NA +x[3,1] <- 6 +x + + +################################################### +### code chunk number 3: Intervals_full +################################################### +y <- as( x, "Intervals_full" ) +y <- c( y, Intervals_full( c(2,3,5,7) ) ) +closed(y)[2:3,1] <- FALSE +closed(y)[4,2] <- FALSE +rownames(y) <- letters[1:5] +y + + +################################################### +### code chunk number 4: size +################################################### +size(x) +empty(x) +empty(y) + + +################################################### +### code chunk number 5: plotting +################################################### +plot( y ) + + +################################################### +### code chunk number 6: set_operations +################################################### +reduce( y ) +interval_intersection( x, x + 2 ) +interval_complement( x ) + + +################################################### +### code chunk number 7: set_operations +################################################### +interval_union( x, interval_complement( x ) ) + + +################################################### +### code chunk number 8: distance +################################################### +B <- 100000 +left <- runif( B, 0, 1e8 ) +right <- left + rexp( B, rate = 1/10 ) +v <- Intervals( cbind( left, right ) ) +head( v ) +mean( size( v ) ) +dim( reduce( v ) ) +system.time( d <- distance_to_nearest( sample( 1e8, B ), v ) ) + + +################################################### +### code chunk number 9: distanceplot +################################################### +hist( d, main = "Distance to nearest interval" ) + + +################################################### +### code chunk number 10: overlap +################################################### +rownames(v) <- sprintf( "%06i", 1:nrow(v) ) +io <- interval_overlap( v, v ) +head( io, n = 3 ) +n <- sapply( io, length ) +sum( n > 1 ) +k <- which.max( n ) +io[ k ] +v[ k, ] +v[ io[[ k ]], ] + + +################################################### +### code chunk number 11: clusters +################################################### +B <- 100 +left <- runif( B, 0, 1e4 ) +right <- left + rexp( B, rate = 1/10 ) +y <- reduce( Intervals( cbind( left, right ) ) ) +w <- 100 +c2 <- clusters( y, w ) +c2[1:3] + + +################################################### +### code chunk number 12: expand +################################################### +delta <- .Machine[[ "double.eps" ]]^0.5 +y1 <- Intervals( c( .5, 1 - delta / 2 ) ) +y2 <- Intervals( c( .25, 1, .75, 2 ) ) +y1 +y2 +all.equal( y1[1,2], y2[2,1] ) +interval_intersection( y1, y2 ) + + +################################################### +### code chunk number 13: expand +################################################### +contract( y1, delta, "relative" ) + + +################################################### +### code chunk number 14: expand +################################################### +inner <- interval_intersection( + contract( y1, delta, "relative" ), + contract( y2, delta, "relative" ) + ) +inner +outer <- interval_intersection( + expand( y1, delta, "relative" ), + expand( y2, delta, "relative" ) + ) +outer + + +################################################### +### code chunk number 15: expand +################################################### +interval_difference( outer, inner ) + + +################################################### +### code chunk number 16: gaps +################################################### +x <- Intervals( c(1,10,100,8,50,200), type = "Z" ) +x +w <- 2 +close_intervals( contract( reduce( expand(x, w/2) ), w/2 ) ) + + +################################################### +### code chunk number 17: integer_range +################################################### +.Machine$integer.max +numeric_max <- with( .Machine, double.base^double.digits ) +options( digits = ceiling( log10( numeric_max ) ) ) +numeric_max + + +################################################### +### code chunk number 18: sessionInfo +################################################### +si <- as.character( toLatex( sessionInfo() ) ) +cat( si[ -grep( "Locale", si ) ], sep = "\n" ) + + diff --git a/vignettes/intervals_overview.Rnw b/vignettes/intervals_overview.Rnw new file mode 100644 index 0000000..34e18b7 --- /dev/null +++ b/vignettes/intervals_overview.Rnw @@ -0,0 +1,453 @@ +@ +\documentclass[a4paper]{article} + +\usepackage{Sweave, amssymb, amsmath} + +\usepackage[ +pdftex, +pdfpagelabels, +plainpages=false, +pdfborder={0 0 0}, +pdfstartview=FitH, +bookmarks=true, +bookmarksnumbered=true, +bookmarksopen=true +] +{hyperref} + +\title{Overview of the \emph{intervals} package} +\author{Richard Bourgon} +\date{06 June 2009} + +% The is for R CMD check, which finds it in spite of the "%", and also for +% automatic creation of links in the HTML documentation for the package: +% \VignetteIndexEntry{Overview of the intervals package.} + + + + +\begin{document} + + + + +%%%%%%%% Setup + +% Don't reform code +\SweaveOpts{keep.source=TRUE, eps=FALSE} + +% Size for figures +\setkeys{Gin}{width=.7\textwidth} + +% Fonts +\DefineVerbatimEnvironment{Sinput}{Verbatim}{xleftmargin=1cm,fontshape=sl,fontsize=\small} +\DefineVerbatimEnvironment{Soutput}{Verbatim}{xleftmargin=1cm,fontsize=\small} + +% Reduce characters per line in R output + +@ +<>= +options( width = 80 ) +@ + +% Make title +\maketitle + +% Typesetting commands + +\newcommand{\R}{\mathbb{R}} +\newcommand{\Z}{\mathbb{Z}} + + + + +%%%%%%%% TOC + +\tableofcontents + +\vspace{.25in} + + +%%%%%%%% Main text + +\section{Introduction} + +The \emph{intervals} packages defines two S4 classes which represent collections +of intervals over either the integers ($\Z$) or the real number line ($\R$). An +instance of either class consists of a two-column matrix of endpoints, plus +additional slots describing endpoint closure and whether the intervals are to be +thought of as being over $\Z$ or $\R$. + +@ +<>= +library( intervals ) +x <- Intervals( matrix( 1:6, ncol = 2 ) ) +x +x[2,2] <- NA +x[3,1] <- 6 +x +@ + +Objects of class \texttt{Intervals} represent collections of intervals with +common endpoint closure, e.g., all left-closed, right-open. More control over +endpoints is permitted with the \texttt{Intervals\_full} class. (Both classes +are derived from \texttt{Intervals\_virtual}, which is not intended for use by +package users.) + +@ +<>= +y <- as( x, "Intervals_full" ) +y <- c( y, Intervals_full( c(2,3,5,7) ) ) +closed(y)[2:3,1] <- FALSE +closed(y)[4,2] <- FALSE +rownames(y) <- letters[1:5] +y +@ + +The \texttt{size} method gives measure --- counting measure over $\Z$ or +Lebesgue measure over $\R$ --- for each interval represented in an object. The +\texttt{empty} method identifies intervals that are in fact empty sets, which +over $\R$ is not the same thing as having size 0. (Valid objects must have each +right endpoint no less than the corresponding left endpoint. When one or both +endpoints are open, however, valid intervals may be empty.) + +@ +<>= +size(x) +empty(x) +empty(y) +@ + +\begin{figure}[!tb] + \centering +@ +<>= +plot( y ) +@ +\caption{The \texttt{Intervals\_full} object \texttt{y}, plotted with +\texttt{plot(y)}. The second row contains an \texttt{NA} endpoint, and is not +shown in the plot. The empty interval is plotted as a hollow point. By default, +vertical placement avoids overlaps but is compact. } +\label{fig:plotting} +\end{figure} + + + + +\section{Interpretation of objects} + +An \texttt{Intervals} or \texttt{Intervals\_full} object can be thought of in +two different modes, each of which is useful in certain contexts: + +\begin{enumerate} + + \item As a (non-unique) representation of a subset of $\Z$ or $\R$. + + \item As a collection of (possibly overlapping) intervals, each of which has a + meaningful identity. + +\end{enumerate} + + + + +\subsection{As a subset of $\Z$ or $\R$} + +The \emph{intervals} package provides a number of basic tools for working in the +first mode, where an object represents a subset of $\Z$ or $\R$ but the rows of +the endpoint matrix do not have any external identity. Basic tools include +\texttt{reduce}, which returns a sorted minimal representation equivalent to the +original (dropping any intervals with \texttt{NA} endpoints), as well as +\texttt{interval\_union}, \texttt{interval\_complement}, and +\texttt{interval\_intersection}. + +@ +<>= +reduce( y ) +interval_intersection( x, x + 2 ) +interval_complement( x ) +@ + +Note that combining \texttt{x} and its complement in order to compute a union +requires mixing endpoint closure types; coercion to \texttt{Intervals\_full} is +automatic. + +@ +<>= +interval_union( x, interval_complement( x ) ) +@ + +The \texttt{distance\_to\_nearest} function treats its \texttt{to} argument in +the first mode, as just a subset of $\Z$ or $\R$; it treats its \texttt{from} +argument, however, in the second mode, returning one distance for every row of +the \texttt{from} object. In the example below, we also look at performance for +large data sets (less than one second on a 2 GHz Intel Core 2 Duo Macintosh, +although the time shown below will likely differ). A histogram of \texttt{d} is +given in Figure~\ref{fig:distance}. + +@ +<>= +B <- 100000 +left <- runif( B, 0, 1e8 ) +right <- left + rexp( B, rate = 1/10 ) +v <- Intervals( cbind( left, right ) ) +head( v ) +mean( size( v ) ) +dim( reduce( v ) ) +system.time( d <- distance_to_nearest( sample( 1e8, B ), v ) ) +@ + +\begin{figure}[!tb] + \centering +@ +<>= +hist( d, main = "Distance to nearest interval" ) +@ + \caption{Histogram of distances from a random set of points to the nearest + point in \texttt{v}. There is also a \texttt{distance\_to\_nearest} method for + comparing two sets of intervals.} + \label{fig:distance} +\end{figure} + + + + +\subsection{As a set of meaningful, possibly overlapping intervals} + +In some applications, each row of an object's endpoint matrix has a meaningful +identity, and particular points from $\Z$ or $\R$ may be found in more than one +row. To support this mode, objects may be given row names, which are propagated +through calculations when appropriate. The \texttt{c} methods simply stack +objects (like \texttt{rbind}), preserving row names and retaining redundancy, if +any. + +The \texttt{interval\_overlap} method works in this mode. In the next example we +use it to identify rows of \texttt{v} which are at least partially redundant, +i.e., which intersect at least one other row of \texttt{v}. All rows overlap +themselves, so we look for rows that overlap at least two rows: + +@ +<>= +rownames(v) <- sprintf( "%06i", 1:nrow(v) ) +io <- interval_overlap( v, v ) +head( io, n = 3 ) +n <- sapply( io, length ) +sum( n > 1 ) +k <- which.max( n ) +io[ k ] +v[ k, ] +v[ io[[ k ]], ] +@ + +The \texttt{which\_nearest} method also respects row identity, for both its +\texttt{to} and \texttt{from} arguments. In addition to computing the distance +from each \texttt{from} interval to the nearest point in \texttt{to}, it also +returns the row index of the \texttt{to} interval (or intervals, in case of +ties) located at the indicated distance. + +Another function which operates in this mode is \texttt{clusters}, which takes a +set of points or intervals and identifies maximal groups which cluster together +--- which are separated from one another by no more than a user-specified +threshold. The following code is taken from the \texttt{clusters} documentation: + +@ +<>= +B <- 100 +left <- runif( B, 0, 1e4 ) +right <- left + rexp( B, rate = 1/10 ) +y <- reduce( Intervals( cbind( left, right ) ) ) +w <- 100 +c2 <- clusters( y, w ) +c2[1:3] +@ + + + + +\section{Floating point and intervals over $\R$} + +When \texttt{type == "R"}, interval endpoints are not truly in $\R$, but rather, +in the subset which can be represented by floating point arithmetic. (For the +moment, this is also true when \texttt{type == "Z"}. See +Section~\ref{sec:representation}.) This limits the endpoint values which can be +represented; more importantly, if computations are performed on interval +endpoints, it means that floating point error can affect whether or not +endpoints coincide, whether intervals which meet at or near endpoints overlap +one another, etc. + +In spite of this potentially serious limitation, it is still often convenient to +work with intervals with non-integer endpoints, including data where adjacent +intervals exactly meet at a non-integer endpoint. To address this, the +\emph{intervals} package takes the following approach: + +\begin{itemize} + + \item Floating point representations of interval endpoints are assumed to be + \emph{exactly equal} (in the sense of \texttt{==} in R or C++) if and only + if the user intends the real values corresponding to these representations + to be exactly equal. + + \item For cases where floating point error and approximate equality are a + concern, tools are provided to permit distinguishing between ambiguous and + unambiguous intersection, union, etc. + +\end{itemize} + +In the next example, \texttt{y1} does not literally overlap \texttt{y2[2,]}, +although R's \texttt{all.equal} function asserts that the gap between them is +smaller than the default tolerance for equivalence up to floating point +precision. + +@ +<>= +delta <- .Machine[[ "double.eps" ]]^0.5 +y1 <- Intervals( c( .5, 1 - delta / 2 ) ) +y2 <- Intervals( c( .25, 1, .75, 2 ) ) +y1 +y2 +all.equal( y1[1,2], y2[2,1] ) +interval_intersection( y1, y2 ) +@ + +The \texttt{expand} and \texttt{contract} methods, used with \texttt{type = +"relative"}, permit consideration of the maximal and minimal interval sets which +are consistent with the nominal endpoints --- from the point of view of endpoint +relative difference. The \texttt{contract} method, for example, contracts each +interval in a collection so that the relative difference between original and +contracted endpoints is equal to tolerance \texttt{delta}. Thus, if a relative +difference less than or equal to \texttt{delta} is our criterion for approximate +floating point equality, the contracted object has endpoints which are +approximately equal to those of the original --- even though the contracted object +is a proper subset of the original. The \texttt{expand} method is similar, but +generates a proper superset. + +@ +<>= +contract( y1, delta, "relative" ) +@ + +We compute two separate intersections which bound the nominal intersection: + +@ +<>= +inner <- interval_intersection( + contract( y1, delta, "relative" ), + contract( y2, delta, "relative" ) + ) +inner +outer <- interval_intersection( + expand( y1, delta, "relative" ), + expand( y2, delta, "relative" ) + ) +outer +@ + +Finally, we identify points which may or may not be in the intersection, +depending on whether we make a conservative, literal, or anti-conservative +interpretation of the nominal endpoints. + +@ +<>= +interval_difference( outer, inner ) +@ + +The \texttt{expand} and \texttt{contract} methods have other uses as +well. Here, we eliminate gaps of size 2 or smaller: + +@ +<>= +x <- Intervals( c(1,10,100,8,50,200), type = "Z" ) +x +w <- 2 +close_intervals( contract( reduce( expand(x, w/2) ), w/2 ) ) +@ + + + + +\section{Notes on implementation} + +\subsection{Endpoint representation} +\label{sec:representation} + +For the moment, interval endpoints are always stored using R's \emph{numeric} +data type. Although this is wasteful from an memory and speed point of view, +we do it for two reasons. First, use of R's \texttt{Inf} and \texttt{-Inf} --- +not possible with the \emph{integer} type --- is very convenient when +computing complements. Second, the range of integers which can be represented +using the \emph{numeric} data type is considerably greater: + +@ +<>= +.Machine$integer.max +numeric_max <- with( .Machine, double.base^double.digits ) +options( digits = ceiling( log10( numeric_max ) ) ) +numeric_max +@ + + + + +\subsection{Efficiency} + +All computations are accomplished by treating intervals as pairs of tagged +endpoints, sorting these endpoints (along with their tags), and then making a +single pass through the results. Computational complexity for set operations is +therefore $O(n \log n)$, where input object $i$ contains $n_i$ rows and $n = +\sum_i n_i$. The same sorting approach is also used for +\texttt{interval\_overlap}, although if every interval in a query object of $m$ +rows overlaps every intervals in a target object of $n$ rows, generating output +alone must of necessity be $O(mn)$. + +Sorted endpoint vectors are not retained in memory. If one wishes to query a +particular object over and over, repeated sorting would be inefficient; in +practice so far, however, such repeated querying has not been needed. + + + + +\subsection{Checking validity} + +The code behind \texttt{which\_nearest} and \texttt{reduce} (key methods in +the \emph{intervals} package, which may be directly called by the user and are +also used internally in numerous locations) is written in C++ for +efficiency. The compiled code makes a number of assumptions about the +\texttt{SEXP} objects passed in as arguments, but does not explicitly check +these assumptions. Nonetheless, when the R wrappers for the compiled code are +applied to \emph{valid} objects from the \texttt{Intervals} or +\texttt{Intervals\_full} classes, all assumptions will always be met. This +design decision was taken so that the requirements for individual objects and +their contents could be gathered together in a single, natural location: the +classes' \texttt{validity} functions. + +The \emph{intervals} package provides replacement methods --- e.g., +\texttt{type} and \texttt{closed} --- which implement error checking and +preserve object validity. R's implementation of S4 classes, however, leaves +object data slots exposed to the user. As a consequence, a user can directly +manipulate the data slots of a valid \texttt{Intervals} or +\texttt{Intervals\_full} object in a way that invalidates the object, but does +not generate any warning or error. + +To prevent invalid objects from being passed to compiled code --- and +potentially generating segmentation faults or other problems --- all wrapper +code in this package includes a \texttt{check\_valid} argument. This argument is +set to \texttt{TRUE} by default, so that \texttt{validObject} is called on +relevant objects before handing them off to the compiled code. For efficiency, +users may choose to override this extra check if they are certain they have not +manually assigned inappropriate content to objects' data slots. + + + + +\section{Session information} + +@ +<>= +si <- as.character( toLatex( sessionInfo() ) ) +cat( si[ -grep( "Locale", si ) ], sep = "\n" ) +@ + + + + +\end{document} diff --git a/vignettes/intervals_overview.pdf b/vignettes/intervals_overview.pdf new file mode 100644 index 0000000..e888056 Binary files /dev/null and b/vignettes/intervals_overview.pdf differ