Skip to content

Commit

Permalink
Long read datastrore (#2)
Browse files Browse the repository at this point in the history
Add basic functionality for building, loading, and retrieving sequences from
a datastore of long reads.
  • Loading branch information
Ben J. Ward authored Aug 10, 2019
1 parent 55b4be8 commit 1d3dfd9
Show file tree
Hide file tree
Showing 8 changed files with 287 additions and 17 deletions.
2 changes: 2 additions & 0 deletions .github/CODEOWNERS
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# These owners are the default owners for everything in the repo.
* [email protected]
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
[![MIT license](https://img.shields.io/badge/license-MIT-green.svg)](https://github.com/BioJulia/ReadDatastores.jl/blob/master/LICENSE)
[![Stable documentation](https://img.shields.io/badge/docs-stable-blue.svg)](https://biojulia.github.io/ReadDatastores.jl/stable)
[![Latest documentation](https://img.shields.io/badge/docs-latest-blue.svg)](https://biojulia.github.io/ReadDatastores.jl/latest/)
![Lifecycle](https://img.shields.io/badge/lifecycle-stable-brightgreen.svg)
[![Lifecycle](https://www.repostatus.org/badges/latest/wip.svg)](https://www.repostatus.org/#wi)
[![Chat](https://img.shields.io/gitter/room/BioJulia/ReadDatastores.svg)](https://gitter.im/BioJulia/ReadDatastores.jl)


Expand Down
26 changes: 25 additions & 1 deletion src/ReadDatastores.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,16 @@ export
PairedReadDatastore,
PairedReadOrientation,
FwRv,
RvFw
RvFw,
LongReadDatastore

using BioSequences, FASTX

const SeqDataStoreMAGIC = 0x05D5

@enum Filetype::UInt16 begin
PairedDS = 1
LongDS = 2
end

###
Expand All @@ -22,6 +24,28 @@ end
write(fd, s) + write(fd, '\0')
end

@inline function write_flat_vector(fd::IO, v::Vector{T}) where {T}
n = UInt64(length(v))
write(fd, n)
write(fd, v)
end

@inline function read_flat_vector!(fd::IO, v::Vector{T}) where {T}
n = read(fd, UInt64)
v = resize!(n)
unsafe_read(fd, pointer(v), n * sizeof(T))
return v
end

@inline function read_flat_vector(fd::IO, ::Type{T}) where {T}
n = read(fd, UInt64)
v = Vector{T}(undef, n)
unsafe_read(fd, pointer(v), n * sizeof(T))
return v
end


include("paired-reads.jl")
include("long-reads.jl")

end # module
164 changes: 164 additions & 0 deletions src/long-reads.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
struct ReadPosSize
offset::UInt64
sequence_size::UInt64
end

Base.:(==)(x::ReadPosSize, y::ReadPosSize) = x.offset == y.offset && x.sequence_size == y.sequence_size

struct LongReadDatastore
filename::String
name::String
default_name::String
read_to_file_positions::Vector{ReadPosSize}
stream::IO
end

const LRDS = LongReadDatastore

index(lrds::LRDS) = lrds.read_to_file_positions

const LongDS_Version = 0x0001

###
### LongReadDatastore Header
###

# | Field | Value | Type |
# |:-----------------------------:|:------:|:-----------:|
# | Magic number | 0x05D5 | UInt16 | 2
# | Datastore type | 0x0002 | UInt16 | 2
# | Version number | 0x0001 | UInt16 | 2
# | Index position in file | N/A | UInt64 | 8
# | Default name of the datastore | N/A | String | N

function LongReadDatastore(rdr::FASTQ.Reader, outfile::String, name::String, min_size::UInt64)
discarded = 0

read_to_file_position = Vector{ReadPosSize}()
ofs = open(outfile, "w")

write(ofs, SeqDataStoreMAGIC, LongDS, LongDS_Version, zero(UInt64))

writestring(ofs, name)

record = FASTQ.Record()
seq = LongSequence{DNAAlphabet{4}}(min_size)

@info "Building long read datastore from FASTQ file"

@info "Writing long reads to datastore"

while !eof(rdr)
read!(rdr, record)
seq_len = FASTQ.seqlen(record)
if seq_len < min_size
discarded = discarded + 1
continue
end
offset = position(ofs)
resize!(seq, seq_len)
copyto!(seq, record)
push!(read_to_file_position, ReadPosSize(offset, seq_len))
write(ofs, seq.data)
end

@info "Done writing paired read sequences to datastore"
@info string(discarded, " reads were discarded due to a too short sequence")

fpos = UInt64(position(ofs))

@info "Writing index to datastore"

write_flat_vector(ofs, read_to_file_position)

# Go to the top and dump the number of reads and the position of the index.
seek(ofs, sizeof(SeqDataStoreMAGIC) + sizeof(Filetype) + sizeof(LongDS_Version))
write(ofs, fpos)
close(ofs)

@info string("Built long read datastore with ", length(read_to_file_position), " reads")

stream = open(outfile, "r+")
return LongReadDatastore(outfile, name, name, read_to_file_position, stream)
end

function Base.open(::Type{LongReadDatastore}, filename::String)
fd = open(filename, "r")
magic = read(fd, UInt16)
dstype = reinterpret(Filetype, read(fd, UInt16))
version = read(fd, UInt16)

@assert magic == SeqDataStoreMAGIC
@assert dstype == LongDS
@assert version == LongDS_Version

fpos = read(fd, UInt64)

default_name = readuntil(fd, '\0')

seek(fd, fpos)

read_to_file_position = read_flat_vector(fd, ReadPosSize)

return LongReadDatastore(filename, default_name, default_name, read_to_file_position, fd)
end

###
### Getting a sequence
###

Base.length(lrds::LRDS) = length(lrds.read_to_file_positions)

Base.firstindex(lrds::LRDS) = 1
Base.lastindex(lrds::LRDS) = length(lrds)
Base.eachindex(lrds::LRDS) = Base.OneTo(lastindex(lrds))

@inline function Base.checkbounds(lrds::LRDS, i::Integer)
if firstindex(lrds) i lastindex(lrds)
return true
end
throw(BoundsError(lrds, i))
end

@inbounds inbounds_position_and_size(lrds::LRDS, idx::Integer) = @inbounds lrds.read_to_file_positions[idx]

@inbounds function position_and_size(lrds::LRDS, idx::Integer)
checkbounds(lrds, idx)
return inbounds_position_and_size(lrds, idx)
end

@inline function unsafe_load_read!(lrds::LRDS, pos_size::ReadPosSize, seq::LongSequence{DNAAlphabet{4}})
seek(lrds.stream, pos_size.offset)
resize!(seq, pos_size.sequence_size)
unsafe_read(lrds.stream, pointer(seq.data), length(seq.data) * sizeof(UInt64))
return seq
end

@inline function inbounds_load_read!(lrds::LRDS, idx::Integer, seq::LongSequence{DNAAlphabet{4}})
pos_size = inbounds_position_and_size(lrds, idx)
return unsafe_load_read!(lrds, pos_size, seq)
end

@inline function load_read!(lrds::LRDS, idx::Integer, seq::LongSequence{DNAAlphabet{4}})
checkbounds(lrds, idx)
return inbounds_load_read!(lrds, idx, seq)
end

@inline function Base.getindex(lrds::LRDS, idx::Integer)
@boundscheck checkbounds(lrds, idx)
pos_size = inbounds_position_and_size(lrds, idx)
seq = LongDNASeq(pos_size.sequence_size)
return unsafe_load_read!(lrds, pos_size, seq)
end

Base.IteratorSize(lrds::LRDS) = Base.HasLength()
Base.IteratorEltype(lrds::LRDS) = Base.HasEltype()
Base.eltype(lrds::LRDS) = LongSequence{DNAAlphabet{4}}

@inline function Base.iterate(lrds::LRDS, state = 1)
@inbounds if firstindex(lrds) state lastindex(lrds)
return lrds[state], state + 1
else
return nothing
end
end
31 changes: 16 additions & 15 deletions src/paired-reads.jl
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ function PairedReadDatastore(rdrx::FASTQ.Reader, rdry::FASTQ.Reader,

pairs = discarded = truncated = 0

@info "Starting datastore build from FASTQ files" sizepos readpos
@info "Building paired read datastore from FASTQ files"
@info "Writing paired reads to datastore"

while !eof(rdrx) && !eof(rdry)
# Read in the two records.
Expand Down Expand Up @@ -117,10 +118,10 @@ function PairedReadDatastore(rdrx::FASTQ.Reader, rdry::FASTQ.Reader,

close(fd)

@info "Done writing paired read sequences to datastore."
@info string(discarded, " read pairs were discarded due to a too short sequence.")
@info string(truncated, " reads were truncated to ", maxsize, " base pairs.")
@info string("Created paired sequence datastore with ", pairs, " sequence pairs.")
@info "Done writing paired read sequences to datastore"
@info string(discarded, " read pairs were discarded due to a too short sequence")
@info string(truncated, " reads were truncated to ", maxsize, " base pairs")
@info string("Created paired sequence datastore with ", pairs, " sequence pairs")

stream = open(outfile, "r+")
return PairedReadDatastore(outfile, name, name,
Expand Down Expand Up @@ -171,19 +172,19 @@ function Base.show(io::IO, prds::PairedReadDatastore)
end

bytes_per_read(prds::PRDS) = (prds.chunksize + 1) * sizeof(UInt64)
read_offset_in_file(prds::PRDS, idx::Integer) = prds.readpos_offset + (bytes_per_read(prds) * (idx - 1))
@inline unsafe_read_offset_in_file(prds::PRDS, idx::Integer) = prds.readpos_offset + (bytes_per_read(prds) * (idx - 1))

function load_read!(prds::PRDS, seq::LongSequence{DNAAlphabet{4}})
@inline function inbounds_load_read!(prds::PRDS, idx::Integer, seq::LongSequence{DNAAlphabet{4}})
seek(prds.stream, unsafe_read_offset_in_file(prds, idx))
seqlen = read(prds.stream, UInt64)
resize!(seq, seqlen)
unsafe_read(prds.stream, pointer(seq.data), length(seq.data) * sizeof(UInt64))
return seq
end

function load_read!(prds::PRDS, idx::Integer, seq::LongSequence{DNAAlphabet{4}})
seek(prds.stream, read_offset_in_file(prds, idx))
load_read!(prds, seq)
return seq
@inline function load_read!(prds::PRDS, idx::Integer, seq::LongSequence{DNAAlphabet{4}})
checkbounds(prds, idx)
return inbounds_load_read!(prds, idx, seq)
end

@inline function Base.checkbounds(prds::PRDS, i::Integer)
Expand All @@ -193,14 +194,14 @@ end
throw(BoundsError(prds, i))
end

firstindex(prds::PRDS) = 1
lastindex(prds::PRDS) = length(prds)
eachindex(prds::PRDS) = Base.OneTo(lastindex(prds))
Base.firstindex(prds::PRDS) = 1
Base.lastindex(prds::PRDS) = length(prds)
Base.eachindex(prds::PRDS) = Base.OneTo(lastindex(prds))

@inline function Base.getindex(prds::PRDS, idx::Integer)
@boundscheck checkbounds(prds, idx)
seq = LongDNASeq(prds.readsize)
return load_read!(prds, idx, seq)
return inbounds_load_read!(prds, idx, seq)
end

Base.IteratorSize(prds::PRDS) = Base.HasLength()
Expand Down
Loading

0 comments on commit 1d3dfd9

Please sign in to comment.