Skip to content

Commit

Permalink
Bump BioSequences/FASTX to v3/v2 (#16)
Browse files Browse the repository at this point in the history
Bump BioSequences/FASTX to v3/v2
  • Loading branch information
banhbio authored Mar 6, 2023
1 parent e7d198c commit ac218f6
Show file tree
Hide file tree
Showing 9 changed files with 34 additions and 34 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/testing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
fail-fast: false
matrix:
julia-version:
- '1.0' # LTS
- '1.6' # LTS
- '1'
julia-arch: [x86]
os: [ubuntu-latest, windows-latest, macOS-latest] # TODO: Work on windows-latest
Expand Down
6 changes: 3 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
FASTX = "c2308a5c-f048-11e8-3e8a-31650f418d12"

[compat]
BioSequences = "2"
FASTX = "1.1"
julia = "1"
BioSequences = "3"
FASTX = "2"
julia = "1.6"

[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Expand Down
2 changes: 1 addition & 1 deletion src/ReadDatastores.jl
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ data blob (i.e. `sizeof(BioSequences.encoded_data(seq))`).
sequence type.
"""
function _load_sequence_data!(ds::ReadDatastore{T}, seq::T) where {T<:LongSequence}
seqdata = BioSequences.encoded_data(seq)
seqdata = seq.data
GC.@preserve seqdata unsafe_read(stream(ds), pointer(seqdata), sizeof(seqdata))
return seq
end
Expand Down
24 changes: 12 additions & 12 deletions src/linked-reads.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@ const LinkedTag = UInt32
mutable struct LinkedReadData{A<:DNAAlphabet}
seq1::LongSequence{A}
seq2::LongSequence{A}
seqlen1::UInt64
seqlen2::UInt64
seqsize1::UInt64
seqsize2::UInt64
tag::LinkedTag
end

Base.isless(a::LinkedReadData, b::LinkedReadData) = a.tag < b.tag
LinkedReadData{A}(len) where {A<:DNAAlphabet} = LinkedReadData{A}(LongSequence{A}(len), LongSequence{A}(len), zero(UInt64), zero(UInt64), zero(LinkedTag))
LinkedReadData{A}(len) where {A<:DNAAlphabet} = LinkedReadData{A}(LongSequence{A}(undef, len), LongSequence{A}(undef, len), zero(UInt64), zero(UInt64), zero(LinkedTag))

const LinkedDS_Version = 0x0003

Expand All @@ -38,10 +38,10 @@ function _extract_tag_and_sequences!(current_data::LinkedReadData, fwrec::FASTQ.
end
end
current_data.tag = newtag
current_data.seqlen1 = UInt64(min(max_read_len, FASTQ.seqlen(fwrec)))
current_data.seqlen2 = UInt64(min(max_read_len, FASTQ.seqlen(rvrec)))
copyto!(current_data.seq1, 1, fwrec, 1, current_data.seqlen1)
copyto!(current_data.seq2, 1, rvrec, 1, current_data.seqlen2)
current_data.seqsize1 = UInt64(min(max_read_len, FASTQ.seqsize(fwrec)))
current_data.seqsize2 = UInt64(min(max_read_len, FASTQ.seqsize(rvrec)))
copyto!(current_data.seq1, 1, fwrec, 1, current_data.seqsize1)
copyto!(current_data.seq2, 1, rvrec, 1, current_data.seqsize2)
end

struct LinkedReads{A<:DNAAlphabet} <: ShortReads{A}
Expand Down Expand Up @@ -121,7 +121,7 @@ function LinkedReads{A}(fwq::FASTQ.Reader, rvq::FASTQ.Reader, outfile::String, n
fwrec = FASTQ.Record()
rvrec = FASTQ.Record()
chunk_data = [LinkedReadData{A}(max_read_len) for _ in 1:chunksize]
datachunksize = length(BioSequences.encoded_data(first(chunk_data).seq1))
datachunksize = length(first(chunk_data).seq1.data)

while !eof(fwq) && !eof(rvq)
# Read in `chunksize` read pairs.
Expand Down Expand Up @@ -150,10 +150,10 @@ function LinkedReads{A}(fwq::FASTQ.Reader, rvq::FASTQ.Reader, outfile::String, n
for j in 1:chunkfill
cd_j = chunk_data[j]
write(chunk_fd, cd_j.tag)
write(chunk_fd, cd_j.seqlen1)
write(chunk_fd, BioSequences.encoded_data(cd_j.seq1))
write(chunk_fd, cd_j.seqlen2)
write(chunk_fd, BioSequences.encoded_data(cd_j.seq2))
write(chunk_fd, cd_j.seqsize1)
write(chunk_fd, cd_j.seq1.data)
write(chunk_fd, cd_j.seqsize2)
write(chunk_fd, cd_j.seq2.data)
end
close(chunk_fd)
push!(chunk_files, string("sorted_chunk_", length(chunk_files), ".data"))
Expand Down
6 changes: 3 additions & 3 deletions src/long-reads.jl
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ function LongReads{A}(rdr::FASTQ.Reader, outfile::String, name::Union{String,Sym
writestring(ofs, String(name))

record = FASTQ.Record()
seq = LongSequence{A}(min_size)
seq = LongSequence{A}(undef, min_size)

@info "Building long read datastore from FASTQ file"

Expand All @@ -109,7 +109,7 @@ function LongReads{A}(rdr::FASTQ.Reader, outfile::String, name::Union{String,Sym
end
rethrow()
end
seq_len = FASTQ.seqlen(record)
seq_len = FASTQ.seqsize(record)
if seq_len < min_size
discarded = discarded + 1
continue
Expand Down Expand Up @@ -186,6 +186,6 @@ end
@inline function Base.getindex(lrds::LongReads, idx::Integer)
@boundscheck checkbounds(lrds, idx)
pos_size = _inbounds_index_of_sequence(lrds, idx)
seq = eltype(lrds)(pos_size.sequence_size)
seq = eltype(lrds)(undef, pos_size.sequence_size)
return inbounds_load_sequence!(lrds, pos_size, seq)
end
10 changes: 5 additions & 5 deletions src/paired-reads.jl
Original file line number Diff line number Diff line change
Expand Up @@ -107,11 +107,11 @@ function PairedReads{A}(rdrx::FASTQ.Reader, rdry::FASTQ.Reader,
# Create and allocate the sequence and record objects.
lread = FASTQ.Record()
rread = FASTQ.Record()
lseq = LongSequence{A}(maxsize)
rseq = LongSequence{A}(maxsize)
lseq = LongSequence{A}(undef, maxsize)
rseq = LongSequence{A}(undef, maxsize)

#chunksize::UInt64 = BioSequences.seq_data_len(DNAAlphabet{4}, maxsize)
chunksize::UInt64 = length(BioSequences.encoded_data(lseq))
chunksize::UInt64 = length(lseq.data)
bps = UInt64(BioSequences.bits_per_symbol(A()))

fd = open(outfile * ".prseq", "w")
Expand Down Expand Up @@ -143,8 +143,8 @@ function PairedReads{A}(rdrx::FASTQ.Reader, rdry::FASTQ.Reader,
rethrow()
end

llen = UInt64(FASTQ.seqlen(lread))
rlen = UInt64(FASTQ.seqlen(rread))
llen = UInt64(FASTQ.seqsize(lread))
rlen = UInt64(FASTQ.seqsize(rread))
# If either read is too short, discard them both.
if llen < minsize || rlen < minsize
discarded += 1
Expand Down
6 changes: 3 additions & 3 deletions src/sequence-buffer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ end

@inline function _load_sequence_data!(seq::LongSequence{A}, sb::DatastoreBuffer, offset::Integer) where {A<:DNAAlphabet}
bufdata = buffer_array(sb)
seqdata = BioSequences.encoded_data(seq)
seqdata = seq.data
GC.@preserve bufdata begin
for i in eachindex(seqdata)
seqdata[i] = unsafe_load(convert(Ptr{UInt64}, pointer(bufdata, offset + 1)))
Expand Down Expand Up @@ -74,7 +74,7 @@ end
buffer_offset = file_offset - buffer_position(sb)
sequence_length = unsafe_load(convert(Ptr{UInt64}, pointer(buffer_array(sb), buffer_offset + 1)))
buffer_offset = buffer_offset + sizeof(UInt64)
seq = eltype(sb)(sequence_length)
seq = eltype(sb)(undef, sequence_length)

return _load_sequence_data!(seq, sb, buffer_offset)
end
Expand Down Expand Up @@ -125,7 +125,7 @@ end
@boundscheck checkbounds(sb, idx)
file_index = _inbounds_index_of_sequence(datastore(sb), idx)
_check_for_buffer_refresh!(sb, file_index)
seq = eltype(sb)(file_index.sequence_size)
seq = eltype(sb)(undef, file_index.sequence_size)
buffer_offset = file_index.offset - buffer_position(sb)
return _load_sequence_data!(seq, sb, buffer_offset)
end
Expand Down
6 changes: 3 additions & 3 deletions src/short-reads.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ abstract type ShortReads{A<:DNAAlphabet} <: ReadDatastore{LongSequence{A}} end
@inline function inbounds_load_sequence!(ds::ShortReads{A}, i::Integer, seq::LongSequence{A}) where {A<:DNAAlphabet}
pos = _offset_of_sequence(ds, i)
seek(stream(ds), pos)
seqlen = read(stream(ds), UInt64)
resize!(seq, seqlen)
seqsize = read(stream(ds), UInt64)
resize!(seq, seqsize)
return _load_sequence_data!(ds, seq)
end

Expand All @@ -19,7 +19,7 @@ end

@inline function Base.getindex(sr::ShortReads{A}, idx::Integer) where {A<:DNAAlphabet}
@boundscheck checkbounds(sr, idx)
seq = eltype(sr)(max_read_length(sr))
seq = eltype(sr)(undef, max_read_length(sr))
return inbounds_load_sequence!(sr, idx, seq)
end

Expand Down
6 changes: 3 additions & 3 deletions test/long-reads.jl
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
@testset "Long read datastores" begin
function get_fastq_seqs(file)
function get_fastq_seqs(::Type{A}, file) where {A<:DNAAlphabet}
seqs = map(open(FASTQ.Reader, file) do rdr
collect(rdr)
end) do rec
FASTQ.sequence(LongDNASeq, rec)
FASTQ.sequence(LongSequence{A}, rec)
end
return seqs
end

function check_round_trip(::Type{A}, FQ) where {A<:DNAAlphabet}
seqs = get_fastq_seqs(FQ)
seqs = get_fastq_seqs(A, FQ)
fq = open(FASTQ.Reader, FQ)
ds = LongReads{A}(fq, "human-nanopore", "human-nanopore", 0)
ds2 = open(LongReads{A}, "human-nanopore.loseq")
Expand Down

0 comments on commit ac218f6

Please sign in to comment.