Skip to content

Commit fb74387

Browse files
authored
feat: Index for bam::IndexedReader (#387)
* Add an IndexView for accessing a sam/bam/cram's index * fix copy and paste error * remove unnecessary cast to u32 * index.stats: return Option<_> * clippy lints * fmt * add test for idxstats * add test for number_mapped_unmapped * add tests for cram index as well * set reference for indexed cram reader * add slow_idxstats from samtools bam_index.c * fix slow_idxstats, refactor as methods of IndexedReader * Add set_read_options 'convenience' method to bam::Read * cargo fmt * add missing test bam/cram indices * rename to set_cram_options, add doc * replace panic with BamUnsorted error * Add tid == -1 comment * remove todo comment * fix doctest path to file
1 parent 7d4990b commit fb74387

File tree

8 files changed

+281
-14
lines changed

8 files changed

+281
-14
lines changed

src/bam/mod.rs

Lines changed: 271 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,9 @@ use crate::utils::path_as_bytes;
3232
pub use crate::bam::buffer::RecordBuffer;
3333
pub use crate::bam::header::Header;
3434
pub use crate::bam::record::Record;
35-
use std::convert::TryInto;
35+
use hts_sys::{hts_fmt_option, sam_fields};
36+
use std::convert::{TryFrom, TryInto};
37+
use std::mem::MaybeUninit;
3638

3739
/// # Safety
3840
///
@@ -106,7 +108,7 @@ pub trait Read: Sized {
106108
/// match result {
107109
/// Ok(_) => {
108110
/// println!("Read sequence: {:?}", record.seq().as_bytes());
109-
/// },
111+
/// }
110112
/// Err(_) => panic!("BAM parsing failed...")
111113
/// }
112114
/// }
@@ -217,6 +219,31 @@ pub trait Read: Sized {
217219
///
218220
/// * `tpool` - thread pool to use for compression work.
219221
fn set_thread_pool(&mut self, tpool: &ThreadPool) -> Result<()>;
222+
223+
/// If the underlying file is in CRAM format, allows modifying CRAM options.
224+
/// Note that this method does *not* check that the underlying file actually is in CRAM format.
225+
///
226+
/// # Examples
227+
///
228+
/// Set the required fields to RNAME and FLAG,
229+
/// potentially allowing htslib to skip over the rest,
230+
/// resulting in faster iteration:
231+
/// ```
232+
/// use rust_htslib::bam::{Read, Reader};
233+
/// use hts_sys;
234+
/// let mut cram = Reader::from_path(&"test/test_cram.cram").unwrap();
235+
/// cram.set_cram_options(hts_sys::hts_fmt_option_CRAM_OPT_REQUIRED_FIELDS,
236+
/// hts_sys::sam_fields_SAM_RNAME | hts_sys::sam_fields_SAM_FLAG).unwrap();
237+
/// ```
238+
fn set_cram_options(&mut self, fmt_opt: hts_fmt_option, fields: sam_fields) -> Result<()> {
239+
unsafe {
240+
if hts_sys::hts_set_opt(self.htsfile(), fmt_opt, fields) != 0 {
241+
Err(Error::HtsSetOpt)
242+
} else {
243+
Ok(())
244+
}
245+
}
246+
}
220247
}
221248

222249
/// A BAM reader.
@@ -471,6 +498,7 @@ pub enum FetchDefinition<'a> {
471498
/// Only reads with the BAM flag BAM_FUNMAP (which might not be all reads with reference = -1)
472499
Unmapped,
473500
}
501+
474502
impl<'a, X: Into<FetchCoordinate>, Y: Into<FetchCoordinate>> From<(i32, X, Y)>
475503
for FetchDefinition<'a>
476504
{
@@ -480,6 +508,7 @@ impl<'a, X: Into<FetchCoordinate>, Y: Into<FetchCoordinate>> From<(i32, X, Y)>
480508
FetchDefinition::Region(tup.0, start.0, stop.0)
481509
}
482510
}
511+
483512
impl<'a, X: Into<FetchCoordinate>, Y: Into<FetchCoordinate>> From<(u32, X, Y)>
484513
for FetchDefinition<'a>
485514
{
@@ -563,7 +592,7 @@ impl<'a, T: AsRef<[u8]>, X: Into<FetchCoordinate>, Y: Into<FetchCoordinate>> Fro
563592
pub struct IndexedReader {
564593
htsfile: *mut htslib::htsFile,
565594
header: Rc<HeaderView>,
566-
idx: *mut htslib::hts_idx_t,
595+
idx: Rc<IndexView>,
567596
itr: Option<*mut htslib::hts_itr_t>,
568597
tpool: Option<ThreadPool>,
569598
}
@@ -609,7 +638,7 @@ impl IndexedReader {
609638
Ok(IndexedReader {
610639
htsfile,
611640
header: Rc::new(HeaderView::new(header)),
612-
idx,
641+
idx: Rc::new(IndexView::new(idx)),
613642
itr: None,
614643
tpool: None,
615644
})
@@ -637,7 +666,7 @@ impl IndexedReader {
637666
Ok(IndexedReader {
638667
htsfile,
639668
header: Rc::new(HeaderView::new(header)),
640-
idx,
669+
idx: Rc::new(IndexView::new(idx)),
641670
itr: None,
642671
tpool: None,
643672
})
@@ -706,12 +735,7 @@ impl IndexedReader {
706735
match tid {
707736
Some(tid) => {
708737
//'large position' spec says target len must will fit into an i64.
709-
let len: i64 = self
710-
.header
711-
.target_len(tid as u32)
712-
.unwrap()
713-
.try_into()
714-
.unwrap();
738+
let len: i64 = self.header.target_len(tid).unwrap().try_into().unwrap();
715739
self._fetch_by_coord_tuple(tid as i32, 0, len)
716740
}
717741
None => self._fetch_by_str(s),
@@ -726,7 +750,7 @@ impl IndexedReader {
726750
if let Some(itr) = self.itr {
727751
unsafe { htslib::hts_itr_destroy(itr) }
728752
}
729-
let itr = unsafe { htslib::sam_itr_queryi(self.idx, tid as i32, beg as i64, end as i64) };
753+
let itr = unsafe { htslib::sam_itr_queryi(self.index().inner_ptr(), tid, beg, end) };
730754
if itr.is_null() {
731755
self.itr = None;
732756
Err(Error::Fetch)
@@ -744,7 +768,7 @@ impl IndexedReader {
744768
let rptr = rstr.as_ptr();
745769
let itr = unsafe {
746770
htslib::sam_itr_querys(
747-
self.idx,
771+
self.index().inner_ptr(),
748772
self.header().inner_ptr() as *mut hts_sys::sam_hdr_t,
749773
rptr,
750774
)
@@ -783,6 +807,163 @@ impl IndexedReader {
783807
pub fn set_reference<P: AsRef<Path>>(&mut self, path: P) -> Result<()> {
784808
unsafe { set_fai_filename(self.htsfile, path) }
785809
}
810+
811+
pub fn index(&self) -> &IndexView {
812+
&self.idx
813+
}
814+
815+
// Analogous to slow_idxstats in samtools, see
816+
// https://github.com/samtools/samtools/blob/556c60fdff977c0e6cadc4c2581661f187098b4d/bam_index.c#L140-L199
817+
unsafe fn slow_idxstats(&mut self) -> Result<Vec<(i64, u64, u64, u64)>> {
818+
self.set_cram_options(
819+
hts_sys::hts_fmt_option_CRAM_OPT_REQUIRED_FIELDS,
820+
hts_sys::sam_fields_SAM_RNAME | hts_sys::sam_fields_SAM_FLAG,
821+
)?;
822+
let header = self.header();
823+
let h = header.inner;
824+
let mut ret;
825+
let mut last_tid = -2;
826+
let fp = self.htsfile();
827+
828+
let nref =
829+
usize::try_from(hts_sys::sam_hdr_nref(h)).map_err(|_| Error::NoSequencesInReference)?;
830+
if nref == 0 {
831+
return Ok(vec![]);
832+
}
833+
let mut counts = vec![vec![0; 2]; nref + 1];
834+
let mut bb: hts_sys::bam1_t = MaybeUninit::zeroed().assume_init();
835+
let b = &mut bb as *mut hts_sys::bam1_t;
836+
loop {
837+
ret = hts_sys::sam_read1(fp, h, b);
838+
if ret < 0 {
839+
break;
840+
}
841+
let tid = (*b).core.tid;
842+
if tid >= nref as i32 || tid < -1 {
843+
return Err(Error::InvalidTid { tid });
844+
}
845+
846+
if tid != last_tid {
847+
if (last_tid >= -1) && (counts[tid as usize][0] + counts[tid as usize][1]) > 0 {
848+
return Err(Error::BamUnsorted);
849+
}
850+
last_tid = tid;
851+
}
852+
853+
let idx = if ((*b).core.flag as u32 & hts_sys::BAM_FUNMAP) > 0 {
854+
1
855+
} else {
856+
0
857+
};
858+
counts[(*b).core.tid as usize][idx] += 1;
859+
}
860+
861+
if ret == -1 {
862+
let res = (0..nref)
863+
.map(|i| {
864+
(
865+
i as i64,
866+
header.target_len(i as u32).unwrap(),
867+
counts[i][0],
868+
counts[i][1],
869+
)
870+
})
871+
.chain([(-1, 0, counts[nref][0], counts[nref][1])])
872+
.collect();
873+
Ok(res)
874+
} else {
875+
Err(Error::SlowIdxStats)
876+
}
877+
}
878+
879+
/// Similar to samtools idxstats, this returns a vector of tuples
880+
/// containing the target id, length, number of mapped reads, and number of unmapped reads.
881+
/// The last entry in the vector corresponds to the unmapped reads for the entire file, with
882+
/// the tid set to -1.
883+
pub fn index_stats(&mut self) -> Result<Vec<(i64, u64, u64, u64)>> {
884+
let header = self.header();
885+
let index = self.index();
886+
if index.inner_ptr().is_null() {
887+
panic!("Index is null");
888+
}
889+
// the quick index stats method only works for BAM files, not SAM or CRAM
890+
unsafe {
891+
if (*self.htsfile()).format.format != htslib::htsExactFormat_bam {
892+
return self.slow_idxstats();
893+
}
894+
}
895+
Ok((0..header.target_count())
896+
.map(|tid| {
897+
let (mapped, unmapped) = index.number_mapped_unmapped(tid);
898+
let tlen = header.target_len(tid).unwrap();
899+
(tid as i64, tlen, mapped, unmapped)
900+
})
901+
.chain([(-1, 0, 0, index.number_unmapped())])
902+
.collect::<_>())
903+
}
904+
}
905+
906+
#[derive(Debug)]
907+
pub struct IndexView {
908+
inner: *mut hts_sys::hts_idx_t,
909+
owned: bool,
910+
}
911+
912+
impl IndexView {
913+
fn new(hts_idx: *mut hts_sys::hts_idx_t) -> Self {
914+
Self {
915+
inner: hts_idx,
916+
owned: true,
917+
}
918+
}
919+
920+
#[inline]
921+
pub fn inner(&self) -> &hts_sys::hts_idx_t {
922+
unsafe { self.inner.as_ref().unwrap() }
923+
}
924+
925+
#[inline]
926+
// Pointer to inner hts_idx_t struct
927+
pub fn inner_ptr(&self) -> *const hts_sys::hts_idx_t {
928+
self.inner
929+
}
930+
931+
#[inline]
932+
pub fn inner_mut(&mut self) -> &mut hts_sys::hts_idx_t {
933+
unsafe { self.inner.as_mut().unwrap() }
934+
}
935+
936+
#[inline]
937+
// Mutable pointer to hts_idx_t struct
938+
pub fn inner_ptr_mut(&mut self) -> *mut hts_sys::hts_idx_t {
939+
self.inner
940+
}
941+
942+
/// Get the number of mapped and unmapped reads for a given target id
943+
/// FIXME only valid for BAM, not SAM/CRAM
944+
fn number_mapped_unmapped(&self, tid: u32) -> (u64, u64) {
945+
let (mut mapped, mut unmapped) = (0, 0);
946+
unsafe {
947+
hts_sys::hts_idx_get_stat(self.inner, tid as i32, &mut mapped, &mut unmapped);
948+
}
949+
(mapped, unmapped)
950+
}
951+
952+
/// Get the total number of unmapped reads in the file
953+
/// FIXME only valid for BAM, not SAM/CRAM
954+
fn number_unmapped(&self) -> u64 {
955+
unsafe { hts_sys::hts_idx_get_n_no_coor(self.inner) }
956+
}
957+
}
958+
959+
impl Drop for IndexView {
960+
fn drop(&mut self) {
961+
if self.owned {
962+
unsafe {
963+
htslib::hts_idx_destroy(self.inner);
964+
}
965+
}
966+
}
786967
}
787968

788969
impl Read for IndexedReader {
@@ -851,7 +1032,6 @@ impl Drop for IndexedReader {
8511032
if self.itr.is_some() {
8521033
htslib::hts_itr_destroy(self.itr.unwrap());
8531034
}
854-
htslib::hts_idx_destroy(self.idx);
8551035
htslib::hts_close(self.htsfile);
8561036
}
8571037
}
@@ -1615,6 +1795,7 @@ CCCCCCCCCCCCCCCCCCC"[..],
16151795
let bam = IndexedReader::from_path(&"test/test.bam").expect("Expected valid index.");
16161796
_test_read_indexed_common(bam);
16171797
}
1798+
16181799
#[test]
16191800
fn test_read_indexed_different_index_name() {
16201801
let bam = IndexedReader::from_path_and_index(
@@ -2818,4 +2999,80 @@ CCCCCCCCCCCCCCCCCCC"[..],
28182999
assert_eq!(header_refseqs[0].get("SN").unwrap(), "ref_1",);
28193000
assert_eq!(header_refseqs[0].get("LN").unwrap(), "10000000",);
28203001
}
3002+
3003+
#[test]
3004+
fn test_idxstats_bam() {
3005+
let mut reader = IndexedReader::from_path("test/test.bam").unwrap();
3006+
let expected = vec![
3007+
(0, 15072423, 6, 0),
3008+
(1, 15279345, 0, 0),
3009+
(2, 13783700, 0, 0),
3010+
(3, 17493793, 0, 0),
3011+
(4, 20924149, 0, 0),
3012+
(-1, 0, 0, 0),
3013+
];
3014+
let actual = reader.index_stats().unwrap();
3015+
assert_eq!(expected, actual);
3016+
}
3017+
3018+
#[test]
3019+
fn test_number_mapped_and_unmapped_bam() {
3020+
let reader = IndexedReader::from_path("test/test.bam").unwrap();
3021+
let expected = (6, 0);
3022+
let actual = reader.index().number_mapped_unmapped(0);
3023+
assert_eq!(expected, actual);
3024+
}
3025+
3026+
#[test]
3027+
fn test_number_unmapped_global_bam() {
3028+
let reader = IndexedReader::from_path("test/test_unmapped.bam").unwrap();
3029+
let expected = 8;
3030+
let actual = reader.index().number_unmapped();
3031+
assert_eq!(expected, actual);
3032+
}
3033+
3034+
#[test]
3035+
fn test_idxstats_cram() {
3036+
let mut reader = IndexedReader::from_path("test/test_cram.cram").unwrap();
3037+
reader.set_reference("test/test_cram.fa").unwrap();
3038+
let expected = vec![
3039+
(0, 120, 2, 0),
3040+
(1, 120, 2, 0),
3041+
(2, 120, 2, 0),
3042+
(-1, 0, 0, 0),
3043+
];
3044+
let actual = reader.index_stats().unwrap();
3045+
assert_eq!(expected, actual);
3046+
}
3047+
3048+
#[test]
3049+
fn test_slow_idxstats_cram() {
3050+
let mut reader = IndexedReader::from_path("test/test_cram.cram").unwrap();
3051+
reader.set_reference("test/test_cram.fa").unwrap();
3052+
let expected = vec![
3053+
(0, 120, 2, 0),
3054+
(1, 120, 2, 0),
3055+
(2, 120, 2, 0),
3056+
(-1, 0, 0, 0),
3057+
];
3058+
let actual = reader.index_stats().unwrap();
3059+
assert_eq!(expected, actual);
3060+
}
3061+
3062+
// #[test]
3063+
// fn test_number_mapped_and_unmapped_cram() {
3064+
// let mut reader = IndexedReader::from_path("test/test_cram.cram").unwrap();
3065+
// reader.set_reference("test/test_cram.fa").unwrap();
3066+
// let expected = (2, 0);
3067+
// let actual = reader.index().number_mapped_unmapped(0);
3068+
// assert_eq!(expected, actual);
3069+
// }
3070+
//
3071+
// #[test]
3072+
// fn test_number_unmapped_global_cram() {
3073+
// let mut reader = IndexedReader::from_path("test/test_unmapped.cram").unwrap();
3074+
// let expected = 8;
3075+
// let actual = reader.index().number_unmapped();
3076+
// assert_eq!(expected, actual);
3077+
// }
28213078
}

0 commit comments

Comments
 (0)