Skip to content

Commit 490e367

Browse files
authored
GenotypeAllele encoding and documentation for pushing to Record (#285)
* Add From<i32> for GenotypeAllele and deprecate old from_encoded method. * Add module level documentation for writing a simple VCF file. * Add examples/doc tests for set_rid, push_genotypes, and push_format_float. * Fix unnecessary s. * Better intra-doc links for Record writing functions.
1 parent d387e17 commit 490e367

File tree

2 files changed

+149
-14
lines changed

2 files changed

+149
-14
lines changed

src/bcf/mod.rs

+58-6
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@
1010
//! Note that BCF corresponds to the in-memory representation of BCF/VCF records in Htslib
1111
//! itself. Thus, it comes without a runtime penalty for parsing, in contrast to reading VCF
1212
//! files.
13-
//! # Example
13+
//!
14+
//! # Example (reading)
15+
//!
1416
//! - Obtaining 0-based locus index of the VCF record.
1517
//! - Obtaining alleles of the VCF record.
1618
//! - calculate alt-allele dosage in a mutli-sample VCF / BCF
@@ -54,6 +56,52 @@
5456
//! }
5557
//! }
5658
//! ```
59+
//!
60+
//! # Example (writing)
61+
//!
62+
//! - Setting up a VCF writer from scratch (including a simple header)
63+
//! - Creating a VCF record and writing it to the VCF file
64+
//!
65+
//! ```
66+
//! use rust_htslib::bcf::{Format, Writer};
67+
//! use rust_htslib::bcf::header::Header;
68+
//! use rust_htslib::bcf::record::GenotypeAllele;
69+
//!
70+
//! // Create minimal VCF header with a single contig and a single sample
71+
//! let mut header = Header::new();
72+
//! let header_contig_line = r#"##contig=<ID=1,length=10>"#;
73+
//! header.push_record(header_contig_line.as_bytes());
74+
//! let header_gt_line = r#"##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">"#;
75+
//! header.push_record(header_gt_line.as_bytes());
76+
//! header.push_sample("test_sample".as_bytes());
77+
//!
78+
//! // Write uncompressed VCF to stdout with above header and get an empty record
79+
//! let mut vcf = Writer::from_stdout(&header, true, Format::VCF).unwrap();
80+
//! let mut record = vcf.empty_record();
81+
//!
82+
//! // Set chrom and pos to 1 and 7, respectively - note the 0-based positions
83+
//! let rid = vcf.header().name2rid(b"1").unwrap();
84+
//! record.set_rid(Some(rid));
85+
//! record.set_pos(6);
86+
//!
87+
//! // Set record genotype to 0|1 - note first allele is always unphased
88+
//! let alleles = &[GenotypeAllele::Unphased(0), GenotypeAllele::Phased(1)];
89+
//! record.push_genotypes(alleles).unwrap();
90+
//!
91+
//! // Write record
92+
//! vcf.write(&record).unwrap()
93+
//! ```
94+
//!
95+
//! This will print the following VCF to stdout:
96+
//!
97+
//! ```lang-none
98+
//! ##fileformat=VCFv4.2
99+
//! ##FILTER=<ID=PASS,Description="All filters passed">
100+
//! ##contig=<ID=1,length=10>
101+
//! ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
102+
//! #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT test_sample
103+
//! 1 7 . . . 0 . . GT 0|1
104+
//! ```
57105
58106
use std::ffi;
59107
use std::path::Path;
@@ -830,7 +878,7 @@ mod tests {
830878
// the artificial "not observed" allele is present in each record.
831879
assert_eq!(record.alleles().iter().last().unwrap(), b"<X>");
832880

833-
let mut fmt = record.format(b"PL");
881+
let fmt = record.format(b"PL");
834882
let pl = fmt.integer().expect("Error reading format.");
835883
assert_eq!(pl.len(), 1);
836884
if i == 59 {
@@ -923,7 +971,7 @@ mod tests {
923971
let mut buffer = Buffer::new();
924972
for (i, rec) in vcf.records().enumerate() {
925973
println!("record {}", i);
926-
let mut record = rec.expect("Error reading record.");
974+
let record = rec.expect("Error reading record.");
927975
assert_eq!(
928976
record
929977
.info_shared_buffer(b"S1", &mut buffer)
@@ -966,7 +1014,7 @@ mod tests {
9661014
let f1 = [false, true];
9671015
let mut buffer = Buffer::new();
9681016
for (i, rec) in vcf.records().enumerate() {
969-
let mut record = rec.expect("Error reading record.");
1017+
let record = rec.expect("Error reading record.");
9701018
assert_eq!(
9711019
record
9721020
.info_shared_buffer(b"F1", &mut buffer)
@@ -996,7 +1044,7 @@ mod tests {
9961044
let mut vcf = Reader::from_path(&"test/test_string.vcf").expect("Error opening file.");
9971045
let expected = ["./1", "1|1", "0/1", "0|1", "1|.", "1/1"];
9981046
for (rec, exp_gt) in vcf.records().zip(expected.iter()) {
999-
let mut rec = rec.expect("Error reading record.");
1047+
let rec = rec.expect("Error reading record.");
10001048
let genotypes = rec.genotypes().expect("Error reading genotypes");
10011049
assert_eq!(&format!("{}", genotypes.get(0)), exp_gt);
10021050
}
@@ -1399,6 +1447,8 @@ mod tests {
13991447
let converted: i32 = allele.into();
14001448
let expected = 4;
14011449
assert_eq!(converted, expected);
1450+
let reverse_conversion = GenotypeAllele::from(expected);
1451+
assert_eq!(allele, reverse_conversion);
14021452
}
14031453

14041454
#[test]
@@ -1407,6 +1457,8 @@ mod tests {
14071457
let converted: i32 = allele.into();
14081458
let expected = 1;
14091459
assert_eq!(converted, expected);
1460+
let reverse_conversion = GenotypeAllele::from(expected);
1461+
assert_eq!(allele, reverse_conversion);
14101462
}
14111463

14121464
#[test]
@@ -1416,7 +1468,7 @@ mod tests {
14161468
let _header = bcf.header();
14171469
// FORMAT fields of first record of the vcf should look like:
14181470
// GT:FS1:FN1 ./1:LongString1:1 1/1:ss1:2
1419-
let mut first_record = bcf.records().next().unwrap().expect("Fail to read record");
1471+
let first_record = bcf.records().next().unwrap().expect("Fail to read record");
14201472
let sample_count = usize::try_from(first_record.sample_count()).unwrap();
14211473
assert_eq!(sample_count, 2);
14221474
let mut n_ref = vec![0; sample_count];

src/bcf/record.rs

+91-8
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,8 @@ impl Record {
184184

185185
/// Get the reference id of the record.
186186
///
187-
/// To look up the contig name, use `bcf::header::HeaderView::rid2name`.
187+
/// To look up the contig name,
188+
/// use [`HeaderView::rid2name`](../header/struct.HeaderView.html#method.rid2name).
188189
///
189190
/// # Returns
190191
///
@@ -197,7 +198,32 @@ impl Record {
197198
}
198199
}
199200

200-
// Update the internal reference ID number.
201+
/// Update the reference id of the record.
202+
///
203+
/// To look up reference id for a contig name,
204+
/// use [`HeaderView::name2rid`](../header/struct.HeaderView.html#method.name2rid).
205+
///
206+
/// # Example
207+
///
208+
/// Example assumes we have a Record `record` from a VCF with a header containing region
209+
/// named `1`. See [module documentation](../index.html#example-writing) for how to set
210+
/// up VCF, header, and record.
211+
///
212+
/// ```
213+
/// # use rust_htslib::bcf::{Format, Writer};
214+
/// # use rust_htslib::bcf::header::Header;
215+
/// # let mut header = Header::new();
216+
/// # let header_contig_line = r#"##contig=<ID=1,length=10>"#;
217+
/// # header.push_record(header_contig_line.as_bytes());
218+
/// # header.push_sample("test_sample".as_bytes());
219+
/// # let mut vcf = Writer::from_stdout(&header, true, Format::VCF).unwrap();
220+
/// # let mut record = vcf.empty_record();
221+
/// let rid = record.header().name2rid(b"1").ok();
222+
/// record.set_rid(rid);
223+
/// assert_eq!(record.rid(), rid);
224+
/// let name = record.header().rid2name(record.rid().unwrap()).ok();
225+
/// assert_eq!(Some("1".as_bytes()), name);
226+
/// ```
201227
pub fn set_rid(&mut self, rid: Option<u32>) {
202228
match rid {
203229
Some(rid) => self.inner_mut().rid = rid as i32,
@@ -424,6 +450,29 @@ impl Record {
424450
/// # Errors
425451
///
426452
/// Returns error if GT tag is not present in header.
453+
///
454+
/// # Example
455+
///
456+
/// Example assumes we have a Record `record` from a VCF with a `GT` `FORMAT` tag.
457+
/// See [module documentation](../index.html#example-writing) for how to set up
458+
/// VCF, header, and record.
459+
///
460+
/// ```
461+
/// # use rust_htslib::bcf::{Format, Writer};
462+
/// # use rust_htslib::bcf::header::Header;
463+
/// # use rust_htslib::bcf::record::GenotypeAllele;
464+
/// # let mut header = Header::new();
465+
/// # let header_contig_line = r#"##contig=<ID=1,length=10>"#;
466+
/// # header.push_record(header_contig_line.as_bytes());
467+
/// # let header_gt_line = r#"##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">"#;
468+
/// # header.push_record(header_gt_line.as_bytes());
469+
/// # header.push_sample("test_sample".as_bytes());
470+
/// # let mut vcf = Writer::from_stdout(&header, true, Format::VCF).unwrap();
471+
/// # let mut record = vcf.empty_record();
472+
/// let alleles = &[GenotypeAllele::Unphased(1), GenotypeAllele::Unphased(1)];
473+
/// record.push_genotypes(alleles);
474+
/// assert_eq!("1/1", &format!("{}", record.genotypes().unwrap().get(0)));
475+
/// ```
427476
pub fn push_genotypes(&mut self, genotypes: &[GenotypeAllele]) -> Result<()> {
428477
let encoded: Vec<i32> = genotypes.iter().map(|gt| i32::from(*gt)).collect();
429478
self.push_format_integer(b"GT", &encoded)
@@ -494,6 +543,28 @@ impl Record {
494543
/// # Errors
495544
///
496545
/// Returns error if tag is not present in header.
546+
///
547+
/// # Example
548+
///
549+
/// Example assumes we have a Record `record` from a VCF with an `AF` `FORMAT` tag.
550+
/// See [module documentation](../index.html#example-writing) for how to set up
551+
/// VCF, header, and record.
552+
///
553+
/// ```
554+
/// # use rust_htslib::bcf::{Format, Writer};
555+
/// # use rust_htslib::bcf::header::Header;
556+
/// # use rust_htslib::bcf::record::GenotypeAllele;
557+
/// # let mut header = Header::new();
558+
/// # let header_contig_line = r#"##contig=<ID=1,length=10>"#;
559+
/// # header.push_record(header_contig_line.as_bytes());
560+
/// # let header_af_line = r#"##FORMAT=<ID=AF,Number=1,Type=Float,Description="Frequency">"#;
561+
/// # header.push_record(header_af_line.as_bytes());
562+
/// # header.push_sample("test_sample".as_bytes());
563+
/// # let mut vcf = Writer::from_stdout(&header, true, Format::VCF).unwrap();
564+
/// # let mut record = vcf.empty_record();
565+
/// record.push_format_float(b"AF", &[0.5]);
566+
/// assert_eq!(0.5, record.format(b"AF").float().unwrap()[0][0]);
567+
/// ```
497568
pub fn push_format_float(&mut self, tag: &[u8], data: &[f32]) -> Result<()> {
498569
self.push_format(tag, data, htslib::BCF_HT_REAL)
499570
}
@@ -747,6 +818,10 @@ pub enum GenotypeAllele {
747818

748819
impl GenotypeAllele {
749820
/// Decode given integer according to BCF standard.
821+
#[deprecated(
822+
since = "0.36.0",
823+
note = "Please use the conversion trait From<i32> for GenotypeAllele instead."
824+
)]
750825
pub fn from_encoded(encoded: i32) -> Self {
751826
match (encoded, encoded & 1) {
752827
(0, 0) => GenotypeAllele::UnphasedMissing,
@@ -783,7 +858,19 @@ impl From<GenotypeAllele> for i32 {
783858
GenotypeAllele::Unphased(a) => (a, 0),
784859
GenotypeAllele::Phased(a) => (a, 1),
785860
};
786-
allele + 1 << 1 | phased
861+
(allele + 1) << 1 | phased
862+
}
863+
}
864+
865+
impl From<i32> for GenotypeAllele {
866+
fn from(encoded: i32) -> GenotypeAllele {
867+
match (encoded, encoded & 1) {
868+
(0, 0) => GenotypeAllele::UnphasedMissing,
869+
(1, 1) => GenotypeAllele::PhasedMissing,
870+
(e, 1) => GenotypeAllele::Phased((e >> 1) - 1),
871+
(e, 0) => GenotypeAllele::Unphased((e >> 1) - 1),
872+
_ => panic!("unexpected phasing type"),
873+
}
787874
}
788875
}
789876

@@ -825,11 +912,7 @@ impl<'a, B: Borrow<Buffer> + 'a> Genotypes<'a, B> {
825912
/// this method will return `[Unphased(1), Phased(1)]`.
826913
pub fn get(&self, i: usize) -> Genotype {
827914
let igt = self.encoded[i];
828-
Genotype(
829-
igt.iter()
830-
.map(|&e| GenotypeAllele::from_encoded(e))
831-
.collect(),
832-
)
915+
Genotype(igt.iter().map(|&e| GenotypeAllele::from(e)).collect())
833916
}
834917
}
835918

0 commit comments

Comments
 (0)