Skip to content

Commit 7ea9b2c

Browse files
committed
Update to v0.12.10
1 parent 3f5c3d4 commit 7ea9b2c

8 files changed

+193
-69
lines changed

CHANGELOG.md

+6
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
# Change Log
22

3+
## v0.12.10 - 2025-02-24
4+
5+
### Fixed
6+
7+
- Fix handling of chromosome names with colons, eg. 'HLA-DRB1*10:01:01' (github #11)
8+
39
## v0.12.9 - 2025-01-10
410

511
### Added

Cargo.lock

+1-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "sawfish"
3-
version = "0.12.9"
3+
version = "0.12.10"
44
authors = ["Chris Saunders <[email protected]>"]
55
description = "Structural variant analysis for mapped PacBio HiFi reads"
66
edition = "2021"

LICENSE-THIRDPARTY.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -1702,7 +1702,7 @@
17021702
},
17031703
{
17041704
"name": "sawfish",
1705-
"version": "0.12.9",
1705+
"version": "0.12.10",
17061706
"authors": "Chris Saunders <[email protected]>",
17071707
"repository": null,
17081708
"license": null,

docs/user_guide.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ SV haplotype contig alignments are output to `${OUTPUT_DIR}/contig.alignment.bam
206206
this file can be viewed in alignment browsers such as IGV.
207207

208208
Aligned contigs are provided for all single breakpoint SV calls. To find the contig for a given SV, locate the SV's VCF ID field, such as `sawfish:0:2803:1:2`,
209-
and take the prefix from this ID that includes the first three digits, in this case `sawfish:0:2803:1` -- this is `QNAME` value of the corresponding SV
209+
and take the prefix from this ID that includes the first three digits, in this case `sawfish:0:2803:1`. This is the `QNAME` value of the corresponding SV
210210
haplotype alignment(s) in the contig alignment BAM file.
211211

212212
Contigs are not available for multi-breakpoint events such as inversions. However, contigs are available for each individual breakpoint

src/genome_regions.rs

+4-4
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use log::info;
66
use rust_vc_utils::ChromList;
77
use unwrap::unwrap;
88

9-
use crate::genome_segment::samtools_region_string_splitter;
9+
use crate::genome_segment::parse_samtools_region_string;
1010

1111
/// A set of chromosome regions which can be efficiently queried
1212
///
@@ -144,10 +144,10 @@ impl GenomeRegions {
144144
) -> Self {
145145
let mut regions = Self::new(overlaps_allowed);
146146
for target_region in target_regions {
147-
let (_chrom_index, chrom_label, start, end) =
148-
samtools_region_string_splitter(chrom_list, target_region);
147+
let (chrom_index, start, end) = parse_samtools_region_string(chrom_list, target_region);
149148

150-
regions.add_region(&chrom_label, start, end);
149+
let chrom_label = &chrom_list.data[chrom_index].label;
150+
regions.add_region(chrom_label, start, end);
151151
}
152152
regions
153153
}

src/genome_segment.rs

+101-51
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,7 @@ impl GenomeSegment {
2525
///
2626
#[allow(dead_code)]
2727
pub fn from_region_str(chrom_list: &ChromList, str: &str) -> Self {
28-
let (chrom_index, _chrom_label, start, end) =
29-
samtools_region_string_splitter(chrom_list, str);
28+
let (chrom_index, start, end) = parse_samtools_region_string(chrom_list, str);
3029
Self {
3130
chrom_index,
3231
range: IntRange::from_pair(start, end),
@@ -109,43 +108,58 @@ pub fn get_segment_dir_distance(gs1: &GenomeSegment, gs2: &GenomeSegment) -> Opt
109108
}
110109
}
111110

112-
/// Convert from a string in 'samtools' region format (e.g. chr20:100-200) to a tuple of
113-
/// (chrom_index, chrom_label, start, end)
114-
/// ...where start and end are converted to the zero-indexed half-open convention used for bed
111+
/// Parse the chromosome string out of a samtools-style region string
115112
///
116-
/// Commas will be stripped out of coordinates if present
113+
/// Return the index of the chromosome from the expected chromosome list, and
114+
/// an optional position string following the chromosome name
117115
///
118-
pub fn samtools_region_string_splitter(
116+
fn parse_chrom_index_from_samtools_region_string<'a>(
119117
chrom_list: &ChromList,
120-
str: &str,
121-
) -> (usize, String, i64, i64) {
122-
let s1 = str.split(':').collect::<Vec<_>>();
118+
str: &'a str,
119+
) -> (usize, Option<&'a str>) {
120+
// Note that rsplitn orders words in reverse order compared to how they appear in the string:
121+
let s1 = str.rsplitn(2, ':').collect::<Vec<_>>();
123122
let s1l = s1.len();
124123
assert!(
125124
s1l > 0 && s1l < 3,
126-
"Unexpected format in genome region string {}",
127-
str
125+
"Unexpected format in genome region string '{str}'"
128126
);
129-
let chrom = s1[0].to_string();
130-
let chrom_index = match chrom_list.label_to_index.get(s1[0]) {
131-
Some(x) => *x,
132-
None => {
133-
panic!("Can't find chromosome '{}' in bam file header", chrom);
134-
}
135-
};
136-
let chrom_size = chrom_list.data[chrom_index].length as i64;
137-
let (start, end) = if s1l == 1 {
138-
(0, chrom_size)
127+
let chrom = *s1.last().unwrap();
128+
if let Some(&chrom_index) = chrom_list.label_to_index.get(chrom) {
129+
let pos_string = if s1l == 2 { Some(s1[0]) } else { None };
130+
(chrom_index, pos_string)
131+
} else if let Some(&chrom_index) = chrom_list.label_to_index.get(str) {
132+
(chrom_index, None)
139133
} else {
140-
let s2 = s1[1].split('-').collect::<Vec<_>>();
134+
let msg = if str != chrom {
135+
format!("Unexpected format in genome region string '{str}': can't find chromosome name '{chrom}' or '{str}' in bam file header")
136+
} else {
137+
format!("Unexpected format in genome region string '{str}': can't find chromosome '{chrom}' in bam file header")
138+
};
139+
panic!("{}", msg);
140+
}
141+
}
142+
143+
/// Parse position range from samtools-style genomic interval string, return
144+
/// start-end coordinate in bedtools zero-index half-open format.
145+
///
146+
/// In the samtools-style string, "100-300" would return (99,300). Just "100"
147+
/// should retunr (99, chrom_length)
148+
///
149+
/// # Arguments
150+
/// * `region_str` - Only used to improve error messages
151+
///
152+
fn parse_samtools_pos_range(
153+
region_str: &str,
154+
pos_range_str: Option<&str>,
155+
chrom_size: i64,
156+
) -> (i64, i64) {
157+
if let Some(pos_range_str) = pos_range_str {
158+
let s2 = pos_range_str.split('-').collect::<Vec<_>>();
141159
let s2l = s2.len();
142-
assert!(
143-
s2l > 0 && s2l < 3,
144-
"Unexpected format in genome region string {}",
145-
str
146-
);
147-
// Strip any commas out of the number field (don't know if samtools does this but just a
148-
// nice ease of use bonus:
160+
assert!(s2l <= 2, "Unexpected format in position range '{pos_range_str}' from genome region string {region_str}");
161+
162+
// Strip any commas out of the number field (same as tabix cmdline behavior)
149163
let s2 = s2
150164
.into_iter()
151165
.map(|s| {
@@ -155,14 +169,33 @@ pub fn samtools_region_string_splitter(
155169
})
156170
.collect::<Vec<_>>();
157171
let start = s2[0].parse::<i64>().unwrap() - 1;
158-
if s2l == 1 {
159-
(start, chrom_size)
172+
let end = if s2l == 1 {
173+
chrom_size
160174
} else {
161-
let end = s2[1].parse::<i64>().unwrap();
162-
(start, end)
163-
}
164-
};
165-
(chrom_index, chrom, start, end)
175+
s2[1].parse::<i64>().unwrap()
176+
};
177+
(start, end)
178+
} else {
179+
(0, chrom_size)
180+
}
181+
}
182+
183+
/// Convert from a string in 'samtools' region format (e.g. chr20:100-200) to a tuple of
184+
/// (chrom_index, chrom_label, start, end)
185+
/// ...where start and end are converted to the zero-indexed half-open convention used for bed
186+
///
187+
/// Commas will be stripped out of coordinates if present
188+
///
189+
/// This parser makes a 'best-effort' to parse contig names with colons in them, such as HLA alleles
190+
/// like "HLA-DRB1*10:01:01". Given that samtools region format already has an optinoal colon, it may
191+
/// be impossible to resolve some cases.
192+
///
193+
pub fn parse_samtools_region_string(chrom_list: &ChromList, region_str: &str) -> (usize, i64, i64) {
194+
let (chrom_index, pos_str) =
195+
parse_chrom_index_from_samtools_region_string(chrom_list, region_str);
196+
let chrom_size = chrom_list.data[chrom_index].length as i64;
197+
let (start, end) = parse_samtools_pos_range(region_str, pos_str, chrom_size);
198+
(chrom_index, start, end)
166199
}
167200

168201
#[allow(dead_code)]
@@ -235,37 +268,54 @@ mod tests {
235268

236269
#[test]
237270
fn test_samtools_region_string_splitter() {
238-
let mut chrom_list = ChromList::default();
239-
chrom_list.add_chrom("chr1", 10000);
240-
chrom_list.add_chrom("chr2", 10000);
241-
chrom_list.add_chrom("chr3", 10000);
242-
let chrom_list = chrom_list;
271+
let chrom_list = {
272+
let mut x = ChromList::default();
273+
x.add_chrom("chr1", 10000);
274+
x.add_chrom("chr2", 10000);
275+
x.add_chrom("chr3", 10000);
276+
x
277+
};
243278

244279
// A simple case
245280
let s = "chr2:1000-2000";
246-
let (chrom_index, chrom_label, start, end) =
247-
samtools_region_string_splitter(&chrom_list, s);
281+
let (chrom_index, start, end) = parse_samtools_region_string(&chrom_list, s);
248282
assert_eq!(chrom_index, 1);
249-
assert_eq!(chrom_label, "chr2");
250283
assert_eq!(start, 999);
251284
assert_eq!(end, 2000);
252285

253286
// Simple case with commas
254287
let s = "chr2:1,000-2,000";
255-
let (chrom_index, chrom_label, start, end) =
256-
samtools_region_string_splitter(&chrom_list, s);
288+
let (chrom_index, start, end) = parse_samtools_region_string(&chrom_list, s);
257289
assert_eq!(chrom_index, 1);
258-
assert_eq!(chrom_label, "chr2");
259290
assert_eq!(start, 999);
260291
assert_eq!(end, 2000);
261292

262293
// No end
263294
let s = "chr2:1,000";
264-
let (chrom_index, chrom_label, start, end) =
265-
samtools_region_string_splitter(&chrom_list, s);
295+
let (chrom_index, start, end) = parse_samtools_region_string(&chrom_list, s);
266296
assert_eq!(chrom_index, 1);
267-
assert_eq!(chrom_label, "chr2");
268297
assert_eq!(start, 999);
269298
assert_eq!(end, 10000);
270299
}
300+
301+
#[test]
302+
fn test_samtools_region_string_splitter_hla() {
303+
let chrom_list = {
304+
let mut x = ChromList::default();
305+
x.add_chrom("HLA-DRB1*10:01:01", 10000);
306+
x
307+
};
308+
309+
let s = "HLA-DRB1*10:01:01:1000-2000";
310+
let (chrom_index, start, end) = parse_samtools_region_string(&chrom_list, s);
311+
assert_eq!(chrom_index, 0);
312+
assert_eq!(start, 999);
313+
assert_eq!(end, 2000);
314+
315+
let s = "HLA-DRB1*10:01:01";
316+
let (chrom_index, start, end) = parse_samtools_region_string(&chrom_list, s);
317+
assert_eq!(chrom_index, 0);
318+
assert_eq!(start, 0);
319+
assert_eq!(end, 10000);
320+
}
271321
}

src/joint_call/get_refined_svs.rs

+78-10
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,13 @@ struct BndAltInfo {
200200
insert_seq: Vec<u8>,
201201
}
202202

203+
/// Parse a VCF breakend record alt-allele string into component parts describing the breakpoint
204+
///
205+
/// Example BND alt allele string is "TCT]chr1:500]", see VCF spec for full description.
206+
///
207+
/// # Arguments
208+
/// * homlen - this is used to correct the remote breakend into a left-shifted position when the breakpoint is inverted
209+
///
203210
fn parse_bnd_alt_allele(chrom_list: &ChromList, alt_allele: &[u8], homlen: i64) -> BndAltInfo {
204211
let alt_allele = std::str::from_utf8(alt_allele).unwrap();
205212

@@ -224,17 +231,21 @@ fn parse_bnd_alt_allele(chrom_list: &ChromList, alt_allele: &[u8], homlen: i64)
224231
alt_allele
225232
);
226233

227-
let mate_location = words[1].split(':').collect::<Vec<_>>();
228-
assert_eq!(
229-
mate_location.len(),
230-
2,
231-
"Unexpected BND alt allele format `{}`",
232-
alt_allele
233-
);
234+
let (breakend2_chrom_index, breakend2_raw_pos) = {
235+
// Note that rsplitn orders words in reverse order compared to how they appear in the string:
236+
let mate_location = words[1].rsplitn(2, ':').collect::<Vec<_>>();
237+
assert_eq!(
238+
mate_location.len(),
239+
2,
240+
"Unexpected BND alt allele format `{}`",
241+
alt_allele
242+
);
234243

235-
let breakend2_chrom_name = mate_location[0];
236-
let breakend2_chrom_index = *chrom_list.label_to_index.get(breakend2_chrom_name).unwrap();
237-
let breakend2_raw_pos = mate_location[1].parse::<i64>().unwrap() - 1;
244+
let breakend2_chrom_name = mate_location[1];
245+
let breakend2_chrom_index = *chrom_list.label_to_index.get(breakend2_chrom_name).unwrap();
246+
let breakend2_raw_pos = mate_location[0].parse::<i64>().unwrap() - 1;
247+
(breakend2_chrom_index, breakend2_raw_pos)
248+
};
238249

239250
let breakend1_direction = if !words[0].is_empty() && words[2].is_empty() {
240251
BreakendDirection::LeftAnchor
@@ -838,3 +849,60 @@ pub fn get_sample_sv_groups(
838849

839850
sv_groups
840851
}
852+
853+
#[cfg(test)]
854+
mod tests {
855+
use super::*;
856+
857+
#[test]
858+
fn test_parse_bnd_alt_allele() {
859+
let chrom_list = {
860+
let mut x = ChromList::default();
861+
x.add_chrom("chr1", 1000);
862+
x
863+
};
864+
865+
let alt_allele = b"TCT]chr1:500]";
866+
867+
let homlen = 10;
868+
let bnd_alt_info = parse_bnd_alt_allele(&chrom_list, alt_allele, homlen);
869+
870+
assert_eq!(
871+
bnd_alt_info.breakend1_direction,
872+
BreakendDirection::LeftAnchor
873+
);
874+
assert_eq!(
875+
bnd_alt_info.breakend2_direction,
876+
BreakendDirection::LeftAnchor
877+
);
878+
assert_eq!(bnd_alt_info.breakend2_chrom_index, 0);
879+
assert_eq!(bnd_alt_info.breakend2_pos, 489);
880+
assert_eq!(bnd_alt_info.insert_seq, b"CT");
881+
}
882+
883+
#[test]
884+
fn test_parse_bnd_alt_allele_hla() {
885+
let chrom_list = {
886+
let mut x = ChromList::default();
887+
x.add_chrom("HLA-DRB1*10:01:01", 1000);
888+
x
889+
};
890+
891+
let alt_allele = b"TCT]HLA-DRB1*10:01:01:500]";
892+
893+
let homlen = 10;
894+
let bnd_alt_info = parse_bnd_alt_allele(&chrom_list, alt_allele, homlen);
895+
896+
assert_eq!(
897+
bnd_alt_info.breakend1_direction,
898+
BreakendDirection::LeftAnchor
899+
);
900+
assert_eq!(
901+
bnd_alt_info.breakend2_direction,
902+
BreakendDirection::LeftAnchor
903+
);
904+
assert_eq!(bnd_alt_info.breakend2_chrom_index, 0);
905+
assert_eq!(bnd_alt_info.breakend2_pos, 489);
906+
assert_eq!(bnd_alt_info.insert_seq, b"CT");
907+
}
908+
}

0 commit comments

Comments
 (0)