@@ -25,8 +25,7 @@ impl GenomeSegment {
25
25
///
26
26
#[ allow( dead_code) ]
27
27
pub fn from_region_str ( chrom_list : & ChromList , str : & str ) -> Self {
28
- let ( chrom_index, _chrom_label, start, end) =
29
- samtools_region_string_splitter ( chrom_list, str) ;
28
+ let ( chrom_index, start, end) = parse_samtools_region_string ( chrom_list, str) ;
30
29
Self {
31
30
chrom_index,
32
31
range : IntRange :: from_pair ( start, end) ,
@@ -109,43 +108,58 @@ pub fn get_segment_dir_distance(gs1: &GenomeSegment, gs2: &GenomeSegment) -> Opt
109
108
}
110
109
}
111
110
112
- /// Convert from a string in 'samtools' region format (e.g. chr20:100-200) to a tuple of
113
- /// (chrom_index, chrom_label, start, end)
114
- /// ...where start and end are converted to the zero-indexed half-open convention used for bed
111
+ /// Parse the chromosome string out of a samtools-style region string
115
112
///
116
- /// Commas will be stripped out of coordinates if present
113
+ /// Return the index of the chromosome from the expected chromosome list, and
114
+ /// an optional position string following the chromosome name
117
115
///
118
- pub fn samtools_region_string_splitter (
116
+ fn parse_chrom_index_from_samtools_region_string < ' a > (
119
117
chrom_list : & ChromList ,
120
- str : & str ,
121
- ) -> ( usize , String , i64 , i64 ) {
122
- let s1 = str. split ( ':' ) . collect :: < Vec < _ > > ( ) ;
118
+ str : & ' a str ,
119
+ ) -> ( usize , Option < & ' a str > ) {
120
+ // Note that rsplitn orders words in reverse order compared to how they appear in the string:
121
+ let s1 = str. rsplitn ( 2 , ':' ) . collect :: < Vec < _ > > ( ) ;
123
122
let s1l = s1. len ( ) ;
124
123
assert ! (
125
124
s1l > 0 && s1l < 3 ,
126
- "Unexpected format in genome region string {}" ,
127
- str
125
+ "Unexpected format in genome region string '{str}'"
128
126
) ;
129
- let chrom = s1[ 0 ] . to_string ( ) ;
130
- let chrom_index = match chrom_list. label_to_index . get ( s1[ 0 ] ) {
131
- Some ( x) => * x,
132
- None => {
133
- panic ! ( "Can't find chromosome '{}' in bam file header" , chrom) ;
134
- }
135
- } ;
136
- let chrom_size = chrom_list. data [ chrom_index] . length as i64 ;
137
- let ( start, end) = if s1l == 1 {
138
- ( 0 , chrom_size)
127
+ let chrom = * s1. last ( ) . unwrap ( ) ;
128
+ if let Some ( & chrom_index) = chrom_list. label_to_index . get ( chrom) {
129
+ let pos_string = if s1l == 2 { Some ( s1[ 0 ] ) } else { None } ;
130
+ ( chrom_index, pos_string)
131
+ } else if let Some ( & chrom_index) = chrom_list. label_to_index . get ( str) {
132
+ ( chrom_index, None )
139
133
} else {
140
- let s2 = s1[ 1 ] . split ( '-' ) . collect :: < Vec < _ > > ( ) ;
134
+ let msg = if str != chrom {
135
+ format ! ( "Unexpected format in genome region string '{str}': can't find chromosome name '{chrom}' or '{str}' in bam file header" )
136
+ } else {
137
+ format ! ( "Unexpected format in genome region string '{str}': can't find chromosome '{chrom}' in bam file header" )
138
+ } ;
139
+ panic ! ( "{}" , msg) ;
140
+ }
141
+ }
142
+
143
+ /// Parse position range from samtools-style genomic interval string, return
144
+ /// start-end coordinate in bedtools zero-index half-open format.
145
+ ///
146
+ /// In the samtools-style string, "100-300" would return (99,300). Just "100"
147
+ /// should retunr (99, chrom_length)
148
+ ///
149
+ /// # Arguments
150
+ /// * `region_str` - Only used to improve error messages
151
+ ///
152
+ fn parse_samtools_pos_range (
153
+ region_str : & str ,
154
+ pos_range_str : Option < & str > ,
155
+ chrom_size : i64 ,
156
+ ) -> ( i64 , i64 ) {
157
+ if let Some ( pos_range_str) = pos_range_str {
158
+ let s2 = pos_range_str. split ( '-' ) . collect :: < Vec < _ > > ( ) ;
141
159
let s2l = s2. len ( ) ;
142
- assert ! (
143
- s2l > 0 && s2l < 3 ,
144
- "Unexpected format in genome region string {}" ,
145
- str
146
- ) ;
147
- // Strip any commas out of the number field (don't know if samtools does this but just a
148
- // nice ease of use bonus:
160
+ assert ! ( s2l <= 2 , "Unexpected format in position range '{pos_range_str}' from genome region string {region_str}" ) ;
161
+
162
+ // Strip any commas out of the number field (same as tabix cmdline behavior)
149
163
let s2 = s2
150
164
. into_iter ( )
151
165
. map ( |s| {
@@ -155,14 +169,33 @@ pub fn samtools_region_string_splitter(
155
169
} )
156
170
. collect :: < Vec < _ > > ( ) ;
157
171
let start = s2[ 0 ] . parse :: < i64 > ( ) . unwrap ( ) - 1 ;
158
- if s2l == 1 {
159
- ( start , chrom_size)
172
+ let end = if s2l == 1 {
173
+ chrom_size
160
174
} else {
161
- let end = s2[ 1 ] . parse :: < i64 > ( ) . unwrap ( ) ;
162
- ( start, end)
163
- }
164
- } ;
165
- ( chrom_index, chrom, start, end)
175
+ s2[ 1 ] . parse :: < i64 > ( ) . unwrap ( )
176
+ } ;
177
+ ( start, end)
178
+ } else {
179
+ ( 0 , chrom_size)
180
+ }
181
+ }
182
+
183
+ /// Convert from a string in 'samtools' region format (e.g. chr20:100-200) to a tuple of
184
+ /// (chrom_index, chrom_label, start, end)
185
+ /// ...where start and end are converted to the zero-indexed half-open convention used for bed
186
+ ///
187
+ /// Commas will be stripped out of coordinates if present
188
+ ///
189
+ /// This parser makes a 'best-effort' to parse contig names with colons in them, such as HLA alleles
190
+ /// like "HLA-DRB1*10:01:01". Given that samtools region format already has an optinoal colon, it may
191
+ /// be impossible to resolve some cases.
192
+ ///
193
+ pub fn parse_samtools_region_string ( chrom_list : & ChromList , region_str : & str ) -> ( usize , i64 , i64 ) {
194
+ let ( chrom_index, pos_str) =
195
+ parse_chrom_index_from_samtools_region_string ( chrom_list, region_str) ;
196
+ let chrom_size = chrom_list. data [ chrom_index] . length as i64 ;
197
+ let ( start, end) = parse_samtools_pos_range ( region_str, pos_str, chrom_size) ;
198
+ ( chrom_index, start, end)
166
199
}
167
200
168
201
#[ allow( dead_code) ]
@@ -235,37 +268,54 @@ mod tests {
235
268
236
269
#[ test]
237
270
fn test_samtools_region_string_splitter ( ) {
238
- let mut chrom_list = ChromList :: default ( ) ;
239
- chrom_list. add_chrom ( "chr1" , 10000 ) ;
240
- chrom_list. add_chrom ( "chr2" , 10000 ) ;
241
- chrom_list. add_chrom ( "chr3" , 10000 ) ;
242
- let chrom_list = chrom_list;
271
+ let chrom_list = {
272
+ let mut x = ChromList :: default ( ) ;
273
+ x. add_chrom ( "chr1" , 10000 ) ;
274
+ x. add_chrom ( "chr2" , 10000 ) ;
275
+ x. add_chrom ( "chr3" , 10000 ) ;
276
+ x
277
+ } ;
243
278
244
279
// A simple case
245
280
let s = "chr2:1000-2000" ;
246
- let ( chrom_index, chrom_label, start, end) =
247
- samtools_region_string_splitter ( & chrom_list, s) ;
281
+ let ( chrom_index, start, end) = parse_samtools_region_string ( & chrom_list, s) ;
248
282
assert_eq ! ( chrom_index, 1 ) ;
249
- assert_eq ! ( chrom_label, "chr2" ) ;
250
283
assert_eq ! ( start, 999 ) ;
251
284
assert_eq ! ( end, 2000 ) ;
252
285
253
286
// Simple case with commas
254
287
let s = "chr2:1,000-2,000" ;
255
- let ( chrom_index, chrom_label, start, end) =
256
- samtools_region_string_splitter ( & chrom_list, s) ;
288
+ let ( chrom_index, start, end) = parse_samtools_region_string ( & chrom_list, s) ;
257
289
assert_eq ! ( chrom_index, 1 ) ;
258
- assert_eq ! ( chrom_label, "chr2" ) ;
259
290
assert_eq ! ( start, 999 ) ;
260
291
assert_eq ! ( end, 2000 ) ;
261
292
262
293
// No end
263
294
let s = "chr2:1,000" ;
264
- let ( chrom_index, chrom_label, start, end) =
265
- samtools_region_string_splitter ( & chrom_list, s) ;
295
+ let ( chrom_index, start, end) = parse_samtools_region_string ( & chrom_list, s) ;
266
296
assert_eq ! ( chrom_index, 1 ) ;
267
- assert_eq ! ( chrom_label, "chr2" ) ;
268
297
assert_eq ! ( start, 999 ) ;
269
298
assert_eq ! ( end, 10000 ) ;
270
299
}
300
+
301
+ #[ test]
302
+ fn test_samtools_region_string_splitter_hla ( ) {
303
+ let chrom_list = {
304
+ let mut x = ChromList :: default ( ) ;
305
+ x. add_chrom ( "HLA-DRB1*10:01:01" , 10000 ) ;
306
+ x
307
+ } ;
308
+
309
+ let s = "HLA-DRB1*10:01:01:1000-2000" ;
310
+ let ( chrom_index, start, end) = parse_samtools_region_string ( & chrom_list, s) ;
311
+ assert_eq ! ( chrom_index, 0 ) ;
312
+ assert_eq ! ( start, 999 ) ;
313
+ assert_eq ! ( end, 2000 ) ;
314
+
315
+ let s = "HLA-DRB1*10:01:01" ;
316
+ let ( chrom_index, start, end) = parse_samtools_region_string ( & chrom_list, s) ;
317
+ assert_eq ! ( chrom_index, 0 ) ;
318
+ assert_eq ! ( start, 0 ) ;
319
+ assert_eq ! ( end, 10000 ) ;
320
+ }
271
321
}
0 commit comments