-
Notifications
You must be signed in to change notification settings - Fork 20
feat: add support for over-writing genotypes for males on x non par #1030
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
aabf64f
5a32dbb
d30d2f8
524de9d
d095e9c
116bb9b
55f7bc8
bf285b9
caf5d75
3f78376
68ae34f
7e8311d
cdca1d6
154fa90
fe766bc
82baf0c
bca37f5
d8bdd9b
bc5e368
62031cc
8d3ba82
a5e50d0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
import hail as hl | ||
|
||
from v03_pipeline.lib.annotations import sv | ||
from v03_pipeline.lib.misc.pedigree import Family | ||
from v03_pipeline.lib.model import ReferenceGenome, Sex | ||
|
||
|
||
def overwrite_male_non_par_calls( | ||
mt: hl.MatrixTable, | ||
families: set[Family], | ||
) -> hl.MatrixTable: | ||
male_sample_ids = { | ||
s.sample_id for f in families for s in f.samples.values() if s.sex == Sex.MALE | ||
} | ||
male_sample_ids = ( | ||
hl.set(male_sample_ids) if male_sample_ids else hl.empty_set(hl.str) | ||
) | ||
par_intervals = hl.array( | ||
[ | ||
i | ||
for i in hl.get_reference(ReferenceGenome.GRCh38).par | ||
if i.start.contig == ReferenceGenome.GRCh38.x_contig | ||
], | ||
) | ||
non_par_interval = hl.interval( | ||
par_intervals[0].end, | ||
par_intervals[1].start, | ||
) | ||
bpblanken marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# NB: making use of existing formatting_annotation_fns. | ||
# We choose to annotate & drop here as the sample level | ||
# fields are dropped by the time we format variants. | ||
mt = mt.annotate_rows( | ||
start_locus=sv.start_locus(mt), | ||
end_locus=sv.end_locus(mt), | ||
) | ||
mt = mt.annotate_entries( | ||
GT=hl.if_else( | ||
( | ||
male_sample_ids.contains(mt.s) | ||
& non_par_interval.overlaps( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think this is correct - if a single SV call overlaps both non-par and par regions I think we want to keep the diploid call because its real for the par region. I think we only want to change it if the SV is 100% overlapped by non-par region There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hrm, I think the current behavior is to "obtain those calls overlapping non-PAR regions and update the GT to 1/1 in males." I think your logic to keep as diploid makes sense though, if we're ok with the behavior change. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not like super confident in my knowledge of this data/biology, so it might be better to maintain parity There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it might be worth asking lynn + alba on this. I think we can get a fast answer. |
||
hl.interval( | ||
mt.start_locus, | ||
mt.end_locus, | ||
), | ||
) | ||
& mt.GT.is_het() | ||
), | ||
hl.Call([1], phased=False), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tweaked this to 1) only change het diploid calls and 2) replace with a haploid call. |
||
mt.GT, | ||
), | ||
) | ||
return mt.drop('start_locus', 'end_locus') |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
import unittest | ||
|
||
import hail as hl | ||
|
||
from v03_pipeline.lib.misc.io import import_callset, select_relevant_fields | ||
from v03_pipeline.lib.misc.pedigree import Family, Sample | ||
from v03_pipeline.lib.misc.sample_ids import subset_samples | ||
from v03_pipeline.lib.misc.sv import overwrite_male_non_par_calls | ||
from v03_pipeline.lib.model import DatasetType, ReferenceGenome, Sex | ||
|
||
TEST_SV_VCF = 'v03_pipeline/var/test/callsets/sv_1.vcf' | ||
|
||
|
||
class SVTest(unittest.TestCase): | ||
def test_overwrite_male_non_par_calls(self) -> None: | ||
mt = import_callset(TEST_SV_VCF, ReferenceGenome.GRCh38, DatasetType.SV) | ||
mt = select_relevant_fields( | ||
mt, | ||
DatasetType.SV, | ||
) | ||
mt = subset_samples( | ||
mt, | ||
hl.Table.parallelize( | ||
[{'s': sample_id} for sample_id in ['RGP_164_1', 'RGP_164_2']], | ||
hl.tstruct(s=hl.dtype('str')), | ||
key='s', | ||
), | ||
) | ||
mt = overwrite_male_non_par_calls( | ||
mt, | ||
{ | ||
Family( | ||
family_guid='family_1', | ||
samples={ | ||
'RGP_164_1': Sample(sample_id='RGP_164_1', sex=Sex.FEMALE), | ||
'RGP_164_2': Sample(sample_id='RGP_164_2', sex=Sex.MALE), | ||
}, | ||
), | ||
}, | ||
) | ||
mt = mt.filter_rows(mt.locus.contig == 'chrX') | ||
self.assertEqual( | ||
[ | ||
hl.Locus(contig='chrX', position=3, reference_genome='GRCh38'), | ||
hl.Locus(contig='chrX', position=2781700, reference_genome='GRCh38'), | ||
], | ||
mt.locus.collect(), | ||
) | ||
self.assertEqual( | ||
[ | ||
hl.Call(alleles=[0, 0], phased=False), | ||
# END of this variant < start of the non-par region. | ||
hl.Call(alleles=[0, 1], phased=False), | ||
hl.Call(alleles=[0, 0], phased=False), | ||
hl.Call(alleles=[1], phased=False), | ||
], | ||
mt.GT.collect(), | ||
) | ||
self.assertFalse( | ||
hasattr(mt, 'start_locus'), | ||
) |
This file was deleted.
Uh oh!
There was an error while loading. Please reload this page.