Skip to content

feat: add support for over-writing genotypes for males on x non par #1030

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 22 commits into from
Feb 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion v03_pipeline/lib/misc/io_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ def test_file_size_mb(self) -> None:
# find v03_pipeline/var/test/callsets/mito_1.mt -type f | grep -v 'crc' | xargs ls -alt {} | awk '{sum += $5; print sum}'
# 191310
self.assertEqual(file_size_bytes(TEST_MITO_MT), 191310)
self.assertEqual(file_size_bytes(TEST_SV_VCF), 20040)

def test_compute_hail_n_partitions(self) -> None:
self.assertEqual(compute_hail_n_partitions(23), 1)
Expand Down
52 changes: 52 additions & 0 deletions v03_pipeline/lib/misc/sv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import hail as hl

from v03_pipeline.lib.annotations import sv
from v03_pipeline.lib.misc.pedigree import Family
from v03_pipeline.lib.model import ReferenceGenome, Sex


def overwrite_male_non_par_calls(
mt: hl.MatrixTable,
families: set[Family],
) -> hl.MatrixTable:
male_sample_ids = {
s.sample_id for f in families for s in f.samples.values() if s.sex == Sex.MALE
}
male_sample_ids = (
hl.set(male_sample_ids) if male_sample_ids else hl.empty_set(hl.str)
)
par_intervals = hl.array(
[
i
for i in hl.get_reference(ReferenceGenome.GRCh38).par
if i.start.contig == ReferenceGenome.GRCh38.x_contig
],
)
non_par_interval = hl.interval(
par_intervals[0].end,
par_intervals[1].start,
)
# NB: making use of existing formatting_annotation_fns.
# We choose to annotate & drop here as the sample level
# fields are dropped by the time we format variants.
mt = mt.annotate_rows(
start_locus=sv.start_locus(mt),
end_locus=sv.end_locus(mt),
)
mt = mt.annotate_entries(
GT=hl.if_else(
(
male_sample_ids.contains(mt.s)
& non_par_interval.overlaps(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this is correct - if a single SV call overlaps both non-par and par regions I think we want to keep the diploid call because its real for the par region. I think we only want to change it if the SV is 100% overlapped by non-par region

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hrm, I think the current behavior is to "obtain those calls overlapping non-PAR regions and update the GT to 1/1 in males."

I think your logic to keep as diploid makes sense though, if we're ok with the behavior change.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not like super confident in my knowledge of this data/biology, so it might be better to maintain parity

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it might be worth asking lynn + alba on this. I think we can get a fast answer.

hl.interval(
mt.start_locus,
mt.end_locus,
),
)
& mt.GT.is_het()
),
hl.Call([1], phased=False),
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tweaked this to 1) only change het diploid calls and 2) replace with a haploid call.

mt.GT,
),
)
return mt.drop('start_locus', 'end_locus')
61 changes: 61 additions & 0 deletions v03_pipeline/lib/misc/sv_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import unittest

import hail as hl

from v03_pipeline.lib.misc.io import import_callset, select_relevant_fields
from v03_pipeline.lib.misc.pedigree import Family, Sample
from v03_pipeline.lib.misc.sample_ids import subset_samples
from v03_pipeline.lib.misc.sv import overwrite_male_non_par_calls
from v03_pipeline.lib.model import DatasetType, ReferenceGenome, Sex

TEST_SV_VCF = 'v03_pipeline/var/test/callsets/sv_1.vcf'


class SVTest(unittest.TestCase):
def test_overwrite_male_non_par_calls(self) -> None:
mt = import_callset(TEST_SV_VCF, ReferenceGenome.GRCh38, DatasetType.SV)
mt = select_relevant_fields(
mt,
DatasetType.SV,
)
mt = subset_samples(
mt,
hl.Table.parallelize(
[{'s': sample_id} for sample_id in ['RGP_164_1', 'RGP_164_2']],
hl.tstruct(s=hl.dtype('str')),
key='s',
),
)
mt = overwrite_male_non_par_calls(
mt,
{
Family(
family_guid='family_1',
samples={
'RGP_164_1': Sample(sample_id='RGP_164_1', sex=Sex.FEMALE),
'RGP_164_2': Sample(sample_id='RGP_164_2', sex=Sex.MALE),
},
),
},
)
mt = mt.filter_rows(mt.locus.contig == 'chrX')
self.assertEqual(
[
hl.Locus(contig='chrX', position=3, reference_genome='GRCh38'),
hl.Locus(contig='chrX', position=2781700, reference_genome='GRCh38'),
],
mt.locus.collect(),
)
self.assertEqual(
[
hl.Call(alleles=[0, 0], phased=False),
# END of this variant < start of the non-par region.
hl.Call(alleles=[0, 1], phased=False),
hl.Call(alleles=[0, 0], phased=False),
hl.Call(alleles=[1], phased=False),
],
mt.GT.collect(),
)
self.assertFalse(
hasattr(mt, 'start_locus'),
)
3 changes: 3 additions & 0 deletions v03_pipeline/lib/misc/terra_data_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
BIGQUERY_METRICS = [
'collaborator_sample_id',
'predicted_sex',
'contamination_rate',
'percent_bases_at_20x',
'mean_coverage',
]
BIGQUERY_RESOURCE = 'bigquery'
TABLE_NAME_VALIDATION_REGEX = r'datarepo-\w+.datarepo_\w+'
Expand Down
4 changes: 4 additions & 0 deletions v03_pipeline/lib/model/dataset_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,3 +387,7 @@ def export_vcf_annotation_fns(self) -> list[Callable[..., hl.Expression]]:
sv.info,
],
}[self]

@property
def overwrite_male_non_par_calls(self) -> None:
return self == DatasetType.SV
4 changes: 4 additions & 0 deletions v03_pipeline/lib/model/definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@ def mito_contig(self) -> str:
ReferenceGenome.GRCh38: 'chrM',
}[self]

@property
def x_contig(self) -> str:
return 'X' if self == ReferenceGenome.GRCh37 else 'chrX'

def contig_recoding(self, include_mt: bool = False) -> dict[str, str]:
recode = {
ReferenceGenome.GRCh37: {
Expand Down

This file was deleted.

Loading