Skip to content

Commit

Permalink
#1246 - Unique variants per VCF
Browse files Browse the repository at this point in the history
  • Loading branch information
davmlaw committed Jan 31, 2025
1 parent bef976c commit 7cacd5e
Show file tree
Hide file tree
Showing 8 changed files with 81 additions and 22 deletions.
2 changes: 1 addition & 1 deletion upload/grids.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def get_colmodels(self, remove_server_side_only=False):
class UploadPipelineModifiedVariantsGrid(JqGridUserRowConfig):
model = ModifiedImportedVariant
caption = 'Modified Imported Variant'
fields = ["variant__variantannotation__transcript_version__gene_version__gene__identifier", "variant__variantannotation__transcript_version__gene_version__gene_symbol__symbol",
fields = ["operation", "variant__variantannotation__transcript_version__gene_version__gene__identifier", "variant__variantannotation__transcript_version__gene_version__gene_symbol__symbol",
'old_multiallelic', 'old_variant']

colmodel_overrides = {"variant__variantannotation__transcript_version__gene_version__gene__identifier": {"hidden": True},
Expand Down
4 changes: 2 additions & 2 deletions upload/management/commands/vcf_remove_non_standard_alts.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def handle(self, *args, **options):
else:
columns = line.split("\t")
# Check alt is ok
alt = columns[VCFColumns.ALT]
alt = columns[VCFColumns.ALT].upper()
if alt_standard_bases_pattern.sub("", alt):
skip_reason = None
if alt.startswith("<") and alt.endswith(">"):
Expand All @@ -48,7 +48,7 @@ def handle(self, *args, **options):
else:
skip_reason = "Symbolic variants disabled via settings."
else:
skip_reason = "non-standard bases in ALT sequence"
skip_reason = f"non-standard bases in ALT sequence: {alt}"
if skip_reason:
skipped_records[skip_reason] += 1
continue
Expand Down
21 changes: 21 additions & 0 deletions upload/migrations/0024_modifiedimportedvariant_operation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Generated by Django 4.2.11 on 2025-01-31 12:12

from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("upload", "0023_uploadstep_split_file_rows"),
]

operations = [
migrations.AddField(
model_name="modifiedimportedvariant",
name="operation",
field=models.CharField(
choices=[("N", "Normalization"), ("R", "Removed Duplicate")],
default="N",
max_length=1,
),
),
]
41 changes: 31 additions & 10 deletions upload/models/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from django.contrib.auth.models import User, Group
from django.core.exceptions import ObjectDoesNotExist, PermissionDenied
from django.db import models, transaction
from django.db.models import Func, F, Value, CharField
from django.db.models.aggregates import Max
from django.db.models.deletion import CASCADE, SET_NULL
from django.db.models.query import QuerySet
Expand All @@ -35,7 +36,7 @@
from snpdb.tasks.soft_delete_tasks import soft_delete_vcfs
from snpdb.user_settings_manager import UserSettingsManager
from upload.models.models_enums import UploadedFileTypes, VCFPipelineStage, \
UploadStepTaskType, TimeFilterMethod, VCFImportInfoSeverity, UploadStepOrigin
UploadStepTaskType, TimeFilterMethod, VCFImportInfoSeverity, UploadStepOrigin, ModifiedImportedVariantOperation
from variantgrid.celery import app


Expand Down Expand Up @@ -241,8 +242,7 @@ def _get_vcf_import_info(self, severity, hide_accepted=True):
kwargs["accepted_date__isnull"] = True
vcf_import_info = []
for import_info in VCFImportInfo.objects.filter(**kwargs).select_subclasses():
msg = import_info.message
if msg:
if import_info.message or import_info.has_more_details:
vcf_import_info.append(import_info)
return vcf_import_info

Expand Down Expand Up @@ -576,13 +576,32 @@ def get_for_pipeline(upload_pipeline) -> 'ModifiedImportedVariants':

@property
def message(self):
qs = self.modifiedimportedvariant_set.filter(old_variant__isnull=False)
num_normalised = qs.count()
if num_normalised:
msg = f"{num_normalised} variants normalised during import."
else:
msg = None
return msg
messages = []
miv_qs = self.modifiedimportedvariant_set.all()
if num_normalised := miv_qs.filter(operation=ModifiedImportedVariantOperation.NORMALIZATION,
old_variant__isnull=False).count():
messages.append(f"{num_normalised} normalised")

multi_allelics_qs = miv_qs.filter(operation=ModifiedImportedVariantOperation.NORMALIZATION,
old_multiallelic__isnull=False)
# The field has a unique id at the end, eg 1 or 2 below:
#
# NC_000001.10|145016034|A|AA,AC|1
# NC_000001.10|145016034|A|AA,AC|2
num_multiallelic = multi_allelics_qs.annotate(stripped_multiallelic=Func(
F('old_multiallelic'),
Value(r'(.*)\|\d+$'),
Value(r'\1'),
function='regexp_replace',
output_field=CharField(),
)).values_list('stripped_multiallelic', flat=True).distinct().count()
if num_multiallelic:
messages.append(f"{num_multiallelic} multi-allelic split")

if num_rmdup := miv_qs.filter(operation=ModifiedImportedVariantOperation.RMDUP).count():
messages.append(f"{num_rmdup} duplicates removed")

return ", ".join(messages)


class ModifiedImportedVariant(models.Model):
Expand All @@ -595,6 +614,8 @@ class ModifiedImportedVariant(models.Model):

import_info = models.ForeignKey(ModifiedImportedVariants, on_delete=CASCADE, null=True)
variant = models.ForeignKey(Variant, on_delete=CASCADE)
operation = models.CharField(max_length=1, choices=ModifiedImportedVariantOperation.choices,
default=ModifiedImportedVariantOperation.NORMALIZATION)
# OLD_MULTIALLELIC from vt: @see https://genome.sph.umich.edu/wiki/Vt#Decompose
old_multiallelic = models.TextField(null=True)
# OLD_VARIANT from vt: @see https://genome.sph.umich.edu/wiki/Vt#Normalization
Expand Down
5 changes: 5 additions & 0 deletions upload/models/models_enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,8 @@ class VCFImportInfoSeverity(models.TextChoices):
class UploadStepOrigin(models.TextChoices):
USER_ADDITION = 'A', "User Addition"
IMPORT_TASK_FACTORY = 'I', "Import Task Factory"


class ModifiedImportedVariantOperation(models.TextChoices):
NORMALIZATION = 'N', "Normalization"
RMDUP = 'R', "Removed Duplicate"
5 changes: 3 additions & 2 deletions upload/vcf/abstract_bulk_vcf_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
from library.genomics.vcf_enums import VCFSymbolicAllele
from library.genomics.vcf_utils import vcf_get_ref_alt_svlen_and_modification
from snpdb.variant_pk_lookup import VariantPKLookup
from upload.models import UploadStep, ModifiedImportedVariant, UploadStepTaskType, VCFPipelineStage, SimpleVCFImportInfo
from upload.models import UploadStep, ModifiedImportedVariant, UploadStepTaskType, VCFPipelineStage, \
SimpleVCFImportInfo, ModifiedImportedVariantOperation
from upload.tasks.vcf.import_sql_copy_task import ImportModifiedImportedVariantSQLCopyTask
from upload.vcf.sql_copy_files import write_sql_copy_csv

Expand Down Expand Up @@ -108,7 +109,7 @@ def add_modified_imported_variant(self, variant: cyvcf2.Variant, variant_hash, m
for ov in ModifiedImportedVariant.bcftools_format_old_variant(bcftools_old_variant, svlen, self.genome_build):
# These 2 need to be in sync
miv_hash_list.append(variant_hash)
miv_list.append((old_multiallelic, old_variant, ov))
miv_list.append((ModifiedImportedVariantOperation.NORMALIZATION, old_multiallelic, old_variant, ov))

def process_modified_imported_variants(self, variant_ids_by_hash):
modified_imported_variants = []
Expand Down
23 changes: 17 additions & 6 deletions upload/vcf/bulk_genotype_vcf_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from snpdb.models import CohortGenotype, VariantCoordinate, VCFFilter
from snpdb.models.models_enums import ProcessingStatus
from upload.models import UploadPipeline, PipelineFailedJobTerminateEarlyException, \
VCFImporter, UploadStep, UploadStepTaskType, VCFPipelineStage
VCFImporter, UploadStep, UploadStepTaskType, VCFPipelineStage, ModifiedImportedVariantOperation
from upload.tasks.vcf.import_sql_copy_task import ImportCohortGenotypeSQLCopyTask
from upload.vcf.abstract_bulk_vcf_processor import AbstractBulkVCFProcessor
from upload.vcf.sql_copy_files import write_sql_copy_csv, COHORT_GENOTYPE_HEADER
Expand Down Expand Up @@ -451,13 +451,16 @@ def batch_process_check(self, minimum_insert_size=None):
variant_ids = self.variant_pk_lookup.get_variant_ids(self.variant_hashes)
self.set_max_variant(self.variant_hashes, variant_ids)

# this can create MIVs (rmdupes)
self.process_cohort_genotypes(self.variant_hashes, variant_ids)

if self.modified_imported_variants:
variant_ids_by_hash = dict(zip(self.variant_hashes, variant_ids))
self.process_modified_imported_variants(variant_ids_by_hash)

self.process_cohort_genotypes(variant_ids)
self.variant_hashes = []

def process_cohort_genotypes(self, variant_ids):
def process_cohort_genotypes(self, variant_hashes, variant_ids):
cohort_genotypes_common = []
cohort_genotypes_rare = []

Expand All @@ -470,8 +473,17 @@ def process_cohort_genotypes(self, variant_ids):
raise ValueError(f"Number of variant ids ({num_variants}) != num {name} ({len(array)})")

# If you add any columns here, need to adjust COHORT_GT_NUM_ADDED_FIELDS
for variant_id, filters, cohort_gt, gnomad_af in zip(variant_ids, self.variant_filters,
self.cohort_genotypes, self.variant_gnomad_af):
last_variant_id = None
for variant_hash, variant_id, filters, cohort_gt, gnomad_af in zip(variant_hashes, variant_ids,
self.variant_filters,
self.cohort_genotypes, self.variant_gnomad_af):
# File is sorted, so dupes will be next to each other. Remove and make ModifiedImportedVariant
if variant_id == last_variant_id:
self.modified_imported_variant_hashes.append(variant_hash)
self.modified_imported_variants.append((ModifiedImportedVariantOperation.RMDUP, None, None, None))
continue

last_variant_id = variant_id
common = gnomad_af and variant_id not in self.uncommon_variant_ids
if common:
cgc_id = self.cohort_genotype_collection.common_collection_id
Expand Down Expand Up @@ -500,7 +512,6 @@ def process_cohort_genotypes(self, variant_ids):
self.create_cohort_genotype_job(table_name, num_cohort_genotypes, cohort_genotypes_filename)
self.cohort_genotype_file_id += 1

self.variant_hashes = []
self.cohort_genotypes = []
self.variant_gnomad_af = []
self.check_pipeline_for_failures() # Need to do this every so often
Expand Down
2 changes: 1 addition & 1 deletion upload/vcf/sql_copy_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
'samples_allele_depth', 'samples_allele_frequency', 'samples_read_depth',
'samples_genotype_quality', 'samples_phred_likelihood', 'samples_filters',
'format', 'info']
MODIFIED_IMPORTED_VARIANT_HEADER = ['import_info_id', 'variant_id',
MODIFIED_IMPORTED_VARIANT_HEADER = ['import_info_id', 'variant_id', 'operation',
'old_multiallelic', 'old_variant', 'old_variant_formatted']

GENE_COVERAGE_HEADER = [
Expand Down

0 comments on commit 7cacd5e

Please sign in to comment.