From 6d297ef5f6c6cb70cd3ac149539c69603d355836 Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Fri, 24 Nov 2023 16:36:25 +1030 Subject: [PATCH 01/29] #850 - new VEP 110 fields --- annotation/management/commands/vep_run.py | 9 +++- annotation/vep_annotation.py | 49 ++++++++++++++----- .../settings/components/default_settings.py | 6 +++ 3 files changed, 51 insertions(+), 13 deletions(-) diff --git a/annotation/management/commands/vep_run.py b/annotation/management/commands/vep_run.py index fe6b2aa7e..e93ee9927 100644 --- a/annotation/management/commands/vep_run.py +++ b/annotation/management/commands/vep_run.py @@ -20,10 +20,12 @@ class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument('--test', action='store_true') + parser.add_argument('--cnv', action='store_true') parser.add_argument('--genome-build', required=True) def handle(self, *args, **options): test = options["test"] + cnv = options["cnv"] build_name = options["genome_build"] genome_build = GenomeBuild.get_name_or_alias(build_name) @@ -47,9 +49,14 @@ def handle(self, *args, **options): vcf_filename = os.path.join(settings.ANNOTATION_VCF_DUMP_DIR, f"{base_name}.vcf") output_filename = os.path.join(output_dir, f"{base_name}.{vep_suffix}.vcf.gz") + if cnv: + pipeline_type = VariantAnnotationPipelineType.CNV + else: + pipeline_type = VariantAnnotationPipelineType.STANDARD + return_code, std_out, std_err = run_vep(vcf_filename, output_filename, genome_build, genome_build.annotation_consortium, - VariantAnnotationPipelineType.STANDARD) + pipeline_type) if return_code != 0: logging.info(std_out) logging.error(std_err) diff --git a/annotation/vep_annotation.py b/annotation/vep_annotation.py index d9cfde420..fd80fbd02 100644 --- a/annotation/vep_annotation.py +++ b/annotation/vep_annotation.py @@ -131,26 +131,28 @@ def get_vep_command(vcf_filename, output_filename, genome_build: GenomeBuild, an ]) # Plugins that require data - ok for these to fail when retrieving vep config - PLUGINS = {VEPPlugin.MASTERMIND: lambda: f"Mastermind,{vc['mastermind']},1", # 1 to not filter - VEPPlugin.MAXENTSCAN: lambda: f"MaxEntScan,{vc['maxentscan']}", - VEPPlugin.DBNSFP: lambda: _get_dbnsfp_plugin_command(genome_build, vc), - VEPPlugin.DBSCSNV: lambda: f"dbscSNV,{vc['dbscsnv']}", - VEPPlugin.SPLICEAI: lambda: f"SpliceAI,snv={vc['spliceai_snv']},indel={vc['spliceai_indel']}"} + plugin_data_func = { + VEPPlugin.MASTERMIND: lambda: f"Mastermind,{vc['mastermind']},1", # 1 to not filter + VEPPlugin.MAXENTSCAN: lambda: f"MaxEntScan,{vc['maxentscan']}", + VEPPlugin.DBNSFP: lambda: _get_dbnsfp_plugin_command(genome_build, vc), + VEPPlugin.DBSCSNV: lambda: f"dbscSNV,{vc['dbscsnv']}", + VEPPlugin.SPLICEAI: lambda: f"SpliceAI,snv={vc['spliceai_snv']},indel={vc['spliceai_indel']}" + } if vc.columns_version >= 2: cmd.extend(["--plugin", "NMD"]) - for vep_plugin, plugin_arg_func in PLUGINS.items(): - try: - cmd.extend(["--plugin", plugin_arg_func()]) - except Exception as e: - logging.warning(e) - logging.warning("No annotation set for plugin: %s", vep_plugin) + if vc.columns_version >= 3: + plugin_data_func.update({ + VEPPlugin.ALPHAMISSENSE: lambda: f"AlphaMissense,file={vc['alphamissense']}", + VEPPlugin.MAVEDB: lambda: f"MaveDB,file={vc['mave']},single_aminoacid_changes=0,transcript_match=0 ", + }) # Custom for vep_custom, prefix in dict(VEPCustom.choices).items(): try: - if fields := ColumnVEPField.get_source_fields(genome_build, vep_custom=vep_custom): + q = ColumnVEPField.get_columns_version_q(vc.columns_version) + if fields := ColumnVEPField.get_source_fields(genome_build, q, vep_custom=vep_custom): prefix_lc = prefix.lower() if cfg := vc[prefix_lc]: # annotation settings are lower case cmd.extend(_get_custom_params_list(fields, prefix, cfg)) @@ -162,6 +164,29 @@ def get_vep_command(vcf_filename, output_filename, genome_build: GenomeBuild, an # Not all annotations available for all builds - ok to just warn logging.warning("Skipped custom annotation: %s", prefix) + else: + plugin_data_func = { + # TODO: Need to decide on overlap criteria + # percentage : percentage overlap between SVs (default: 80) + # reciprocal : calculate reciprocal overlap, options: 0 or 1. (default: 0) + # (overlap is expressed as % of input SV by default) + # cols : colon delimited list of data types to return from the INFO fields (only AF by default) + # same_type : 1/0 only report SV of the same type (eg deletions for deletions, off by default) + # distance : the distance the ends of the overlapping SVs should be within. + # match_type : only report reference SV which lie within or completely surround the input SV + # options: within, surrounding + VEPPlugin.STRUCTURALVARIANTOVERLAP: lambda: f"StructuralVariantOverlap,file={vc['structuralvariantoverlap']}", + } + + for vep_plugin, plugin_arg_func in plugin_data_func.items(): + try: + cmd.extend(["--plugin", plugin_arg_func()]) + except Exception as e: + logging.warning(e) + logging.warning("No annotation set for plugin: %s", vep_plugin) + + + return cmd diff --git a/variantgrid/settings/components/default_settings.py b/variantgrid/settings/components/default_settings.py index b273b5d51..db9986e9d 100644 --- a/variantgrid/settings/components/default_settings.py +++ b/variantgrid/settings/components/default_settings.py @@ -230,11 +230,13 @@ # so you can change just that variable and have everything else work # The names correspond to VEPPlugin or VEPCustom entries (but lower case) "vep_config": { + "alphamissense": "annotation_data/GRCh37/AlphaMissense_hg19.tsv.gz", "cosmic": "annotation_data/GRCh37/CosmicCodingMuts_v95_20211101_grch37.normal.vcf.gz", "dbnsfp": "annotation_data/GRCh37/dbNSFP4.0a.grch37.stripped.gz", "dbscsnv": "annotation_data/GRCh37/dbscSNV1.1_GRCh37.txt.gz", "gnomad2": "annotation_data/GRCh37/gnomad2.1.1_GRCh37_combined_af.vcf.bgz", "mastermind": "annotation_data/GRCh37/mastermind_cited_variants_reference-2022.04.02-grch37.vcf.gz", + "mave": None, # n/a for GRCh37 "maxentscan": "annotation_data/all_builds/maxentscan", 'phastcons100way': "annotation_data/GRCh37/hg19.100way.phastCons.bw", 'phastcons46way': "annotation_data/GRCh37/hg19.phastCons46way.placental.bw", @@ -245,6 +247,7 @@ "repeatmasker": "annotation_data/GRCh37/repeatmasker_hg19.bed.gz", "spliceai_snv": "annotation_data/GRCh37/spliceai_scores.raw.snv.hg19.vcf.gz", "spliceai_indel": "annotation_data/GRCh37/spliceai_scores.raw.indel.hg19.vcf.gz", + "structuralvariantoverlap": "annotation_data/GRCh37/gnomad_v2.1_sv.sites.grch37.converted.vcf.gz", "topmed": "annotation_data/GRCh37/TOPMED_GRCh37.vcf.gz", "uk10k": "annotation_data/GRCh37/UK10K_COHORT.20160215.sites.vcf.gz", } @@ -262,12 +265,14 @@ # so you can change just that variable and have everything else work # The names correspond to VEPPlugin or VEPCustom entries (but lower case) "vep_config": { + "alphamissense": "annotation_data/GRCh38/AlphaMissense_hg38.tsv.gz", "cosmic": "annotation_data/GRCh38/CosmicCodingMuts_v95_20211101_grch38.normal.vcf.gz", "dbnsfp": "annotation_data/GRCh38/dbNSFP4.0a.grch38.stripped.gz", "dbscsnv": "annotation_data/GRCh38/dbscSNV1.1_GRCh38.txt.gz", "gnomad2": "annotation_data/GRCh38/gnomad2.1.1_GRCh38_combined_af.vcf.bgz", "gnomad3": "annotation_data/GRCh38/gnomad3.1_GRCh38_merged.vcf.bgz", "mastermind": "annotation_data/GRCh38/mastermind_cited_variants_reference-2022.04.02-grch38.vcf.gz", + "mave": "annotation_data/GRCh38/MaveDB_variants.tsv.gz", "maxentscan": "annotation_data/all_builds/maxentscan", 'phastcons100way': "annotation_data/GRCh38/hg38.phastCons100way.bw", 'phastcons46way': None, # n/a for GRCh38 @@ -278,6 +283,7 @@ "repeatmasker": "annotation_data/GRCh38/repeatmasker_hg38.bed.gz", "spliceai_snv": "annotation_data/GRCh38/spliceai_scores.raw.snv.hg38.vcf.gz", "spliceai_indel": "annotation_data/GRCh38/spliceai_scores.raw.indel.hg38.vcf.gz", + "structuralvariantoverlap": "annotation_data/GRCh38/gnomad.v4.0.sv.merged.vcf.gz", "topmed": "annotation_data/GRCh38/TOPMED_GRCh38_20180418.vcf.gz", "uk10k": "annotation_data/GRCh38/UK10K_COHORT.20160215.sites.GRCh38.vcf.gz", } From 9922a51658fad64c011c2bab8ea10534b68eb13e Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Mon, 27 Nov 2023 22:48:13 +1030 Subject: [PATCH 02/29] #938 - Add X/hemi stuff --- .../annotation_data/generate_annotation/gnomad4_data.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/annotation/annotation_data/generate_annotation/gnomad4_data.py b/annotation/annotation_data/generate_annotation/gnomad4_data.py index 8f68fc48d..01f8add6c 100755 --- a/annotation/annotation_data/generate_annotation/gnomad4_data.py +++ b/annotation/annotation_data/generate_annotation/gnomad4_data.py @@ -20,6 +20,7 @@ COUNTS = ['AC', 'AN'] OTHER_INFOS = ["nhomalt", "non_par", "faf95", "faf99", "fafmax_faf95_max", "fafmax_faf99_max"] GNOMAD_SUB_POPS = ["afr", "amr", "asj", "eas", "fin", "mid", "nfe", "remaining", "sas"] # Will get AF for each +CHR_X_ONLY = ["AC_XY", "AN_XY", "AF_XY"] # popmax/grpmax is calculated using non-bottlenecked genetic ancestry groups BOTTLENECKED_SUB_POPS = ["asj", "fin", "mid", "remaining"] @@ -85,6 +86,8 @@ def write_scripts(args): # To remove all INFO tags except "FOO" and "BAR", use "^INFO/FOO,INFO/BAR" # @see https://samtools.github.io/bcftools/bcftools.html#annotate """ my_columns = columns.copy() + if chrom == "X": + my_columns.extend(CHR_X_ONLY) info_columns = [f"INFO/{i}" for i in my_columns] keep_columns = ','.join(info_columns) # AC/AN are special format fields @@ -112,6 +115,7 @@ def write_scripts(args): # Merge exomes/genome VCFs # if we leave out rule, will take from 1st file which is ok for PAR as will be the same skip_columns = {"non_par"} + # Default rule = "sum" if not below (or skipped) rule_ops = { # Will take higher of whatever is there in genomes/exomes "faf95": "max", @@ -120,7 +124,7 @@ def write_scripts(args): "fafmax_faf99_max": "max", } info_rules = [] - for c in columns: + for c in my_columns: if c not in skip_columns: op = rule_ops.get(c, "sum") info_rules.append(f"{c}:{op}") @@ -193,6 +197,9 @@ def write_vcf_header(): ##INFO= ##INFO= ##INFO= +##INFO= +##INFO= +##INFO= ##INFO= ##INFO= ##INFO= From f701538a8d8082811cf4b3367fcd905b345f1f6e Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Tue, 28 Nov 2023 13:36:26 +1030 Subject: [PATCH 03/29] Consistent names (we use filename to obtain gnomAD version in VAV) --- .../generate_annotation/gnomad4_data.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/annotation/annotation_data/generate_annotation/gnomad4_data.py b/annotation/annotation_data/generate_annotation/gnomad4_data.py index 01f8add6c..7dd190e1b 100755 --- a/annotation/annotation_data/generate_annotation/gnomad4_data.py +++ b/annotation/annotation_data/generate_annotation/gnomad4_data.py @@ -14,6 +14,7 @@ from argparse import ArgumentParser from datetime import datetime +GNOMAD_VERSION="4.0" GRCh38 = "GRCh38" # We deliberately leave out AF and "grpmax" stuff as we recalculate that later in 'calculate_allele_frequency' @@ -31,7 +32,7 @@ def get_args(): parser.add_argument("--test", action='store_true', help="Only download 5k of each file.") # parser.add_argument("--genome-fasta", help='Fasta (correct for build)') parser.add_argument("--chrom-mapping-file", help='bcftools chromosome conversion') - parser.add_argument("--version", help='gnomAD version (default: 4.0)', default='4.0') + parser.add_argument("--version", help=f'gnomAD version (default: {GNOMAD_VERSION})', default=GNOMAD_VERSION) parser.add_argument("--path", help='Colon separated paths for tabix/bgzip/vt/bcftools') parser.add_argument("--gnomad-input-vcf") parser.add_argument("--af-output-vcf") @@ -75,7 +76,7 @@ def write_scripts(args): chrom_scripts = [] af_vcfs = [] for chrom in CHROMOSOMES: - prefix = f"gnomad4_chr{chrom}" + prefix = f"gnomad{GNOMAD_VERSION}_{GRCh38}_chr{chrom}" chrom_script = f"{prefix}.sh" chrom_scripts.append(chrom_script) with open(chrom_script, "w") as cs: @@ -94,7 +95,7 @@ def write_scripts(args): output_vcf = f"{prefix}_{vcf_type}.filtered_info.vcf.gz" annotate_args = f"--rename-chrs={args.chrom_mapping_file}" - gnomad_vcf_filename = f"gnomad.{vcf_type}.v4.0.sites.chr{chrom}.vcf.bgz" + gnomad_vcf_filename = f"gnomad.{vcf_type}.{GNOMAD_VERSION}_{GRCh38}.sites.chr{chrom}.vcf.bgz" # bcftools merge doesn't work with type='A' # bcftools now works with AC/AN etc - see https://github.com/samtools/bcftools/issues/1394 @@ -150,8 +151,8 @@ def write_scripts(args): with open(merge_script_filename, "w") as ms: ms.write(bash_header) quoted_files = ' '.join([f"'{f}'" for f in af_vcfs]) - gnomad_combined_af_vcf = f"gnomad4_combined_af.vcf.bgz" - ms.write(f"zcat {vcf_header} {quoted_files} | bgzip > {gnomad_combined_af_vcf}\n") + gnomad_combined_af_vcf = f"gnomad{GNOMAD_VERSION}_{GRCh38}_combined_af.vcf.bgz" + ms.write(f"cat {vcf_header} {quoted_files} > {gnomad_combined_af_vcf}\n") ms.write(f"tabix {gnomad_combined_af_vcf}\n") launch_script_filename = f"gnomad4_launch.sh" From 6c4bd6460c3f73d94b3b78ae20e92704e12bfef5 Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Tue, 28 Nov 2023 14:29:13 +1030 Subject: [PATCH 04/29] consolidate download scripts --- .../generate_annotation/gnomad2.1_download.sh | 19 +++++++++++ .../generate_annotation/gnomad4_download.sh | 22 +++++++++++++ .../gnomad4_download_exomes.sh | 10 ------ .../gnomad4_download_genomes.sh | 10 ------ .../gnomad4_download_structural.sh | 33 ------------------- 5 files changed, 41 insertions(+), 53 deletions(-) create mode 100644 annotation/annotation_data/generate_annotation/gnomad2.1_download.sh create mode 100644 annotation/annotation_data/generate_annotation/gnomad4_download.sh delete mode 100644 annotation/annotation_data/generate_annotation/gnomad4_download_exomes.sh delete mode 100644 annotation/annotation_data/generate_annotation/gnomad4_download_genomes.sh delete mode 100644 annotation/annotation_data/generate_annotation/gnomad4_download_structural.sh diff --git a/annotation/annotation_data/generate_annotation/gnomad2.1_download.sh b/annotation/annotation_data/generate_annotation/gnomad2.1_download.sh new file mode 100644 index 000000000..b629dee4a --- /dev/null +++ b/annotation/annotation_data/generate_annotation/gnomad2.1_download.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +# gnomad v4.0 + +# Exomes +for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do + wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/2.1.1/vcf/exomes/gnomad.exomes.r2.1.1.sites.${chrom}.vcf.bgz + wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/2.1.1/vcf/exomes/gnomad.exomes.r2.1.1.sites.${chrom}.vcf.bgz.tbi +done + +# Genomes +for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do + wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/2.1.1/vcf/genomes/gnomad.genomes.r2.1.1.sites.${chrom}.vcf.bgz + wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/2.1.1/vcf/genomes/gnomad.genomes.r2.1.1.sites.${chrom}.vcf.bgz.tbi +done + +# Structural +wget https://gnomad-public-us-east-1.s3.amazonaws.com/papers/2019-sv/gnomad_v2.1_sv.sites.vcf.gz +wget https://gnomad-public-us-east-1.s3.amazonaws.com/papers/2019-sv/gnomad_v2.1_sv.sites.vcf.gz.tbi diff --git a/annotation/annotation_data/generate_annotation/gnomad4_download.sh b/annotation/annotation_data/generate_annotation/gnomad4_download.sh new file mode 100644 index 000000000..59ec1e333 --- /dev/null +++ b/annotation/annotation_data/generate_annotation/gnomad4_download.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# gnomad v4.0 + +# Exomes +for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do + wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.0/vcf/exomes/gnomad.exomes.v4.0.sites.chr${chrom}.vcf.bgz + wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.0/vcf/exomes/gnomad.exomes.v4.0.sites.chr${chrom}.vcf.bgz.tbi +done + +# Genomes +for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do + wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chr${chrom}.vcf.bgz + wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chr${chrom}.vcf.bgz.tbi +done + +# Structural +for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do + GNOMAD_VCF=gnomad.v4.0.sv.chr${chrom}.vcf.gz + wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.0/genome_sv/${GNOMAD_VCF} + wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.0/genome_sv/${GNOMAD_VCF}.tbi +done \ No newline at end of file diff --git a/annotation/annotation_data/generate_annotation/gnomad4_download_exomes.sh b/annotation/annotation_data/generate_annotation/gnomad4_download_exomes.sh deleted file mode 100644 index 1375d380b..000000000 --- a/annotation/annotation_data/generate_annotation/gnomad4_download_exomes.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - - -# Structural variants - -# gnomad v4 -for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do - wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.0/vcf/exomes/gnomad.exomes.v4.0.sites.chr${chrom}.vcf.bgz - wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.0/vcf/exomes/gnomad.exomes.v4.0.sites.chr${chrom}.vcf.bgz.tbi -done \ No newline at end of file diff --git a/annotation/annotation_data/generate_annotation/gnomad4_download_genomes.sh b/annotation/annotation_data/generate_annotation/gnomad4_download_genomes.sh deleted file mode 100644 index dcb8b239d..000000000 --- a/annotation/annotation_data/generate_annotation/gnomad4_download_genomes.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - - -# Structural variants - -# gnomad v4 -for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do - wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chr${chrom}.vcf.bgz - wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chr${chrom}.vcf.bgz.tbi -done \ No newline at end of file diff --git a/annotation/annotation_data/generate_annotation/gnomad4_download_structural.sh b/annotation/annotation_data/generate_annotation/gnomad4_download_structural.sh deleted file mode 100644 index a2098050e..000000000 --- a/annotation/annotation_data/generate_annotation/gnomad4_download_structural.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash - -THIS_DIR=$(realpath "$(dirname "${BASH_SOURCE[0]}")") - -# Structural variants -SV_COLUMNS=INFO/SVLEN,INFO/SVTYPE,INFO/END -COLS=INFO/AC,INFO/AN,INFO/AF -OTHER_COUNTS=INFO/N_HOMREF,INFO/N_HET,INFO/N_HOMALT -SUBPOPS=INFO/afr_AF,INFO/amr_AF,INFO/asj_AF,INFO/eas_AF,INFO/fin_AF,INFO/mid_AF,INFO/nfe_AF,INFO/oth_AF,INFO/sas_AF - -KEEP_COLUMNS=${SV_COLUMNS},${COLS},${OTHER_COUNTS},${SUBPOPS} -CHROM_MAPPING_FILE=${THIS_DIR}/../../../snpdb/genome/chrom_mapping_GRCh38.map -GENOME_FASTA=/data/annotation/fasta/GCF_000001405.40_GRCh38.p14_genomic.fna.gz - -# gnomad v4 -for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do - GNOMAD_VCF=gnomad.v4.0.sv.chr${chrom}.vcf.gz - wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.0/genome_sv/${GNOMAD_VCF} - wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.0/genome_sv/${GNOMAD_VCF}.tbi - - OUTPUT_VCF= - # bcftools annotate --exclude 'AC=0' --remove '^{KEEP_COLUMNS}' --rename-chrs={CHROM_MAPPING_FILE} | vt normalize - -r ${GENOME_FASTA} -o + | vt uniq + -o ${OUTPUT_VCF} - -done - - -# OTHER_INFOS = ["AC_popmax", "AN_popmax", "AF_popmax", "popmax", "nhomalt", "nhomalt_popmax", "nonpar"] -# GNOMAD_SUB_POPS = ["afr", "amr", "asj", "eas", "fin", "mid", "nfe", "oth", "sas"] # Will get AF for each - -# These have been removed in v4 - "AC_popmax", "AN_popmax", "AF_popmax" -# nonpar is now "par" - - From 5860cc153c550151e5428c884a41fcf2ecab27cd Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Wed, 29 Nov 2023 15:58:49 +1030 Subject: [PATCH 05/29] #938 - gnomAD - consolidate scripts --- .../generate_annotation/gnomad2_data.py | 270 ------------------ .../gnomad3.1.2_download.sh | 7 + .../gnomad3_create_genome_scripts.py | 114 -------- ...mad4_download.sh => gnomad4.0_download.sh} | 0 .../{gnomad4_data.py => gnomad_data.py} | 210 +++++++++----- 5 files changed, 150 insertions(+), 451 deletions(-) delete mode 100755 annotation/annotation_data/generate_annotation/gnomad2_data.py create mode 100644 annotation/annotation_data/generate_annotation/gnomad3.1.2_download.sh delete mode 100755 annotation/annotation_data/generate_annotation/gnomad3_create_genome_scripts.py rename annotation/annotation_data/generate_annotation/{gnomad4_download.sh => gnomad4.0_download.sh} (100%) rename annotation/annotation_data/generate_annotation/{gnomad4_data.py => gnomad_data.py} (50%) diff --git a/annotation/annotation_data/generate_annotation/gnomad2_data.py b/annotation/annotation_data/generate_annotation/gnomad2_data.py deleted file mode 100755 index bf03d8218..000000000 --- a/annotation/annotation_data/generate_annotation/gnomad2_data.py +++ /dev/null @@ -1,270 +0,0 @@ -#!/usr/bin/env python3 -""" -We want to do this per-chrom so we can process in parallel - -Steps are: - 1. Download exomes.vcf + genome.vcf, removing most INFO fields before writing to disk (to reduce disk space) - 2. Merge exome + genome, summing counts - 3. Run through this script with --af to calculate allele frequency, write TSV (more efficient than VCF) - 4. Cat them all together again -""" - -import gzip -import os -from argparse import ArgumentParser -from datetime import datetime - -from cyvcf2 import VCF - -GRCh37 = "GRCh37" -GRCh38 = "GRCh38" -BUILDS = [GRCh37, GRCh38] - -COUNTS = ['AC', 'AN'] -OTHER_INFOS = ["nhomalt"] -GNOMAD_SUB_POPS = ["afr", "amr", "asj", "eas", "fin", "nfe", "oth", "sas"] # Will get AN/AC for these -CHROMOSOMES = list(map(str, range(1, 23))) + ['X', 'Y'] - - -def get_args(): - parser = ArgumentParser(description="Merge exome+genome VCFs for VariantGrid VEP pipeline") - parser.add_argument("--test", action='store_true', help="Only download 5k of each file.") - parser.add_argument("--genome-build", help='GRCh37 or GRCh38') - parser.add_argument("--genome-fasta", help='Fasta (correct for build)') - parser.add_argument("--genome-fasta-has-chr", type=bool, help='GnomAD has "chr1", set to false if ref uses "1"') - parser.add_argument("--version", help='gnomAD version (default: 2.1.1)', default='2.1.1') - parser.add_argument("--gnomad-input-vcf") - parser.add_argument("--af-output-vcf") - - group = parser.add_mutually_exclusive_group(required=True) - group.add_argument('--scripts', action='store_true', help="Generate scripts") - group.add_argument('--af', action='store_true', help="Calculate allele frequency from VCF") - - args = parser.parse_args() - if args.scripts: - if args.genome_build is None or args.genome_build not in BUILDS: - parser.error(f"--genome-build must be one of {','.join(BUILDS)}") - if args.genome_fasta is None: - parser.error("--genome-fasta required for --scripts") - else: - if args.gnomad_input_vcf is None: - parser.error("--gnomad-input-vcf required for --af") - if args.af_output_vcf is None: - parser.error("--af-output-vcf required for --af") - - return args - - -def main(args): - if args.scripts: - write_scripts(args) - else: - calculate_allele_frequency(args.gnomad_input_vcf, args.af_output_vcf) - - -def write_scripts(args): - genome_build = args.genome_build - version = args.version - genome_fasta = args.genome_fasta - if args.test: - # only download 5k lines of file - extra_filters = "| bgzip -d | head -5000 | bcftools view -O z" - else: - extra_filters = "" # nothing - - if not args.genome_fasta_has_chr: - chrom_mapping_file = None - else: - chrom_mapping_file = write_chrom_mapping_file() - - columns = get_columns() - bash_header = "#!/bin/bash\nset -e # fail on error\n" - - chrom_scripts = [] - af_vcfs = [] - for chrom in CHROMOSOMES: - prefix = f"gnomad_{genome_build}_chr{chrom}" - chrom_script = f"{prefix}.sh" - chrom_scripts.append(chrom_script) - with open(chrom_script, "w") as cs: - cs.write(bash_header) - - output_vcfs = [] - for vcf_type in ["exomes", "genomes"]: - if genome_build == GRCh37: - url = f"https://storage.googleapis.com/gnomad-public/release/{version}/vcf/{vcf_type}/gnomad.{vcf_type}.r{version}.sites.{chrom}.vcf.bgz" - else: - url = f"https://storage.googleapis.com/gnomad-public/release/{version}/liftover_grch38/vcf/{vcf_type}/gnomad.{vcf_type}.r{version}.sites.{chrom}.liftover_grch38.vcf.bgz" - - # To remove all INFO tags except "FOO" and "BAR", use "^INFO/FOO,INFO/BAR" - # @see https://samtools.github.io/bcftools/bcftools.html#annotate """ - my_columns = columns.copy() - if vcf_type == "genomes": # No SAS in genomes - if chrom == 'Y': - continue # No Y in genomes - my_columns.remove("AC_sas") - my_columns.remove("AN_sas") - - info_columns = [f"INFO/{i}" for i in my_columns] - keep_columns = ','.join(info_columns) # AC/AN are special format fields - output_vcf = f"{prefix}_{vcf_type}.filtered_info.vcf.gz" - if chrom_mapping_file: - annotate_args = f"--rename-chrs={chrom_mapping_file}" - else: - annotate_args = "" - - # bcftools merge doesn't work with type='A' or special AC/AN INFO fields w/o a FORMAT (which gnomAD doesn't have) - modify_fields = "sed -e 's/,Number=A,/,Number=1,/' -e 's/ID=AC,/ID=AC_count,/' -e 's/ID=AN,/ID=AN_count,/' -e 's/AC=/AC_count=/' -e 's/AN=/AN_count=/'" - # gnomAD appears to already be decomposed - vt decompose + -s -o + - cs.write("\necho Download and clean as we go to save disk\n") - cs.write(f"wget --quiet -O - {url} {extra_filters} | bcftools annotate --exclude 'AC=0' --remove '^{keep_columns}' {annotate_args} | {modify_fields} | vt normalize - -r {genome_fasta} -o + | vt uniq + -o {output_vcf}\n") - output_vcfs.append(output_vcf) - - combined_vcf = f"{prefix}.combined.vcf.gz" - if len(output_vcfs) == 1: # Just 1, rename it - output_vcf = output_vcfs[0] - cs.write(f"mv {output_vcf} {combined_vcf}\n") - else: - for ov in output_vcfs: - cs.write(f"tabix {ov}\n") - - # Merge - adding them together... - renamed_columns = [f"{c}_count" if c in ['AC', 'AN'] else c for c in columns] - info_rules = [f"{c}:sum" for c in renamed_columns] - info_rules_arg = ','.join(info_rules) - cs.write("\n\necho Merging VCFs - will keep flags from genomes.\n") - cs.write(f"bcftools merge --merge none --info-rules '{info_rules_arg}' '{output_vcfs[0]}' '{output_vcfs[1]}' -O z -o {combined_vcf}\n") - - # Now process them with this script - cs.write("\n\necho Calculate Allele Frequency\n") - script_filename = os.path.realpath(__file__) - allele_frequency_vcf = f"{prefix}.af.vcf.gz" - cs.write(f"{script_filename} --af --gnomad-input-vcf={combined_vcf} --af-output-vcf={allele_frequency_vcf}\n") - af_vcfs.append(allele_frequency_vcf) - - if args.test: - break # Only do 1 chrom - - # Write merge script - merge_script_filename = f"gnomad_{genome_build}_merge.sh" - vcf_header = write_vcf_header() - - with open(merge_script_filename, "w") as ms: - ms.write(bash_header) - quoted_files = ' '.join([f"'{f}'" for f in af_vcfs]) - gnomad_combined_af_vcf = f"gnomad_{genome_build}_combined_af.vcf.bgz" - ms.write(f"gzcat {vcf_header} {quoted_files} | bgzip > {gnomad_combined_af_vcf}\n") - ms.write(f"tabix {gnomad_combined_af_vcf}\n") - - launch_script_filename = f"gnomad_{genome_build}_launch.sh" - with open(launch_script_filename, "w") as ms: - ms.write(bash_header) - ms.write('SCRIPT_DIR=$(dirname "${BASH_SOURCE[0]}")\n') - for cs in chrom_scripts: - ms.write(f"${{SCRIPT_DIR}}/{cs} > {cs}.log 2> {cs}.stderr.log &\n") - - ms.write("echo Waiting for all chroms to finish...\n") - ms.write("wait\n") - ms.write(f"${{SCRIPT_DIR}}/{merge_script_filename}\n") - - -def get_columns(): - columns = COUNTS + OTHER_INFOS - for g in GNOMAD_SUB_POPS: - for f in ["AC", "AN"]: - columns.append(f"{f}_{g.lower()}") - return columns - - -def get_af_info(): - af_info = [ - ("AF", None, "AC_count", "AN_count"), - ] - for g in GNOMAD_SUB_POPS: - af_info.append((f'AF_{g}', g, f'AC_{g}', f'AN_{g}')) - return af_info - - -def write_vcf_header(): - """ Needs to be gzipped so can be concatenated with other gzipped files """ - - now = datetime.now() - file_date = "%d%02d%02d" % (now.year, now.month, now.day) - source = __file__ - meta = """##fileformat=VCFv4.2 -##fileDate=%(file_date)s -##source=%(source)s -##INFO= -##INFO= -##INFO= -##INFO= -""" % {"file_date": file_date, "source": source} - - af_info = get_af_info() - for info_id, pop_name, ac_name, an_name in af_info: - if pop_name: - af_desc = f"for {pop_name}" - else: - af_desc = "" - af_desc += f" made from (exomes_{ac_name} + genomes_{ac_name}) / (exomes_{an_name} + genomes_{an_name})" - meta += f'##INFO=\n' - - vcf_header = "vcf_header.txt.gz" - with gzip.open(vcf_header, "wt") as f: - f.write(meta) - header_cols = ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] - header = "#" + '\t'.join(header_cols) - f.write(header + "\n") - return vcf_header - - -def write_chrom_mapping_file(): - chrom_mapping_file = "chrom_mapping.txt" - with open(chrom_mapping_file, "w") as f: - for c in CHROMOSOMES: - f.write(f"chr{c}\t{c}\n") - return chrom_mapping_file - - -def calculate_allele_frequency(gnomad_input_vcf, af_output_vcf): - # We have to re-calculate POPMAX as we can't merge it - af_info = get_af_info() - info_names = [ai[0] for ai in af_info] + OTHER_INFOS + ["AF_popmax", "popmax", "gnomad_filtered"] - - with gzip.open(af_output_vcf, "wt") as f: - for variant in VCF(gnomad_input_vcf): - chrom = variant.CHROM - pos = str(variant.POS) - variant_id = variant.ID or '.' - ref = variant.REF - alt = variant.ALT[0] # no multi-alts - - af_popmax = 0 - popmax = '.' - infos = [] - for _, pop_name, ac_name, an_name in af_info: - ac = variant.INFO.get(ac_name, 0) - an = variant.INFO.get(an_name) - #print(f"{ac_name}/{an_name} {ac}/{an}") - if an: - af = ac / an - if pop_name and af > af_popmax: # Only use subpops - af_popmax = af - popmax = pop_name - af = f'{af:.6f}' - else: - af = '.' - infos.append(af) - - for o in OTHER_INFOS: - infos.append(str(variant.INFO.get(o, '.'))) - gnomad_filtered = '0' if variant.FILTER is None else '1' - infos.extend([str(af_popmax), popmax, gnomad_filtered]) - info_str = ";".join([i + "=" + v for i, v in zip(info_names, infos)]) - columns = [chrom, pos, variant_id, ref, alt, '.', '.', info_str] - f.write("\t".join(columns) + "\n") - - -if __name__ == "__main__": - args = get_args() - main(args) diff --git a/annotation/annotation_data/generate_annotation/gnomad3.1.2_download.sh b/annotation/annotation_data/generate_annotation/gnomad3.1.2_download.sh new file mode 100644 index 000000000..6e32e0975 --- /dev/null +++ b/annotation/annotation_data/generate_annotation/gnomad3.1.2_download.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +# 3.1 only had genomes +for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do + wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/3.1.2/vcf/genomes/gnomad.genomes.v3.1.2.sites.chr${chrom}.vcf.bgz + wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/3.1.2/vcf/genomes/gnomad.genomes.v3.1.2.sites.chr${chrom}.vcf.bgz.tbi +done diff --git a/annotation/annotation_data/generate_annotation/gnomad3_create_genome_scripts.py b/annotation/annotation_data/generate_annotation/gnomad3_create_genome_scripts.py deleted file mode 100755 index 822013d1f..000000000 --- a/annotation/annotation_data/generate_annotation/gnomad3_create_genome_scripts.py +++ /dev/null @@ -1,114 +0,0 @@ -#!/usr/bin/env python3 -""" - The gnomAD v3.1.2 data set contains 76,156 whole genomes (and no exomes), all mapped to the GRCh38 reference sequence. -""" - -from argparse import ArgumentParser - -GRCh38 = "GRCh38" -BUILDS = [GRCh38] - -COUNTS = ['AC', 'AN', 'AF'] -OTHER_INFOS = ["AC_popmax", "AN_popmax", "AF_popmax", "popmax", "nhomalt", "nhomalt_popmax", "nonpar"] -GNOMAD_SUB_POPS = ["afr", "amr", "asj", "eas", "fin", "nfe", "oth", "sas"] # Will get AF for each -CHROMOSOMES = list(map(str, range(1, 23))) + ['X', 'Y'] - - -def get_args(): - parser = ArgumentParser(description="Get, strip and merge gnomAD VCFs for VariantGrid VEP pipeline") - parser.add_argument("--test", action='store_true', help="Only download 5k of each file.") - parser.add_argument("--genome-fasta", required=True, help='Fasta (correct for build)') - parser.add_argument("--chrom_mapping_file", help="Mapping file to convert chroms (if you get 'the sequence 'chr1' was not found)'") - return parser.parse_args() - - -def main(args): - genome_build = GRCh38 - genome_fasta = args.genome_fasta - if args.test: - # only download 5k lines of file - extra_filters = "| bgzip -d | head -5000 | bcftools view -O z" - else: - extra_filters = "" # nothing - - # To remove all INFO tags except "FOO" and "BAR", use "^INFO/FOO,INFO/BAR" - # @see https://samtools.github.io/bcftools/bcftools.html#annotate """ - info_columns = [f"INFO/{i}" for i in get_columns()] - keep_columns = ','.join(info_columns) # AC/AN are special format fields - bash_header = "#!/bin/bash\nset -e # fail on error\n" - - chrom_scripts = [] - chrom_vcfs = [] - for chrom in CHROMOSOMES: - prefix = f"gnomad_{genome_build}_chr{chrom}" - chrom_script = f"{prefix}.sh" - chrom_scripts.append(chrom_script) - with open(chrom_script, "w") as cs: - cs.write(bash_header) - # gnomAD3.1 only has genomes, no exomes - url = f"https://storage.googleapis.com/gcp-public-data--gnomad/release/3.1.2/vcf/genomes/gnomad.genomes.v3.1.3.sites.chr{chrom}.vcf.bgz" - output_vcf = f"{prefix}.filtered_info.vcf.gz" - if args.chrom_mapping_file: - annotate_args = f"--rename-chrs={args.chrom_mapping_file}" - else: - annotate_args = "" - - # gnomAD appears to already be decomposed - vt decompose + -s -o + - cs.write("\necho Download and clean as we go to save disk\n") - cs.write(f"wget --quiet -O - {url} {extra_filters} | bcftools annotate --exclude 'AC=0' --remove '^{keep_columns}' {annotate_args} | vt normalize - -r {genome_fasta} -o + | vt uniq + -o {output_vcf}\n") - - chrom_vcfs.append(output_vcf) - if args.test: - break # Only do 1 chrom - - # Write merge script - merge_script_filename = f"gnomad_{genome_build}_merge.sh" - - vcf_header = write_vcf_header() - - with open(merge_script_filename, "w") as ms: - ms.write(bash_header) - quoted_files = ' '.join([f"'{f}'" for f in chrom_vcfs]) - gnomad_combined_af_vcf = f"gnomad3_{genome_build}_combined.vcf.bgz" - ms.write(f"zcat {chrom_vcfs[0]} | head -1000 | grep '^#' | bgzip > vcf_header.bgz") - ms.write(f"gzcat vcf_header.bgz {quoted_files} | bgzip > {gnomad_combined_af_vcf}\n") - ms.write(f"tabix {gnomad_combined_af_vcf}\n") - - launch_script_filename = f"gnomad_{genome_build}_launch.sh" - with open(launch_script_filename, "w") as ms: - ms.write(bash_header) - ms.write('SCRIPT_DIR=$(dirname "${BASH_SOURCE[0]}")\n') - for cs in chrom_scripts: - ms.write(f"${{SCRIPT_DIR}}/{cs} > {cs}.log 2> {cs}.stderr.log &\n") - - ms.write("echo Waiting for all chroms to finish...\n") - ms.write("wait\n") - ms.write(f"${{SCRIPT_DIR}}/{merge_script_filename}\n") - - -def write_vcf_header(): - """ Needs to be gzipped so can be concatenated with other gzipped files """ - vcf_header = "" - return vcf_header - - -def get_columns(): - columns = COUNTS + OTHER_INFOS - for g in GNOMAD_SUB_POPS: - # gnomAD 3 changed from underscore to dash - # 3.1.2 changed back to underscore - columns.append(f"AF_{g.lower()}") - return columns - - -def write_chrom_mapping_file(): - chrom_mapping_file = "chrom_mapping.txt" - with open(chrom_mapping_file, "w") as f: - for c in CHROMOSOMES: - f.write(f"chr{c}\t{c}\n") - return chrom_mapping_file - - -if __name__ == "__main__": - args = get_args() - main(args) diff --git a/annotation/annotation_data/generate_annotation/gnomad4_download.sh b/annotation/annotation_data/generate_annotation/gnomad4.0_download.sh similarity index 100% rename from annotation/annotation_data/generate_annotation/gnomad4_download.sh rename to annotation/annotation_data/generate_annotation/gnomad4.0_download.sh diff --git a/annotation/annotation_data/generate_annotation/gnomad4_data.py b/annotation/annotation_data/generate_annotation/gnomad_data.py similarity index 50% rename from annotation/annotation_data/generate_annotation/gnomad4_data.py rename to annotation/annotation_data/generate_annotation/gnomad_data.py index 7dd190e1b..aa9c55ce0 100755 --- a/annotation/annotation_data/generate_annotation/gnomad4_data.py +++ b/annotation/annotation_data/generate_annotation/gnomad_data.py @@ -13,27 +13,64 @@ import os from argparse import ArgumentParser from datetime import datetime - -GNOMAD_VERSION="4.0" -GRCh38 = "GRCh38" - -# We deliberately leave out AF and "grpmax" stuff as we recalculate that later in 'calculate_allele_frequency' -COUNTS = ['AC', 'AN'] -OTHER_INFOS = ["nhomalt", "non_par", "faf95", "faf99", "fafmax_faf95_max", "fafmax_faf99_max"] -GNOMAD_SUB_POPS = ["afr", "amr", "asj", "eas", "fin", "mid", "nfe", "remaining", "sas"] # Will get AF for each -CHR_X_ONLY = ["AC_XY", "AN_XY", "AF_XY"] +from typing import Tuple, List + +GNOMAD_V_2_1 = "2.1.1" +GNOMAD_V_3_1_2 = "3.1.2" +GNOMAD_V_4_0 = "4.0" + +GNOMAD_VERSIONS = { + GNOMAD_V_2_1, + GNOMAD_V_3_1_2, + GNOMAD_V_4_0, +} + +FILENAMES = { + GNOMAD_V_2_1: "gnomad.%(capture_type)s.r2.1.1.sites.%(chrom)s.vcf.bgz", + GNOMAD_V_3_1_2: "gnomad.%(capture_type)s.v3.1.2.sites.chr{chrom}.vcf.bgz", + GNOMAD_V_4_0: "gnomad.%(capture_type)s.v4.0.sites.chr%(chrom)s.vcf.bgz", +} + + +GENOME_BUILDS = {"GRCh37", "GRCh38"} + +def get_infos_for_version(gnomad_version) -> Tuple[List[str], List[str], List[str], List[str]]: + # We deliberately leave out AF and "grpmax" stuff as we recalculate that later in 'calculate_allele_frequency' + info_fields = ['AC', 'AN', "nhomalt", "nonpar"] + popmax_fields = ["AF_popmax", "AC_popmax", "AN_popmax", "popmax", "nhomalt_popmax"] + grpmax_fields = ["AF_grpmax", "AC_grpmax", "AN_grpmax", "grpmax", "nhomalt_grpmax"] + sub_pops = ["afr", "amr", "asj", "eas", "fin", "nfe", "oth", "sas"] # Will get AF for each + chr_x_male = ["AC_male", "AN_male", "AF_male"] + chr_x_xy = ["AC_XY", "AN_XY", "AF_XY"] + + if gnomad_version == GNOMAD_V_4_0: + popmax_fields = grpmax_fields + chr_x_male = chr_x_xy + info_fields.extend(["faf95", "faf99", "fafmax_faf95_max", "fafmax_faf99_max"]) + # Others are now called remaining + sub_pops.remove("oth") + sub_pops.append("remaining") # + sub_pops.append("mid") # Middle easterners added in v4 + + info_fields.remove("nonpar") + info_fields.append("non_par") + + return info_fields, chr_x_male, popmax_fields, sub_pops # popmax/grpmax is calculated using non-bottlenecked genetic ancestry groups -BOTTLENECKED_SUB_POPS = ["asj", "fin", "mid", "remaining"] +BOTTLENECKED_SUB_POPS = {"asj", "fin", "mid", "oth", "remaining"} def get_args(): + available_builds = ", ".join(GENOME_BUILDS) + available_versions = ", ".join(GNOMAD_VERSIONS) + parser = ArgumentParser(description="Merge exome+genome VCFs for VariantGrid VEP pipeline") - parser.add_argument("--test", action='store_true', help="Only download 5k of each file.") - # parser.add_argument("--genome-fasta", help='Fasta (correct for build)') + parser.add_argument("--test", action='store_true', help="Only do chrY (quick test)") parser.add_argument("--chrom-mapping-file", help='bcftools chromosome conversion') - parser.add_argument("--version", help=f'gnomAD version (default: {GNOMAD_VERSION})', default=GNOMAD_VERSION) - parser.add_argument("--path", help='Colon separated paths for tabix/bgzip/vt/bcftools') + parser.add_argument("--genome-build", help=f'GenomeBuild (one of {available_builds})') + parser.add_argument("--version", help=f'gnomAD version (one of {available_versions})') + parser.add_argument("--path", help='Optional Colon separated paths for tabix/bgzip/vt/bcftools') parser.add_argument("--gnomad-input-vcf") parser.add_argument("--af-output-vcf") @@ -48,6 +85,9 @@ def get_args(): if args.af_output_vcf is None: parser.error("--af-output-vcf required for --af") + if args.version not in GNOMAD_VERSIONS: + parser.error(f"Version must be one of: {available_versions}") + return args @@ -55,7 +95,7 @@ def main(args): if args.scripts: write_scripts(args) else: - calculate_allele_frequency(args.gnomad_input_vcf, args.af_output_vcf) + calculate_allele_frequency(args.version, args.gnomad_input_vcf, args.af_output_vcf) def write_scripts(args): @@ -63,42 +103,50 @@ def write_scripts(args): raise ValueError("--chrom-mapping-file is required for write scripts step") if args.test: - CHROMOSOMES = ["Y"] # Just do Y + chromosomes = ["Y"] # Just do Y else: - CHROMOSOMES = list(map(str, range(1, 23))) + ['X', 'Y'] + chromosomes = list(map(str, range(1, 23))) + ['X', 'Y'] + + info_fields, chr_x_male, popmax_fields, sub_pops = get_infos_for_version(args.version) - columns = get_columns() + columns = get_columns(info_fields, sub_pops) bash_header = "#!/bin/bash\nset -e # fail on error\n" if args.path: bash_header += "PATH=${PATH}:" + args.path + "\n" + filename_template = FILENAMES[args.version] + chrom_scripts = [] af_vcfs = [] - for chrom in CHROMOSOMES: - prefix = f"gnomad{GNOMAD_VERSION}_{GRCh38}_chr{chrom}" + for chrom in chromosomes: + prefix = f"gnomad{args.version}_{args.genome_build}_chr{chrom}" chrom_script = f"{prefix}.sh" chrom_scripts.append(chrom_script) - with open(chrom_script, "w") as cs: + with (open(chrom_script, "w") as cs): cs.write(bash_header) output_vcfs = [] - for vcf_type in ["exomes", "genomes"]: + for capture_type in ["exomes", "genomes"]: # To remove all INFO tags except "FOO" and "BAR", use "^INFO/FOO,INFO/BAR" # @see https://samtools.github.io/bcftools/bcftools.html#annotate """ my_columns = columns.copy() if chrom == "X": - my_columns.extend(CHR_X_ONLY) + my_columns.extend(chr_x_male) info_columns = [f"INFO/{i}" for i in my_columns] keep_columns = ','.join(info_columns) # AC/AN are special format fields - output_vcf = f"{prefix}_{vcf_type}.filtered_info.vcf.gz" + output_vcf = f"{prefix}_{capture_type}.filtered_info.vcf.gz" annotate_args = f"--rename-chrs={args.chrom_mapping_file}" - gnomad_vcf_filename = f"gnomad.{vcf_type}.{GNOMAD_VERSION}_{GRCh38}.sites.chr{chrom}.vcf.bgz" + gnomad_vcf_filename = filename_template % { + "capture_type": capture_type, + "chrom": chrom, + } - # bcftools merge doesn't work with type='A' # bcftools now works with AC/AN etc - see https://github.com/samtools/bcftools/issues/1394 + # but make sure you are using v18 + # bcftools merge doesn't work with type='A' modify_fields2 = "sed -e 's/,Number=A,/,Number=1,/'" # gnomAD appears to already be decomposed - vt decompose + -s -o + # We no longer remove AC=0 as we want to keep AN (total counts) for pops for later AF calculations @@ -115,7 +163,7 @@ def write_scripts(args): # Merge exomes/genome VCFs # if we leave out rule, will take from 1st file which is ok for PAR as will be the same - skip_columns = {"non_par"} + skip_columns = {"nonpar", "non_par"} # Default rule = "sum" if not below (or skipped) rule_ops = { # Will take higher of whatever is there in genomes/exomes @@ -145,17 +193,18 @@ def write_scripts(args): af_vcfs.append(allele_frequency_vcf) # Write merge script - merge_script_filename = f"gnomad4_merge.sh" - vcf_header = write_vcf_header() + merge_script_filename = f"gnomad{args.version}_merge.sh" + vcf_header = write_vcf_header(args.version, info_fields, popmax_fields, sub_pops) with open(merge_script_filename, "w") as ms: ms.write(bash_header) quoted_files = ' '.join([f"'{f}'" for f in af_vcfs]) - gnomad_combined_af_vcf = f"gnomad{GNOMAD_VERSION}_{GRCh38}_combined_af.vcf.bgz" - ms.write(f"cat {vcf_header} {quoted_files} > {gnomad_combined_af_vcf}\n") + gnomad_combined_af_vcf = f"gnomad{args.version}_{args.genome_build}_combined_af.vcf.bgz" + # We produce gzipped files, but want bgzipped, so need to cat then bgzip + ms.write(f"zcat {vcf_header} {quoted_files} | bgzip > {gnomad_combined_af_vcf}\n") ms.write(f"tabix {gnomad_combined_af_vcf}\n") - launch_script_filename = f"gnomad4_launch.sh" + launch_script_filename = f"gnomad{args.version}_launch.sh" with open(launch_script_filename, "w") as ms: ms.write(bash_header) ms.write('SCRIPT_DIR=$(dirname "${BASH_SOURCE[0]}")\n') @@ -167,50 +216,66 @@ def write_scripts(args): ms.write(f"${{SCRIPT_DIR}}/{merge_script_filename}\n") -def get_columns(): - columns = COUNTS + OTHER_INFOS - for g in GNOMAD_SUB_POPS: +def get_columns(info_fields, sub_pops): + columns = info_fields.copy() + for g in sub_pops: for f in ["AC", "AN"]: columns.append(f"{f}_{g.lower()}") return columns -def get_af_info(): +def get_af_info(sub_pops): af_info = [ ("AF", None, "AC", "AN"), ] - for g in GNOMAD_SUB_POPS: + for g in sub_pops: af_info.append((f'AF_{g}', g, f'AC_{g}', f'AN_{g}')) return af_info -def write_vcf_header(): +def write_vcf_header(version, info_fields, popmax_fields, sub_pops): """ Needs to be gzipped so can be concatenated with other gzipped files """ + all_info = set(info_fields + popmax_fields + ["gnomad_filtered"]) + field_headers = { + 'AC': '##INFO=', + 'AN': '##INFO=', + 'AC_XY': '##INFO=', + 'AF_XY': '##INFO=', + 'AN_XY': '##INFO=', + 'faf95': '##INFO=', + 'faf99': '##INFO=', + 'fafmax_faf95_max': '##INFO=', + 'fafmax_faf99_max': '##INFO=', + 'AF_popmax': '##INFO="', + 'AC_popmax': '##INFO=', + 'AN_popmax': '##INFO=', + 'popmax': '##INFO=', + 'AF_grpmax': '##INFO="', + 'AC_grpmax': '##INFO=', + 'AN_grpmax': '##INFO=', + 'grpmax': '##INFO=', + 'nhomalt': '##INFO=', + 'gnomad_filtered': '##INFO=', + 'nonpar': '##INFO=', + 'non_par': '##INFO=', + } + + info_headers = "" + for field in all_info: + if header := field_headers.get(field): + info_headers += header + "\n" + now = datetime.now() file_date = "%d%02d%02d" % (now.year, now.month, now.day) source = __file__ meta = """##fileformat=VCFv4.2 ##fileDate=%(file_date)s ##source=%(source)s -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -""" % {"file_date": file_date, "source": source} - - af_info = get_af_info() +%(info_headers)s +""" % {"file_date": file_date, "source": source, "info_headers": info_headers} + + af_info = get_af_info(sub_pops) for info_id, pop_name, ac_name, an_name in af_info: if pop_name: af_desc = f"for {pop_name}" @@ -219,7 +284,7 @@ def write_vcf_header(): af_desc += f" made from (exomes_{ac_name} + genomes_{ac_name}) / (exomes_{an_name} + genomes_{an_name})" meta += f'##INFO=\n' - vcf_header = "vcf_header.txt.gz" + vcf_header = f"gnomad_{version}_vcf_header.txt.gz" with gzip.open(vcf_header, "wt") as f: f.write(meta) header_cols = ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] @@ -228,12 +293,13 @@ def write_vcf_header(): return vcf_header -def calculate_allele_frequency(gnomad_input_vcf, af_output_vcf): +def calculate_allele_frequency(version, gnomad_input_vcf, af_output_vcf): + """ We have to re-calculate POPMAX as we can't merge it """ + from cyvcf2 import VCF # Import here, so that rest of script can run on HPC easier - # We have to re-calculate POPMAX as we can't merge it - af_info = get_af_info() - info_names = [ai[0] for ai in af_info] + COUNTS + OTHER_INFOS + ["AF_grpmax", "AC_grpmax", "AN_grpmax", "grpmax", "gnomad_filtered"] + info_fields, _, popmax_fields, sub_pops = get_infos_for_version(version) + af_info = get_af_info(sub_pops) with gzip.open(af_output_vcf, "wt") as f: for variant in VCF(gnomad_input_vcf): @@ -247,7 +313,7 @@ def calculate_allele_frequency(gnomad_input_vcf, af_output_vcf): ac_popmax = 0 an_popmax = 0 popmax = '.' - infos = [] + infos = {} for _, pop_name, ac_name, an_name in af_info: ac = variant.INFO.get(ac_name, 0) an = variant.INFO.get(an_name) @@ -262,13 +328,23 @@ def calculate_allele_frequency(gnomad_input_vcf, af_output_vcf): af = f'{af:.6f}' else: af = '.' - infos.append(af) + infos["AF"] = af - for o in COUNTS + OTHER_INFOS: - infos.append(str(variant.INFO.get(o, '.'))) + for o in info_fields: + infos[o] = str(variant.INFO.get(o, '.')) gnomad_filtered = '0' if variant.FILTER is None else '1' - infos.extend([str(af_popmax), str(ac_popmax), str(an_popmax), popmax, gnomad_filtered]) - info_str = ";".join([i + "=" + v for i, v in zip(info_names, infos)]) + infos["gnomad_filtered"] = gnomad_filtered + + for p in popmax_fields: # can be popmax or grpmax + if p.startswith("AF_"): + infos[p] = str(af_popmax) + elif p.startswith("AC_"): + infos[p] = str(ac_popmax) + elif p.startswith(("AN_")): + infos[p] = str(an_popmax) + elif p in {"popmax", "grpmax"}: + infos[p] = popmax + info_str = ";".join([f"{k}={v}" for k, v in infos.items()]) columns = [chrom, pos, variant_id, ref, alt, '.', '.', info_str] f.write("\t".join(columns) + "\n") From 3abf2d6701cbddb11c78a26af415cc5e4a95bb73 Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Wed, 29 Nov 2023 18:13:27 +1030 Subject: [PATCH 06/29] #850 - new VEP 110 fields --- .../0024_new_vep_annotation_gnomad3.py | 83 +++++------ ...variantannotation_gnomad_faf95_and_more.py | 83 +++++++++++ .../migrations/0082_new_vep_110_columns_v3.py | 134 ++++++++++++++++++ annotation/models/models.py | 27 ++-- annotation/models/models_enums.py | 1 + annotation/vep_annotation.py | 3 +- ...lemergelog_allele_linking_tool_and_more.py | 28 ++++ .../migrations/0107_new_vep_110_columns_v3.py | 120 ++++++++++++++++ snpdb/models/models_enums.py | 4 +- .../settings/components/default_settings.py | 1 + 10 files changed, 431 insertions(+), 53 deletions(-) create mode 100644 annotation/migrations/0081_rename_faf95_variantannotation_gnomad_faf95_and_more.py create mode 100644 annotation/migrations/0082_new_vep_110_columns_v3.py create mode 100644 snpdb/migrations/0106_alter_allelemergelog_allele_linking_tool_and_more.py create mode 100644 snpdb/migrations/0107_new_vep_110_columns_v3.py diff --git a/annotation/migrations/0024_new_vep_annotation_gnomad3.py b/annotation/migrations/0024_new_vep_annotation_gnomad3.py index 39a1dc859..740ad1a80 100644 --- a/annotation/migrations/0024_new_vep_annotation_gnomad3.py +++ b/annotation/migrations/0024_new_vep_annotation_gnomad3.py @@ -7,6 +7,9 @@ def _new_vep_annotation_gnomad3(apps, _schema_editor): # Separate out gnomAD 2 vs 3 + GNOMAD_2 = 'g' + GNOMAD_3 = 'n' + # Make everything from gnomAD2 GRCh37 specific EXCEPT gnomadAF (still want that one) ColumnVEPField = apps.get_model("annotation", "ColumnVEPField") GenomeBuild = apps.get_model("snpdb", "GenomeBuild") @@ -15,7 +18,7 @@ def _new_vep_annotation_gnomad3(apps, _schema_editor): grch38 = GenomeBuild.objects.get(pk="GRCh38") # All existing gnomAD are now GRCh37 only (will insert new legacy one below) - ColumnVEPField.objects.filter(vep_custom='g').update(genome_build=grch37) + ColumnVEPField.objects.filter(vep_custom=GNOMAD_2).update(genome_build=grch37) # Might as well hide these now as we can - GRCh37 has 46, GRCh37 has 30 ColumnVEPField.objects.filter(column__in=["phylop_46_way_mammalian", "phastcons_46_way_mammalian"]).update(genome_build=grch37) @@ -23,48 +26,48 @@ def _new_vep_annotation_gnomad3(apps, _schema_editor): COLUMN_VEP_FIELD = [ # Legacy - {'column': 'gnomad2_liftover_af', 'vep_plugin': None, 'source_field_has_custom_prefix': True, - 'vep_custom': 'g', 'variant_grid_column_id': 'gnomad2_liftover_af', 'source_field': 'AF', 'category': 'F'}, + {'column': 'gnomad2_liftover_af', 'source_field_has_custom_prefix': True, + 'vep_custom': GNOMAD_2, 'variant_grid_column_id': 'gnomad2_liftover_af', 'source_field': 'AF', 'category': 'F'}, # gnomAD 3 - {'column': 'gnomad3_ac', 'vep_plugin': None, 'source_field_has_custom_prefix': True, - 'vep_custom': 'n', 'variant_grid_column_id': 'gnomad_ac', 'source_field': 'AC', 'category': 'F'}, - {'column': 'gnomad3_af', 'vep_plugin': None, 'source_field_has_custom_prefix': True, - 'vep_custom': 'n', 'variant_grid_column_id': 'gnomad_af', 'source_field': 'AF', 'category': 'F'}, - {'column': 'gnomad3_an', 'vep_plugin': None, 'source_field_has_custom_prefix': True, - 'vep_custom': 'n', 'variant_grid_column_id': 'gnomad_an', 'source_field': 'AN', 'category': 'F'}, - {'column': 'gnomad3_afr_af', 'vep_plugin': None, 'source_field_has_custom_prefix': True, - 'vep_custom': 'n', 'variant_grid_column_id': 'gnomad_afr_af', 'source_field': 'AF-afr', 'category': 'F'}, - {'column': 'gnomad3_amr_af', 'vep_plugin': None, 'source_field_has_custom_prefix': True, - 'vep_custom': 'n', 'variant_grid_column_id': 'gnomad_amr_af', 'source_field': 'AF-amr', 'category': 'F'}, - {'column': 'gnomad3_asj_af', 'vep_plugin': None, 'source_field_has_custom_prefix': True, - 'vep_custom': 'n', 'variant_grid_column_id': 'gnomad_asj_af', 'source_field': 'AF-asj', 'category': 'F'}, - {'column': 'gnomad3_eas_af', 'vep_plugin': None, 'source_field_has_custom_prefix': True, - 'vep_custom': 'n', 'variant_grid_column_id': 'gnomad_eas_af', 'source_field': 'AF-eas', 'category': 'F'}, - {'column': 'gnomad3_filtered', 'vep_plugin': None, 'source_field_has_custom_prefix': True, - 'vep_custom': 'n', 'variant_grid_column_id': 'gnomad_filtered', 'source_field': 'FILTER', 'category': 'F'}, - {'column': 'gnomad3_fin_af', 'vep_plugin': None, 'source_field_has_custom_prefix': True, - 'vep_custom': 'n', 'variant_grid_column_id': 'gnomad_fin_af', 'source_field': 'AF-fin', 'category': 'F'}, - {'column': 'gnomad3_hom_alt', 'vep_plugin': None, 'source_field_has_custom_prefix': True, - 'vep_custom': 'n', 'variant_grid_column_id': 'gnomad_hom_alt', 'source_field': 'nhomalt', 'category': 'F'}, - {'column': 'gnomad3_nfe_af', 'vep_plugin': None, 'source_field_has_custom_prefix': True, - 'vep_custom': 'n', 'variant_grid_column_id': 'gnomad_nfe_af', 'source_field': 'AF-nfe', 'category': 'F'}, - {'column': 'gnomad3_oth_af', 'vep_plugin': None, 'source_field_has_custom_prefix': True, - 'vep_custom': 'n', 'variant_grid_column_id': 'gnomad_oth_af', 'source_field': 'AF-oth', 'category': 'F'}, - {'column': 'gnomad3_popmax', 'vep_plugin': None, 'source_field_has_custom_prefix': True, - 'vep_custom': 'n', 'variant_grid_column_id': 'gnomad_popmax', 'source_field': 'popmax', 'category': 'F'}, - {'column': 'gnomad3_popmax_ac', 'vep_plugin': None, 'source_field_has_custom_prefix': True, - 'vep_custom': 'n', 'variant_grid_column_id': 'gnomad_popmax_ac', 'source_field': 'AC_popmax', 'category': 'F'}, - {'column': 'gnomad3_popmax_af', 'vep_plugin': None, 'source_field_has_custom_prefix': True, - 'vep_custom': 'n', 'variant_grid_column_id': 'gnomad_popmax_af', 'source_field': 'AF_popmax', 'category': 'F'}, - {'column': 'gnomad3_popmax_an', 'vep_plugin': None, 'source_field_has_custom_prefix': True, - 'vep_custom': 'n', 'variant_grid_column_id': 'gnomad_popmax_an', 'source_field': 'AN_popmax', 'category': 'F'}, - {'column': 'gnomad3_popmax_hom_alt', 'vep_plugin': None, 'source_field_has_custom_prefix': True, - 'vep_custom': 'n', 'variant_grid_column_id': 'gnomad_popmax_hom_alt', 'source_field': 'nhomalt_popmax', 'category': 'F'}, - {'column': 'gnomad3_sas_af', 'vep_plugin': None, 'source_field_has_custom_prefix': True, - 'vep_custom': 'n', 'variant_grid_column_id': 'gnomad_sas_af', 'source_field': 'AF-sas', 'category': 'F'}, + {'column': 'gnomad3_ac', 'source_field_has_custom_prefix': True, + 'vep_custom': GNOMAD_3, 'variant_grid_column_id': 'gnomad_ac', 'source_field': 'AC', 'category': 'F'}, + {'column': 'gnomad3_af', 'source_field_has_custom_prefix': True, + 'vep_custom': GNOMAD_3, 'variant_grid_column_id': 'gnomad_af', 'source_field': 'AF', 'category': 'F'}, + {'column': 'gnomad3_an', 'source_field_has_custom_prefix': True, + 'vep_custom': GNOMAD_3, 'variant_grid_column_id': 'gnomad_an', 'source_field': 'AN', 'category': 'F'}, + {'column': 'gnomad3_afr_af', 'source_field_has_custom_prefix': True, + 'vep_custom': GNOMAD_3, 'variant_grid_column_id': 'gnomad_afr_af', 'source_field': 'AF-afr', 'category': 'F'}, + {'column': 'gnomad3_amr_af', 'source_field_has_custom_prefix': True, + 'vep_custom': GNOMAD_3, 'variant_grid_column_id': 'gnomad_amr_af', 'source_field': 'AF-amr', 'category': 'F'}, + {'column': 'gnomad3_asj_af', 'source_field_has_custom_prefix': True, + 'vep_custom': GNOMAD_3, 'variant_grid_column_id': 'gnomad_asj_af', 'source_field': 'AF-asj', 'category': 'F'}, + {'column': 'gnomad3_eas_af', 'source_field_has_custom_prefix': True, + 'vep_custom': GNOMAD_3, 'variant_grid_column_id': 'gnomad_eas_af', 'source_field': 'AF-eas', 'category': 'F'}, + {'column': 'gnomad3_filtered', 'source_field_has_custom_prefix': True, + 'vep_custom': GNOMAD_3, 'variant_grid_column_id': 'gnomad_filtered', 'source_field': 'FILTER', 'category': 'F'}, + {'column': 'gnomad3_fin_af', 'source_field_has_custom_prefix': True, + 'vep_custom': GNOMAD_3, 'variant_grid_column_id': 'gnomad_fin_af', 'source_field': 'AF-fin', 'category': 'F'}, + {'column': 'gnomad3_hom_alt', 'source_field_has_custom_prefix': True, + 'vep_custom': GNOMAD_3, 'variant_grid_column_id': 'gnomad_hom_alt', 'source_field': 'nhomalt', 'category': 'F'}, + {'column': 'gnomad3_nfe_af', 'source_field_has_custom_prefix': True, + 'vep_custom': GNOMAD_3, 'variant_grid_column_id': 'gnomad_nfe_af', 'source_field': 'AF-nfe', 'category': 'F'}, + {'column': 'gnomad3_oth_af', 'source_field_has_custom_prefix': True, + 'vep_custom': GNOMAD_3, 'variant_grid_column_id': 'gnomad_oth_af', 'source_field': 'AF-oth', 'category': 'F'}, + {'column': 'gnomad3_popmax', 'source_field_has_custom_prefix': True, + 'vep_custom': GNOMAD_3, 'variant_grid_column_id': 'gnomad_popmax', 'source_field': 'popmax', 'category': 'F'}, + {'column': 'gnomad3_popmax_ac', 'source_field_has_custom_prefix': True, + 'vep_custom': GNOMAD_3, 'variant_grid_column_id': 'gnomad_popmax_ac', 'source_field': 'AC_popmax', 'category': 'F'}, + {'column': 'gnomad3_popmax_af', 'source_field_has_custom_prefix': True, + 'vep_custom': GNOMAD_3, 'variant_grid_column_id': 'gnomad_popmax_af', 'source_field': 'AF_popmax', 'category': 'F'}, + {'column': 'gnomad3_popmax_an', 'source_field_has_custom_prefix': True, + 'vep_custom': GNOMAD_3, 'variant_grid_column_id': 'gnomad_popmax_an', 'source_field': 'AN_popmax', 'category': 'F'}, + {'column': 'gnomad3_popmax_hom_alt', 'source_field_has_custom_prefix': True, + 'vep_custom': GNOMAD_3, 'variant_grid_column_id': 'gnomad_popmax_hom_alt', 'source_field': 'nhomalt_popmax', 'category': 'F'}, + {'column': 'gnomad3_sas_af', 'source_field_has_custom_prefix': True, + 'vep_custom': GNOMAD_3, 'variant_grid_column_id': 'gnomad_sas_af', 'source_field': 'AF-sas', 'category': 'F'}, ] bulk_insert_class_data(apps, "annotation", [("ColumnVEPField", COLUMN_VEP_FIELD)]) - ColumnVEPField.objects.filter(vep_custom='n').update(genome_build=grch38) + ColumnVEPField.objects.filter(vep_custom=GNOMAD_3).update(genome_build=grch38) class Migration(migrations.Migration): diff --git a/annotation/migrations/0081_rename_faf95_variantannotation_gnomad_faf95_and_more.py b/annotation/migrations/0081_rename_faf95_variantannotation_gnomad_faf95_and_more.py new file mode 100644 index 000000000..d66e2bedb --- /dev/null +++ b/annotation/migrations/0081_rename_faf95_variantannotation_gnomad_faf95_and_more.py @@ -0,0 +1,83 @@ +# Generated by Django 4.2.2 on 2023-11-29 07:20 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('annotation', '0080_columnvepfield_pipeline_type_variantannotation_faf95_and_more'), + ] + + operations = [ + migrations.RenameField( + model_name='variantannotation', + old_name='faf95', + new_name='gnomad_faf95', + ), + migrations.RenameField( + model_name='variantannotation', + old_name='faf99', + new_name='gnomad_faf99', + ), + migrations.RenameField( + model_name='variantannotation', + old_name='fafmax_faf95_max', + new_name='gnomad_fafmax_faf95_max', + ), + migrations.RenameField( + model_name='variantannotation', + old_name='fafmax_faf99_max', + new_name='gnomad_fafmax_faf99_max', + ), + migrations.AddField( + model_name='variantannotation', + name='gnomad_hemi_count', + field=models.IntegerField(blank=True, null=True), + ), + migrations.AddField( + model_name='variantannotation', + name='gnomad_non_par', + field=models.BooleanField(blank=True, null=True), + ), + migrations.AddField( + model_name='variantannotation', + name='gnomad_xy_ac', + field=models.IntegerField(blank=True, null=True), + ), + migrations.AddField( + model_name='variantannotation', + name='gnomad_xy_af', + field=models.FloatField(blank=True, null=True), + ), + migrations.AddField( + model_name='variantannotation', + name='gnomad_xy_an', + field=models.IntegerField(blank=True, null=True), + ), + migrations.AddField( + model_name='varianttranscriptannotation', + name='alphamissense_class', + field=models.CharField(blank=True, choices=[('b', 'likely_benign'), ('a', 'ambiguous'), ('p', 'likely_pathogenic')], max_length=1, null=True), + ), + migrations.AddField( + model_name='varianttranscriptannotation', + name='alphamissense_pathogenicity', + field=models.FloatField(blank=True, null=True), + ), + migrations.AddField( + model_name='varianttranscriptannotation', + name='mavedb_score', + field=models.FloatField(blank=True, null=True), + ), + migrations.AddField( + model_name='varianttranscriptannotation', + name='mavedb_urn', + field=models.TextField(blank=True, null=True), + ), + migrations.AlterField( + model_name='columnvepfield', + name='category', + field=models.CharField(choices=[('C', 'Conservation'), ('E', 'External ID'), ('F', 'Frequency Data'), ('f', 'Functional Effect'), ('G', 'Gene Annotations'), ('H', 'HGVS'), ('L', 'Literature'), ('N', 'Nearby Features'), ('P', 'Pathogenicity Predictions'), ('Y', 'Phenotype'), ('D', 'Protein Domains'), ('Q', 'Sequence'), ('S', 'Splicing Predictions'), ('V', 'Variant Data')], max_length=1), + ), + ] diff --git a/annotation/migrations/0082_new_vep_110_columns_v3.py b/annotation/migrations/0082_new_vep_110_columns_v3.py new file mode 100644 index 000000000..c9b259034 --- /dev/null +++ b/annotation/migrations/0082_new_vep_110_columns_v3.py @@ -0,0 +1,134 @@ +# Generated by Django 4.2.2 on 2023-11-29 06:00 + +from django.db import migrations + +from library.django_utils import bulk_insert_class_data + + +def _new_vep_110_annotation(apps, _schema_editor): + VEP_CUSTOM_GNOMAD_3 = 'n' + VEP_CUSTOM_GNOMAD_4 = 'o' + + VEP_PLUGIN_MAVEDB = 'V' + VEP_PLUGIN_ALPHAMISSENSE = 'A' + + FREQUENCY_DATA = 'F' + FUNCTIONAL_EFFECT = 'f' + PATHOGENICITY_PREDICTIONS = 'P' + + + ColumnVEPField = apps.get_model("annotation", "ColumnVEPField") + GenomeBuild = apps.get_model("snpdb", "GenomeBuild") + + grch37 = GenomeBuild.objects.get(pk="GRCh37") + grch38 = GenomeBuild.objects.get(pk="GRCh38") + + # Make existing gnomAD3 have max column version of 2 + ColumnVEPField.objects.filter(vep_custom=VEP_CUSTOM_GNOMAD_3).update(max_vep_columns_version=2) + + raise ValueError("Still need to add new fields from gnomad2 stuff") + + + COLUMN_VEP_FIELD = [ + # gnomAD 4 + {'column': 'gnomad4_ac', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_ac', 'source_field': 'AC', 'category': FREQUENCY_DATA}, + {'column': 'gnomad4_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_af', 'source_field': 'AF', 'category': FREQUENCY_DATA}, + {'column': 'gnomad4_an', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_an', 'source_field': 'AN', 'category': FREQUENCY_DATA}, + {'column': 'gnomad4_afr_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_afr_af', 'source_field': 'AF_afr', 'category': FREQUENCY_DATA}, + {'column': 'gnomad4_amr_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_amr_af', 'source_field': 'AF_amr', 'category': FREQUENCY_DATA}, + {'column': 'gnomad4_asj_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_asj_af', 'source_field': 'AF_asj', 'category': FREQUENCY_DATA}, + {'column': 'gnomad4_eas_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_eas_af', 'source_field': 'AF_eas', 'category': FREQUENCY_DATA}, + {'column': 'gnomad4_fin_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_fin_af', 'source_field': 'AF_fin', 'category': FREQUENCY_DATA}, + {'column': 'gnomad4_mid_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_mid_af', 'source_field': 'AF_mid', 'category': FREQUENCY_DATA}, + {'column': 'gnomad4_nfe_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_nfe_af', 'source_field': 'AF_nfe', 'category': FREQUENCY_DATA}, + {'column': 'gnomad4_oth_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_oth_af', 'source_field': 'AF_remaining', 'category': FREQUENCY_DATA}, + {'column': 'gnomad4_sas_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_sas_af', 'source_field': 'AF_sas', 'category': FREQUENCY_DATA}, + + {'column': 'gnomad4_filtered', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_filtered', 'source_field': 'gnomad_filtered', 'category': FREQUENCY_DATA}, + {'column': 'gnomad4_hom_alt', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_hom_alt', 'source_field': 'nhomalt', 'category': FREQUENCY_DATA}, + {'column': 'gnomad4_popmax', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_popmax', 'source_field': 'grpmax', 'category': FREQUENCY_DATA}, + {'column': 'gnomad4_popmax_ac', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_popmax_ac', 'source_field': 'AC_grpmax', 'category': FREQUENCY_DATA}, + {'column': 'gnomad4_popmax_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_popmax_af', 'source_field': 'AF_grpmax', 'category': FREQUENCY_DATA}, + {'column': 'gnomad4_popmax_an', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_popmax_an', 'source_field': 'AN_grpmax', 'category': FREQUENCY_DATA}, + + {'column': 'gnomad4_xy_ac', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_xy_ac', 'source_field': 'AC_XY', + 'category': FREQUENCY_DATA}, + {'column': 'gnomad4_xy_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_xy_af', 'source_field': 'AF_XY', + 'category': FREQUENCY_DATA}, + {'column': 'gnomad4_xy_an', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_popmax_an', 'source_field': 'AN_XY', + 'category': FREQUENCY_DATA}, + + {'column': 'gnomad4_faf95', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_faf95', 'source_field': 'faf95', + 'category': FREQUENCY_DATA}, + {'column': 'gnomad4_faf99', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_faf99', 'source_field': 'faf99', + 'category': FREQUENCY_DATA}, + {'column': 'gnomad4_fafmax_faf95_max', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_fafmax_faf95_max', 'source_field': 'fafmax_faf95_max', + 'category': FREQUENCY_DATA}, + {'column': 'gnomad4_fafmax_faf99_max', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_fafmax_faf99_max', 'source_field': 'fafmax_faf99_max', + 'category': FREQUENCY_DATA}, + + # I left this out don't think it really matters +# {'column': 'gnomad4_popmax_hom_alt', 'source_field_has_custom_prefix': True, +# 'vep_custom': GNOMAD_4, 'variant_grid_column_id': 'gnomad_popmax_hom_alt', 'source_field': 'nhomalt_grpmax', 'category': FREQUENCY_DATA}, + + # MAVE + {'column': 'mavedb_score', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, + 'vep_plugin': VEP_PLUGIN_MAVEDB, 'variant_grid_column_id': 'mavedb_score', + 'source_field': 'score', + 'category': FUNCTIONAL_EFFECT}, + {'column': 'mavedb_urn', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, + 'vep_plugin': VEP_PLUGIN_MAVEDB, 'variant_grid_column_id': 'mavedb_urn', + 'source_field': 'urn', + 'category': FUNCTIONAL_EFFECT}, + + # AlphaMissense + {'column': 'alphamissense_class', 'min_vep_columns_version': 3, + 'vep_plugin': VEP_PLUGIN_ALPHAMISSENSE, 'variant_grid_column_id': 'alphamissense_class', + 'source_field': 'am_class', + 'category': PATHOGENICITY_PREDICTIONS}, + {'column': 'alphamissense_pathogenicity', 'min_vep_columns_version': 3, + 'vep_plugin': VEP_PLUGIN_ALPHAMISSENSE, 'variant_grid_column_id': 'alphamissense_pathogenicity', + 'source_field': 'am_pathogenicity', + 'category': PATHOGENICITY_PREDICTIONS}, + ] + bulk_insert_class_data(apps, "annotation", [("ColumnVEPField", COLUMN_VEP_FIELD)]) + ColumnVEPField.objects.filter(vep_custom=VEP_CUSTOM_GNOMAD_4).update(genome_build=grch38) + + + + +class Migration(migrations.Migration): + + dependencies = [ + ('annotation', '0081_rename_faf95_variantannotation_gnomad_faf95_and_more'), + ("snpdb", "0107_new_vep_110_columns_v3"), # Defines new columns + ] + + operations = [ + migrations.RunPython(_new_vep_110_annotation) + ] diff --git a/annotation/models/models.py b/annotation/models/models.py index 0287a467f..c9a308e3e 100644 --- a/annotation/models/models.py +++ b/annotation/models/models.py @@ -835,6 +835,12 @@ class AbstractVariantAnnotation(models.Model): splice_region = models.TextField(null=True, blank=True) symbol = models.TextField(null=True, blank=True) + alphamissense_class = models.CharField(max_length=1, choices=AlphaMissensePrediction.choices, null=True, blank=True) + alphamissense_pathogenicity = models.FloatField(null=True, blank=True) + + mavedb_score = models.FloatField(null=True, blank=True) + mavedb_urn = models.TextField(null=True, blank=True) + class Meta: abstract = True @@ -888,6 +894,7 @@ class VariantAnnotation(AbstractVariantAnnotation): # Population frequency af_1kg = models.FloatField(null=True, blank=True) af_uk10k = models.FloatField(null=True, blank=True) + topmed_af = models.FloatField(null=True, blank=True) gnomad_af = models.FloatField(null=True, blank=True) gnomad2_liftover_af = models.FloatField(null=True, blank=True) gnomad_ac = models.IntegerField(null=True, blank=True) @@ -903,16 +910,20 @@ class VariantAnnotation(AbstractVariantAnnotation): gnomad_oth_af = models.FloatField(null=True, blank=True) gnomad_sas_af = models.FloatField(null=True, blank=True) # filtering allele frequencies (new in gnomADv4) - faf95 = models.FloatField(null=True, blank=True) - faf99 = models.FloatField(null=True, blank=True) - fafmax_faf95_max = models.FloatField(null=True, blank=True) - fafmax_faf99_max = models.FloatField(null=True, blank=True) + gnomad_faf95 = models.FloatField(null=True, blank=True) + gnomad_faf99 = models.FloatField(null=True, blank=True) + gnomad_fafmax_faf95_max = models.FloatField(null=True, blank=True) + gnomad_fafmax_faf99_max = models.FloatField(null=True, blank=True) + gnomad_xy_af = models.FloatField(null=True, blank=True) + gnomad_xy_ac = models.IntegerField(null=True, blank=True) + gnomad_xy_an = models.IntegerField(null=True, blank=True) + gnomad_hemi_count = models.IntegerField(null=True, blank=True) # This is set from gnomad_xy_ac if gnomad_non_par gnomad_popmax_af = models.FloatField(null=True, blank=True) gnomad_popmax_ac = models.IntegerField(null=True, blank=True) gnomad_popmax_an = models.IntegerField(null=True, blank=True) gnomad_popmax_hom_alt = models.IntegerField(null=True, blank=True) - topmed_af = models.FloatField(null=True, blank=True) gnomad_filtered = models.BooleanField(null=True, blank=True) + gnomad_non_par = models.BooleanField(null=True, blank=True) # Not pseudoautosomal regions gnomad_popmax = models.CharField(max_length=3, choices=GnomADPopulation.choices, null=True, blank=True) # From https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4267638/ @@ -966,12 +977,6 @@ class VariantAnnotation(AbstractVariantAnnotation): spliceai_pred_ds_dl = models.FloatField(null=True, blank=True) spliceai_gene_symbol = models.TextField(null=True, blank=True) - alphamissense_class = models.CharField(max_length=1, choices=AlphaMissensePrediction.choices, null=True, blank=True) - alphamissense_pathogenicity = models.FloatField(null=True, blank=True) - - mavedb_score = models.FloatField(null=True, blank=True) - mavedb_urn = models.TextField(null=True, blank=True) - repeat_masker = models.TextField(null=True, blank=True) overlapping_symbols = models.TextField(null=True, blank=True) # Summary of most_damaging fields for faster DamageNode queries diff --git a/annotation/models/models_enums.py b/annotation/models/models_enums.py index d476c08ab..bda376d97 100644 --- a/annotation/models/models_enums.py +++ b/annotation/models/models_enums.py @@ -111,6 +111,7 @@ class ColumnAnnotationCategory(models.TextChoices): CONSERVATION = 'C', "Conservation" EXTERNAL_ID = 'E', "External ID" FREQUENCY_DATA = 'F', "Frequency Data" + FUNCTIONAL_EFFECT = 'f', "Functional Effect" GENE_ANNOTATIONS = 'G', 'Gene Annotations' HGVS = 'H', "HGVS" LITERATURE = 'L', 'Literature' diff --git a/annotation/vep_annotation.py b/annotation/vep_annotation.py index fd80fbd02..ee43f94b1 100644 --- a/annotation/vep_annotation.py +++ b/annotation/vep_annotation.py @@ -281,7 +281,8 @@ def vep_int_version(vep_string_version): kwargs["dbnsfp"] = 'n/a' # we use our own gnomAD custom annotation, not the default VEP one - if cvf := ColumnVEPField.objects.filter(variant_grid_column='gnomad_af', genome_build=genome_build).first(): + q_cvf = ColumnVEPField.get_columns_version_q(vep_config.columns_version) + if cvf := ColumnVEPField.objects.filter(q_cvf, variant_grid_column='gnomad_af', genome_build=genome_build).first(): try: # annotation_data/GRCh37/gnomad2.1.1_GRCh37_combined_af.vcf.bgz # gnomad3.1_GRCh38_merged.vcf.bgz diff --git a/snpdb/migrations/0106_alter_allelemergelog_allele_linking_tool_and_more.py b/snpdb/migrations/0106_alter_allelemergelog_allele_linking_tool_and_more.py new file mode 100644 index 000000000..7a6e3f756 --- /dev/null +++ b/snpdb/migrations/0106_alter_allelemergelog_allele_linking_tool_and_more.py @@ -0,0 +1,28 @@ +# Generated by Django 4.2.2 on 2023-11-29 05:58 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('snpdb', '0105_alter_settingsoverride_default_genome_build_and_more'), + ] + + operations = [ + migrations.AlterField( + model_name='allelemergelog', + name='allele_linking_tool', + field=models.CharField(choices=[('SC', 'Identical Contig/Version'), ('CA', 'ClinGen Allele Registry'), ('DB', 'dbSNP API'), ('NR', 'NCBI Remap'), ('PC', 'Picard LiftoverVCF'), ('CM', 'CrossMap')], max_length=2), + ), + migrations.AlterField( + model_name='liftover', + name='conversion_tool', + field=models.CharField(choices=[('SC', 'Identical Contig/Version'), ('CA', 'ClinGen Allele Registry'), ('DB', 'dbSNP API'), ('NR', 'NCBI Remap'), ('PC', 'Picard LiftoverVCF'), ('CM', 'CrossMap')], max_length=2), + ), + migrations.AlterField( + model_name='variantallele', + name='allele_linking_tool', + field=models.CharField(choices=[('SC', 'Identical Contig/Version'), ('CA', 'ClinGen Allele Registry'), ('DB', 'dbSNP API'), ('NR', 'NCBI Remap'), ('PC', 'Picard LiftoverVCF'), ('CM', 'CrossMap')], max_length=2), + ), + ] diff --git a/snpdb/migrations/0107_new_vep_110_columns_v3.py b/snpdb/migrations/0107_new_vep_110_columns_v3.py new file mode 100644 index 000000000..d0d003f71 --- /dev/null +++ b/snpdb/migrations/0107_new_vep_110_columns_v3.py @@ -0,0 +1,120 @@ +# Generated by Django 4.2.2 on 2023-11-29 06:39 + +from django.db import migrations + +from library.django_utils import bulk_insert_class_data + + + +def _new_vep_110_annotation(apps, _schema_editor): + TRANSCRIPT_LEVEL = 'T' + VARIANT_LEVEL = 'V' + + raise ValueError("This is not complete yet!") + + + NEW_VARIANT_GRID_COLUMNS = [ + {'grid_column_name': 'gnomad_mid_af', + 'variant_column': 'variantannotation__gnomad_mid_af', + 'annotation_level': VARIANT_LEVEL, + 'width': None, + 'label': 'gnomAD MID AF', + 'description': "Allele Frequency (0-1) among Middle Eastern genotypes (exome+genome) (GRCh38/gnomad4 only)", + 'model_field': True, + 'queryset_field': True}, + {'grid_column_name': 'gnomad_faf95', + 'variant_column': 'variantannotation__gnomad_faf95', + 'annotation_level': VARIANT_LEVEL, + 'width': None, + 'label': 'gnomAD FAF95', + 'description': "Filtering allele frequency (using Poisson 95% CI) (max of exomes/genomes) (GRCh38/gnomad4 only)", + 'model_field': True, + 'queryset_field': True}, + {'grid_column_name': 'gnomad_faf99', + 'variant_column': 'variantannotation__gnomad_faf99', + 'annotation_level': VARIANT_LEVEL, + 'width': None, + 'label': 'gnomAD FAF99', + 'description': "Filtering allele frequency (using Poisson 99% CI) (max of exomes/genomes) (GRCh38/gnomad4 only)", + 'model_field': True, + 'queryset_field': True}, + {'grid_column_name': 'gnomad_fafmax_faf95_max', + 'variant_column': 'variantannotation__gnomad_fafmax_faf95_max', + 'annotation_level': VARIANT_LEVEL, + 'width': None, + 'label': 'gnomAD FAF95 Max', + 'description': "Maximum filtering allele frequency (using Poisson 95% CI) across genetic_ancestry groups (max of exomes/genomes) (GRCh38/gnomad4 only)", + 'model_field': True, + 'queryset_field': True}, + {'grid_column_name': 'gnomad_fafmax_faf99_max', + 'variant_column': 'variantannotation__gnomad_fafmax_faf99_max', + 'annotation_level': VARIANT_LEVEL, + 'width': None, + 'label': 'gnomAD FAF99 Max', + 'description': "Maximum filtering allele frequency (using Poisson 99% CI) across genetic_ancestry groups (max of exomes/genomes) (GRCh38/gnomad4 only)", + 'model_field': True, + 'queryset_field': True}, + + {'grid_column_name': 'gnomad_hemi_count', + 'variant_column': 'variantannotation__gnomad_hemi_count', + 'annotation_level': VARIANT_LEVEL, + 'width': None, + 'label': 'gnomAD2 Hemizygous count', + 'description': "XY count (in non-PAR regions) - only on chrX", + 'model_field': True, + 'queryset_field': True}, + + # TODO: + # alphamissense_class + # alphamissense_pathogenicity + # mavedb_score + # mavedb_urn + + ] + + NEW_COLUMN_VCF_INFO = [ + {'info_id': 'GNOMAD3_AC', + 'column_id': 'gnomad_ac', + 'number': None, + 'type': 'F', + 'description': "gnomAD: Alternate Allele Count (GRCh38 only)"}, + {'info_id': 'GNOMAD3_AN', + 'column_id': 'gnomad_an', + 'number': None, + 'type': 'I', + 'description': "gnomAD: Total number of alleles (GRCh38 only)"}, + {'info_id': 'GNOMAD3_POPMAX_AC', + 'column_id': 'gnomad_popmax_ac', + 'number': None, + 'type': 'I', + 'description': "gnomAD: Allele count in the population with the maximum AF (GRCh38 only)"}, + {'info_id': 'GNOMAD3_POPMAX_AN', + 'column_id': 'gnomad_popmax_an', + 'number': None, + 'type': 'I', + 'description': "gnomAD: Total number of alleles in the population with the maximum AF (GRCh38 only)"}, + {'info_id': 'GNOMAD3_POPMAX_HOM_ALT', + 'column_id': 'gnomad_popmax_hom_alt', + 'number': None, + 'type': 'I', + 'description': "gnomAD: Count of homozygous individuals in the population with the maximum allele frequency (GRCh38 only)"}, + {'info_id': 'GNOMAD2_LIFTOVER_AF', + 'column_id': 'gnomad2_liftover_af', + 'number': None, + 'type': 'F', + 'description': "gnomAD: Allele Frequency from gnomAD2 liftover (GRCh38 only)"}, + ] + + bulk_insert_class_data(apps, "snpdb", [("VariantGridColumn", NEW_VARIANT_GRID_COLUMNS)]) + bulk_insert_class_data(apps, "snpdb", [("ColumnVCFInfo", NEW_COLUMN_VCF_INFO)]) + + +class Migration(migrations.Migration): + + dependencies = [ + ('snpdb', '0106_alter_allelemergelog_allele_linking_tool_and_more'), + ] + + operations = [ + migrations.RunPython(_new_vep_110_annotation) + ] diff --git a/snpdb/models/models_enums.py b/snpdb/models/models_enums.py index ec1d4c686..a51e2e8b9 100644 --- a/snpdb/models/models_enums.py +++ b/snpdb/models/models_enums.py @@ -138,7 +138,9 @@ class AlleleConversionTool(models.TextChoices): SAME_CONTIG = "SC", "Identical Contig/Version" CLINGEN_ALLELE_REGISTRY = 'CA', "ClinGen Allele Registry" DBSNP = 'DB', "dbSNP API" - NCBI_REMAP = 'NR', "NCBI Remap" + NCBI_REMAP = 'NR', "NCBI Remap" # This is obsolete as of November 2023 + PICARD = "PC", "Picard LiftoverVCF" + CROSSMAP = "CM", "CrossMap" @classmethod def vcf_tuples_in_destination_build(cls, conversion_tool): diff --git a/variantgrid/settings/components/default_settings.py b/variantgrid/settings/components/default_settings.py index db9986e9d..a60a34fd9 100644 --- a/variantgrid/settings/components/default_settings.py +++ b/variantgrid/settings/components/default_settings.py @@ -271,6 +271,7 @@ "dbscsnv": "annotation_data/GRCh38/dbscSNV1.1_GRCh38.txt.gz", "gnomad2": "annotation_data/GRCh38/gnomad2.1.1_GRCh38_combined_af.vcf.bgz", "gnomad3": "annotation_data/GRCh38/gnomad3.1_GRCh38_merged.vcf.bgz", + "gnomad4": "annotation_data/GRCh38/gnomad4.0_GRCh38_combined_af.vcf.bgz", "mastermind": "annotation_data/GRCh38/mastermind_cited_variants_reference-2022.04.02-grch38.vcf.gz", "mave": "annotation_data/GRCh38/MaveDB_variants.tsv.gz", "maxentscan": "annotation_data/all_builds/maxentscan", From ebd2d129ebd5b01eac5712e546c29b8b692928e7 Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Wed, 29 Nov 2023 23:33:16 +1030 Subject: [PATCH 07/29] #938 - need to add version on AF script --- .../annotation_data/generate_annotation/gnomad_data.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/annotation/annotation_data/generate_annotation/gnomad_data.py b/annotation/annotation_data/generate_annotation/gnomad_data.py index aa9c55ce0..f024a3d78 100755 --- a/annotation/annotation_data/generate_annotation/gnomad_data.py +++ b/annotation/annotation_data/generate_annotation/gnomad_data.py @@ -79,7 +79,12 @@ def get_args(): group.add_argument('--af', action='store_true', help="Calculate allele frequency from VCF") args = parser.parse_args() - if not args.scripts: + if args.scripts: + if args.genome_build is None: + parser.error("--genome-build required for --scripts") + if args.genome_build not in GENOME_BUILDS: + parser.error(f"--genome-build must be one of {', '.join(GENOME_BUILDS)}") + else: if args.gnomad_input_vcf is None: parser.error("--gnomad-input-vcf required for --af") if args.af_output_vcf is None: @@ -189,7 +194,7 @@ def write_scripts(args): # cs.write("source /home/a1059391/venv/dave_venv/bin/activate\n") script_filename = os.path.realpath(__file__) allele_frequency_vcf = f"{prefix}.af.vcf.gz" - cs.write(f"{script_filename} --af --gnomad-input-vcf={combined_vcf} --af-output-vcf={allele_frequency_vcf}\n") + cs.write(f"{script_filename} --af --gnomad-input-vcf={combined_vcf} --af-output-vcf={allele_frequency_vcf} --version={args.version}\n") af_vcfs.append(allele_frequency_vcf) # Write merge script From 541fb2ecd8c33c8dda20a2ca8eca73331a9231c1 Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Thu, 30 Nov 2023 11:33:57 +1030 Subject: [PATCH 08/29] #938 - fix VCF header, get AF script to include all fields correctly --- .../generate_annotation/gnomad_data.py | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/annotation/annotation_data/generate_annotation/gnomad_data.py b/annotation/annotation_data/generate_annotation/gnomad_data.py index f024a3d78..2d461b990 100755 --- a/annotation/annotation_data/generate_annotation/gnomad_data.py +++ b/annotation/annotation_data/generate_annotation/gnomad_data.py @@ -248,11 +248,14 @@ def write_vcf_header(version, info_fields, popmax_fields, sub_pops): 'AC_XY': '##INFO=', 'AF_XY': '##INFO=', 'AN_XY': '##INFO=', + 'AC_male': '##INFO=', + 'AN_male': '##INFO=', + 'AF_male': '##INFO=', 'faf95': '##INFO=', 'faf99': '##INFO=', 'fafmax_faf95_max': '##INFO=', 'fafmax_faf99_max': '##INFO=', - 'AF_popmax': '##INFO="', + 'AF_popmax': '##INFO=', 'AC_popmax': '##INFO=', 'AN_popmax': '##INFO=', 'popmax': '##INFO=', @@ -277,8 +280,7 @@ def write_vcf_header(version, info_fields, popmax_fields, sub_pops): meta = """##fileformat=VCFv4.2 ##fileDate=%(file_date)s ##source=%(source)s -%(info_headers)s -""" % {"file_date": file_date, "source": source, "info_headers": info_headers} +%(info_headers)s""" % {"file_date": file_date, "source": source, "info_headers": info_headers} af_info = get_af_info(sub_pops) for info_id, pop_name, ac_name, an_name in af_info: @@ -303,7 +305,7 @@ def calculate_allele_frequency(version, gnomad_input_vcf, af_output_vcf): from cyvcf2 import VCF # Import here, so that rest of script can run on HPC easier - info_fields, _, popmax_fields, sub_pops = get_infos_for_version(version) + info_fields, chr_x_male, popmax_fields, sub_pops = get_infos_for_version(version) af_info = get_af_info(sub_pops) with gzip.open(af_output_vcf, "wt") as f: @@ -319,10 +321,10 @@ def calculate_allele_frequency(version, gnomad_input_vcf, af_output_vcf): an_popmax = 0 popmax = '.' infos = {} - for _, pop_name, ac_name, an_name in af_info: + for af_name, pop_name, ac_name, an_name in af_info: ac = variant.INFO.get(ac_name, 0) an = variant.INFO.get(an_name) - #print(f"{ac_name}/{an_name} {ac}/{an}") + # print(f"{pop_name=},{ac_name=},{an_name=} {ac=}/{an=}") if an: af = ac / an if pop_name and (pop_name not in BOTTLENECKED_SUB_POPS) and af > af_popmax: @@ -333,9 +335,12 @@ def calculate_allele_frequency(version, gnomad_input_vcf, af_output_vcf): af = f'{af:.6f}' else: af = '.' - infos["AF"] = af - for o in info_fields: + infos[af_name] = af + infos[ac_name] = ac + infos[an_name] = an + + for o in info_fields + chr_x_male: infos[o] = str(variant.INFO.get(o, '.')) gnomad_filtered = '0' if variant.FILTER is None else '1' infos["gnomad_filtered"] = gnomad_filtered From 787f408d2f9e06c81cd53d90ba7fa9ff69b7b781 Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Thu, 30 Nov 2023 18:11:53 +1030 Subject: [PATCH 09/29] #850 - new VEP 110 fields --- .../migrations/0082_new_vep_110_columns_v3.py | 131 +++++++++++------- .../migrations/0107_new_vep_110_columns_v3.py | 85 ++++++++++-- 2 files changed, 155 insertions(+), 61 deletions(-) diff --git a/annotation/migrations/0082_new_vep_110_columns_v3.py b/annotation/migrations/0082_new_vep_110_columns_v3.py index c9b259034..278cdb5ae 100644 --- a/annotation/migrations/0082_new_vep_110_columns_v3.py +++ b/annotation/migrations/0082_new_vep_110_columns_v3.py @@ -16,114 +16,143 @@ def _new_vep_110_annotation(apps, _schema_editor): FUNCTIONAL_EFFECT = 'f' PATHOGENICITY_PREDICTIONS = 'P' - ColumnVEPField = apps.get_model("annotation", "ColumnVEPField") - GenomeBuild = apps.get_model("snpdb", "GenomeBuild") - - grch37 = GenomeBuild.objects.get(pk="GRCh37") - grch38 = GenomeBuild.objects.get(pk="GRCh38") # Make existing gnomAD3 have max column version of 2 ColumnVEPField.objects.filter(vep_custom=VEP_CUSTOM_GNOMAD_3).update(max_vep_columns_version=2) - raise ValueError("Still need to add new fields from gnomad2 stuff") - - COLUMN_VEP_FIELD = [ - # gnomAD 4 + # gnomAD 2.1 additional fields - issue #231 + {'column': 'gnomad2_ac', 'variant_grid_column_id': 'gnomad_ac', + 'genome_build_id': 'GRCh37', 'pipeline_type': 'S', 'category': 'F', 'source_field': 'AC', + 'source_field_processing_description': 'Sum of exome AC + genome AC', + 'vep_custom': 'g', 'source_field_has_custom_prefix': True}, + {'column': 'gnomad2_popmax_ac', 'variant_grid_column_id': 'gnomad_popmax_ac', + 'genome_build_id': 'GRCh37', 'pipeline_type': 'S', 'category': 'F', 'source_field': 'AC_popmax', + 'source_field_processing_description': 'Sum of exome AC_popmax + genome AC_popmax', + 'vep_custom': 'g', 'source_field_has_custom_prefix': True}, + {'column': 'gnomad2_an', 'variant_grid_column_id': 'gnomad_an', + 'genome_build_id': 'GRCh37', 'pipeline_type': 'S', 'category': 'F', 'source_field': 'AN', + 'source_field_processing_description': 'Sum of exome AN + genome AN', + 'vep_custom': 'g', 'source_field_has_custom_prefix': True}, + {'column': 'gnomad2_popmax_an', 'variant_grid_column_id': 'gnomad_popmax_an', + 'genome_build_id': 'GRCh37', 'pipeline_type': 'S', 'category': 'F', 'source_field': 'AN_popmax', + 'source_field_processing_description': 'Sum of exome AN_popmax + genome AN_popmax', + 'vep_custom': 'g', 'source_field_has_custom_prefix': True}, + {'column': 'gnomad2_nonpar', 'variant_grid_column_id': 'gnomad_non_par', + 'genome_build_id': 'GRCh37', 'pipeline_type': 'S', 'category': 'F', 'source_field': 'nonpar', + 'source_field_processing_description': 'nonpar from genomes', + 'vep_custom': 'g', 'source_field_has_custom_prefix': True}, + + # gnomAD 4 - issue #938 {'column': 'gnomad4_ac', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, - 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_ac', 'source_field': 'AC', 'category': FREQUENCY_DATA}, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_ac', 'source_field': 'AC', + 'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'}, {'column': 'gnomad4_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, - 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_af', 'source_field': 'AF', 'category': FREQUENCY_DATA}, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_af', 'source_field': 'AF', + 'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'}, {'column': 'gnomad4_an', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, - 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_an', 'source_field': 'AN', 'category': FREQUENCY_DATA}, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_an', 'source_field': 'AN', + 'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'}, {'column': 'gnomad4_afr_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, - 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_afr_af', 'source_field': 'AF_afr', 'category': FREQUENCY_DATA}, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_afr_af', 'source_field': 'AF_afr', + 'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'}, {'column': 'gnomad4_amr_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, - 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_amr_af', 'source_field': 'AF_amr', 'category': FREQUENCY_DATA}, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_amr_af', 'source_field': 'AF_amr', + 'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'}, {'column': 'gnomad4_asj_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, - 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_asj_af', 'source_field': 'AF_asj', 'category': FREQUENCY_DATA}, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_asj_af', 'source_field': 'AF_asj', + 'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'}, {'column': 'gnomad4_eas_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, - 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_eas_af', 'source_field': 'AF_eas', 'category': FREQUENCY_DATA}, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_eas_af', 'source_field': 'AF_eas', + 'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'}, {'column': 'gnomad4_fin_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, - 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_fin_af', 'source_field': 'AF_fin', 'category': FREQUENCY_DATA}, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_fin_af', 'source_field': 'AF_fin', + 'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'}, {'column': 'gnomad4_mid_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, - 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_mid_af', 'source_field': 'AF_mid', 'category': FREQUENCY_DATA}, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_mid_af', 'source_field': 'AF_mid', + 'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'}, {'column': 'gnomad4_nfe_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, - 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_nfe_af', 'source_field': 'AF_nfe', 'category': FREQUENCY_DATA}, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_nfe_af', 'source_field': 'AF_nfe', + 'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'}, {'column': 'gnomad4_oth_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, - 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_oth_af', 'source_field': 'AF_remaining', 'category': FREQUENCY_DATA}, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_oth_af', 'source_field': 'AF_remaining', + 'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'}, {'column': 'gnomad4_sas_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, - 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_sas_af', 'source_field': 'AF_sas', 'category': FREQUENCY_DATA}, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_sas_af', 'source_field': 'AF_sas', + 'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'}, + {'column': 'gnomad4_nonpar', 'variant_grid_column_id': 'gnomad_non_par', + 'genome_build_id': 'GRCh37', 'pipeline_type': 'S', 'category': 'F', 'source_field': 'non_par', + 'source_field_processing_description': 'nonpar from genomes', + 'vep_custom': 'g', 'source_field_has_custom_prefix': True}, {'column': 'gnomad4_filtered', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, - 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_filtered', 'source_field': 'gnomad_filtered', 'category': FREQUENCY_DATA}, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_filtered', + 'source_field': 'gnomad_filtered', 'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'}, {'column': 'gnomad4_hom_alt', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, - 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_hom_alt', 'source_field': 'nhomalt', 'category': FREQUENCY_DATA}, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_hom_alt', 'source_field': 'nhomalt', + 'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'}, {'column': 'gnomad4_popmax', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, - 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_popmax', 'source_field': 'grpmax', 'category': FREQUENCY_DATA}, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_popmax', 'source_field': 'grpmax', + 'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'}, {'column': 'gnomad4_popmax_ac', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, - 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_popmax_ac', 'source_field': 'AC_grpmax', 'category': FREQUENCY_DATA}, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_popmax_ac', 'source_field': 'AC_grpmax', + 'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'}, {'column': 'gnomad4_popmax_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, - 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_popmax_af', 'source_field': 'AF_grpmax', 'category': FREQUENCY_DATA}, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_popmax_af', 'source_field': 'AF_grpmax', + 'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'}, {'column': 'gnomad4_popmax_an', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, - 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_popmax_an', 'source_field': 'AN_grpmax', 'category': FREQUENCY_DATA}, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_popmax_an', 'source_field': 'AN_grpmax', + 'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'}, {'column': 'gnomad4_xy_ac', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_xy_ac', 'source_field': 'AC_XY', - 'category': FREQUENCY_DATA}, + 'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'}, {'column': 'gnomad4_xy_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_xy_af', 'source_field': 'AF_XY', - 'category': FREQUENCY_DATA}, + 'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'}, {'column': 'gnomad4_xy_an', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, - 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_popmax_an', 'source_field': 'AN_XY', - 'category': FREQUENCY_DATA}, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_xy_an', 'source_field': 'AN_XY', + 'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'}, {'column': 'gnomad4_faf95', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_faf95', 'source_field': 'faf95', - 'category': FREQUENCY_DATA}, + 'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'}, {'column': 'gnomad4_faf99', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_faf99', 'source_field': 'faf99', - 'category': FREQUENCY_DATA}, + 'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'}, {'column': 'gnomad4_fafmax_faf95_max', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, - 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_fafmax_faf95_max', 'source_field': 'fafmax_faf95_max', - 'category': FREQUENCY_DATA}, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_fafmax_faf95_max', + 'source_field': 'fafmax_faf95_max', 'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'}, {'column': 'gnomad4_fafmax_faf99_max', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, - 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_fafmax_faf99_max', 'source_field': 'fafmax_faf99_max', - 'category': FREQUENCY_DATA}, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_fafmax_faf99_max', + 'source_field': 'fafmax_faf99_max', 'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'}, # I left this out don't think it really matters -# {'column': 'gnomad4_popmax_hom_alt', 'source_field_has_custom_prefix': True, -# 'vep_custom': GNOMAD_4, 'variant_grid_column_id': 'gnomad_popmax_hom_alt', 'source_field': 'nhomalt_grpmax', 'category': FREQUENCY_DATA}, + # {'column': 'gnomad4_popmax_hom_alt', 'source_field_has_custom_prefix': True, + # 'vep_custom': GNOMAD_4, 'variant_grid_column_id': 'gnomad_popmax_hom_alt', 'source_field': 'nhomalt_grpmax', 'category': FREQUENCY_DATA}, # MAVE {'column': 'mavedb_score', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, 'vep_plugin': VEP_PLUGIN_MAVEDB, 'variant_grid_column_id': 'mavedb_score', - 'source_field': 'score', - 'category': FUNCTIONAL_EFFECT}, + 'source_field': 'score', 'category': FUNCTIONAL_EFFECT, 'genome_build_id': 'GRCh38'}, {'column': 'mavedb_urn', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, 'vep_plugin': VEP_PLUGIN_MAVEDB, 'variant_grid_column_id': 'mavedb_urn', - 'source_field': 'urn', - 'category': FUNCTIONAL_EFFECT}, + 'source_field': 'urn', 'category': FUNCTIONAL_EFFECT, 'genome_build_id': 'GRCh38'}, # AlphaMissense {'column': 'alphamissense_class', 'min_vep_columns_version': 3, 'vep_plugin': VEP_PLUGIN_ALPHAMISSENSE, 'variant_grid_column_id': 'alphamissense_class', - 'source_field': 'am_class', - 'category': PATHOGENICITY_PREDICTIONS}, + 'source_field': 'am_class', 'category': PATHOGENICITY_PREDICTIONS}, {'column': 'alphamissense_pathogenicity', 'min_vep_columns_version': 3, 'vep_plugin': VEP_PLUGIN_ALPHAMISSENSE, 'variant_grid_column_id': 'alphamissense_pathogenicity', - 'source_field': 'am_pathogenicity', - 'category': PATHOGENICITY_PREDICTIONS}, + 'source_field': 'am_pathogenicity', 'category': PATHOGENICITY_PREDICTIONS}, + ] bulk_insert_class_data(apps, "annotation", [("ColumnVEPField", COLUMN_VEP_FIELD)]) - ColumnVEPField.objects.filter(vep_custom=VEP_CUSTOM_GNOMAD_4).update(genome_build=grch38) - - class Migration(migrations.Migration): - dependencies = [ ('annotation', '0081_rename_faf95_variantannotation_gnomad_faf95_and_more'), ("snpdb", "0107_new_vep_110_columns_v3"), # Defines new columns diff --git a/snpdb/migrations/0107_new_vep_110_columns_v3.py b/snpdb/migrations/0107_new_vep_110_columns_v3.py index d0d003f71..b94c06283 100644 --- a/snpdb/migrations/0107_new_vep_110_columns_v3.py +++ b/snpdb/migrations/0107_new_vep_110_columns_v3.py @@ -13,15 +13,23 @@ def _new_vep_110_annotation(apps, _schema_editor): raise ValueError("This is not complete yet!") + NEW_VARIANT_GRID_COLUMNS = [ - {'grid_column_name': 'gnomad_mid_af', - 'variant_column': 'variantannotation__gnomad_mid_af', - 'annotation_level': VARIANT_LEVEL, + {'grid_column_name': 'alphamissense_class', + 'variant_column': 'variantannotation__alphamissense_class', + 'annotation_level': TRANSCRIPT_LEVEL, 'width': None, - 'label': 'gnomAD MID AF', - 'description': "Allele Frequency (0-1) among Middle Eastern genotypes (exome+genome) (GRCh38/gnomad4 only)", + 'label': 'AlphaMissense Class', 'model_field': True, 'queryset_field': True}, + {'grid_column_name': 'alphamissense_pathogenicity', + 'variant_column': 'variantannotation__alphamissense_pathogenicity', + 'annotation_level': TRANSCRIPT_LEVEL, + 'width': None, + 'label': 'AlphaMissense Pathogenicity', + 'model_field': True, + 'queryset_field': True}, + {'grid_column_name': 'gnomad_faf95', 'variant_column': 'variantannotation__gnomad_faf95', 'annotation_level': VARIANT_LEVEL, @@ -55,6 +63,49 @@ def _new_vep_110_annotation(apps, _schema_editor): 'model_field': True, 'queryset_field': True}, + {'grid_column_name': 'gnomad_mid_af', + 'variant_column': 'variantannotation__gnomad_mid_af', + 'annotation_level': VARIANT_LEVEL, + 'width': None, + 'label': 'gnomAD MID AF', + 'description': "Allele Frequency (0-1) among Middle Eastern genotypes (exome+genome) (GRCh38/gnomad4 only)", + 'model_field': True, + 'queryset_field': True}, + + {'grid_column_name': 'gnomad_non_par', + 'variant_column': 'variantannotation__gnomad_non_par', + 'annotation_level': VARIANT_LEVEL, + 'width': None, + 'label': 'gnomAD non-PAR', + 'description': "non_par in genomes or exomes", + 'model_field': True, + 'queryset_field': True}, + + {'grid_column_name': 'gnomad_xy_ac', + 'variant_column': 'variantannotation__gnomad_xy_ac', + 'annotation_level': VARIANT_LEVEL, + 'width': None, + 'label': 'gnomAD XY AC', + 'description': "Allele Count in XY", + 'model_field': True, + 'queryset_field': True}, + {'grid_column_name': 'gnomad_xy_af', + 'variant_column': 'variantannotation__gnomad_xy_af', + 'annotation_level': VARIANT_LEVEL, + 'width': None, + 'label': 'gnomAD XY AF', + 'description': "Allele Frequency in XY", + 'model_field': True, + 'queryset_field': True}, + {'grid_column_name': 'gnomad_xy_an', + 'variant_column': 'variantannotation__gnomad_xy_an', + 'annotation_level': VARIANT_LEVEL, + 'width': None, + 'label': 'gnomAD XY AN', + 'description': "Allele Number in XY", + 'model_field': True, + 'queryset_field': True}, + {'grid_column_name': 'gnomad_hemi_count', 'variant_column': 'variantannotation__gnomad_hemi_count', 'annotation_level': VARIANT_LEVEL, @@ -64,12 +115,21 @@ def _new_vep_110_annotation(apps, _schema_editor): 'model_field': True, 'queryset_field': True}, - # TODO: - # alphamissense_class - # alphamissense_pathogenicity - # mavedb_score - # mavedb_urn + {'grid_column_name': 'mavedb_score', + 'variant_column': 'variantannotation__mavedb_score', + 'annotation_level': TRANSCRIPT_LEVEL, + 'width': None, + 'label': 'MAVEdb score', + 'model_field': True, + 'queryset_field': True}, + {'grid_column_name': 'mavedb_urn', + 'variant_column': 'variantannotation__mavedb_urn', + 'annotation_level': TRANSCRIPT_LEVEL, + 'width': None, + 'label': 'MAVEdb urn', + 'model_field': True, + 'queryset_field': True}, ] NEW_COLUMN_VCF_INFO = [ @@ -103,8 +163,13 @@ def _new_vep_110_annotation(apps, _schema_editor): 'number': None, 'type': 'F', 'description': "gnomAD: Allele Frequency from gnomAD2 liftover (GRCh38 only)"}, + + # TODO: + ] + raise ValueError("Huge amount of NEW_COLUMN_VCF_INFO to do") + bulk_insert_class_data(apps, "snpdb", [("VariantGridColumn", NEW_VARIANT_GRID_COLUMNS)]) bulk_insert_class_data(apps, "snpdb", [("ColumnVCFInfo", NEW_COLUMN_VCF_INFO)]) From ab2538d3d5b6a751370d17cd111c96b699d5dba5 Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Fri, 1 Dec 2023 11:35:15 +1030 Subject: [PATCH 10/29] #850 - new VEP 110 fields --- .../migrations/0082_new_vep_110_columns_v3.py | 4 +- .../migrations/0107_new_vep_110_columns_v3.py | 101 +++++++++++------- 2 files changed, 67 insertions(+), 38 deletions(-) diff --git a/annotation/migrations/0082_new_vep_110_columns_v3.py b/annotation/migrations/0082_new_vep_110_columns_v3.py index 278cdb5ae..dd2fabce2 100644 --- a/annotation/migrations/0082_new_vep_110_columns_v3.py +++ b/annotation/migrations/0082_new_vep_110_columns_v3.py @@ -83,9 +83,9 @@ def _new_vep_110_annotation(apps, _schema_editor): 'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'}, {'column': 'gnomad4_nonpar', 'variant_grid_column_id': 'gnomad_non_par', - 'genome_build_id': 'GRCh37', 'pipeline_type': 'S', 'category': 'F', 'source_field': 'non_par', + 'genome_build_id': 'GRCh38', 'pipeline_type': 'S', 'category': 'F', 'source_field': 'non_par', 'source_field_processing_description': 'nonpar from genomes', - 'vep_custom': 'g', 'source_field_has_custom_prefix': True}, + 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'source_field_has_custom_prefix': True}, {'column': 'gnomad4_filtered', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_filtered', 'source_field': 'gnomad_filtered', 'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'}, diff --git a/snpdb/migrations/0107_new_vep_110_columns_v3.py b/snpdb/migrations/0107_new_vep_110_columns_v3.py index b94c06283..cadaccfb0 100644 --- a/snpdb/migrations/0107_new_vep_110_columns_v3.py +++ b/snpdb/migrations/0107_new_vep_110_columns_v3.py @@ -5,15 +5,10 @@ from library.django_utils import bulk_insert_class_data - def _new_vep_110_annotation(apps, _schema_editor): TRANSCRIPT_LEVEL = 'T' VARIANT_LEVEL = 'V' - raise ValueError("This is not complete yet!") - - - NEW_VARIANT_GRID_COLUMNS = [ {'grid_column_name': 'alphamissense_class', 'variant_column': 'variantannotation__alphamissense_class', @@ -133,49 +128,83 @@ def _new_vep_110_annotation(apps, _schema_editor): ] NEW_COLUMN_VCF_INFO = [ - {'info_id': 'GNOMAD3_AC', - 'column_id': 'gnomad_ac', - 'number': None, + {'info_id': 'ALPHAMISSENSE_class', + 'column_id': 'alphamissense_class', + 'number': 1, + 'type': 'S', + 'description': 'AlphaMissense pathogenicity prediction'}, + {'info_id': 'ALPHAMISSENSE_pathogenicity', + 'column_id': 'alphamissense_pathogenicity', + 'number': 1, 'type': 'F', - 'description': "gnomAD: Alternate Allele Count (GRCh38 only)"}, - {'info_id': 'GNOMAD3_AN', - 'column_id': 'gnomad_an', - 'number': None, - 'type': 'I', - 'description': "gnomAD: Total number of alleles (GRCh38 only)"}, - {'info_id': 'GNOMAD3_POPMAX_AC', - 'column_id': 'gnomad_popmax_ac', - 'number': None, + 'description': 'AlphaMissense pathogenicity prediction score'}, + {'info_id': 'GNOMAD_faf95', + 'column_id': 'gnomad_faf95', + 'number': 1, + 'type': 'F', + 'description': 'Filtering allele frequency (using Poisson 95% CI) (max of exomes/genomes) (GRCh38/gnomad4 only)'}, + {'info_id': 'GNOMAD_faf99', + 'column_id': 'gnomad_faf99', + 'number': 1, + 'type': 'F', + 'description': 'Filtering allele frequency (using Poisson 99% CI) (max of exomes/genomes) (GRCh38/gnomad4 only)'}, + {'info_id': 'GNOMAD_fafmax_faf95_max', + 'column_id': 'gnomad_fafmax_faf95_max', + 'number': 1, + 'type': 'F', + 'description': 'Maximum filtering allele frequency (using Poisson 95% CI) across genetic_ancestry groups (max of exomes/genomes) (GRCh38/gnomad4 only)'}, + {'info_id': 'GNOMAD_fafmax_faf99_max', + 'column_id': 'gnomad_fafmax_faf99_max', + 'number': 1, + 'type': 'F', + 'description': 'Maximum filtering allele frequency (using Poisson 99% CI) across genetic_ancestry groups (max of exomes/genomes) (GRCh38/gnomad4 only)'}, + {'info_id': 'GNOMAD_AF_mid', + 'column_id': 'gnomad_mid_af', + 'number': 1, + 'type': 'F', + 'description': 'Allele Frequency (0-1) among Middle Eastern genotypes (exome+genome) (GRCh38/gnomad4 only)'}, + {'info_id': 'GNOMAD_non_par', + 'column_id': 'gnomad_non_par', + 'number': 1, + 'type': 'F', + 'description': 'non_par in genomes or exomes'}, + {'info_id': 'GNOMAD_AC_XY', + 'column_id': 'gnomad_xy_ac', + 'number': 1, 'type': 'I', - 'description': "gnomAD: Allele count in the population with the maximum AF (GRCh38 only)"}, - {'info_id': 'GNOMAD3_POPMAX_AN', - 'column_id': 'gnomad_popmax_an', - 'number': None, + 'description': 'Allele Count in XY'}, + {'info_id': 'GNOMAD_AF_XY', + 'column_id': 'gnomad_xy_af', + 'number': 1, + 'type': 'F', + 'description': 'Allele Frequency in XY'}, + {'info_id': 'GNOMAD_AN_XY', + 'column_id': 'gnomad_xy_an', + 'number': 1, 'type': 'I', - 'description': "gnomAD: Total number of alleles in the population with the maximum AF (GRCh38 only)"}, - {'info_id': 'GNOMAD3_POPMAX_HOM_ALT', - 'column_id': 'gnomad_popmax_hom_alt', - 'number': None, + 'description': 'Allele Number in XY'}, + {'info_id': 'GNOMAD_HEMI_COUNT', + 'column_id': 'gnomad_hemi_count', + 'number': 1, 'type': 'I', - 'description': "gnomAD: Count of homozygous individuals in the population with the maximum allele frequency (GRCh38 only)"}, - {'info_id': 'GNOMAD2_LIFTOVER_AF', - 'column_id': 'gnomad2_liftover_af', - 'number': None, + 'description': 'XY count (in non-PAR regions) - only on chrX'}, + {'info_id': 'MaveDB_score', + 'column_id': 'mavedb_score', + 'number': 1, 'type': 'F', - 'description': "gnomAD: Allele Frequency from gnomAD2 liftover (GRCh38 only)"}, - - # TODO: - + 'description': 'MaveDB score - see MaveDB for interpretation of scores'}, + {'info_id': 'MaveDB_urn', + 'column_id': 'mavedb_urn', + 'number': 1, + 'type': 'S', + 'description': 'MaveDB database identifier'} ] - raise ValueError("Huge amount of NEW_COLUMN_VCF_INFO to do") - bulk_insert_class_data(apps, "snpdb", [("VariantGridColumn", NEW_VARIANT_GRID_COLUMNS)]) bulk_insert_class_data(apps, "snpdb", [("ColumnVCFInfo", NEW_COLUMN_VCF_INFO)]) class Migration(migrations.Migration): - dependencies = [ ('snpdb', '0106_alter_allelemergelog_allele_linking_tool_and_more'), ] From 8010053366e7aa5d02c0023ff2b5b1706814daad Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Fri, 1 Dec 2023 13:33:24 +1030 Subject: [PATCH 11/29] #850 - new VEP 110 fields --- annotation/management/commands/vep_version.py | 9 +- .../migrations/0082_new_vep_110_columns_v3.py | 8 +- annotation/models/damage_enums.py | 16 ++- annotation/models/models.py | 15 ++- ..._columns_version3_grch37.vep_annotated.vcf | 101 +++++++++++++++ ..._columns_version3_grch38.vep_annotated.vcf | 116 ++++++++++++++++++ 6 files changed, 252 insertions(+), 13 deletions(-) create mode 100644 annotation/tests/test_data/test_columns_version3_grch37.vep_annotated.vcf create mode 100644 annotation/tests/test_data/test_columns_version3_grch38.vep_annotated.vcf diff --git a/annotation/management/commands/vep_version.py b/annotation/management/commands/vep_version.py index 06d47f749..c916d7765 100644 --- a/annotation/management/commands/vep_version.py +++ b/annotation/management/commands/vep_version.py @@ -1,6 +1,6 @@ from django.core.management.base import BaseCommand -from annotation.vep_annotation import get_vep_version, VEPConfig +from annotation.vep_annotation import get_vep_version, VEPConfig, vep_dict_to_variant_annotation_version_kwargs from snpdb.models.models_genome import GenomeBuild @@ -14,4 +14,11 @@ def handle(self, *args, **options): genome_build = GenomeBuild.get_name_or_alias(build_name) vep_config = VEPConfig(genome_build) vep_version = get_vep_version(genome_build, vep_config.annotation_consortium) + print("*" * 40) + print("VEP kwargs:") print(vep_version) + + vav_kwargs = vep_dict_to_variant_annotation_version_kwargs(vep_config, vep_version) + print("*" * 40) + print("VariantAnnotationVersion kwargs:") + print(vav_kwargs) diff --git a/annotation/migrations/0082_new_vep_110_columns_v3.py b/annotation/migrations/0082_new_vep_110_columns_v3.py index dd2fabce2..877aaca63 100644 --- a/annotation/migrations/0082_new_vep_110_columns_v3.py +++ b/annotation/migrations/0082_new_vep_110_columns_v3.py @@ -133,12 +133,12 @@ def _new_vep_110_annotation(apps, _schema_editor): # 'vep_custom': GNOMAD_4, 'variant_grid_column_id': 'gnomad_popmax_hom_alt', 'source_field': 'nhomalt_grpmax', 'category': FREQUENCY_DATA}, # MAVE - {'column': 'mavedb_score', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, + {'column': 'mavedb_score', 'min_vep_columns_version': 3, 'vep_plugin': VEP_PLUGIN_MAVEDB, 'variant_grid_column_id': 'mavedb_score', - 'source_field': 'score', 'category': FUNCTIONAL_EFFECT, 'genome_build_id': 'GRCh38'}, - {'column': 'mavedb_urn', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3, + 'source_field': 'MaveDB_score', 'category': FUNCTIONAL_EFFECT, 'genome_build_id': 'GRCh38'}, + {'column': 'mavedb_urn', 'min_vep_columns_version': 3, 'vep_plugin': VEP_PLUGIN_MAVEDB, 'variant_grid_column_id': 'mavedb_urn', - 'source_field': 'urn', 'category': FUNCTIONAL_EFFECT, 'genome_build_id': 'GRCh38'}, + 'source_field': 'MaveDB_urn', 'category': FUNCTIONAL_EFFECT, 'genome_build_id': 'GRCh38'}, # AlphaMissense {'column': 'alphamissense_class', 'min_vep_columns_version': 3, diff --git a/annotation/models/damage_enums.py b/annotation/models/damage_enums.py index d366f0ce3..16c4aaa1e 100644 --- a/annotation/models/damage_enums.py +++ b/annotation/models/damage_enums.py @@ -156,8 +156,16 @@ class ALoFTPrediction(models.TextChoices): DOMINANT = "d", "Dominant" -class AlphaMissensePrediction(models.TextChoices): +class AlphaMissensePrediction(AbstractPathogenicity): """ @see https://asia.ensembl.org/info/docs/tools/vep/script/vep_plugins.html#alphamissense """ - LIKELY_BENIGN = 'b', "likely_benign" - AMBIGUOUS = "a", "ambiguous" - LIKELY_PATHOGENIC = "p", "likely_pathogenic" + LIKELY_BENIGN = 'b' + AMBIGUOUS = "a" + LIKELY_PATHOGENIC = "p" + + CHOICES = [ + (LIKELY_BENIGN, 'likely_benign'), + (AMBIGUOUS, 'ambiguous'), + (LIKELY_PATHOGENIC, 'likely_pathogenic'), + ] + MINIMUM_FLAG_DAMAGE_LEVEL = LIKELY_PATHOGENIC + VARIANT_PATH = "variantannotation__alphamissense_class" diff --git a/annotation/models/models.py b/annotation/models/models.py index c9a308e3e..b99d3fa45 100644 --- a/annotation/models/models.py +++ b/annotation/models/models.py @@ -459,6 +459,9 @@ class ColumnVEPField(models.Model): min_vep_columns_version = models.IntegerField(null=True) max_vep_columns_version = models.IntegerField(null=True) + def __str__(self) -> str: + return self.column + @property def vep_info_field(self): """ For VCFs, be sure to set source_field_has_custom_prefix=True @@ -467,7 +470,7 @@ def vep_info_field(self): We need to adjust for this in BulkVEPVCFAnnotationInserter """ vif = self.source_field - if self.source_field_has_custom_prefix: + if self.vep_custom and self.source_field_has_custom_prefix: vif = self.get_vep_custom_display() + "_" + vif return vif @@ -550,11 +553,15 @@ def get_pathogenic_prediction_funcs(self) -> Dict[str, Callable]: 'mutation_taster_pred_most_damaging': lambda d: d in MutationTasterPrediction.get_damage_or_greater_levels(), 'polyphen2_hvar_pred_most_damaging': lambda d: d in Polyphen2Prediction.get_damage_or_greater_levels(), } - elif self.columns_version == 2: + elif self.columns_version in (2, 3): pathogenic_rankscore = settings.ANNOTATION_MIN_PATHOGENIC_RANKSCORE pathogenic_prediction_columns = ['bayesdel_noaf_rankscore', 'cadd_raw_rankscore', 'clinpred_rankscore', 'revel_rankscore', 'metalr_rankscore', 'vest4_rankscore'] - return {c: lambda d: float(d) >= pathogenic_rankscore for c in pathogenic_prediction_columns} + pp_funcs = {c: lambda d: float(d) >= pathogenic_rankscore for c in pathogenic_prediction_columns} + if self.columns_version == 3: + pp_funcs["alphamissense_class"] = lambda d: d in AlphaMissensePrediction.get_damage_or_greater_levels() + return pp_funcs + raise ValueError(f"Don't know fields for {self.columns_version=}") @cached_property @@ -835,7 +842,7 @@ class AbstractVariantAnnotation(models.Model): splice_region = models.TextField(null=True, blank=True) symbol = models.TextField(null=True, blank=True) - alphamissense_class = models.CharField(max_length=1, choices=AlphaMissensePrediction.choices, null=True, blank=True) + alphamissense_class = models.CharField(max_length=1, choices=AlphaMissensePrediction.CHOICES, null=True, blank=True) alphamissense_pathogenicity = models.FloatField(null=True, blank=True) mavedb_score = models.FloatField(null=True, blank=True) diff --git a/annotation/tests/test_data/test_columns_version3_grch37.vep_annotated.vcf b/annotation/tests/test_data/test_columns_version3_grch37.vep_annotated.vcf new file mode 100644 index 000000000..9e647723a --- /dev/null +++ b/annotation/tests/test_data/test_columns_version3_grch37.vep_annotated.vcf @@ -0,0 +1,101 @@ +##fileformat=VCFv4.1 +##INFO= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##VEP="v110" time="2023-12-01 11:54:41" cache="/data/annotation/VEP/vep_cache/homo_sapiens_refseq/110_GRCh37" ensembl-funcgen=110.24e6da6 ensembl-variation=110.d34d25e ensembl-io=110.b1a0d57 ensembl=110.9eadbc2 1000genomes="phase3" COSMIC="92" ClinVar="202012" HGMD-PUBLIC="20204" assembly="GRCh37.p13" dbSNP="154" gencode="GENCODE 19" genebuild="2011-04" gnomADe="r2.1" polyphen="2.2.2" refseq="2020-10-26 17:03:42 - GCF_000001405.25_GRCh37.p13_genomic.gff" regbuild="1.0" sift="sift5.2.2" +##INFO= +##Grantham=Grantham Matrix score - Grantham, R. Amino Acid Difference Formula to Help Explain Protein Evolution, Science 1974 Sep 6;185(4154):862-4. +##SpliceRegion=SpliceRegion predictions +##NMD=Nonsense-mediated mRNA decay escaping variants prediction +##Mastermind_MMID3=Mastermind MMID3 variant identifier(s), as gene:key. Link to the Genomenon Mastermind Genomic Search Engine. +##Mastermind_counts=Mastermind number of citations in the medical literature. Output includes three unique counts: MMCNT1|MMCNT2|MMCNT3. MMCNT1 - Count of Mastermind articles with cDNA matches for this specific variant; MMCNT2 - Count of Mastermind articles with variants either explicitly matching at the cDNA level or given only at protein level; MMCNT3 - Count of Mastermind articles including other DNA-level variants resulting in the same amino acid change. +##MaxEntScan_alt=MaxEntScan alternate sequence score +##MaxEntScan_diff=MaxEntScan score difference +##MaxEntScan_ref=MaxEntScan reference sequence score +##Aloft_Confidence=Aloft_Confidence from dbNSFP file +##Aloft_pred=Aloft_pred from dbNSFP file +##Aloft_prob_Dominant=Aloft_prob_Dominant from dbNSFP file +##Aloft_prob_Recessive=Aloft_prob_Recessive from dbNSFP file +##Aloft_prob_Tolerant=Aloft_prob_Tolerant from dbNSFP file +##BayesDel_noAF_rankscore=BayesDel_noAF_rankscore from dbNSFP file +##CADD_raw_rankscore=CADD_raw_rankscore from dbNSFP file +##ClinPred_rankscore=ClinPred_rankscore from dbNSFP file +##Ensembl_transcriptid=Ensembl_transcriptid from dbNSFP file +##GERP++_RS=GERP++_RS from dbNSFP file +##Interpro_domain=Interpro_domain from dbNSFP file +##MetaLR_rankscore=MetaLR_rankscore from dbNSFP file +##REVEL_rankscore=REVEL_rankscore from dbNSFP file +##VEST4_rankscore=VEST4_rankscore from dbNSFP file +##ada_score=dbscSNV ADA score +##rf_score=dbscSNV RF score +##SpliceAI_pred_DP_AG=SpliceAI predicted effect on splicing. Delta position for acceptor gain +##SpliceAI_pred_DP_AL=SpliceAI predicted effect on splicing. Delta position for acceptor loss +##SpliceAI_pred_DP_DG=SpliceAI predicted effect on splicing. Delta position for donor gain +##SpliceAI_pred_DP_DL=SpliceAI predicted effect on splicing. Delta position for donor loss +##SpliceAI_pred_DS_AG=SpliceAI predicted effect on splicing. Delta score for acceptor gain +##SpliceAI_pred_DS_AL=SpliceAI predicted effect on splicing. Delta score for acceptor loss +##SpliceAI_pred_DS_DG=SpliceAI predicted effect on splicing. Delta score for donor gain +##SpliceAI_pred_DS_DL=SpliceAI predicted effect on splicing. Delta score for donor loss +##SpliceAI_pred_SYMBOL=SpliceAI gene symbol +##am_class=AlphaMissense pathogenicity prediction; column from /data/annotation/VEP/annotation_data/GRCh37/AlphaMissense_hg19.tsv.gz +##am_pathogenicity=AlphaMissense pathogenicity score; column from /data/annotation/VEP/annotation_data/GRCh37/AlphaMissense_hg19.tsv.gz +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##VEP-command-line='vep --af --assembly GRCh37 --biotype --cache --canonical --check_existing --compress_output gzip --custom [PATH]/CosmicCodingMuts_v95_20211101_grch37.normal.vcf.gz,COSMIC,vcf,exact,0,CNT,LEGACY_ID --database 0 --dir [PATH]/vep_cache --distance 5000 --domains --exclude_predicted --fasta [PATH]/GCF_000001405.25_GRCh37.p13_genomic.fna.gz --flag_pick --force_overwrite --hgvs --input_file [PATH]/test_grch37.vcf --no_escape --no_stats --numbers --offline --output_file [PATH]/test_grch37.vep_annotated.vcf.gz --plugin [PATH]/AlphaMissense_hg19.tsv.gz --protein --pubmed --refseq --sift b --symbol --transcript_version --use_given_ref --variant_class --vcf' +#CHROM POS ID REF ALT QUAL FILTER INFO +1 69098 . C G . . variant_id=13629760;CSQ=G|missense_variant|MODERATE|OR4F5|79501|Transcript|NM_001005484.2|protein_coding|3/3||NM_001005484.2:c.71C>G|NP_001005484.2:p.Thr24Ser|131|71|24|T/S|aCt/aGt|||1||1|SNV|EntrezGene||YES|NP_001005484.2||||tolerated(1)||||||||58||||||||.&.&|.&.&|.&.&|.&.&|.&.&|0.05653|0.12098|0.08831|ENST00000641515&ENST00000335137|2.31|.&.|0.01092|0.14661|0.07811|||-5|35|-5|17|0.00|0.15|0.27|0.08|OR4F5||||||||||||||||||||||0.0020000000949949|0.966000020503998|0.894999980926514|1.23199999332428|||||||| +1 69589 . G A . . variant_id=13629761;CSQ=A|missense_variant|MODERATE|OR4F5|79501|Transcript|NM_001005484.2|protein_coding|3/3||NM_001005484.2:c.562G>A|NP_001005484.2:p.Val188Ile|622|562|188|V/I|Gtc/Atc|COSV58736794||1||1|SNV|EntrezGene||YES|NP_001005484.2||||tolerated(0.41)|||||1|1||29|||OR4F5:V167I|0&1&1||||.&.&|.&.&|.&.&|.&.&|.&.&|0.01817|0.11649|0.18729|ENST00000641515&ENST00000335137|0.138|GPCR&_rhodopsin-like&_7TM&GPCR&_rhodopsin-like&_7TM|0.00039|0.11576|0.05287|||29|-46|-12|-1|0.00|0.00|0.00|0.11|OR4F5||||||||||||||||||||||0|0.814000010490417|-1.23099994659424|-0.108000002801418||||||COSV58736794|1|COSM6847255 +13 95839002 . C T . . variant_id=13629762;CSQ=T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_001105515.3|protein_coding|11/21||NM_001105515.3:c.1498G>A|NP_001098985.1:p.Glu500Lys|1635|1498|500|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1|||SNV|EntrezGene|||NP_001098985.1||||tolerated(0.11)|||0.0004||0&1|0&1||56|||ABCC4:E500K|0&1&1||||.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|0.63287|0.41304|0.15198|ENST00000645237&ENST00000376887&ENST00000646439&ENST00000536256&ENST00000629385&ENST00000645532|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|0.80456|0.69527|0.56662|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|benign|0.1101|rs145886106|41|36|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|282728|24956|0|0|.|afr|0.84799998998642|0.998000025749207|1.37699997425079|2.61299991607666||rs145886106|0.000927261|||COSV65320224|1|COSM7286401,T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_001301829.2|protein_coding|11/30||NM_001301829.2:c.1498G>A|NP_001288758.1:p.Glu500Lys|1635|1498|500|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1|||SNV|EntrezGene|||NP_001288758.1||||tolerated(0.06)|||0.0004||0&1|0&1||56|||ABCC4:E500K|0&1&1||||.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|0.63287|0.41304|0.15198|ENST00000645237&ENST00000376887&ENST00000646439&ENST00000536256&ENST00000629385&ENST00000645532|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|0.80456|0.69527|0.56662|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|benign|0.1101|rs145886106|41|36|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|282728|24956|0|0|.|afr|0.84799998998642|0.998000025749207|1.37699997425079|2.61299991607666||rs145886106|0.000927261|||COSV65320224|1|COSM7286401,T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_001301830.2|protein_coding|10/20||NM_001301830.2:c.1273G>A|NP_001288759.1:p.Glu425Lys|1410|1273|425|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1|||SNV|EntrezGene|||NP_001288759.1||||tolerated(0.11)|||0.0004||0&1|0&1||56|||ABCC4:E500K|0&1&1||||.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|0.63287|0.41304|0.15198|ENST00000645237&ENST00000376887&ENST00000646439&ENST00000536256&ENST00000629385&ENST00000645532|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|0.80456|0.69527|0.56662|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|||rs145886106|41|36|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|282728|24956|0|0|.|afr|0.84799998998642|0.998000025749207|1.37699997425079|2.61299991607666||rs145886106|0.000927261|||COSV65320224|1|COSM7286401,T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_005845.5|protein_coding|11/31||NM_005845.5:c.1498G>A|NP_005836.2:p.Glu500Lys|1635|1498|500|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1||1|SNV|EntrezGene||YES|NP_005836.2||||tolerated(0.06)|||0.0004||0&1|0&1||56|||ABCC4:E500K|0&1&1||||.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|0.63287|0.41304|0.15198|ENST00000645237&ENST00000376887&ENST00000646439&ENST00000536256&ENST00000629385&ENST00000645532|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|0.80456|0.69527|0.56662|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|benign|0.1101|rs145886106|41|36|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|282728|24956|0|0|.|afr|0.84799998998642|0.998000025749207|1.37699997425079|2.61299991607666||rs145886106|0.000927261|||COSV65320224|1|COSM7286401 +15 32928050 . C T . . variant_id=13638004;CSQ=T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_001286479.3|protein_coding|11/12||NM_001286479.3:c.850C>T|NP_001273408.1:p.Arg284Ter|1398|850|284|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|EntrezGene|||NP_001273408.1|||||||||0&1|0&1||||||||||.&High&.&.&.&|.&Recessive&.&.&.&|.&0.13585&.&.&.&|.&0.81255&.&.&.&|.&0.0516&.&.&.&|0.99401|0.93057||ENST00000565905&ENST00000361627&ENST00000567348&ENST00000563864&ENST00000543522|2.5|.&.&.&.&.|||0.36043|||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|||rs776172390|4|1|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|247816|29766|0|0|.|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835|3|COSM185114,T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_001286480.3|protein_coding|12/13||NM_001286480.3:c.850C>T|NP_001273409.1:p.Arg284Ter|1506|850|284|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|EntrezGene|||NP_001273409.1|||||||||0&1|0&1||||||||||.&High&.&.&.&|.&Recessive&.&.&.&|.&0.13585&.&.&.&|.&0.81255&.&.&.&|.&0.0516&.&.&.&|0.99401|0.93057||ENST00000565905&ENST00000361627&ENST00000567348&ENST00000563864&ENST00000543522|2.5|.&.&.&.&.|||0.36043|||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|||rs776172390|4|1|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|247816|29766|0|0|.|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835|3|COSM185114,T|intron_variant|MODIFIER|ARHGAP11A-SCG5|114118903|Transcript|NM_001368319.1|protein_coding||9/13|NM_001368319.1:c.1235+2741C>T|||||||rs776172390&COSV64380835||1|||SNV|EntrezGene||YES|NP_001355248.1|||||||||0&1|0&1|||||||||||||||||||||||||||||||||||||rs776172390|4|1|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|247816|29766|0|0|.|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835|3|COSM185114,T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_014783.6|protein_coding|11/12||NM_014783.6:c.1417C>T|NP_055598.1:p.Arg473Ter|2125|1417|473|R/*|Cga/Tga|rs776172390&COSV64380835||1||1|SNV|EntrezGene||YES|NP_055598.1|||||||||0&1|0&1||||||||||.&High&.&.&.&|.&Recessive&.&.&.&|.&0.13585&.&.&.&|.&0.81255&.&.&.&|.&0.0516&.&.&.&|0.99401|0.93057||ENST00000565905&ENST00000361627&ENST00000567348&ENST00000563864&ENST00000543522|2.5|.&.&.&.&.|||0.36043|||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|||rs776172390|4|1|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|247816|29766|0|0|.|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835|3|COSM185114,T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_199357.3|protein_coding|11/11||NM_199357.3:c.1417C>T|NP_955389.1:p.Arg473Ter|2125|1417|473|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|EntrezGene|||NP_955389.1|||||||||0&1|0&1||||NMD_escaping_variant||||||.&High&.&.&.&|.&Recessive&.&.&.&|.&0.13585&.&.&.&|.&0.81255&.&.&.&|.&0.0516&.&.&.&|0.99401|0.93057||ENST00000565905&ENST00000361627&ENST00000567348&ENST00000563864&ENST00000543522|2.5|.&.&.&.&.|||0.36043|||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|||rs776172390|4|1|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|247816|29766|0|0|.|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835|3|COSM185114 diff --git a/annotation/tests/test_data/test_columns_version3_grch38.vep_annotated.vcf b/annotation/tests/test_data/test_columns_version3_grch38.vep_annotated.vcf new file mode 100644 index 000000000..b74781756 --- /dev/null +++ b/annotation/tests/test_data/test_columns_version3_grch38.vep_annotated.vcf @@ -0,0 +1,116 @@ +##fileformat=VCFv4.1 +##INFO= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##VEP="v110" time="2023-12-01 11:54:29" cache="/data/annotation/VEP/vep_cache/homo_sapiens_refseq/110_GRCh38" ensembl=110.9eadbc2 ensembl-io=110.b1a0d57 ensembl-funcgen=110.24e6da6 ensembl-variation=110.d34d25e 1000genomes="phase3" COSMIC="97" ClinVar="202301" HGMD-PUBLIC="20204" assembly="GRCh38.p14" dbSNP="154" gencode="GENCODE 44" genebuild="2014-07" gnomADe="r2.1.1" gnomADg="v3.1.2" polyphen="2.2.3" refseq="110 - GCF_000001405.40_GRCh38.p14_genomic.gff" regbuild="1.0" sift="6.2.1" +##INFO= +##Grantham=Grantham Matrix score - Grantham, R. Amino Acid Difference Formula to Help Explain Protein Evolution, Science 1974 Sep 6;185(4154):862-4. +##SpliceRegion=SpliceRegion predictions +##NMD=Nonsense-mediated mRNA decay escaping variants prediction +##Mastermind_MMID3=Mastermind MMID3 variant identifier(s), as gene:key. Link to the Genomenon Mastermind Genomic Search Engine. +##Mastermind_counts=Mastermind number of citations in the medical literature. Output includes three unique counts: MMCNT1|MMCNT2|MMCNT3. MMCNT1 - Count of Mastermind articles with cDNA matches for this specific variant; MMCNT2 - Count of Mastermind articles with variants either explicitly matching at the cDNA level or given only at protein level; MMCNT3 - Count of Mastermind articles including other DNA-level variants resulting in the same amino acid change. +##MaxEntScan_alt=MaxEntScan alternate sequence score +##MaxEntScan_diff=MaxEntScan score difference +##MaxEntScan_ref=MaxEntScan reference sequence score +##Aloft_Confidence=Aloft_Confidence from dbNSFP file +##Aloft_pred=Aloft_pred from dbNSFP file +##Aloft_prob_Dominant=Aloft_prob_Dominant from dbNSFP file +##Aloft_prob_Recessive=Aloft_prob_Recessive from dbNSFP file +##Aloft_prob_Tolerant=Aloft_prob_Tolerant from dbNSFP file +##BayesDel_noAF_rankscore=BayesDel_noAF_rankscore from dbNSFP file +##CADD_raw_rankscore=CADD_raw_rankscore from dbNSFP file +##ClinPred_rankscore=ClinPred_rankscore from dbNSFP file +##Ensembl_transcriptid=Ensembl_transcriptid from dbNSFP file +##GERP++_RS=GERP++_RS from dbNSFP file +##Interpro_domain=Interpro_domain from dbNSFP file +##MetaLR_rankscore=MetaLR_rankscore from dbNSFP file +##REVEL_rankscore=REVEL_rankscore from dbNSFP file +##VEST4_rankscore=VEST4_rankscore from dbNSFP file +##ada_score=dbscSNV ADA score +##rf_score=dbscSNV RF score +##SpliceAI_pred_DP_AG=SpliceAI predicted effect on splicing. Delta position for acceptor gain +##SpliceAI_pred_DP_AL=SpliceAI predicted effect on splicing. Delta position for acceptor loss +##SpliceAI_pred_DP_DG=SpliceAI predicted effect on splicing. Delta position for donor gain +##SpliceAI_pred_DP_DL=SpliceAI predicted effect on splicing. Delta position for donor loss +##SpliceAI_pred_DS_AG=SpliceAI predicted effect on splicing. Delta score for acceptor gain +##SpliceAI_pred_DS_AL=SpliceAI predicted effect on splicing. Delta score for acceptor loss +##SpliceAI_pred_DS_DG=SpliceAI predicted effect on splicing. Delta score for donor gain +##SpliceAI_pred_DS_DL=SpliceAI predicted effect on splicing. Delta score for donor loss +##SpliceAI_pred_SYMBOL=SpliceAI gene symbol +##am_class=AlphaMissense pathogenicity prediction; column from /data/annotation/VEP/annotation_data/GRCh38/AlphaMissense_hg38.tsv.gz +##am_pathogenicity=AlphaMissense pathogenicity score; column from /data/annotation/VEP/annotation_data/GRCh38/AlphaMissense_hg38.tsv.gz +##MaveDB_nt=MaveDB HGVS (nucleotide); column from MaveDB_variants.tsv.gz +##MaveDB_pro=MaveDB HGVS (protein); column from MaveDB_variants.tsv.gz +##MaveDB_score=MaveDB score - see MaveDB for interpretation of scores; column from MaveDB_variants.tsv.gz +##MaveDB_urn=MaveDB database identifier; column from MaveDB_variants.tsv.gz +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##VEP-command-line='vep --af --assembly GRCh38 --biotype --cache --canonical --check_existing --compress_output gzip --custom [PATH]/CosmicCodingMuts_v95_20211101_grch38.normal.vcf.gz,COSMIC,vcf,exact,0,CNT,LEGACY_ID --database 0 --dir [PATH]/vep_cache --distance 5000 --domains --exclude_predicted --fasta [PATH]/GCF_000001405.39_GRCh38.p13_genomic.fna.gz --flag_pick --force_overwrite --hgvs --input_file [PATH]/test_grch38.vcf --no_escape --no_stats --numbers --offline --output_file [PATH]/test_grch38.vep_annotated.vcf.gz --plugin [PATH]/MaveDB_variants.tsv.gz,single_aminoacid_changes=0,transcript_match=0 --protein --pubmed --refseq --sift b --symbol --transcript_version --use_given_ref --variant_class --vcf' +#CHROM POS ID REF ALT QUAL FILTER INFO +1 69113 . T A . . variant_id=131165;CSQ=A|missense_variant|MODERATE|OR4F5|79501|Transcript|NM_001005484.2|protein_coding|3/3||NM_001005484.2:c.86T>A|NP_001005484.2:p.Leu29Gln|146|86|29|L/Q|cTg/cAg|||1||1|SNV|EntrezGene||YES|NP_001005484.2||||deleterious(0.02)||||||||113||||||||.&.&|.&.&|.&.&|.&.&|.&.&|0.18177|0.37495|0.36595|ENST00000641515&ENST00000335137|2.31|.&.|0.00994|0.27654|0.49146|||2|20|-5|20|0.05|0.17|0.01|0.04|OR4F5|||||||||NC_000001.11:69113-69113|1|1||0.000003|0.000000|0.000000|0.000000|0.000000|0.000000|5.681366254956992e-06|0.000000|0.000006|0.000000|0.000000||306558|176014||0.0|0.0|.|.|1|nfe|0|.|0.00100000004749745|0.967999994754791|1.01499998569489|0.986999988555908|||||||| +1 1020216 . CG GT . . variant_id=131166;CSQ=GT|missense_variant|MODERATE|AGRN|375790|Transcript|NM_001305275.2|protein_coding|1/39||NM_001305275.2:c.44_45delinsGT|NP_001292204.1:p.Pro15Arg|97-98|44-45|15|P/R|cCG/cGT|rs1553170743||1|||substitution|EntrezGene|||NP_001292204.1||||tolerated_low_confidence(0.08)||||uncertain_significance||1||103||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||1&0.619000017642975|0.651000022888184&0.649999976158142|1.41199994087219&0.852999985218048|0.0419999994337559&0.894999980926514||||||||,GT|missense_variant|MODERATE|AGRN|375790|Transcript|NM_198576.4|protein_coding|1/36||NM_198576.4:c.44_45delinsGT|NP_940978.2:p.Pro15Arg|97-98|44-45|15|P/R|cCG/cGT|rs1553170743||1||1|substitution|EntrezGene||YES|NP_940978.2||||tolerated_low_confidence(0.08)||||uncertain_significance||1||103||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||1&0.619000017642975|0.651000022888184&0.649999976158142|1.41199994087219&0.852999985218048|0.0419999994337559&0.894999980926514|||||||| +1 68440906 . C T . . variant_id=131167;CSQ=T|missense_variant|MODERATE|RPE65|6121|Transcript|NM_000329.3|protein_coding|6/14||NM_000329.3:c.590G>A|NP_000320.1:p.Gly197Glu|639|590|197|G/E|gGa/gAa|COSV52017509||-1||1|SNV|EntrezGene||YES|NP_000320.1||||deleterious(0)|||||1|1||98||||||||.&|.&|.&|.&|.&|0.93672|0.69004|0.89503|ENST00000262340|5.43||0.97965|0.97022|0.95374|||-35|43|13|-49|0.00|0.00|0.01|0.00|RPE65|likely_pathogenic|0.9466||||||||||||||||||||||||||||||||||1|0.999000012874603|5.75400018692017|1.02600002288818||||||COSV52017509|2|COSM5924206 +13 95186748 . C T . . variant_id=24601;CSQ=T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_001105515.3|protein_coding|11/21||NM_001105515.3:c.1498G>A|NP_001098985.1:p.Glu500Lys|1635|1498|500|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1|||SNV|EntrezGene|||NP_001098985.1||||tolerated_low_confidence(0.07)|||0.0004||0&1|0&1||56|||ABCC4:E500K|0&1&1||||.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|0.63287|0.41304|0.15198|ENST00000645237&ENST00000376887&ENST00000646439&ENST00000536256&ENST00000629385&ENST00000645532|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|0.80456|0.69527|0.56662|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|likely_benign|0.1101|||||13:95186748-95186748|0.000145|rs145886106|109|87||0.000068|0.001160|0.000133|0.000000|0.000000|0.000000|0.0011598453539528064|0.000000|0.000005|0.000128|0.000000||1614060|75010||0.0002782499941531569|0.00025161998928524554|0.0009378800168633461|0.0008432099712081254|0|afr|0|.|0.86599999666214|0.990000009536743|1.43400001525879|1.02600002288818||rs145886106|0.000414118|||COSV65320224|1|COSM7286401,T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_001301829.2|protein_coding|11/30||NM_001301829.2:c.1498G>A|NP_001288758.1:p.Glu500Lys|1635|1498|500|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1|||SNV|EntrezGene|||NP_001288758.1||||tolerated_low_confidence(0.08)|||0.0004||0&1|0&1||56|||ABCC4:E500K|0&1&1||||.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|0.63287|0.41304|0.15198|ENST00000645237&ENST00000376887&ENST00000646439&ENST00000536256&ENST00000629385&ENST00000645532|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|0.80456|0.69527|0.56662|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|likely_benign|0.1101|||||13:95186748-95186748|0.000145|rs145886106|109|87||0.000068|0.001160|0.000133|0.000000|0.000000|0.000000|0.0011598453539528064|0.000000|0.000005|0.000128|0.000000||1614060|75010||0.0002782499941531569|0.00025161998928524554|0.0009378800168633461|0.0008432099712081254|0|afr|0|.|0.86599999666214|0.990000009536743|1.43400001525879|1.02600002288818||rs145886106|0.000414118|||COSV65320224|1|COSM7286401,T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_001301830.2|protein_coding|10/20||NM_001301830.2:c.1273G>A|NP_001288759.1:p.Glu425Lys|1410|1273|425|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1|||SNV|EntrezGene|||NP_001288759.1||||tolerated_low_confidence(0.08)|||0.0004||0&1|0&1||56|||ABCC4:E500K|0&1&1||||.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|0.63287|0.41304|0.15198|ENST00000645237&ENST00000376887&ENST00000646439&ENST00000536256&ENST00000629385&ENST00000645532|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|0.80456|0.69527|0.56662|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|||||||13:95186748-95186748|0.000145|rs145886106|109|87||0.000068|0.001160|0.000133|0.000000|0.000000|0.000000|0.0011598453539528064|0.000000|0.000005|0.000128|0.000000||1614060|75010||0.0002782499941531569|0.00025161998928524554|0.0009378800168633461|0.0008432099712081254|0|afr|0|.|0.86599999666214|0.990000009536743|1.43400001525879|1.02600002288818||rs145886106|0.000414118|||COSV65320224|1|COSM7286401,T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_005845.5|protein_coding|11/31||NM_005845.5:c.1498G>A|NP_005836.2:p.Glu500Lys|1635|1498|500|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1||1|SNV|EntrezGene||YES|NP_005836.2||||tolerated_low_confidence(0.07)|||0.0004||0&1|0&1||56|||ABCC4:E500K|0&1&1||||.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|0.63287|0.41304|0.15198|ENST00000645237&ENST00000376887&ENST00000646439&ENST00000536256&ENST00000629385&ENST00000645532|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|0.80456|0.69527|0.56662|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|likely_benign|0.1101|||||13:95186748-95186748|0.000145|rs145886106|109|87||0.000068|0.001160|0.000133|0.000000|0.000000|0.000000|0.0011598453539528064|0.000000|0.000005|0.000128|0.000000||1614060|75010||0.0002782499941531569|0.00025161998928524554|0.0009378800168633461|0.0008432099712081254|0|afr|0|.|0.86599999666214|0.990000009536743|1.43400001525879|1.02600002288818||rs145886106|0.000414118|||COSV65320224|1|COSM7286401 +15 32635849 . C T . . variant_id=42;CSQ=T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_001286479.3|protein_coding|11/12||NM_001286479.3:c.850C>T|NP_001273408.1:p.Arg284Ter|1398|850|284|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|EntrezGene|||NP_001273408.1|||||||||0&1|0&1||||||||||.&High&.&.&.&|.&Recessive&.&.&.&|.&0.13585&.&.&.&|.&0.81255&.&.&.&|.&0.0516&.&.&.&|0.99401|0.93057||ENST00000565905&ENST00000361627&ENST00000567348&ENST00000563864&ENST00000543522|2.5|.&.&.&.&.|||0.36043|||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|||||||15:32635849-32635849|1.6e-05|rs776172390|13|2||0.000008|0.000027|0.000000|0.000000|0.000000|0.000000|2.6797438164911436e-05|0.000000|0.000008|0.000016|0.000000||1609462|74634||5.259999852569308e-06|2.530000074330019e-06|8.029999662539922e-06|3.000000106112566e-06|0|afr|0|.|0.00899999961256981|0.839999973773956|0.433999985456467|0.282000005245209||rs776172390|3.18550e-05|||COSV64380835|3|COSM185114,T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_001286480.3|protein_coding|12/13||NM_001286480.3:c.850C>T|NP_001273409.1:p.Arg284Ter|1506|850|284|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|EntrezGene|||NP_001273409.1|||||||||0&1|0&1||||||||||.&High&.&.&.&|.&Recessive&.&.&.&|.&0.13585&.&.&.&|.&0.81255&.&.&.&|.&0.0516&.&.&.&|0.99401|0.93057||ENST00000565905&ENST00000361627&ENST00000567348&ENST00000563864&ENST00000543522|2.5|.&.&.&.&.|||0.36043|||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|||||||15:32635849-32635849|1.6e-05|rs776172390|13|2||0.000008|0.000027|0.000000|0.000000|0.000000|0.000000|2.6797438164911436e-05|0.000000|0.000008|0.000016|0.000000||1609462|74634||5.259999852569308e-06|2.530000074330019e-06|8.029999662539922e-06|3.000000106112566e-06|0|afr|0|.|0.00899999961256981|0.839999973773956|0.433999985456467|0.282000005245209||rs776172390|3.18550e-05|||COSV64380835|3|COSM185114,T|intron_variant|MODIFIER|ARHGAP11A-SCG5|114118903|Transcript|NM_001368319.1|protein_coding||9/13|NM_001368319.1:c.1235+2741C>T|||||||rs776172390&COSV64380835||1|||SNV|EntrezGene||YES|NP_001355248.1|||||||||0&1|0&1|||||||||||||||||||||||||||||||||||||||||15:32635849-32635849|1.6e-05|rs776172390|13|2||0.000008|0.000027|0.000000|0.000000|0.000000|0.000000|2.6797438164911436e-05|0.000000|0.000008|0.000016|0.000000||1609462|74634||5.259999852569308e-06|2.530000074330019e-06|8.029999662539922e-06|3.000000106112566e-06|0|afr|0|.|0.00899999961256981|0.839999973773956|0.433999985456467|0.282000005245209||rs776172390|3.18550e-05|||COSV64380835|3|COSM185114,T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_014783.6|protein_coding|11/12||NM_014783.6:c.1417C>T|NP_055598.1:p.Arg473Ter|2125|1417|473|R/*|Cga/Tga|rs776172390&COSV64380835||1||1|SNV|EntrezGene||YES|NP_055598.1|||||||||0&1|0&1||||||||||.&High&.&.&.&|.&Recessive&.&.&.&|.&0.13585&.&.&.&|.&0.81255&.&.&.&|.&0.0516&.&.&.&|0.99401|0.93057||ENST00000565905&ENST00000361627&ENST00000567348&ENST00000563864&ENST00000543522|2.5|.&.&.&.&.|||0.36043|||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|||||||15:32635849-32635849|1.6e-05|rs776172390|13|2||0.000008|0.000027|0.000000|0.000000|0.000000|0.000000|2.6797438164911436e-05|0.000000|0.000008|0.000016|0.000000||1609462|74634||5.259999852569308e-06|2.530000074330019e-06|8.029999662539922e-06|3.000000106112566e-06|0|afr|0|.|0.00899999961256981|0.839999973773956|0.433999985456467|0.282000005245209||rs776172390|3.18550e-05|||COSV64380835|3|COSM185114,T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_199357.3|protein_coding|11/11||NM_199357.3:c.1417C>T|NP_955389.1:p.Arg473Ter|2125|1417|473|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|EntrezGene|||NP_955389.1|||||||||0&1|0&1||||NMD_escaping_variant||||||.&High&.&.&.&|.&Recessive&.&.&.&|.&0.13585&.&.&.&|.&0.81255&.&.&.&|.&0.0516&.&.&.&|0.99401|0.93057||ENST00000565905&ENST00000361627&ENST00000567348&ENST00000563864&ENST00000543522|2.5|.&.&.&.&.|||0.36043|||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|||||||15:32635849-32635849|1.6e-05|rs776172390|13|2||0.000008|0.000027|0.000000|0.000000|0.000000|0.000000|2.6797438164911436e-05|0.000000|0.000008|0.000016|0.000000||1609462|74634||5.259999852569308e-06|2.530000074330019e-06|8.029999662539922e-06|3.000000106112566e-06|0|afr|0|.|0.00899999961256981|0.839999973773956|0.433999985456467|0.282000005245209||rs776172390|3.18550e-05|||COSV64380835|3|COSM185114 From b396b8caf7aab562d66dc3f1334f97faa8013a21 Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Fri, 1 Dec 2023 13:50:54 +1030 Subject: [PATCH 12/29] Use new gnomADv2 fields for 37 test generation --- ..._columns_version2_grch37.vep_annotated.vcf | 95 ++++++++++--------- 1 file changed, 52 insertions(+), 43 deletions(-) diff --git a/annotation/tests/test_data/test_columns_version2_grch37.vep_annotated.vcf b/annotation/tests/test_data/test_columns_version2_grch37.vep_annotated.vcf index b87b50674..8a8253d6c 100644 --- a/annotation/tests/test_data/test_columns_version2_grch37.vep_annotated.vcf +++ b/annotation/tests/test_data/test_columns_version2_grch37.vep_annotated.vcf @@ -25,31 +25,30 @@ ##contig= ##contig= ##contig= -##VEP="v106" time="2022-05-19 12:29:43" cache="/data/annotation/VEP/vep_cache/homo_sapiens/106_GRCh37" ensembl-variation=106.2aa7a5d ensembl=106.f4b50c6 ensembl-io=106.6eafdaa ensembl-funcgen=106.027e023 1000genomes="phase3" COSMIC="92" ClinVar="202012" HGMD-PUBLIC="20204" assembly="GRCh37.p13" dbSNP="154" gencode="GENCODE 19" genebuild="2011-04" gnomAD="r2.1" polyphen="2.2.2" regbuild="1.0" sift="sift5.2.2" -##INFO= +##VEP="v110" time="2023-12-01 13:33:59" cache="/data/annotation/VEP/vep_cache/homo_sapiens_refseq/110_GRCh37" ensembl-io=110.b1a0d57 ensembl-variation=110.d34d25e ensembl-funcgen=110.24e6da6 ensembl=110.9eadbc2 1000genomes="phase3" COSMIC="92" ClinVar="202012" HGMD-PUBLIC="20204" assembly="GRCh37.p13" dbSNP="154" gencode="GENCODE 19" genebuild="2011-04" gnomADe="r2.1" polyphen="2.2.2" refseq="2020-10-26 17:03:42 - GCF_000001405.25_GRCh37.p13_genomic.gff" regbuild="1.0" sift="sift5.2.2" +##INFO= ##Grantham=Grantham Matrix score - Grantham, R. Amino Acid Difference Formula to Help Explain Protein Evolution, Science 1974 Sep 6;185(4154):862-4. ##SpliceRegion=SpliceRegion predictions -##LoFtool=LoFtool score for gene ##NMD=Nonsense-mediated mRNA decay escaping variants prediction ##Mastermind_MMID3=Mastermind MMID3 variant identifier(s), as gene:key. Link to the Genomenon Mastermind Genomic Search Engine. ##Mastermind_counts=Mastermind number of citations in the medical literature. Output includes three unique counts: MMCNT1|MMCNT2|MMCNT3. MMCNT1 - Count of Mastermind articles with cDNA matches for this specific variant; MMCNT2 - Count of Mastermind articles with variants either explicitly matching at the cDNA level or given only at protein level; MMCNT3 - Count of Mastermind articles including other DNA-level variants resulting in the same amino acid change. ##MaxEntScan_alt=MaxEntScan alternate sequence score ##MaxEntScan_diff=MaxEntScan score difference ##MaxEntScan_ref=MaxEntScan reference sequence score -##Aloft_Confidence=(from dbNSFP) Confidence level of Aloft_pred; values can be "High Confidence" (p < 0.05) or "Low Confidence" (p > 0.05) multiple values separated by ";", corresponding to Ensembl_proteinid. -##Aloft_pred=(from dbNSFP) final classification predicted by ALoFT; values can be Tolerant, Recessive or Dominant multiple values separated by ";", corresponding to Ensembl_proteinid. -##Aloft_prob_Dominant=(from dbNSFP) Probability of the SNP being classified as dominant disease-causing by ALoFT multiple values separated by ";", corresponding to Ensembl_proteinid. -##Aloft_prob_Recessive=(from dbNSFP) Probability of the SNP being classified as recessive disease-causing by ALoFT multiple values separated by ";", corresponding to Ensembl_proteinid. -##Aloft_prob_Tolerant=(from dbNSFP) Probability of the SNP being classified as benign by ALoFT multiple values separated by ";", corresponding to Ensembl_proteinid. -##BayesDel_noAF_rankscore=(from dbNSFP) BayesDel_noAF scores were ranked among all BayesDel_noAF scores in dbNSFP. The rankscore is the ratio of the rank of the score over the total number of BayesDel_noAF scores in dbNSFP. -##CADD_raw_rankscore=(from dbNSFP) CADD raw scores were ranked among all CADD raw scores in dbNSFP. The rankscore is the ratio of the rank of the score over the total number of CADD raw scores in dbNSFP. Please note the following copyright statement for CADD: "CADD scores (http://cadd.gs.washington.edu/) are Copyright 2013 University of Washington and Hudson-Alpha Institute for Biotechnology (all rights reserved) but are freely available for all academic, non-commercial applications. For commercial licensing information contact Jennifer McCullar (mccullaj@uw.edu)." -##ClinPred_rankscore=(from dbNSFP) ClinPred scores were ranked among all ClinPred scores in dbNSFP. The rankscore is the ratio of the rank of the score over the total number of ClinPred scores in dbNSFP. -##Ensembl_transcriptid=(from dbNSFP) Ensembl transcript ids (Multiple entries separated by ";") -##GERP++_RS=(from dbNSFP) GERP++ RS score, the larger the score, the more conserved the site. Scores range from -12.3 to 6.17. -##Interpro_domain=(from dbNSFP) domain or conserved site on which the variant locates. Domain annotations come from Interpro database. The number in the brackets following a specific domain is the count of times Interpro assigns the variant position to that domain, typically coming from different predicting databases. Multiple entries separated by ";". -##MetaLR_rankscore=(from dbNSFP) MetaLR scores were ranked among all MetaLR scores in dbNSFP. The rankscore is the ratio of the rank of the score over the total number of MetaLR scores in dbNSFP. The scores range from 0 to 1. -##REVEL_rankscore=(from dbNSFP) REVEL scores were ranked among all REVEL scores in dbNSFP. The rankscore is the ratio of the rank of the score over the total number of REVEL scores in dbNSFP. -##VEST4_rankscore=(from dbNSFP) VEST4 scores were ranked among all VEST4 scores in dbNSFP. The rankscore is the ratio of the rank of the score over the total number of VEST4 scores in dbNSFP. In case there are multiple scores for the same variant, the largest score (most damaging) is presented. The scores range from 0 to 1. Please note VEST score is free for non-commercial use. For more details please refer to http://wiki.chasmsoftware.org/index.php/SoftwareLicense. Commercial users should contact the Johns Hopkins Technology Transfer office. +##Aloft_Confidence=Aloft_Confidence from dbNSFP file +##Aloft_pred=Aloft_pred from dbNSFP file +##Aloft_prob_Dominant=Aloft_prob_Dominant from dbNSFP file +##Aloft_prob_Recessive=Aloft_prob_Recessive from dbNSFP file +##Aloft_prob_Tolerant=Aloft_prob_Tolerant from dbNSFP file +##BayesDel_noAF_rankscore=BayesDel_noAF_rankscore from dbNSFP file +##CADD_raw_rankscore=CADD_raw_rankscore from dbNSFP file +##ClinPred_rankscore=ClinPred_rankscore from dbNSFP file +##Ensembl_transcriptid=Ensembl_transcriptid from dbNSFP file +##GERP++_RS=GERP++_RS from dbNSFP file +##Interpro_domain=Interpro_domain from dbNSFP file +##MetaLR_rankscore=MetaLR_rankscore from dbNSFP file +##REVEL_rankscore=REVEL_rankscore from dbNSFP file +##VEST4_rankscore=VEST4_rankscore from dbNSFP file ##ada_score=dbscSNV ADA score ##rf_score=dbscSNV RF score ##SpliceAI_pred_DP_AG=SpliceAI predicted effect on splicing. Delta position for acceptor gain @@ -61,30 +60,40 @@ ##SpliceAI_pred_DS_DG=SpliceAI predicted effect on splicing. Delta score for donor gain ##SpliceAI_pred_DS_DL=SpliceAI predicted effect on splicing. Delta score for donor loss ##SpliceAI_pred_SYMBOL=SpliceAI gene symbol -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##VEP-command-line='vep --af --assembly GRCh37 --biotype --cache --canonical --check_existing --compress_output gzip --custom [PATH]/CosmicCodingMuts_v95_20211101_grch37.normal.vcf.gz,COSMIC,vcf,exact,0,CNT,LEGACY_ID --database 0 --dir [PATH]/vep_cache --distance 5000 --domains --exclude_predicted --fasta [PATH]/GCF_000001405.25_GRCh37.p13_genomic.fna.gz --flag_pick --force_overwrite --hgvs --input_file [PATH]/test_grch37.vcf --no_escape --no_stats --numbers --offline --output_file [PATH]/test_grch37.vep_annotated.vcf.gz --plugin [PATH]/spliceai_scores.raw.indel.hg19.vcf.gz --protein --pubmed --refseq --sift b --symbol --transcript_version --use_given_ref --variant_class --vcf' #CHROM POS ID REF ALT QUAL FILTER INFO -1 69098 . C G . . variant_id=13629760;CSQ=G|missense_variant|MODERATE|OR4F5|ENSG00000186092|Transcript|ENST00000335137.3|protein_coding|1/1||ENST00000335137.3:c.8C>G|ENSP00000334393.3:p.Thr3Ser|8|8|3|T/S|aCt/aGt|||1||1|SNV|HGNC|14825|YES|ENSP00000334393||tolerated(1)|PANTHER:PTHR26451&PANTHER:PTHR26451:SF72&Gene3D:1.20.1070.10&Superfamily:SSF81321|||||||58|||||||||.&.&|.&.&|.&.&|.&.&|.&.&|0.05653|0.12098|0.08831|ENST00000641515&ENST00000335137|2.31|.&.|0.01092|0.14661|0.07811|||-5|35|-5|17|0.00|0.15|0.27|0.08|OR4F5|||||||||||||||||||||| -1 69589 . G A . . variant_id=13629761;CSQ=A|missense_variant|MODERATE|OR4F5|ENSG00000186092|Transcript|ENST00000335137.3|protein_coding|1/1||ENST00000335137.3:c.499G>A|ENSP00000334393.3:p.Val167Ile|499|499|167|V/I|Gtc/Atc|COSV58736794||1||1|SNV|HGNC|14825|YES|ENSP00000334393||tolerated(0.44)|PANTHER:PTHR26451&PANTHER:PTHR26451:SF72&Gene3D:1.20.1070.10&Pfam:PF13853&Superfamily:SSF81321&PROSITE_profiles:PS50262||||1|1||29||||OR4F5:V167I|0&1&1||||.&.&|.&.&|.&.&|.&.&|.&.&|0.01817|0.11649|0.18729|ENST00000641515&ENST00000335137|0.138|GPCR&_rhodopsin-like&_7TM&GPCR&_rhodopsin-like&_7TM|0.00039|0.11576|0.05287|||||||||||||||||||||||||||||||COSV58736794|1|COSM6847255 -13 95839002 . C T . . variant_id=13629762;CSQ=T|missense_variant|MODERATE|ABCC4|ENSG00000125257|Transcript|ENST00000376887.4|protein_coding|11/31||ENST00000376887.4:c.1498G>A|ENSP00000366084.4:p.Glu500Lys|1613|1498|500|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1||1|SNV|HGNC|55|YES|ENSP00000366084||tolerated(0.06)|Superfamily:SSF52540&SMART:SM00382&Pfam:PF00005&Gene3D:3.40.50.300&PANTHER:PTHR24223&PANTHER:PTHR24223:SF205&PROSITE_profiles:PS50893||0.0004||0&1|0&1||56||0.0441||ABCC4:E500K|0&1&1||||.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|0.63287|0.41304|0.15198|ENST00000645237&ENST00000376887&ENST00000646439&ENST00000536256&ENST00000629385&ENST00000645532|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|0.80456|0.69527|0.56662||||||||||||rs145886106|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|0|0|afr||rs145886106|0.000927261|||COSV65320224&COSV65320224&COSV65320224&COSV65320224&COSV65320224|1&1&1&1&1|COSM7286401&COSM7286401&COSM7286401&COSM7286401&COSM7286401,T|missense_variant|MODERATE|ABCC4|ENSG00000125257|Transcript|ENST00000412704.1|protein_coding|11/30||ENST00000412704.1:c.1498G>A|ENSP00000388657.1:p.Glu500Lys|1617|1498|500|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1|||SNV|HGNC|55||ENSP00000388657||tolerated(0.06)|PROSITE_profiles:PS50893&PANTHER:PTHR24223:SF205&PANTHER:PTHR24223&Pfam:PF00005&Gene3D:3.40.50.300&SMART:SM00382&Superfamily:SSF52540||0.0004||0&1|0&1||56||0.0441||ABCC4:E500K|0&1&1||||.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|0.63287|0.41304|0.15198|ENST00000645237&ENST00000376887&ENST00000646439&ENST00000536256&ENST00000629385&ENST00000645532|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|0.80456|0.69527|0.56662||||||||||||rs145886106|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|0|0|afr||rs145886106|0.000927261|||COSV65320224&COSV65320224&COSV65320224&COSV65320224&COSV65320224|1&1&1&1&1|COSM7286401&COSM7286401&COSM7286401&COSM7286401&COSM7286401,T|missense_variant|MODERATE|ABCC4|ENSG00000125257|Transcript|ENST00000431522.1|protein_coding|11/21||ENST00000431522.1:c.1498G>A|ENSP00000398562.1:p.Glu500Lys|1617|1498|500|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1|||SNV|HGNC|55||ENSP00000398562||tolerated(0.11)|Superfamily:SSF52540&SMART:SM00382&Pfam:PF00005&Gene3D:3.40.50.300&PANTHER:PTHR24223:SF205&PANTHER:PTHR24223&PROSITE_profiles:PS50893||0.0004||0&1|0&1||56||0.0441||ABCC4:E500K|0&1&1||||.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|0.63287|0.41304|0.15198|ENST00000645237&ENST00000376887&ENST00000646439&ENST00000536256&ENST00000629385&ENST00000645532|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|0.80456|0.69527|0.56662||||||||||||rs145886106|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|0|0|afr||rs145886106|0.000927261|||COSV65320224&COSV65320224&COSV65320224&COSV65320224&COSV65320224|1&1&1&1&1|COSM7286401&COSM7286401&COSM7286401&COSM7286401&COSM7286401,T|missense_variant|MODERATE|ABCC4|ENSG00000125257|Transcript|ENST00000536256.1|protein_coding|10/20||ENST00000536256.1:c.1273G>A|ENSP00000442024.1:p.Glu425Lys|1392|1273|425|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1|||SNV|HGNC|55||ENSP00000442024||tolerated(0.11)|PROSITE_profiles:PS50893&PANTHER:PTHR24223&PANTHER:PTHR24223:SF205&Pfam:PF00005&Gene3D:3.40.50.300&SMART:SM00382&Superfamily:SSF52540||0.0004||0&1|0&1||56||0.0441||ABCC4:E500K|0&1&1||||.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|0.63287|0.41304|0.15198|ENST00000645237&ENST00000376887&ENST00000646439&ENST00000536256&ENST00000629385&ENST00000645532|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|0.80456|0.69527|0.56662||||||||||||rs145886106|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|0|0|afr||rs145886106|0.000927261|||COSV65320224&COSV65320224&COSV65320224&COSV65320224&COSV65320224|1&1&1&1&1|COSM7286401&COSM7286401&COSM7286401&COSM7286401&COSM7286401,T|3_prime_UTR_variant|MODIFIER|ABCC4|ENSG00000125257|Transcript|ENST00000538287.1|protein_coding|13/17||ENST00000538287.1:c.*1689G>A||1785|||||rs145886106&COSV65320224||-1|||SNV|HGNC|55||ENSP00000440160|||||0.0004||0&1|0&1||||0.0441||ABCC4:E500K|0&1&1|||||||||||||||||||||||||||||rs145886106|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|0|0|afr||rs145886106|0.000927261|||COSV65320224&COSV65320224&COSV65320224&COSV65320224&COSV65320224|1&1&1&1&1|COSM7286401&COSM7286401&COSM7286401&COSM7286401&COSM7286401 -15 32928050 . C T . . variant_id=13638004;CSQ=T|stop_gained|HIGH|ARHGAP11A|ENSG00000198826|Transcript|ENST00000361627.3|protein_coding|11/12||ENST00000361627.3:c.1417C>T|ENSP00000355090.3:p.Arg473Ter|2139|1417|473|R/*|Cga/Tga|rs776172390&COSV64380835||1||1|SNV|HGNC|15783|YES|ENSP00000355090|||PANTHER:PTHR15670&PANTHER:PTHR15670:SF3||||0&1|0&1||||0.426|||||||.&High&.&.&.&|.&Recessive&.&.&.&|.&0.13585&.&.&.&|.&0.81255&.&.&.&|.&0.0516&.&.&.&|0.99401|0.93057||ENST00000565905&ENST00000361627&ENST00000567348&ENST00000563864&ENST00000543522|2.5|.&.&.&.&.|||0.36043||||||||||||rs776172390|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|0|0|sas||rs776172390|3.4343e-005|||COSV64380835&COSV64380835&COSV64380835&COSV64380835&COSV64380835|3&3&3&3&3|COSM185114&COSM185114&COSM185114&COSM185114&COSM185114,T|stop_gained|HIGH|ARHGAP11A|ENSG00000198826|Transcript|ENST00000543522.1|protein_coding|12/13||ENST00000543522.1:c.850C>T|ENSP00000440073.1:p.Arg284Ter|1439|850|284|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|HGNC|15783||ENSP00000440073|||PANTHER:PTHR15670&PANTHER:PTHR15670:SF3||||0&1|0&1||||0.426|||||||.&High&.&.&.&|.&Recessive&.&.&.&|.&0.13585&.&.&.&|.&0.81255&.&.&.&|.&0.0516&.&.&.&|0.99401|0.93057||ENST00000565905&ENST00000361627&ENST00000567348&ENST00000563864&ENST00000543522|2.5|.&.&.&.&.|||0.36043||||||||||||rs776172390|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|0|0|sas||rs776172390|3.4343e-005|||COSV64380835&COSV64380835&COSV64380835&COSV64380835&COSV64380835|3&3&3&3&3|COSM185114&COSM185114&COSM185114&COSM185114&COSM185114,T|stop_gained|HIGH|ARHGAP11A|ENSG00000198826|Transcript|ENST00000562481.1|protein_coding|5/6||ENST00000562481.1:c.469C>T|ENSP00000455593.1:p.Arg157Ter|468|469|157|R/*|Cga/Tga|rs776172390&COSV64380835||1|cds_start_NF&cds_end_NF||SNV|HGNC|15783||ENSP00000455593|||PANTHER:PTHR15670&PANTHER:PTHR15670:SF3||||0&1|0&1||||0.426|||||||.&High&.&.&.&|.&Recessive&.&.&.&|.&0.13585&.&.&.&|.&0.81255&.&.&.&|.&0.0516&.&.&.&|0.99401|0.93057||ENST00000565905&ENST00000361627&ENST00000567348&ENST00000563864&ENST00000543522|2.5|.&.&.&.&.|||0.36043||||||||||||rs776172390|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|0|0|sas||rs776172390|3.4343e-005|||COSV64380835&COSV64380835&COSV64380835&COSV64380835&COSV64380835|3&3&3&3&3|COSM185114&COSM185114&COSM185114&COSM185114&COSM185114,T|stop_gained|HIGH|ARHGAP11A|ENSG00000198826|Transcript|ENST00000563864.1|protein_coding|11/11||ENST00000563864.1:c.1333C>T|ENSP00000456078.1:p.Arg445Ter|1980|1333|445|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|HGNC|15783||ENSP00000456078|||PANTHER:PTHR15670&PANTHER:PTHR15670:SF3||||0&1|0&1||||0.426|NMD_escaping_variant||||||.&High&.&.&.&|.&Recessive&.&.&.&|.&0.13585&.&.&.&|.&0.81255&.&.&.&|.&0.0516&.&.&.&|0.99401|0.93057||ENST00000565905&ENST00000361627&ENST00000567348&ENST00000563864&ENST00000543522|2.5|.&.&.&.&.|||0.36043||||||||||||rs776172390|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|0|0|sas||rs776172390|3.4343e-005|||COSV64380835&COSV64380835&COSV64380835&COSV64380835&COSV64380835|3&3&3&3&3|COSM185114&COSM185114&COSM185114&COSM185114&COSM185114,T|non_coding_transcript_exon_variant|MODIFIER|ARHGAP11A|ENSG00000198826|Transcript|ENST00000564918.1|retained_intron|3/4||ENST00000564918.1:n.466C>T||466|||||rs776172390&COSV64380835||1|||SNV|HGNC|15783|||||||||0&1|0&1||||0.426||||||||||||||||||||||||||||||||rs776172390|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|0|0|sas||rs776172390|3.4343e-005|||COSV64380835&COSV64380835&COSV64380835&COSV64380835&COSV64380835|3&3&3&3&3|COSM185114&COSM185114&COSM185114&COSM185114&COSM185114,T|stop_gained|HIGH|ARHGAP11A|ENSG00000198826|Transcript|ENST00000565905.1|protein_coding|11/12||ENST00000565905.1:c.850C>T|ENSP00000455754.1:p.Arg284Ter|1398|850|284|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|HGNC|15783||ENSP00000455754|||PANTHER:PTHR15670:SF3&PANTHER:PTHR15670||||0&1|0&1||||0.426|||||||.&High&.&.&.&|.&Recessive&.&.&.&|.&0.13585&.&.&.&|.&0.81255&.&.&.&|.&0.0516&.&.&.&|0.99401|0.93057||ENST00000565905&ENST00000361627&ENST00000567348&ENST00000563864&ENST00000543522|2.5|.&.&.&.&.|||0.36043||||||||||||rs776172390|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|0|0|sas||rs776172390|3.4343e-005|||COSV64380835&COSV64380835&COSV64380835&COSV64380835&COSV64380835|3&3&3&3&3|COSM185114&COSM185114&COSM185114&COSM185114&COSM185114,T|stop_gained|HIGH|ARHGAP11A|ENSG00000198826|Transcript|ENST00000567348.1|protein_coding|11/11||ENST00000567348.1:c.1417C>T|ENSP00000454575.1:p.Arg473Ter|2090|1417|473|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|HGNC|15783||ENSP00000454575|||PANTHER:PTHR15670&PANTHER:PTHR15670:SF3||||0&1|0&1||||0.426|NMD_escaping_variant||||||.&High&.&.&.&|.&Recessive&.&.&.&|.&0.13585&.&.&.&|.&0.81255&.&.&.&|.&0.0516&.&.&.&|0.99401|0.93057||ENST00000565905&ENST00000361627&ENST00000567348&ENST00000563864&ENST00000543522|2.5|.&.&.&.&.|||0.36043||||||||||||rs776172390|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|0|0|sas||rs776172390|3.4343e-005|||COSV64380835&COSV64380835&COSV64380835&COSV64380835&COSV64380835|3&3&3&3&3|COSM185114&COSM185114&COSM185114&COSM185114&COSM185114 +1 69098 . C G . . variant_id=13629760;CSQ=G|missense_variant|MODERATE|OR4F5|79501|Transcript|NM_001005484.2|protein_coding|3/3||NM_001005484.2:c.71C>G|NP_001005484.2:p.Thr24Ser|131|71|24|T/S|aCt/aGt|||1||1|SNV|EntrezGene||YES|NP_001005484.2||||tolerated(1)||||||||58||||||||.&.&|.&.&|.&.&|.&.&|.&.&|0.05653|0.12098|0.08831|ENST00000641515&ENST00000335137|2.31|.&.|0.01092|0.14661|0.07811|||-5|35|-5|17|0.00|0.15|0.27|0.08|OR4F5||||||||||||||||||||0.0020000000949949|0.966000020503998|0.894999980926514|1.23199999332428|||||||| +1 69589 . G A . . variant_id=13629761;CSQ=A|missense_variant|MODERATE|OR4F5|79501|Transcript|NM_001005484.2|protein_coding|3/3||NM_001005484.2:c.562G>A|NP_001005484.2:p.Val188Ile|622|562|188|V/I|Gtc/Atc|COSV58736794||1||1|SNV|EntrezGene||YES|NP_001005484.2||||tolerated(0.41)|||||1|1||29|||OR4F5:V167I|0&1&1||||.&.&|.&.&|.&.&|.&.&|.&.&|0.01817|0.11649|0.18729|ENST00000641515&ENST00000335137|0.138|GPCR&_rhodopsin-like&_7TM&GPCR&_rhodopsin-like&_7TM|0.00039|0.11576|0.05287|||29|-46|-12|-1|0.00|0.00|0.00|0.11|OR4F5||||||||||||||||||||0|0.814000010490417|-1.23099994659424|-0.108000002801418||||||COSV58736794|1|COSM6847255 +13 95839002 . C T . . variant_id=13629762;CSQ=T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_001105515.3|protein_coding|11/21||NM_001105515.3:c.1498G>A|NP_001098985.1:p.Glu500Lys|1635|1498|500|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1|||SNV|EntrezGene|||NP_001098985.1||||tolerated(0.11)|||0.0004||0&1|0&1||56|||ABCC4:E500K|0&1&1||||.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|0.63287|0.41304|0.15198|ENST00000645237&ENST00000376887&ENST00000646439&ENST00000536256&ENST00000629385&ENST00000645532|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|0.80456|0.69527|0.56662|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|rs145886106|41|36|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|282728|24956|0|0|.|afr|0.84799998998642|0.998000025749207|1.37699997425079|2.61299991607666||rs145886106|0.000927261|||COSV65320224|1|COSM7286401,T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_001301829.2|protein_coding|11/30||NM_001301829.2:c.1498G>A|NP_001288758.1:p.Glu500Lys|1635|1498|500|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1|||SNV|EntrezGene|||NP_001288758.1||||tolerated(0.06)|||0.0004||0&1|0&1||56|||ABCC4:E500K|0&1&1||||.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|0.63287|0.41304|0.15198|ENST00000645237&ENST00000376887&ENST00000646439&ENST00000536256&ENST00000629385&ENST00000645532|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|0.80456|0.69527|0.56662|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|rs145886106|41|36|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|282728|24956|0|0|.|afr|0.84799998998642|0.998000025749207|1.37699997425079|2.61299991607666||rs145886106|0.000927261|||COSV65320224|1|COSM7286401,T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_001301830.2|protein_coding|10/20||NM_001301830.2:c.1273G>A|NP_001288759.1:p.Glu425Lys|1410|1273|425|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1|||SNV|EntrezGene|||NP_001288759.1||||tolerated(0.11)|||0.0004||0&1|0&1||56|||ABCC4:E500K|0&1&1||||.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|0.63287|0.41304|0.15198|ENST00000645237&ENST00000376887&ENST00000646439&ENST00000536256&ENST00000629385&ENST00000645532|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|0.80456|0.69527|0.56662|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|rs145886106|41|36|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|282728|24956|0|0|.|afr|0.84799998998642|0.998000025749207|1.37699997425079|2.61299991607666||rs145886106|0.000927261|||COSV65320224|1|COSM7286401,T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_005845.5|protein_coding|11/31||NM_005845.5:c.1498G>A|NP_005836.2:p.Glu500Lys|1635|1498|500|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1||1|SNV|EntrezGene||YES|NP_005836.2||||tolerated(0.06)|||0.0004||0&1|0&1||56|||ABCC4:E500K|0&1&1||||.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|0.63287|0.41304|0.15198|ENST00000645237&ENST00000376887&ENST00000646439&ENST00000536256&ENST00000629385&ENST00000645532|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|0.80456|0.69527|0.56662|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|rs145886106|41|36|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|282728|24956|0|0|.|afr|0.84799998998642|0.998000025749207|1.37699997425079|2.61299991607666||rs145886106|0.000927261|||COSV65320224|1|COSM7286401 +15 32928050 . C T . . variant_id=13638004;CSQ=T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_001286479.3|protein_coding|11/12||NM_001286479.3:c.850C>T|NP_001273408.1:p.Arg284Ter|1398|850|284|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|EntrezGene|||NP_001273408.1|||||||||0&1|0&1||||||||||.&High&.&.&.&|.&Recessive&.&.&.&|.&0.13585&.&.&.&|.&0.81255&.&.&.&|.&0.0516&.&.&.&|0.99401|0.93057||ENST00000565905&ENST00000361627&ENST00000567348&ENST00000563864&ENST00000543522|2.5|.&.&.&.&.|||0.36043|||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|rs776172390|4|1|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|247816|29766|0|0|.|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835|3|COSM185114,T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_001286480.3|protein_coding|12/13||NM_001286480.3:c.850C>T|NP_001273409.1:p.Arg284Ter|1506|850|284|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|EntrezGene|||NP_001273409.1|||||||||0&1|0&1||||||||||.&High&.&.&.&|.&Recessive&.&.&.&|.&0.13585&.&.&.&|.&0.81255&.&.&.&|.&0.0516&.&.&.&|0.99401|0.93057||ENST00000565905&ENST00000361627&ENST00000567348&ENST00000563864&ENST00000543522|2.5|.&.&.&.&.|||0.36043|||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|rs776172390|4|1|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|247816|29766|0|0|.|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835|3|COSM185114,T|intron_variant|MODIFIER|ARHGAP11A-SCG5|114118903|Transcript|NM_001368319.1|protein_coding||9/13|NM_001368319.1:c.1235+2741C>T|||||||rs776172390&COSV64380835||1|||SNV|EntrezGene||YES|NP_001355248.1|||||||||0&1|0&1|||||||||||||||||||||||||||||||||||rs776172390|4|1|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|247816|29766|0|0|.|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835|3|COSM185114,T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_014783.6|protein_coding|11/12||NM_014783.6:c.1417C>T|NP_055598.1:p.Arg473Ter|2125|1417|473|R/*|Cga/Tga|rs776172390&COSV64380835||1||1|SNV|EntrezGene||YES|NP_055598.1|||||||||0&1|0&1||||||||||.&High&.&.&.&|.&Recessive&.&.&.&|.&0.13585&.&.&.&|.&0.81255&.&.&.&|.&0.0516&.&.&.&|0.99401|0.93057||ENST00000565905&ENST00000361627&ENST00000567348&ENST00000563864&ENST00000543522|2.5|.&.&.&.&.|||0.36043|||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|rs776172390|4|1|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|247816|29766|0|0|.|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835|3|COSM185114,T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_199357.3|protein_coding|11/11||NM_199357.3:c.1417C>T|NP_955389.1:p.Arg473Ter|2125|1417|473|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|EntrezGene|||NP_955389.1|||||||||0&1|0&1||||NMD_escaping_variant||||||.&High&.&.&.&|.&Recessive&.&.&.&|.&0.13585&.&.&.&|.&0.81255&.&.&.&|.&0.0516&.&.&.&|0.99401|0.93057||ENST00000565905&ENST00000361627&ENST00000567348&ENST00000563864&ENST00000543522|2.5|.&.&.&.&.|||0.36043|||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|rs776172390|4|1|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|247816|29766|0|0|.|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835|3|COSM185114 From bb9a2059d43482ed187e062f972d1b9da7def791 Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Fri, 1 Dec 2023 13:51:06 +1030 Subject: [PATCH 13/29] #850 - new VEP 110 fields - alpha missense formatter --- .../vcf_files/bulk_vep_vcf_annotation_inserter.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py b/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py index e8f7580f3..446f10646 100644 --- a/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py +++ b/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py @@ -10,7 +10,7 @@ from annotation.models.damage_enums import SIFTPrediction, FATHMMPrediction, \ MutationAssessorPrediction, MutationTasterPrediction, Polyphen2Prediction, \ - PathogenicityImpact, ALoFTPrediction + PathogenicityImpact, ALoFTPrediction, AlphaMissensePrediction from annotation.models.models import ColumnVEPField, VariantAnnotation, \ VariantTranscriptAnnotation, VariantAnnotationVersion, VariantGeneOverlap from annotation.models.models_enums import VariantClass, VariantAnnotationPipelineType @@ -156,6 +156,7 @@ def _add_vep_field_handlers(self): "aloft_pred": get_choice_formatter_func(ALoFTPrediction.choices, empty_values=["."]), "aloft_high_confidence": format_aloft_high_confidence, "aloft_ensembl_transcript": format_empty_as_none, + "alphamissense_class": get_format_alphamissense_class_func(), "canonical": format_canonical, "cosmic_count": format_pick_highest_int, "cosmic_id": extract_cosmic, @@ -598,6 +599,16 @@ def format_vep_sift_to_choice(vep_sift): return SIFTPrediction.TOLERATED raise ValueError(f"Unknown SIFT value: '{vep_sift}'") +def get_format_alphamissense_class_func(): + """ GRCh37 has 'benign' while GRCh38 has 'likely_benign' + @see https://github.com/Ensembl/VEP_plugins/issues/668 + """ + cff = get_choice_formatter_func(AlphaMissensePrediction.CHOICES) + def _format_alphamissense_class(alphamissense_class): + if alphamissense_class == "benign": + alphamissense_class = "likely_benign" + return cff(alphamissense_class) + return _format_alphamissense_class def get_extract_existing_variation(prefix): def format_vep_existing_variation(vep_existing_variation): From b8cdbcddd701da152e5f7f54dd49adeed111ca2c Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Fri, 1 Dec 2023 15:19:45 +1030 Subject: [PATCH 14/29] #850 - unit tests --- analysis/tests/test_urls.py | 2 +- annotation/fake_annotation.py | 2 +- annotation/management/commands/vep_run.py | 5 +- .../migrations/0082_new_vep_110_columns_v3.py | 2 +- annotation/tests/test_annotation_vcf.py | 55 ++++++++++- ..._columns_version1_grch37.vep_annotated.vcf | 92 +++++++++++++++++++ ...columns_version1_grch38.vep_annotated.vcf} | 0 .../test_data/test_grch37.vep_annotated.vcf | 88 ------------------ .../bulk_vep_vcf_annotation_inserter.py | 12 ++- 9 files changed, 160 insertions(+), 98 deletions(-) create mode 100644 annotation/tests/test_data/test_columns_version1_grch37.vep_annotated.vcf rename annotation/tests/test_data/{test_grch38.vep_annotated.vcf => test_columns_version1_grch38.vep_annotated.vcf} (100%) delete mode 100644 annotation/tests/test_data/test_grch37.vep_annotated.vcf diff --git a/analysis/tests/test_urls.py b/analysis/tests/test_urls.py index e8af0d6c8..0f609db41 100644 --- a/analysis/tests/test_urls.py +++ b/analysis/tests/test_urls.py @@ -69,7 +69,7 @@ def setUpTestData(cls): father=father_cs, father_affected=False, proband=proband_cs) - vcf_filename = os.path.join(settings.BASE_DIR, "annotation/tests/test_data/test_grch37.vep_annotated.vcf") + vcf_filename = os.path.join(settings.BASE_DIR, "annotation/tests/test_data/test_columns_version1_grch37.vep_annotated.vcf") slowly_create_loci_and_variants_for_vcf(grch37, vcf_filename, get_variant_id_from_info=True) variant = Variant.objects.filter(Variant.get_no_reference_q()).first() CohortGenotype.objects.create(collection=collection, diff --git a/annotation/fake_annotation.py b/annotation/fake_annotation.py index de3438b66..3bca22a1b 100644 --- a/annotation/fake_annotation.py +++ b/annotation/fake_annotation.py @@ -87,7 +87,7 @@ def get_fake_annotation_version(genome_build: GenomeBuild): def create_fake_variants(genome_build: GenomeBuild): build_lc = genome_build.name.lower() - vcf_filename = os.path.join(settings.BASE_DIR, f"annotation/tests/test_data/test_{build_lc}.vep_annotated.vcf") + vcf_filename = os.path.join(settings.BASE_DIR, f"annotation/tests/test_data/test_columns_version1_{build_lc}.vep_annotated.vcf") slowly_create_loci_and_variants_for_vcf(genome_build, vcf_filename, get_variant_id_from_info=True) diff --git a/annotation/management/commands/vep_run.py b/annotation/management/commands/vep_run.py index e93ee9927..684332464 100644 --- a/annotation/management/commands/vep_run.py +++ b/annotation/management/commands/vep_run.py @@ -10,7 +10,7 @@ from django.core.management.base import BaseCommand from annotation.models import VariantAnnotationPipelineType -from annotation.vep_annotation import run_vep +from annotation.vep_annotation import run_vep, VEPConfig from snpdb.models.models_genome import GenomeBuild DO_SMALL = False @@ -28,6 +28,8 @@ def handle(self, *args, **options): cnv = options["cnv"] build_name = options["genome_build"] genome_build = GenomeBuild.get_name_or_alias(build_name) + vc = VEPConfig(genome_build) + if test: print("Re-generating VCF for unit test") @@ -36,6 +38,7 @@ def handle(self, *args, **options): unit_test_dir = os.path.join(settings.BASE_DIR, "annotation/tests/test_data") vcf_filename = os.path.join(unit_test_dir, f"{base_name}.vcf") output_dir = unit_test_dir + base_name = f"test_columns_version_{vc.columns_version}_{genome_build.name.lower()}" else: vep_suffix = f"vep_annotated_{genome_build.name}" output_dir = settings.ANNOTATION_VCF_DUMP_DIR diff --git a/annotation/migrations/0082_new_vep_110_columns_v3.py b/annotation/migrations/0082_new_vep_110_columns_v3.py index 877aaca63..3a87573ae 100644 --- a/annotation/migrations/0082_new_vep_110_columns_v3.py +++ b/annotation/migrations/0082_new_vep_110_columns_v3.py @@ -82,7 +82,7 @@ def _new_vep_110_annotation(apps, _schema_editor): 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_sas_af', 'source_field': 'AF_sas', 'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'}, - {'column': 'gnomad4_nonpar', 'variant_grid_column_id': 'gnomad_non_par', + {'column': 'gnomad4_nonpar', 'variant_grid_column_id': 'gnomad_non_par', 'min_vep_columns_version': 3, 'genome_build_id': 'GRCh38', 'pipeline_type': 'S', 'category': 'F', 'source_field': 'non_par', 'source_field_processing_description': 'nonpar from genomes', 'vep_custom': VEP_CUSTOM_GNOMAD_4, 'source_field_has_custom_prefix': True}, diff --git a/annotation/tests/test_annotation_vcf.py b/annotation/tests/test_annotation_vcf.py index bf7fd6a33..2f97767fc 100644 --- a/annotation/tests/test_annotation_vcf.py +++ b/annotation/tests/test_annotation_vcf.py @@ -48,8 +48,8 @@ ANNOTATION=ANNOTATION_COLUMNS_V1) class TestAnnotationVCF(TestCase): TEST_DATA_DIR = os.path.join(settings.BASE_DIR, "annotation/tests/test_data") - TEST_ANNOTATION_VCF_GRCH37 = os.path.join(TEST_DATA_DIR, "test_grch37.vep_annotated.vcf") - TEST_ANNOTATION_VCF_GRCH38 = os.path.join(TEST_DATA_DIR, "test_grch38.vep_annotated.vcf") + TEST_ANNOTATION_VCF_GRCH37 = os.path.join(TEST_DATA_DIR, "test_columns_version1_grch37.vep_annotated.vcf") + TEST_ANNOTATION_VCF_GRCH38 = os.path.join(TEST_DATA_DIR, "test_columns_version1_grch38.vep_annotated.vcf") @classmethod def setUpTestData(cls): @@ -128,6 +128,10 @@ def _test_extra_grch37(self): self.assertEqual(va.predictions_num_pathogenic, 1) self.assertEqual(va.predictions_num_benign, 0) + def _test_24601_gnomad_grch38(self, va): + # This is from gnomAD v3 + self.assertAlmostEqual(va.gnomad_af, 0.000354913) + def test_import_variant_annotations_grch38(self): genome_build = GenomeBuild.get_name_or_alias('GRCh38') vav = self.variant_annotation_versions_by_build[genome_build.name] @@ -147,7 +151,7 @@ def test_import_variant_annotations_grch38(self): self.assertEqual(va.impact, PathogenicityImpact.MODERATE) self.assertEqual(va.dbsnp_rs_id, "rs145886106") self.assertEqual(va.cosmic_legacy_id, "COSM7286401") # Test it has collapsed dupes - self.assertAlmostEqual(va.gnomad_af, 0.000354913) + self._test_24601_gnomad_grch38(va) self.assertEqual(va.gnomad_filtered, False) # Test it converted FILTER properly to bool va = VariantAnnotation.objects.get(variant_id=42) @@ -230,6 +234,51 @@ def _test_extra_grch38(self): self.assertTrue(vta.nmd_escaping_variant) +ANNOTATION_COLUMNS_V3 = copy.deepcopy(TEST_ANNOTATION) +ANNOTATION_COLUMNS_V3[settings.BUILD_GRCH37]["columns_version"] = 3 +ANNOTATION_COLUMNS_V3[settings.BUILD_GRCH38]["columns_version"] = 3 + + +@override_settings(IMPORT_PROCESSING_DIR=TEST_IMPORT_PROCESSING_DIR, + VARIANT_ZYGOSITY_GLOBAL_COLLECTION="global", + ANNOTATION_VEP_FAKE_VERSION=True, + ANNOTATION=ANNOTATION_COLUMNS_V3) +class TestAnnotationVCF3(TestAnnotationVCF): + TEST_DATA_DIR = os.path.join(settings.BASE_DIR, "annotation/tests/test_data") + TEST_ANNOTATION_VCF_GRCH37 = os.path.join(TEST_DATA_DIR, "test_columns_version3_grch37.vep_annotated.vcf") + TEST_ANNOTATION_VCF_GRCH38 = os.path.join(TEST_DATA_DIR, "test_columns_version3_grch38.vep_annotated.vcf") + + def _test_extra_grch37(self): + # This is testing columns_version 2 + pass + + def _test_24601_gnomad_grch38(self, va): + """ gnomAD v4 """ + # AF total copied from https://gnomad.broadinstitute.org/variant/13-95186748-C-T?dataset=gnomad_r4 + self.assertAlmostEqual(va.gnomad_af, 0.00006753, places=6) + + def _test_extra_grch38(self): + # This is testing columns_version 2 + va = VariantAnnotation.objects.get(variant_id=24601) + self.assertAlmostEqual(va.metalr_rankscore, 0.80456) + self.assertAlmostEqual(va.revel_rankscore, 0.69527) + self.assertAlmostEqual(va.vest4_rankscore, 0.56662) + self.assertAlmostEqual(va.bayesdel_noaf_rankscore, 0.63287) + self.assertAlmostEqual(va.cadd_raw_rankscore, 0.41304) + self.assertAlmostEqual(va.clinpred_rankscore, 0.15198) + + va = VariantAnnotation.objects.get(variant_id=42) + self.assertEqual(va.aloft_high_confidence, True) + self.assertEqual(va.aloft_pred, ALoFTPrediction.RECESSIVE) + self.assertAlmostEqual(va.aloft_prob_dominant, 0.13585) + self.assertAlmostEqual(va.aloft_prob_recessive, 0.81255) + self.assertAlmostEqual(va.aloft_prob_tolerant, 0.0516) + + vta = VariantTranscriptAnnotation.objects.get(variant_id=42, hgvs_c='NM_199357.3:c.1417C>T') + self.assertTrue(vta.nmd_escaping_variant) + + + class TestVEP(TestCase): """ Random VEP annotation methods """ maxDiff = None diff --git a/annotation/tests/test_data/test_columns_version1_grch37.vep_annotated.vcf b/annotation/tests/test_data/test_columns_version1_grch37.vep_annotated.vcf new file mode 100644 index 000000000..26197612d --- /dev/null +++ b/annotation/tests/test_data/test_columns_version1_grch37.vep_annotated.vcf @@ -0,0 +1,92 @@ +##fileformat=VCFv4.1 +##INFO= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##VEP="v110" time="2023-12-01 15:10:26" cache="/data/annotation/VEP/vep_cache/homo_sapiens_refseq/110_GRCh37" ensembl-variation=110.d34d25e ensembl=110.9eadbc2 ensembl-funcgen=110.24e6da6 ensembl-io=110.b1a0d57 1000genomes="phase3" COSMIC="92" ClinVar="202012" HGMD-PUBLIC="20204" assembly="GRCh37.p13" dbSNP="154" gencode="GENCODE 19" genebuild="2011-04" gnomADe="r2.1" polyphen="2.2.2" refseq="2020-10-26 17:03:42 - GCF_000001405.25_GRCh37.p13_genomic.gff" regbuild="1.0" sift="sift5.2.2" +##INFO= +##Grantham=Grantham Matrix score - Grantham, R. Amino Acid Difference Formula to Help Explain Protein Evolution, Science 1974 Sep 6;185(4154):862-4. +##SpliceRegion=SpliceRegion predictions +##Mastermind_MMID3=Mastermind MMID3 variant identifier(s), as gene:key. Link to the Genomenon Mastermind Genomic Search Engine. +##Mastermind_counts=Mastermind number of citations in the medical literature. Output includes three unique counts: MMCNT1|MMCNT2|MMCNT3. MMCNT1 - Count of Mastermind articles with cDNA matches for this specific variant; MMCNT2 - Count of Mastermind articles with variants either explicitly matching at the cDNA level or given only at protein level; MMCNT3 - Count of Mastermind articles including other DNA-level variants resulting in the same amino acid change. +##MaxEntScan_alt=MaxEntScan alternate sequence score +##MaxEntScan_diff=MaxEntScan score difference +##MaxEntScan_ref=MaxEntScan reference sequence score +##CADD_phred=CADD_phred from dbNSFP file +##FATHMM_pred=FATHMM_pred from dbNSFP file +##GERP++_RS=GERP++_RS from dbNSFP file +##Interpro_domain=Interpro_domain from dbNSFP file +##MutationAssessor_pred=MutationAssessor_pred from dbNSFP file +##MutationTaster_pred=MutationTaster_pred from dbNSFP file +##Polyphen2_HVAR_pred=Polyphen2_HVAR_pred from dbNSFP file +##REVEL_score=REVEL_score from dbNSFP file +##ada_score=dbscSNV ADA score +##rf_score=dbscSNV RF score +##SpliceAI_pred_DP_AG=SpliceAI predicted effect on splicing. Delta position for acceptor gain +##SpliceAI_pred_DP_AL=SpliceAI predicted effect on splicing. Delta position for acceptor loss +##SpliceAI_pred_DP_DG=SpliceAI predicted effect on splicing. Delta position for donor gain +##SpliceAI_pred_DP_DL=SpliceAI predicted effect on splicing. Delta position for donor loss +##SpliceAI_pred_DS_AG=SpliceAI predicted effect on splicing. Delta score for acceptor gain +##SpliceAI_pred_DS_AL=SpliceAI predicted effect on splicing. Delta score for acceptor loss +##SpliceAI_pred_DS_DG=SpliceAI predicted effect on splicing. Delta score for donor gain +##SpliceAI_pred_DS_DL=SpliceAI predicted effect on splicing. Delta score for donor loss +##SpliceAI_pred_SYMBOL=SpliceAI gene symbol +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##VEP-command-line='vep --af --assembly GRCh37 --biotype --cache --canonical --check_existing --compress_output gzip --custom [PATH]/CosmicCodingMuts_v95_20211101_grch37.normal.vcf.gz,COSMIC,vcf,exact,0,CNT,LEGACY_ID --database 0 --dir [PATH]/vep_cache --distance 5000 --domains --exclude_predicted --fasta [PATH]/GCF_000001405.25_GRCh37.p13_genomic.fna.gz --flag_pick --force_overwrite --hgvs --input_file [PATH]/test_grch37.vcf --no_escape --no_stats --numbers --offline --output_file [PATH]/test_grch37.vep_annotated.vcf.gz --plugin [PATH]/spliceai_scores.raw.indel.hg19.vcf.gz --protein --pubmed --refseq --sift b --symbol --transcript_version --use_given_ref --variant_class --vcf' +#CHROM POS ID REF ALT QUAL FILTER INFO +1 69098 . C G . . variant_id=13629760;CSQ=G|missense_variant|MODERATE|OR4F5|79501|Transcript|NM_001005484.2|protein_coding|3/3||NM_001005484.2:c.71C>G|NP_001005484.2:p.Thr24Ser|131|71|24|T/S|aCt/aGt|||1||1|SNV|EntrezGene||YES|NP_001005484.2||||tolerated(1)||||||||58|||||||13.33|.&T|2.31|.&.|.&N|N|.&B|0.052|||-5|35|-5|17|0.00|0.15|0.27|0.08|OR4F5||||||||||||||||||||0.0020000000949949|0.966000020503998|0.894999980926514|1.23199999332428|||||||| +1 69589 . G A . . variant_id=13629761;CSQ=A|missense_variant|MODERATE|OR4F5|79501|Transcript|NM_001005484.2|protein_coding|3/3||NM_001005484.2:c.562G>A|NP_001005484.2:p.Val188Ile|622|562|188|V/I|Gtc/Atc|COSV58736794||1||1|SNV|EntrezGene||YES|NP_001005484.2||||tolerated(0.41)|||||1|1||29||OR4F5:V167I|0&1&1||||14.31|.&T|0.138|GPCR&_rhodopsin-like&_7TM&GPCR&_rhodopsin-like&_7TM|.&N|N|.&B|0.043|||29|-46|-12|-1|0.00|0.00|0.00|0.11|OR4F5||||||||||||||||||||0|0.814000010490417|-1.23099994659424|-0.108000002801418||||||COSV58736794|1|COSM6847255 +13 95839002 . C T . . variant_id=13629762;CSQ=T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_001105515.3|protein_coding|11/21||NM_001105515.3:c.1498G>A|NP_001098985.1:p.Glu500Lys|1635|1498|500|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1|||SNV|EntrezGene|||NP_001098985.1||||tolerated(0.11)|||0.0004||0&1|0&1||56||ABCC4:E500K|0&1&1||||22.3|.&D&.&D&.&.|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|N&N&N&.&N&.|D&D&D&D|B&B&B&.&.&.|0.377|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|rs145886106|41|36|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|282728|24956|0|0|.|afr|0.84799998998642|0.998000025749207|1.37699997425079|2.61299991607666||rs145886106|0.000927261|||COSV65320224|1|COSM7286401,T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_001301829.2|protein_coding|11/30||NM_001301829.2:c.1498G>A|NP_001288758.1:p.Glu500Lys|1635|1498|500|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1|||SNV|EntrezGene|||NP_001288758.1||||tolerated(0.06)|||0.0004||0&1|0&1||56||ABCC4:E500K|0&1&1||||22.3|.&D&.&D&.&.|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|N&N&N&.&N&.|D&D&D&D|B&B&B&.&.&.|0.377|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|rs145886106|41|36|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|282728|24956|0|0|.|afr|0.84799998998642|0.998000025749207|1.37699997425079|2.61299991607666||rs145886106|0.000927261|||COSV65320224|1|COSM7286401,T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_001301830.2|protein_coding|10/20||NM_001301830.2:c.1273G>A|NP_001288759.1:p.Glu425Lys|1410|1273|425|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1|||SNV|EntrezGene|||NP_001288759.1||||tolerated(0.11)|||0.0004||0&1|0&1||56||ABCC4:E500K|0&1&1||||22.3|.&D&.&D&.&.|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|N&N&N&.&N&.|D&D&D&D|B&B&B&.&.&.|0.377|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|rs145886106|41|36|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|282728|24956|0|0|.|afr|0.84799998998642|0.998000025749207|1.37699997425079|2.61299991607666||rs145886106|0.000927261|||COSV65320224|1|COSM7286401,T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_005845.5|protein_coding|11/31||NM_005845.5:c.1498G>A|NP_005836.2:p.Glu500Lys|1635|1498|500|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1||1|SNV|EntrezGene||YES|NP_005836.2||||tolerated(0.06)|||0.0004||0&1|0&1||56||ABCC4:E500K|0&1&1||||22.3|.&D&.&D&.&.|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|N&N&N&.&N&.|D&D&D&D|B&B&B&.&.&.|0.377|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|rs145886106|41|36|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|282728|24956|0|0|.|afr|0.84799998998642|0.998000025749207|1.37699997425079|2.61299991607666||rs145886106|0.000927261|||COSV65320224|1|COSM7286401 +15 32928050 . C T . . variant_id=13638004;CSQ=T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_001286479.3|protein_coding|11/12||NM_001286479.3:c.850C>T|NP_001273408.1:p.Arg284Ter|1398|850|284|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|EntrezGene|||NP_001273408.1|||||||||0&1|0&1|||||||||34|.&.&.&.&.|2.5|.&.&.&.&.|.&.&.&.&.|A&A&A&D&D|.&.&.&.&.||||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|rs776172390|4|1|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|247816|29766|0|0|.|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835|3|COSM185114,T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_001286480.3|protein_coding|12/13||NM_001286480.3:c.850C>T|NP_001273409.1:p.Arg284Ter|1506|850|284|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|EntrezGene|||NP_001273409.1|||||||||0&1|0&1|||||||||34|.&.&.&.&.|2.5|.&.&.&.&.|.&.&.&.&.|A&A&A&D&D|.&.&.&.&.||||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|rs776172390|4|1|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|247816|29766|0|0|.|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835|3|COSM185114,T|intron_variant|MODIFIER|ARHGAP11A-SCG5|114118903|Transcript|NM_001368319.1|protein_coding||9/13|NM_001368319.1:c.1235+2741C>T|||||||rs776172390&COSV64380835||1|||SNV|EntrezGene||YES|NP_001355248.1|||||||||0&1|0&1||||||||||||||||||||||||||||rs776172390|4|1|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|247816|29766|0|0|.|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835|3|COSM185114,T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_014783.6|protein_coding|11/12||NM_014783.6:c.1417C>T|NP_055598.1:p.Arg473Ter|2125|1417|473|R/*|Cga/Tga|rs776172390&COSV64380835||1||1|SNV|EntrezGene||YES|NP_055598.1|||||||||0&1|0&1|||||||||34|.&.&.&.&.|2.5|.&.&.&.&.|.&.&.&.&.|A&A&A&D&D|.&.&.&.&.||||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|rs776172390|4|1|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|247816|29766|0|0|.|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835|3|COSM185114,T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_199357.3|protein_coding|11/11||NM_199357.3:c.1417C>T|NP_955389.1:p.Arg473Ter|2125|1417|473|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|EntrezGene|||NP_955389.1|||||||||0&1|0&1|||||||||34|.&.&.&.&.|2.5|.&.&.&.&.|.&.&.&.&.|A&A&A&D&D|.&.&.&.&.||||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|rs776172390|4|1|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|247816|29766|0|0|.|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835|3|COSM185114 diff --git a/annotation/tests/test_data/test_grch38.vep_annotated.vcf b/annotation/tests/test_data/test_columns_version1_grch38.vep_annotated.vcf similarity index 100% rename from annotation/tests/test_data/test_grch38.vep_annotated.vcf rename to annotation/tests/test_data/test_columns_version1_grch38.vep_annotated.vcf diff --git a/annotation/tests/test_data/test_grch37.vep_annotated.vcf b/annotation/tests/test_data/test_grch37.vep_annotated.vcf deleted file mode 100644 index 59415bffe..000000000 --- a/annotation/tests/test_data/test_grch37.vep_annotated.vcf +++ /dev/null @@ -1,88 +0,0 @@ -##fileformat=VCFv4.1 -##INFO= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##VEP="v100" time="2021-02-26 10:31:47" cache="/media/dlawrence/SpinningIron/reference/VEP/vep_cache/homo_sapiens_refseq/100_GRCh37" ensembl-variation=100.b220ff4 ensembl=100.7e964b7 ensembl-io=100.f87ae4f ensembl-funcgen=100.f0c3948 1000genomes="phase3" COSMIC="90" ClinVar="201912" ESP="20141103" HGMD-PUBLIC="20194" assembly="GRCh37.p13" dbSNP="153" gencode="GENCODE 19" genebuild="2011-04" gnomAD="r2.1" polyphen="2.2.2" refseq="01_2015" regbuild="1.0" sift="sift5.2.2" -##INFO= -##Grantham=Grantham Matrix score - Grantham, R. Amino Acid Difference Formula to Help Explain Protein Evolution, Science 1974 Sep 6;185(4154):862-4. -##SpliceRegion=SpliceRegion predictions -##LoFtool=LoFtool score for gene -##Mastermind_MMID3=Mastermind MMID3 variant identifier(s), as gene:key, for MMCNT3. -##Mastermind_counts=Mastermind number of citations in the medical literature. Output includes three unique counts: MMCNT1|MMCNT2|MMCNT3. MMCNT1 - Count of Mastermind articles with cDNA matches for this specific variant; MMCNT2 - Count of Mastermind articles with variants either explicitly matching at the cDNA level or given only at protein level; MMCNT3 - Count of Mastermind articles including other DNA-level variants resulting in the same amino acid change. -##MaxEntScan_alt=MaxEntScan alternate sequence score -##MaxEntScan_diff=MaxEntScan score difference -##MaxEntScan_ref=MaxEntScan reference sequence score -##CADD_phred=(from dbNSFP) CADD phred-like score. This is phred-like rank score based on whole genome CADD raw scores. Please refer to Kircher et al. (2014) Nature Genetics 46(3):310-5 for details. The larger the score the more likely the SNP has damaging effect. Please note the following copyright statement for CADD: "CADD scores (http://cadd.gs.washington.edu/) are Copyright 2013 University of Washington and Hudson-Alpha Institute for Biotechnology (all rights reserved) but are freely available for all academic, non-commercial applications. For commercial licensing information contact Jennifer McCullar (mccullaj@uw.edu)." -##CADD_raw=(from dbNSFP) CADD raw score for functional prediction of a SNP. Please refer to Kircher et al. (2014) Nature Genetics 46(3):310-5 for details. The larger the score the more likely the SNP has damaging effect. Scores range from -6.458163 to 18.301497 in dbNSFP. Please note the following copyright statement for CADD: "CADD scores (http://cadd.gs.washington.edu/) are Copyright 2013 University of Washington and Hudson-Alpha Institute for Biotechnology (all rights reserved) but are freely available for all academic, non-commercial applications. For commercial licensing information contact Jennifer McCullar (mccullaj@uw.edu)." -##FATHMM_pred=(from dbNSFP) If a FATHMMori score is <=-1.5 (or rankscore >=0.81332) the corresponding nsSNV is predicted as "D(AMAGING)"; otherwise it is predicted as "T(OLERATED)". Multiple predictions separated by ";", corresponding to Ensembl_proteinid. -##GERP++_RS=(from dbNSFP) GERP++ RS score, the larger the score, the more conserved the site. Scores range from -12.3 to 6.17. -##Interpro_domain=(from dbNSFP) domain or conserved site on which the variant locates. Domain annotations come from Interpro database. The number in the brackets following a specific domain is the count of times Interpro assigns the variant position to that domain, typically coming from different predicting databases. Multiple entries separated by ";". -##MutationAssessor_pred=(from dbNSFP) MutationAssessor's functional impact of a variant - predicted functional, i.e. high ("H") or medium ("M"), or predicted non-functional, i.e. low ("L") or neutral ("N"). The MAori score cutoffs between "H" and "M", "M" and "L", and "L" and "N", are 3.5, 1.935 and 0.8, respectively. The rankscore cutoffs between "H" and "M", "M" and "L", and "L" and "N", are 0.9307, 0.52043 and 0.19675, respectively. -##MutationTaster_pred=(from dbNSFP) MutationTaster prediction, "A" ("disease_causing_automatic"), "D" ("disease_causing"), "N" ("polymorphism") or "P" ("polymorphism_automatic"). The score cutoff between "D" and "N" is 0.5 for MTnew and 0.31733 for the rankscore. -##Polyphen2_HVAR_pred=(from dbNSFP) Polyphen2 prediction based on HumVar, "D" ("probably damaging", HVAR score in [0.909,1] or rankscore in [0.65694,0.97581]), "P" ("possibly damaging", HVAR in [0.447,0.908] or rankscore in [0.47121,0.65622]) and "B" ("benign", HVAR score in [0,0.446] or rankscore in [0.01493,0.47076]). Score cutoff for binary classification is 0.5 for HVAR score or 0.48762 for rankscore, i.e. the prediction is "neutral" if the HVAR score is smaller than 0.5 (rankscore is smaller than 0.48762), and "deleterious" if the HVAR score is larger than 0.5 (rankscore is larger than 0.48762). Multiple entries are separated by ";", corresponding to Uniprot_acc. -##REVEL_score=(from dbNSFP) REVEL is an ensemble score based on 13 individual scores for predicting the pathogenicity of missense variants. Scores range from 0 to 1. The larger the score the more likely the SNP has damaging effect. "REVEL scores are freely available for non-commercial use. For other uses, please contact Weiva Sieh" (weiva.sieh@mssm.edu) -##ada_score=dbscSNV ADA score -##rf_score=dbscSNV RF score -##SpliceAI_pred_DP_AG=SpliceAI predicted effect on splicing. Delta position for acceptor gain -##SpliceAI_pred_DP_AL=SpliceAI predicted effect on splicing. Delta position for acceptor loss -##SpliceAI_pred_DP_DG=SpliceAI predicted effect on splicing. Delta position for donor gain -##SpliceAI_pred_DP_DL=SpliceAI predicted effect on splicing. Delta position for donor loss -##SpliceAI_pred_DS_AG=SpliceAI predicted effect on splicing. Delta score for acceptor gain -##SpliceAI_pred_DS_AL=SpliceAI predicted effect on splicing. Delta score for acceptor loss -##SpliceAI_pred_DS_DG=SpliceAI predicted effect on splicing. Delta score for donor gain -##SpliceAI_pred_DS_DL=SpliceAI predicted effect on splicing. Delta score for donor loss -##SpliceAI_pred_SYMBOL=SpliceAI gene symbol -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -#CHROM POS ID REF ALT QUAL FILTER INFO -1 69098 . C G . . variant_id=13629760;CSQ=G|missense_variant|MODERATE|OR4F5|79501|Transcript|NM_001005484.1|protein_coding|1/1||NM_001005484.1:c.8C>G|NP_001005484.1:p.Thr3Ser|8|8|3|T/S|aCt/aGt|||1||1|SNV|EntrezGene||YES|NP_001005484.1||||rseq_mrna_match|||tolerated(1)||||||||58||||||||13.33|1.049236|.&T|2.31|.&.|.&N|N|.&B|0.052|||-5|35|-5|17|0.00|0.15|0.27|0.08|OR4F5|||||||||||||||0.0020000000949949|0.966000020503998|0.894999980926514|1.23199999332428|||||||| -1 69589 . G A . . variant_id=13629761;CSQ=A|missense_variant|MODERATE|OR4F5|79501|Transcript|NM_001005484.1|protein_coding|1/1||NM_001005484.1:c.499G>A|NP_001005484.1:p.Val167Ile|499|499|167|V/I|Gtc/Atc|COSV58736794||1||1|SNV|EntrezGene||YES|NP_001005484.1||||rseq_mrna_match|||tolerated(0.44)|||||1|1||29|||OR4F5:V167I|0&1&1||||14.31|1.192033|.&T|0.138|GPCR&_rhodopsin-like&_7TM&GPCR&_rhodopsin-like&_7TM|.&N|N|.&B|0.043|||29|-46|-12|-1|0.00|0.00|0.00|0.11|OR4F5|||||||||||||||0|0.814000010490417|-1.23099994659424|-0.108000002801418||||||COSV58736794|1|COSM6847255 -13 95839002 . C T . . variant_id=13629762;CSQ=T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_001105515.3|protein_coding|11/21||NM_001105515.3:c.1498G>A|NP_001098985.1:p.Glu500Lys|1635|1498|500|E/K|Gaa/Aaa|rs145886106||-1|||SNV|EntrezGene|||NP_001098985.1||||rseq_mrna_match|||tolerated(0.11)|||0.0004|||||56||0.0441||||||22.3|2.646731|.&D&.&D&.&.|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|N&N&N&.&N&.|D&D&D&D|B&B&B&.&.&.|0.377|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|rs145886106|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|0|0|afr|0.84799998998642|0.998000025749207|1.37699997425079|2.61299991607666||rs145886106|0.000927261|||COSV65320224&COSV65320224&COSV65320224&COSV65320224&COSV65320224|1&1&1&1&1|COSM7286401&COSM7286401&COSM7286401&COSM7286401&COSM7286401,T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_001301829.2|protein_coding|11/30||NM_001301829.2:c.1498G>A|NP_001288758.1:p.Glu500Lys|1635|1498|500|E/K|Gaa/Aaa|rs145886106||-1|||SNV|EntrezGene|||NP_001288758.1||||rseq_mrna_match|||tolerated(0.06)|||0.0004|||||56||0.0441||||||22.3|2.646731|.&D&.&D&.&.|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|N&N&N&.&N&.|D&D&D&D|B&B&B&.&.&.|0.377|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|rs145886106|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|0|0|afr|0.84799998998642|0.998000025749207|1.37699997425079|2.61299991607666||rs145886106|0.000927261|||COSV65320224&COSV65320224&COSV65320224&COSV65320224&COSV65320224|1&1&1&1&1|COSM7286401&COSM7286401&COSM7286401&COSM7286401&COSM7286401,T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_001301830.2|protein_coding|10/20||NM_001301830.2:c.1273G>A|NP_001288759.1:p.Glu425Lys|1410|1273|425|E/K|Gaa/Aaa|rs145886106||-1|||SNV|EntrezGene|||NP_001288759.1||||rseq_mrna_match|||tolerated(0.11)|||0.0004|||||56||0.0441||||||22.3|2.646731|.&D&.&D&.&.|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|N&N&N&.&N&.|D&D&D&D|B&B&B&.&.&.|0.377|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|rs145886106|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|0|0|afr|0.84799998998642|0.998000025749207|1.37699997425079|2.61299991607666||rs145886106|0.000927261|||COSV65320224&COSV65320224&COSV65320224&COSV65320224&COSV65320224|1&1&1&1&1|COSM7286401&COSM7286401&COSM7286401&COSM7286401&COSM7286401,T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_005845.4|protein_coding|11/31||NM_005845.4:c.1498G>A|NP_005836.2:p.Glu500Lys|1630|1498|500|E/K|Gaa/Aaa|rs145886106||-1||1|SNV|EntrezGene||YES|NP_005836.2||||rseq_mrna_nonmatch&rseq_3p_mismatch|||tolerated(0.06)|||0.0004|||||56||0.0441||||||22.3|2.646731|.&D&.&D&.&.|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|N&N&N&.&N&.|D&D&D&D|B&B&B&.&.&.|0.377|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|rs145886106|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|0|0|afr|0.84799998998642|0.998000025749207|1.37699997425079|2.61299991607666||rs145886106|0.000927261|||COSV65320224&COSV65320224&COSV65320224&COSV65320224&COSV65320224|1&1&1&1&1|COSM7286401&COSM7286401&COSM7286401&COSM7286401&COSM7286401 -15 32928050 . C T . . variant_id=13638004;CSQ=T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_001286479.2|protein_coding|11/12||NM_001286479.2:c.850C>T|NP_001273408.1:p.Arg284Ter|1398|850|284|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|EntrezGene|||NP_001273408.1||||rseq_mrna_match||||||||0&1|0&1||||0.426||||||34|4.790614|.&.&.&.&.|2.5|.&.&.&.&.|.&.&.&.&.|A&A&A&D&D|.&.&.&.&.||||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|rs776172390|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|0|0|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835&COSV64380835&COSV64380835&COSV64380835&COSV64380835|3&3&3&3&3|COSM185114&COSM185114&COSM185114&COSM185114&COSM185114,T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_001286480.3|protein_coding|12/13||NM_001286480.3:c.850C>T|NP_001273409.1:p.Arg284Ter|1506|850|284|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|EntrezGene|||NP_001273409.1||||rseq_mrna_match||||||||0&1|0&1||||0.426||||||34|4.790614|.&.&.&.&.|2.5|.&.&.&.&.|.&.&.&.&.|A&A&A&D&D|.&.&.&.&.||||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|rs776172390|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|0|0|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835&COSV64380835&COSV64380835&COSV64380835&COSV64380835|3&3&3&3&3|COSM185114&COSM185114&COSM185114&COSM185114&COSM185114,T|intron_variant|MODIFIER|ARHGAP11A-SCG5|114118903|Transcript|NM_001368319.1|protein_coding||9/13|NM_001368319.1:c.1235+2741C>T|||||||rs776172390&COSV64380835||1|||SNV|EntrezGene||YES|NP_001355248.1||||rseq_mrna_match||||||||0&1|0&1|||||||||||||||||||||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|rs776172390|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|0|0|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835&COSV64380835&COSV64380835&COSV64380835&COSV64380835|3&3&3&3&3|COSM185114&COSM185114&COSM185114&COSM185114&COSM185114,T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_014783.6|protein_coding|11/12||NM_014783.6:c.1417C>T|NP_055598.1:p.Arg473Ter|2125|1417|473|R/*|Cga/Tga|rs776172390&COSV64380835||1||1|SNV|EntrezGene||YES|NP_055598.1||||rseq_mrna_match||||||||0&1|0&1||||0.426||||||34|4.790614|.&.&.&.&.|2.5|.&.&.&.&.|.&.&.&.&.|A&A&A&D&D|.&.&.&.&.||||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|rs776172390|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|0|0|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835&COSV64380835&COSV64380835&COSV64380835&COSV64380835|3&3&3&3&3|COSM185114&COSM185114&COSM185114&COSM185114&COSM185114,T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_199357.2|protein_coding|11/11||NM_199357.2:c.1417C>T|NP_955389.1:p.Arg473Ter|2153|1417|473|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|EntrezGene|||NP_955389.1||||rseq_mrna_match||||||||0&1|0&1||||0.426||||||34|4.790614|.&.&.&.&.|2.5|.&.&.&.&.|.&.&.&.&.|A&A&A&D&D|.&.&.&.&.||||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|rs776172390|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|0|0|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835&COSV64380835&COSV64380835&COSV64380835&COSV64380835|3&3&3&3&3|COSM185114&COSM185114&COSM185114&COSM185114&COSM185114 diff --git a/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py b/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py index 446f10646..6845acb88 100644 --- a/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py +++ b/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py @@ -188,13 +188,16 @@ def _add_vep_field_handlers(self): "topmed_af": format_pick_highest_float, "variant_class": get_choice_formatter_func(VariantClass.choices), } - if self.genome_build == GenomeBuild.grch38(): + + vc = VEPConfig(self.genome_build) + # gnomad3 wasn't combined using gnomad_data.py so just uses FILTER + # while combined exome/genomes use "gnomad_filtered=1" (which should auto-convert bool) + if self.genome_build == GenomeBuild.grch38() and vc.columns_version <= 2: self.field_formatters["gnomad_filtered"] = gnomad_filtered_func self.source_field_to_columns = defaultdict(set) self.ignored_vep_fields = self.VEP_NOT_COPIED_FIELDS.copy() - vc = VEPConfig(self.genome_build) cvf_filters = [ColumnVEPField.get_columns_version_q(vc.columns_version)] if self.annotation_run.pipeline_type == VariantAnnotationPipelineType.CNV: cvf_filters.extend([ @@ -582,7 +585,10 @@ def empty_to_none(it): # Field formatters def gnomad_filtered_func(raw_value): - """ We use FILTER in Gnomad3 (GRCh38 only) - need to convert back to bool """ + """ We use FILTER in Gnomad3 (GRCh38 only) - need to convert back to bool + In the combined exomes/genomes (gnomad2, gnomad4) we use gnomad_filtered=1 + So don't need to format this etc + """ return raw_value not in (None, "PASS") From 68459da14a1650ff07759fbc8dbc4284f0b3422f Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Sun, 3 Dec 2023 20:58:24 +1030 Subject: [PATCH 15/29] update to latest cdot data --- annotation/annotation_data/cdot_update.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/annotation/annotation_data/cdot_update.sh b/annotation/annotation_data/cdot_update.sh index 5ea4563f4..ab3ce9485 100755 --- a/annotation/annotation_data/cdot_update.sh +++ b/annotation/annotation_data/cdot_update.sh @@ -1,6 +1,6 @@ #!/bin/bash -CDOT_VERSION=0.2.21 +CDOT_VERSION=0.2.22 THIS_DIR=$(realpath "$(dirname "${BASH_SOURCE[0]}")") VG_DIR=${THIS_DIR}/../.. DOWNLOAD_DIR=/tmp From dc33997ac3e1938d9927941f3c1b5239ef70a9b7 Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Sun, 3 Dec 2023 21:47:09 +1030 Subject: [PATCH 16/29] cdot v0.2.22 (data) --- annotation/annotation_data/cdot_update.sh | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/annotation/annotation_data/cdot_update.sh b/annotation/annotation_data/cdot_update.sh index ab3ce9485..8cccb389c 100755 --- a/annotation/annotation_data/cdot_update.sh +++ b/annotation/annotation_data/cdot_update.sh @@ -1,5 +1,7 @@ #!/bin/bash +set -e + CDOT_VERSION=0.2.22 THIS_DIR=$(realpath "$(dirname "${BASH_SOURCE[0]}")") VG_DIR=${THIS_DIR}/../.. @@ -9,10 +11,10 @@ echo "Downloading data in ${DOWNLOAD_DIR}" cd ${DOWNLOAD_DIR} wget \ - https://github.com/SACGF/cdot/releases/download/v${CDOT_VERSION}/cdot-${CDOT_VERSION}.ensembl.grch37.json.gz \ - https://github.com/SACGF/cdot/releases/download/v${CDOT_VERSION}/cdot-${CDOT_VERSION}.ensembl.grch38.json.gz \ - https://github.com/SACGF/cdot/releases/download/v${CDOT_VERSION}/cdot-${CDOT_VERSION}.refseq.grch37.json.gz \ - https://github.com/SACGF/cdot/releases/download/v${CDOT_VERSION}/cdot-${CDOT_VERSION}.refseq.grch38.json.gz + https://github.com/SACGF/cdot/releases/download/data_v${CDOT_VERSION}/cdot-${CDOT_VERSION}.ensembl.grch37.json.gz \ + https://github.com/SACGF/cdot/releases/download/data_v${CDOT_VERSION}/cdot-${CDOT_VERSION}.ensembl.grch38.json.gz \ + https://github.com/SACGF/cdot/releases/download/data_v${CDOT_VERSION}/cdot-${CDOT_VERSION}.refseq.grch37.json.gz \ + https://github.com/SACGF/cdot/releases/download/data_v${CDOT_VERSION}/cdot-${CDOT_VERSION}.refseq.grch38.json.gz cd ${VG_DIR} From 21450197166f7b6b09fb61d830b904d81d12471c Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Mon, 4 Dec 2023 15:38:37 +1030 Subject: [PATCH 17/29] #850 - Upgrade dbnsfp to 4.5 (and include Alpha Missense) --- .../dbnsfp_grch37_strip.sh | 32 ++++++++++-------- .../dbnsfp_grch38_strip.sh | 33 ++++++++++--------- 2 files changed, 35 insertions(+), 30 deletions(-) diff --git a/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh b/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh index 076cd4042..4a02d506c 100755 --- a/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh +++ b/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh @@ -2,33 +2,37 @@ set -e -# All of this python is just to get the columns used in cut and tabix args +# Download 4.5 from https://sites.google.com/site/jpopgen/dbNSFP + +# https://m.ensembl.org/info/docs/tools/vep/script/vep_plugins.html#dbnsfp +# zcat dbNSFP4.5a_variant.chr1.gz | head -n1 > header.txt +# mkdir /tmp/dbsnp37 +# zgrep -h -v ^#chr dbNSFP4.5a_variant.chr* | awk '$8 != "." ' | sort -T /tmp/dbsnp37 -k8,8 -k9,9n - | cat header.txt - | bgzip -c > dbNSFP4.5a_grch37.gz +# tabix -s 8 -b 9 -e 9 dbNSFP4.5a.grch37.gz + + +# All of this python is just to get the columns used in cut and tabix args at bottom of this file # Get dbNSFP fields used by VariantGrid - run python3 manage.py shell # In [12]: ",".join(ColumnVEPField.get_source_fields(vep_plugin='d')) # Get column names from dbNSFP data file -# df = pd.read_csv("./dbNSFP4.3a.grch37.gz", sep='\t', index_col=None, nrows=0) -# vep_fields = 'GERP++_RS,Interpro_domain,CADD_raw_rankscore,REVEL_rankscore,BayesDel_noAF_rankscore,ClinPred_rankscore,VEST4_rankscore,MetaLR_rankscore,Aloft_prob_Tolerant,Aloft_prob_Recessive,Aloft_prob_Dominant,Aloft_pred,Aloft_Confidence' +# import pandas as pd +# df = pd.read_csv("header.txt", sep='\t', index_col=None, nrows=0) +# vep_fields = 'GERP++_RS,Interpro_domain,CADD_raw_rankscore,REVEL_rankscore,BayesDel_noAF_rankscore,ClinPred_rankscore,VEST4_rankscore,MetaLR_rankscore,Aloft_prob_Tolerant,Aloft_prob_Recessive,Aloft_prob_Dominant,Aloft_pred,Aloft_Confidence,AlphaMissense_rankscore,AlphaMissense_pred' # columns = ['ref', 'alt', 'aaref', 'aaalt', 'hg19_chr', 'hg19_pos(1-based)', 'Ensembl_transcriptid'] + vep_fields.split(",") # cols = [] # for i in columns: # cols.append(list(df.columns).index(i) + 1) -# ",".join([str(c) for c in sorted(cols)]) -# columns are: '3,4,5,6,8,9,15,69,74,84,104,107,113,114,115,116,117,119,156,640' - +# print(",".join([str(c) for c in sorted(cols)])) +# columns are: '3,4,5,6,8,9,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705' -# Download 4.3 from https://sites.google.com/site/jpopgen/dbNSFP -# https://m.ensembl.org/info/docs/tools/vep/script/vep_plugins.html#dbnsfp -# zcat dbNSFP4.3a_variant.chr1.gz | head -n1 > h -# zgrep -h -v ^#chr dbNSFP4.3a_variant.chr* | awk '$8 != "." ' | sort -T /path/to/tmp_folder -k8,8 -k9,9n - | cat h - | bgzip -c > dbNSFP4.3a_grch37.gz -# tabix -s 8 -b 9 -e 9 dbNSFP4.3a.grch37.gz -IN_FILE=dbNSFP4.3a.grch37.gz -OUT_FILE=dbNSFP4.3a.grch37.stripped.gz +IN_FILE=dbNSFP4.5a.grch37.gz +OUT_FILE=dbNSFP4.5a.grch37.stripped.gz # Header needs to start with # -(echo -n "#" ; zcat ${IN_FILE} | cut -f 3,4,5,6,8,9,15,69,74,84,104,107,113,114,115,116,117,119,156,640 ) | bgzip > ${OUT_FILE} +(echo -n "#" ; zcat ${IN_FILE} | cut -f 3,4,5,6,8,9,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705 ) | bgzip > ${OUT_FILE} tabix -s 5 -b 6 -e 6 ${OUT_FILE} # cols are: 1=ref, 2=alt, 3=chr, 4=pos diff --git a/annotation/annotation_data/generate_annotation/dbnsfp_grch38_strip.sh b/annotation/annotation_data/generate_annotation/dbnsfp_grch38_strip.sh index a2666fc7c..1fcd065b4 100755 --- a/annotation/annotation_data/generate_annotation/dbnsfp_grch38_strip.sh +++ b/annotation/annotation_data/generate_annotation/dbnsfp_grch38_strip.sh @@ -2,34 +2,35 @@ set -e +# Download 4.5 from https://sites.google.com/site/jpopgen/dbNSFP + +# https://m.ensembl.org/info/docs/tools/vep/script/vep_plugins.html#dbnsfp + +# zcat dbNSFP4.5a_variant.chr1.gz | head -n1 > header.txt +# mkdir /tmp/dbsnp38 +# zgrep -h -v ^#chr dbNSFP4.5a_variant.chr* | sort -T /tmp/dbsnp38 -k1,1 -k2,2n - | cat header.txt - | bgzip -c > dbNSFP4.5a_grch38.gz +# tabix -s 1 -b 2 -e 2 dbNSFP4.5a_grch38.gz + + # All of this python is just to get the columns used in cut and tabix args # Get dbNSFP fields used by VariantGrid - run python3 manage.py shell # In [12]: ",".join(ColumnVEPField.get_source_fields(vep_plugin='d')) # Get column names from dbNSFP data file -# df = pd.read_csv("./dbNSFP4.3a.grch38.gz", sep='\t', index_col=None, nrows=0) -# vep_fields = 'GERP++_RS,Interpro_domain,CADD_raw_rankscore,REVEL_rankscore,BayesDel_noAF_rankscore,ClinPred_rankscore,VEST4_rankscore,MetaLR_rankscore,Aloft_prob_Tolerant,Aloft_prob_Recessive,Aloft_prob_Dominant,Aloft_pred,Aloft_Confidence' +# df = pd.read_csv("header.txt", sep='\t', index_col=None, nrows=0) +# vep_fields = 'GERP++_RS,Interpro_domain,CADD_raw_rankscore,REVEL_rankscore,BayesDel_noAF_rankscore,ClinPred_rankscore,VEST4_rankscore,MetaLR_rankscore,Aloft_prob_Tolerant,Aloft_prob_Recessive,Aloft_prob_Dominant,Aloft_pred,Aloft_Confidence,AlphaMissense_rankscore,AlphaMissense_pred' # columns = ['#chr', 'pos(1-based)', 'ref', 'alt', 'aaref', 'aaalt', 'Ensembl_transcriptid'] + vep_fields.split(",") # cols = [] # for i in columns: # cols.append(list(df.columns).index(i) + 1) -# ",".join([str(c) for c in sorted(cols)]) -# columns are: '1,2,3,4,5,6,15,69,74,84,104,107,113,114,115,116,117,119,156,640' - -# Download 4.3 from https://sites.google.com/site/jpopgen/dbNSFP - -# https://m.ensembl.org/info/docs/tools/vep/script/vep_plugins.html#dbnsfp - -# zcat dbNSFP4.3a_variant.chr1.gz | head -n1 > h -# zgrep -h -v ^#chr dbNSFP4.3a_variant.chr* | sort -T /path/to/tmp_folder -k1,1 -k2,2n - | cat h - | bgzip -c > dbNSFP4.3a_grch38.gz -# tabix -s 1 -b 2 -e 2 dbNSFP4.3a_grch38.gz - +# print(",".join([str(c) for c in sorted(cols)])) +# columns are: '1,2,3,4,5,6,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705' -IN_FILE=dbNSFP4.3a.grch38.gz -OUT_FILE=dbNSFP4.3a.grch38.stripped.gz +IN_FILE=dbNSFP4.5a.grch38.gz +OUT_FILE=dbNSFP4.5a.grch38.stripped.gz # Header needs to start with # -(echo -n "#" ; zcat ${IN_FILE} | cut -f 1,2,3,4,5,6,15,69,74,84,104,107,113,114,115,116,117,119,156,640 ) | bgzip > ${OUT_FILE} +(echo -n "#" ; zcat ${IN_FILE} | cut -f 1,2,3,4,5,6,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705 ) | bgzip > ${OUT_FILE} tabix -s 1 -b 2 -e 2 ${OUT_FILE} # cols are: 1=chr, 2=pos From 56dab0d54b85448c88e5517a4eb4460bcd93b593 Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Tue, 5 Dec 2023 12:13:37 +1030 Subject: [PATCH 18/29] #850 - do per-chrom to make it faster --- .../generate_annotation/dbnsfp_grch37_strip.sh | 15 +++++++++------ .../generate_annotation/dbnsfp_grch38_strip.sh | 13 +++++++++---- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh b/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh index 4a02d506c..966afd597 100755 --- a/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh +++ b/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh @@ -27,12 +27,15 @@ set -e # print(",".join([str(c) for c in sorted(cols)])) # columns are: '3,4,5,6,8,9,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705' - - -IN_FILE=dbNSFP4.5a.grch37.gz +CUT_COLUMNS="3,4,5,6,8,9,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705" OUT_FILE=dbNSFP4.5a.grch37.stripped.gz +TMP_DIR=/tmp # /hpcfs/groups/phoenix-hpc-sacgf/scratch/dbnsfp_GRCh37 +mkdir -p ${TMP_DIR} -# Header needs to start with # -(echo -n "#" ; zcat ${IN_FILE} | cut -f 3,4,5,6,8,9,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705 ) | bgzip > ${OUT_FILE} -tabix -s 5 -b 6 -e 6 ${OUT_FILE} # cols are: 1=ref, 2=alt, 3=chr, 4=pos +# Sort chromosomes individually as that's much more efficient +cat header.txt | cut -f ${CUT_COLUMNS} | bgzip > ${OUT_FILE} +for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do + zgrep -h -v ^#chr dbNSFP4.5a_variant.chr${chrom}.gz | awk '$8 != "." ' | cut -f ${CUT_COLUMNS} | sort -T ${TMP_DIR} -k8,8 -k9,9n - | bgzip >> ${OUT_FILE} +done +tabix -s 5 -b 6 -e 6 ${OUT_FILE} # cols are: 1=ref, 2=alt, 3=chr, 4=pos diff --git a/annotation/annotation_data/generate_annotation/dbnsfp_grch38_strip.sh b/annotation/annotation_data/generate_annotation/dbnsfp_grch38_strip.sh index 1fcd065b4..262b2ac79 100755 --- a/annotation/annotation_data/generate_annotation/dbnsfp_grch38_strip.sh +++ b/annotation/annotation_data/generate_annotation/dbnsfp_grch38_strip.sh @@ -27,10 +27,15 @@ set -e # print(",".join([str(c) for c in sorted(cols)])) # columns are: '1,2,3,4,5,6,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705' -IN_FILE=dbNSFP4.5a.grch38.gz +CUT_COLUMNS="1,2,3,4,5,6,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705" OUT_FILE=dbNSFP4.5a.grch38.stripped.gz +TMP_DIR=/tmp # /hpcfs/groups/phoenix-hpc-sacgf/scratch/dbnsfp4.5_GRCh38 +mkdir -p ${TMP_DIR} -# Header needs to start with # -(echo -n "#" ; zcat ${IN_FILE} | cut -f 1,2,3,4,5,6,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705 ) | bgzip > ${OUT_FILE} -tabix -s 1 -b 2 -e 2 ${OUT_FILE} # cols are: 1=chr, 2=pos +# Sort chromosomes individually as that's much more efficient +cat header.txt | cut -f ${CUT_COLUMNS} | bgzip > ${OUT_FILE} +for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do + zgrep -h -v ^#chr dbNSFP4.5a_variant.chr${chrom}.gz | cut -f ${CUT_COLUMNS} | sort -T ${TMP_DIR} -k1,1 -k2,2n - >> ${OUT_FILE} +done +tabix -s 1 -b 2 -e 2 ${OUT_FILE} # cols are: 1=chr, 2=pos From 9339bcb01c010294740e20f9ac2dbb63c5bb846f Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Tue, 5 Dec 2023 14:25:40 +1030 Subject: [PATCH 19/29] #850 - Need to bgzip data --- .../annotation_data/generate_annotation/dbnsfp_grch38_strip.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/annotation/annotation_data/generate_annotation/dbnsfp_grch38_strip.sh b/annotation/annotation_data/generate_annotation/dbnsfp_grch38_strip.sh index 262b2ac79..0a9199067 100755 --- a/annotation/annotation_data/generate_annotation/dbnsfp_grch38_strip.sh +++ b/annotation/annotation_data/generate_annotation/dbnsfp_grch38_strip.sh @@ -35,7 +35,7 @@ mkdir -p ${TMP_DIR} # Sort chromosomes individually as that's much more efficient cat header.txt | cut -f ${CUT_COLUMNS} | bgzip > ${OUT_FILE} for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do - zgrep -h -v ^#chr dbNSFP4.5a_variant.chr${chrom}.gz | cut -f ${CUT_COLUMNS} | sort -T ${TMP_DIR} -k1,1 -k2,2n - >> ${OUT_FILE} + zgrep -h -v ^#chr dbNSFP4.5a_variant.chr${chrom}.gz | cut -f ${CUT_COLUMNS} | sort -T ${TMP_DIR} -k1,1 -k2,2n - | bgzip >> ${OUT_FILE} done tabix -s 1 -b 2 -e 2 ${OUT_FILE} # cols are: 1=chr, 2=pos From 101c84584a2bd6846ef49d6c37ffd8a8c108c067 Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Tue, 5 Dec 2023 15:53:53 +1030 Subject: [PATCH 20/29] #850 - Do bgzip afterwards, 37 need to shift sort out as columns were cut first --- .../generate_annotation/dbnsfp_grch37_strip.sh | 12 ++++++++---- .../generate_annotation/dbnsfp_grch38_strip.sh | 9 +++++---- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh b/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh index 966afd597..de6851765 100755 --- a/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh +++ b/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh @@ -28,14 +28,18 @@ set -e # columns are: '3,4,5,6,8,9,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705' CUT_COLUMNS="3,4,5,6,8,9,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705" -OUT_FILE=dbNSFP4.5a.grch37.stripped.gz +SEQ_COL=5 # hg19_chr +POS_COL=6 # hg19_pos(1-based) +OUT_FILE=dbNSFP4.5a.grch37.stripped TMP_DIR=/tmp # /hpcfs/groups/phoenix-hpc-sacgf/scratch/dbnsfp_GRCh37 mkdir -p ${TMP_DIR} # Sort chromosomes individually as that's much more efficient -cat header.txt | cut -f ${CUT_COLUMNS} | bgzip > ${OUT_FILE} +cat header.txt | cut -f ${CUT_COLUMNS} > ${OUT_FILE} for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do - zgrep -h -v ^#chr dbNSFP4.5a_variant.chr${chrom}.gz | awk '$8 != "." ' | cut -f ${CUT_COLUMNS} | sort -T ${TMP_DIR} -k8,8 -k9,9n - | bgzip >> ${OUT_FILE} + zgrep -h -v ^#chr dbNSFP4.5a_variant.chr${chrom}.gz | awk '$8 != "." ' | cut -f ${CUT_COLUMNS} | sort -T ${TMP_DIR} -k${SEQ_COL},${SEQ_COL} -k${POS_COL},${POS_COL}n - >> ${OUT_FILE} done -tabix -s 5 -b 6 -e 6 ${OUT_FILE} # cols are: 1=ref, 2=alt, 3=chr, 4=pos +bgzip ${OUT_FILE} +tabix -s ${SEQ_COL} -b ${POS_COL} -e ${POS_COL} ${OUT_FILE}.gz + diff --git a/annotation/annotation_data/generate_annotation/dbnsfp_grch38_strip.sh b/annotation/annotation_data/generate_annotation/dbnsfp_grch38_strip.sh index 0a9199067..af197a4d2 100755 --- a/annotation/annotation_data/generate_annotation/dbnsfp_grch38_strip.sh +++ b/annotation/annotation_data/generate_annotation/dbnsfp_grch38_strip.sh @@ -28,14 +28,15 @@ set -e # columns are: '1,2,3,4,5,6,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705' CUT_COLUMNS="1,2,3,4,5,6,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705" -OUT_FILE=dbNSFP4.5a.grch38.stripped.gz +OUT_FILE=dbNSFP4.5a.grch38.stripped TMP_DIR=/tmp # /hpcfs/groups/phoenix-hpc-sacgf/scratch/dbnsfp4.5_GRCh38 mkdir -p ${TMP_DIR} # Sort chromosomes individually as that's much more efficient -cat header.txt | cut -f ${CUT_COLUMNS} | bgzip > ${OUT_FILE} +cat header.txt | cut -f ${CUT_COLUMNS} > ${OUT_FILE} for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do - zgrep -h -v ^#chr dbNSFP4.5a_variant.chr${chrom}.gz | cut -f ${CUT_COLUMNS} | sort -T ${TMP_DIR} -k1,1 -k2,2n - | bgzip >> ${OUT_FILE} + zgrep -h -v ^#chr dbNSFP4.5a_variant.chr${chrom}.gz | cut -f ${CUT_COLUMNS} | sort -T ${TMP_DIR} -k1,1 -k2,2n - >> ${OUT_FILE} done -tabix -s 1 -b 2 -e 2 ${OUT_FILE} # cols are: 1=chr, 2=pos +bgzip ${OUT_FILE} +tabix -s 1 -b 2 -e 2 ${OUT_FILE}.gz # cols are: 1=chr, 2=pos From 31999dd033a1dfd528b5b5fb4b2e954044d22007 Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Tue, 5 Dec 2023 16:46:38 +1030 Subject: [PATCH 21/29] Move alpha missense to dbNSFP --- .../0083_one_off_move_alphamissense_dbnsfp.py | 31 +++++++++++++++ ...tation_alphamissense_rankscore_and_more.py | 35 +++++++++++++++++ annotation/models/damage_enums.py | 16 ++------ annotation/models/models.py | 11 +++--- annotation/models/models_enums.py | 1 - .../0108_one_off_move_alphamissense_dbnsfp.py | 39 +++++++++++++++++++ 6 files changed, 114 insertions(+), 19 deletions(-) create mode 100644 annotation/migrations/0083_one_off_move_alphamissense_dbnsfp.py create mode 100644 annotation/migrations/0084_rename_alphamissense_pathogenicity_variantannotation_alphamissense_rankscore_and_more.py create mode 100644 snpdb/migrations/0108_one_off_move_alphamissense_dbnsfp.py diff --git a/annotation/migrations/0083_one_off_move_alphamissense_dbnsfp.py b/annotation/migrations/0083_one_off_move_alphamissense_dbnsfp.py new file mode 100644 index 000000000..03dad05f9 --- /dev/null +++ b/annotation/migrations/0083_one_off_move_alphamissense_dbnsfp.py @@ -0,0 +1,31 @@ +# Generated by Django 4.2.2 on 2023-12-05 05:51 + +from django.db import migrations + +def _one_off_move_alphamissense_dbnsfp(apps, _schema_editor): + PATHOGENICITY_PREDICTIONS = 'P' + VEP_PLUGIN_DBNSFP = 'd' + + + ColumnVEPField = apps.get_model("annotation", "ColumnVEPField") + # The old alphamissense VEPFields cascade deleted from 0108_one_off_move_alphamissense_dbnsfp + + data = { + 'column': 'alphamissense_pathogenicity', 'min_vep_columns_version': 3, + 'variant_grid_column_id': 'alphamissense_rankscore', + 'vep_plugin': VEP_PLUGIN_DBNSFP, + 'source_field': 'AlphaMissense_rankscore', 'category': PATHOGENICITY_PREDICTIONS + } + ColumnVEPField.objects.create(**data) + + +class Migration(migrations.Migration): + + dependencies = [ + ('annotation', '0082_new_vep_110_columns_v3'), + ("snpdb", "0108_one_off_move_alphamissense_dbnsfp"), + ] + + operations = [ + migrations.RunPython(_one_off_move_alphamissense_dbnsfp) + ] diff --git a/annotation/migrations/0084_rename_alphamissense_pathogenicity_variantannotation_alphamissense_rankscore_and_more.py b/annotation/migrations/0084_rename_alphamissense_pathogenicity_variantannotation_alphamissense_rankscore_and_more.py new file mode 100644 index 000000000..6c30cca77 --- /dev/null +++ b/annotation/migrations/0084_rename_alphamissense_pathogenicity_variantannotation_alphamissense_rankscore_and_more.py @@ -0,0 +1,35 @@ +# Generated by Django 4.2.2 on 2023-12-05 06:16 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('annotation', '0083_one_off_move_alphamissense_dbnsfp'), + ] + + operations = [ + migrations.RenameField( + model_name='variantannotation', + old_name='alphamissense_pathogenicity', + new_name='alphamissense_rankscore', + ), + migrations.RemoveField( + model_name='variantannotation', + name='alphamissense_class', + ), + migrations.RemoveField( + model_name='varianttranscriptannotation', + name='alphamissense_class', + ), + migrations.RemoveField( + model_name='varianttranscriptannotation', + name='alphamissense_pathogenicity', + ), + migrations.AlterField( + model_name='columnvepfield', + name='vep_plugin', + field=models.CharField(choices=[('d', 'dbNSFP'), ('v', 'dbscSNV'), ('g', 'Grantham'), ('l', 'LoFtool'), ('n', 'Mastermind'), ('V', 'MaveDb'), ('m', 'MaxEntScan'), ('N', 'NMD'), ('a', 'SpliceAI'), ('s', 'SpliceRegion'), ('o', 'StructuralVariantOverlap')], max_length=1, null=True), + ), + ] diff --git a/annotation/models/damage_enums.py b/annotation/models/damage_enums.py index 16c4aaa1e..fdc588643 100644 --- a/annotation/models/damage_enums.py +++ b/annotation/models/damage_enums.py @@ -156,16 +156,8 @@ class ALoFTPrediction(models.TextChoices): DOMINANT = "d", "Dominant" -class AlphaMissensePrediction(AbstractPathogenicity): +class AlphaMissensePrediction(models.TextChoices): """ @see https://asia.ensembl.org/info/docs/tools/vep/script/vep_plugins.html#alphamissense """ - LIKELY_BENIGN = 'b' - AMBIGUOUS = "a" - LIKELY_PATHOGENIC = "p" - - CHOICES = [ - (LIKELY_BENIGN, 'likely_benign'), - (AMBIGUOUS, 'ambiguous'), - (LIKELY_PATHOGENIC, 'likely_pathogenic'), - ] - MINIMUM_FLAG_DAMAGE_LEVEL = LIKELY_PATHOGENIC - VARIANT_PATH = "variantannotation__alphamissense_class" + LIKELY_BENIGN = 'b', 'likely_benign' + AMBIGUOUS = "a", 'ambiguous' + LIKELY_PATHOGENIC = "p", 'likely_pathogenic' diff --git a/annotation/models/models.py b/annotation/models/models.py index b99d3fa45..79cea1783 100644 --- a/annotation/models/models.py +++ b/annotation/models/models.py @@ -557,10 +557,10 @@ def get_pathogenic_prediction_funcs(self) -> Dict[str, Callable]: pathogenic_rankscore = settings.ANNOTATION_MIN_PATHOGENIC_RANKSCORE pathogenic_prediction_columns = ['bayesdel_noaf_rankscore', 'cadd_raw_rankscore', 'clinpred_rankscore', 'revel_rankscore', 'metalr_rankscore', 'vest4_rankscore'] - pp_funcs = {c: lambda d: float(d) >= pathogenic_rankscore for c in pathogenic_prediction_columns} if self.columns_version == 3: - pp_funcs["alphamissense_class"] = lambda d: d in AlphaMissensePrediction.get_damage_or_greater_levels() - return pp_funcs + pathogenic_prediction_columns.append("alphamissense_rankscore") + + return {c: lambda d: float(d) >= pathogenic_rankscore for c in pathogenic_prediction_columns} raise ValueError(f"Don't know fields for {self.columns_version=}") @@ -842,9 +842,6 @@ class AbstractVariantAnnotation(models.Model): splice_region = models.TextField(null=True, blank=True) symbol = models.TextField(null=True, blank=True) - alphamissense_class = models.CharField(max_length=1, choices=AlphaMissensePrediction.CHOICES, null=True, blank=True) - alphamissense_pathogenicity = models.FloatField(null=True, blank=True) - mavedb_score = models.FloatField(null=True, blank=True) mavedb_urn = models.TextField(null=True, blank=True) @@ -955,6 +952,8 @@ class VariantAnnotation(AbstractVariantAnnotation): clinpred_rankscore = models.FloatField(null=True, blank=True) vest4_rankscore = models.FloatField(null=True, blank=True) metalr_rankscore = models.FloatField(null=True, blank=True) + alphamissense_rankscore = models.FloatField(null=True, blank=True) + # ALoFT (from dbNSFP) aloft_prob_tolerant = models.FloatField(null=True, blank=True) aloft_prob_recessive = models.FloatField(null=True, blank=True) diff --git a/annotation/models/models_enums.py b/annotation/models/models_enums.py index bda376d97..92a24ee0c 100644 --- a/annotation/models/models_enums.py +++ b/annotation/models/models_enums.py @@ -125,7 +125,6 @@ class ColumnAnnotationCategory(models.TextChoices): class VEPPlugin(models.TextChoices): - ALPHAMISSENSE = 'A', 'AlphaMissense' DBNSFP = 'd', 'dbNSFP' DBSCSNV = 'v', 'dbscSNV' GRANTHAM = 'g', 'Grantham' diff --git a/snpdb/migrations/0108_one_off_move_alphamissense_dbnsfp.py b/snpdb/migrations/0108_one_off_move_alphamissense_dbnsfp.py new file mode 100644 index 000000000..21f4c7837 --- /dev/null +++ b/snpdb/migrations/0108_one_off_move_alphamissense_dbnsfp.py @@ -0,0 +1,39 @@ +# Generated by Django 4.2.2 on 2023-12-05 05:55 + +from django.db import migrations + + +def _one_off_move_alphamissense_dbnsfp(apps, _schema_editor): + # Getting rid of alphamissense_pathogenicity to replace with alphamissense_rankscore + + TRANSCRIPT_LEVEL = 'T' + + VariantGridColumn = apps.get_model("snpdb", "VariantGridColumn") + ColumnVCFInfo = apps.get_model("snpdb", "ColumnVCFInfo") + + VariantGridColumn.objects.filter(pk__in=['alphamissense_pathogenicity', 'alphamissense_class']).delete() + + alphamissense_rankscore = VariantGridColumn.objects.create(grid_column_name='alphamissense_rankscore', + variant_column='variantannotation__alphamissense_rankscore', + annotation_level=TRANSCRIPT_LEVEL, + width=None, + label='AlphaMissense RankScore', + model_field=True, + queryset_field=True) + + ColumnVCFInfo.objects.create(info_id='ALPHAMISSENSE_rankscore', + column=alphamissense_rankscore, + number=1, + type='F', + description='AlphaMissense pathogenicity rank score') + + +class Migration(migrations.Migration): + dependencies = [ + ('annotation', '0082_new_vep_110_columns_v3'), + ('snpdb', '0107_new_vep_110_columns_v3'), + ] + + operations = [ + migrations.RunPython(_one_off_move_alphamissense_dbnsfp) + ] From 5a04c47cafa7ca355d7d341615c97820eab6522b Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Tue, 5 Dec 2023 23:14:13 +1030 Subject: [PATCH 22/29] #850 - annotation --- ...pgeneannotationversion_options_and_more.py | 28 +++++++++++++++++++ annotation/models/models.py | 8 ++++-- .../bulk_vep_vcf_annotation_inserter.py | 2 +- annotation/vep_annotation.py | 1 - 4 files changed, 35 insertions(+), 4 deletions(-) create mode 100644 annotation/migrations/0085_alter_dbnsfpgeneannotationversion_options_and_more.py diff --git a/annotation/migrations/0085_alter_dbnsfpgeneannotationversion_options_and_more.py b/annotation/migrations/0085_alter_dbnsfpgeneannotationversion_options_and_more.py new file mode 100644 index 000000000..a72e77fa1 --- /dev/null +++ b/annotation/migrations/0085_alter_dbnsfpgeneannotationversion_options_and_more.py @@ -0,0 +1,28 @@ +# Generated by Django 4.1.4 on 2023-12-05 12:00 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ( + "annotation", + "0084_rename_alphamissense_pathogenicity_variantannotation_alphamissense_rankscore_and_more", + ), + ] + + operations = [ + migrations.AlterModelOptions( + name="dbnsfpgeneannotationversion", + options={}, + ), + migrations.AlterField( + model_name="dbnsfpgeneannotationversion", + name="md5_hash", + field=models.CharField(max_length=32), + ), + migrations.AlterUniqueTogether( + name="dbnsfpgeneannotationversion", + unique_together={("version", "md5_hash")}, + ), + ] diff --git a/annotation/models/models.py b/annotation/models/models.py index 79cea1783..fd9f41773 100644 --- a/annotation/models/models.py +++ b/annotation/models/models.py @@ -281,9 +281,13 @@ class ClinVarCitation(models.Model): class DBNSFPGeneAnnotationVersion(TimeStampedModel): - """ @see https://sites.google.com/site/jpopgen/dbNSFP """ + """ @see https://sites.google.com/site/jpopgen/dbNSFP + This isn't updated every release, so can have same hash across diff versions """ version = models.TextField(primary_key=True) - md5_hash = models.CharField(max_length=32, unique=True) + md5_hash = models.CharField(max_length=32) + + class Meta: + unique_together = ('version', 'md5_hash') def save(self, **kwargs): created = not self.pk diff --git a/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py b/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py index 6845acb88..4eb015447 100644 --- a/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py +++ b/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py @@ -609,7 +609,7 @@ def get_format_alphamissense_class_func(): """ GRCh37 has 'benign' while GRCh38 has 'likely_benign' @see https://github.com/Ensembl/VEP_plugins/issues/668 """ - cff = get_choice_formatter_func(AlphaMissensePrediction.CHOICES) + cff = get_choice_formatter_func(AlphaMissensePrediction.choices) def _format_alphamissense_class(alphamissense_class): if alphamissense_class == "benign": alphamissense_class = "likely_benign" diff --git a/annotation/vep_annotation.py b/annotation/vep_annotation.py index ee43f94b1..d8b0a11fc 100644 --- a/annotation/vep_annotation.py +++ b/annotation/vep_annotation.py @@ -144,7 +144,6 @@ def get_vep_command(vcf_filename, output_filename, genome_build: GenomeBuild, an if vc.columns_version >= 3: plugin_data_func.update({ - VEPPlugin.ALPHAMISSENSE: lambda: f"AlphaMissense,file={vc['alphamissense']}", VEPPlugin.MAVEDB: lambda: f"MaveDB,file={vc['mave']},single_aminoacid_changes=0,transcript_match=0 ", }) From 14176f0846fb1cedb53a9e0c7909ad306ccd0e6c Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Wed, 6 Dec 2023 14:35:45 +1030 Subject: [PATCH 23/29] MAVE format. Be able to load page if huge logs --- annotation/templates/annotation/view_annotation_run.html | 4 ++-- annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/annotation/templates/annotation/view_annotation_run.html b/annotation/templates/annotation/view_annotation_run.html index 9b7883ac7..3dd3eeecb 100644 --- a/annotation/templates/annotation/view_annotation_run.html +++ b/annotation/templates/annotation/view_annotation_run.html @@ -74,8 +74,8 @@ {% labelled hint="chunky" label="VCF Dump Filename" %}{% code_shell annotation_run.vcf_dump_filename %}{% endlabelled %} {% labelled hint="chunky" label="VCF Annotated Filename" %}{% code_shell annotation_run.vcf_annotated_filename %}{% endlabelled %} {% labelled hint="chunky" label="Pipeline Command" %}{% code_shell annotation_run.pipeline_command %}{% endlabelled %} - {% labelled hint="chunky" label="Pipeline StdOut" %}{% code_shell annotation_run.pipeline_stdout %}{% endlabelled %} - {% labelled hint="chunky" label="Pipeline StdErr" %}{% code_shell annotation_run.pipeline_stderr %}{% endlabelled %} + {% labelled hint="chunky" label="Pipeline StdOut" %}{% code_shell annotation_run.pipeline_stdout|truncatechars:10000 %}{% endlabelled %} + {% labelled hint="chunky" label="Pipeline StdErr" %}{% code_shell annotation_run.pipeline_stderr|truncatechars:10000 %}{% endlabelled %} {% labelled hint="chunky" label="Error Exception" %}{% code_shell annotation_run.error_exceptionr %}{% endlabelled %} diff --git a/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py b/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py index 4eb015447..4eade20b7 100644 --- a/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py +++ b/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py @@ -136,6 +136,7 @@ def _get_vep_columns_from_csq(infos): def _add_vep_field_handlers(self): # TOPMED and 1k genomes can return multiple values - take highest + format_pick_lowest_float = get_clean_and_pick_single_value_func(min, float) format_pick_highest_float = get_clean_and_pick_single_value_func(max, float) format_pick_highest_int = get_clean_and_pick_single_value_func(max, int) remove_empty_multiples = get_clean_and_pick_single_value_func(join_uniq) @@ -173,6 +174,7 @@ def _add_vep_field_handlers(self): "mastermind_count_3_aa_change": get_clean_and_pick_single_value_func(operator.itemgetter(2), int), "mutation_assessor_pred_most_damaging": get_most_damaging_func(MutationAssessorPrediction), "mutation_taster_pred_most_damaging": get_most_damaging_func(MutationTasterPrediction), + "mavedb_score": format_pick_lowest_float, "nmd_escaping_variant": format_nmd_escaping_variant, # conservation fields are from BigWig, which can return multiple entries # for deletions. Higher = more conserved, so for rare disease filtering taking max makes sense From 3f825270f31ff93151a7c3272b6868583e3698b6 Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Wed, 6 Dec 2023 16:50:11 +1030 Subject: [PATCH 24/29] Be able to reload annotation runs (was broken after we split standard/CNV) --- annotation/views.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/annotation/views.py b/annotation/views.py index 7d4b816c5..02387a6fd 100644 --- a/annotation/views.py +++ b/annotation/views.py @@ -394,8 +394,10 @@ def view_annotation_run(request, annotation_run_id): can_retry_annotation_run = False can_retry_annotation_run_upload = False if annotation_run.status == AnnotationStatus.ERROR: + # There may be other runs of different types (don't care about them) other_annotation_runs_qs = AnnotationRun.objects.filter( - annotation_range_lock=annotation_run.annotation_range_lock) + annotation_range_lock=annotation_run.annotation_range_lock, + pipeline_type=annotation_run.pipeline_type) other_annotation_runs_qs = other_annotation_runs_qs.exclude(status=AnnotationStatus.ERROR) can_retry_annotation_run = not other_annotation_runs_qs.exists() can_retry_annotation_run_upload = can_retry_annotation_run and annotation_run.vcf_annotated_filename From fc80a54d5447b546ca5c8e9fa2f1e960590daf50 Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Wed, 6 Dec 2023 16:50:48 +1030 Subject: [PATCH 25/29] #938 - SV data processing scripts --- .../generate_annotation/gnomad4_process_sv.sh | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100755 annotation/annotation_data/generate_annotation/gnomad4_process_sv.sh diff --git a/annotation/annotation_data/generate_annotation/gnomad4_process_sv.sh b/annotation/annotation_data/generate_annotation/gnomad4_process_sv.sh new file mode 100755 index 000000000..a84704a19 --- /dev/null +++ b/annotation/annotation_data/generate_annotation/gnomad4_process_sv.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +export PATH=${PATH}:/hpcfs/groups/phoenix-hpc-sacgf/tools/tabix-0.2.6:/hpcfs/groups/phoenix-hpc-sacgf/tools/bcftools/current/bcftools + +# THIS_DIR=$(realpath "$(dirname "${BASH_SOURCE[0]}")") +THIS_DIR=/hpcfs/groups/phoenix-hpc-sacgf/reference/hg38/Misce/gnomAD4/sv +cd ${THIS_DIR} + +# Structural variants +SV_COLUMNS=INFO/SVLEN,INFO/SVTYPE,INFO/END +COLS=INFO/AC,INFO/AN,INFO/AF +OTHER_COUNTS=INFO/N_HOMREF,INFO/N_HET,INFO/N_HOMALT,INFO/POPMAX_AF,INFO/PAR +SUBPOPS=INFO/afr_AF,INFO/amr_AF,INFO/asj_AF,INFO/eas_AF,INFO/fin_AF,INFO/mid_AF,INFO/nfe_AF,INFO/oth_AF,INFO/sas_AF + +KEEP_COLUMNS=${SV_COLUMNS},${COLS},${OTHER_COUNTS},${SUBPOPS} +MAPPING_DIR=$(dirname ${THIS_DIR}) +CHROM_MAPPING_FILE=${MAPPING_DIR}/chrom_mapping_GRCh38.map +MERGE_VCF=gnomad.v4.0.sv.merged.vcf + +# gnomad v4 +merge_args=() +for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do + GNOMAD_VCF=gnomad.v4.0.sv.chr${chrom}.vcf.gz + #wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.0/genome_sv/${GNOMAD_VCF} + #wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.0/genome_sv/${GNOMAD_VCF}.tbi + + OUTPUT_VCF=gnomad.v4.0.sv.chr${chrom}.converted.vcf.gz + echo "Going from ${GNOMAD_VCF} -> ${OUTPUT_VCF}" + + # Dont' normalize as is mostly "N" refs + bcftools annotate --exclude 'AC=0' --remove "^${KEEP_COLUMNS}" --rename-chrs=${CHROM_MAPPING_FILE} ${GNOMAD_VCF} -o ${OUTPUT_VCF} + merge_args+=(${OUTPUT_VCF}) +done + +bcftools concat --output-type b --output ${MERGE_VCF} ${merge_args[@]}; +bgzip ${MERGE_VCF} +tabix -p vcf ${MERGE_VCF}.gz \ No newline at end of file From 98611aee01371d879ba76e42a0b51ecfebe17ecf Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Wed, 6 Dec 2023 17:03:11 +1030 Subject: [PATCH 26/29] dbnsfp 4.5 processing scripts --- .../dbnsfp_grch37_strip.sh | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh b/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh index de6851765..e6d3ef889 100755 --- a/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh +++ b/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh @@ -27,19 +27,19 @@ set -e # print(",".join([str(c) for c in sorted(cols)])) # columns are: '3,4,5,6,8,9,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705' +# Note: We can't do this per-contig then join them, as some variants switch contigs between builds CUT_COLUMNS="3,4,5,6,8,9,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705" -SEQ_COL=5 # hg19_chr -POS_COL=6 # hg19_pos(1-based) -OUT_FILE=dbNSFP4.5a.grch37.stripped -TMP_DIR=/tmp # /hpcfs/groups/phoenix-hpc-sacgf/scratch/dbnsfp_GRCh37 -mkdir -p ${TMP_DIR} +SEQ_COL=3 # hg19_chr was col 5 (but 3rd after cut) +POS_COL=4 # hg19_pos(1-based) was 6 (but 4th after cut) -# Sort chromosomes individually as that's much more efficient -cat header.txt | cut -f ${CUT_COLUMNS} > ${OUT_FILE} -for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do - zgrep -h -v ^#chr dbNSFP4.5a_variant.chr${chrom}.gz | awk '$8 != "." ' | cut -f ${CUT_COLUMNS} | sort -T ${TMP_DIR} -k${SEQ_COL},${SEQ_COL} -k${POS_COL},${POS_COL}n - >> ${OUT_FILE} -done +version=4.5a +out_vcf=dbNSFP${version}_grch37.gz -bgzip ${OUT_FILE} -tabix -s ${SEQ_COL} -b ${POS_COL} -e ${POS_COL} ${OUT_FILE}.gz +# cd /hpcfs/groups/phoenix-hpc-sacgf/reference/annotation/dbnsfp/dbnsfp4.5 +zcat dbNSFP${version}_variant.chr1.gz | head -n1 > h +zgrep -h -v ^#chr dbNSFP${version}_variant.chr* | awk '$8 != "." ' | sort -T ${TMP_DIR} -k8,8 -k9,9n - | cat h - | bgzip -c > ${out_vcf} +zcat ${out_vcf} | cut -f ${CUT_COLUMNS} > dbNSFP${version}_grch37.stripped +bgzip dbNSFP${version}_grch37.stripped + +tabix -s ${SEQ_COL} -b ${POS_COL} -e ${POS_COL} dbNSFP${version}_grch37.stripped.gz From 49340d777b762281f547842f3d98b9c2d7816b6b Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Wed, 6 Dec 2023 17:06:29 +1030 Subject: [PATCH 27/29] Add missing info fields to get rid of warning --- annotation/annotation_data/generate_annotation/gnomad_data.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/annotation/annotation_data/generate_annotation/gnomad_data.py b/annotation/annotation_data/generate_annotation/gnomad_data.py index 2d461b990..7a2622a44 100755 --- a/annotation/annotation_data/generate_annotation/gnomad_data.py +++ b/annotation/annotation_data/generate_annotation/gnomad_data.py @@ -289,7 +289,9 @@ def write_vcf_header(version, info_fields, popmax_fields, sub_pops): else: af_desc = "" af_desc += f" made from (exomes_{ac_name} + genomes_{ac_name}) / (exomes_{an_name} + genomes_{an_name})" - meta += f'##INFO=\n' + meta += f'##INFO=\n' + meta += f'##INFO=\n' + meta += f'##INFO=\n' vcf_header = f"gnomad_{version}_vcf_header.txt.gz" with gzip.open(vcf_header, "wt") as f: From a5cd0990d52b3ae7ff853e630466f7be6e0860bc Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Thu, 7 Dec 2023 10:33:21 +1030 Subject: [PATCH 28/29] dbnsfp 4.5 processing scripts - left off "#" for GRCh37, get index cols right --- .../generate_annotation/dbnsfp_grch37_strip.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh b/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh index e6d3ef889..cfeada043 100755 --- a/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh +++ b/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh @@ -29,8 +29,8 @@ set -e # Note: We can't do this per-contig then join them, as some variants switch contigs between builds CUT_COLUMNS="3,4,5,6,8,9,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705" -SEQ_COL=3 # hg19_chr was col 5 (but 3rd after cut) -POS_COL=4 # hg19_pos(1-based) was 6 (but 4th after cut) +SEQ_COL=5 # hg19_chr (after cut) +POS_COL=6 # hg19_pos(1-based) (after cut) version=4.5a out_vcf=dbNSFP${version}_grch37.gz @@ -39,7 +39,7 @@ out_vcf=dbNSFP${version}_grch37.gz zcat dbNSFP${version}_variant.chr1.gz | head -n1 > h zgrep -h -v ^#chr dbNSFP${version}_variant.chr* | awk '$8 != "." ' | sort -T ${TMP_DIR} -k8,8 -k9,9n - | cat h - | bgzip -c > ${out_vcf} -zcat ${out_vcf} | cut -f ${CUT_COLUMNS} > dbNSFP${version}_grch37.stripped -bgzip dbNSFP${version}_grch37.stripped +# Needs a '#' header +(echo -n "#" ; zcat ${out_vcf} | cut -f ${CUT_COLUMNS}) | bgzip > dbNSFP${version}_grch37.stripped.gz tabix -s ${SEQ_COL} -b ${POS_COL} -e ${POS_COL} dbNSFP${version}_grch37.stripped.gz From 263e3d3909164a652c6d9402864e990c1dca8d91 Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Thu, 7 Dec 2023 16:48:54 +1030 Subject: [PATCH 29/29] #850 - Allow for custom MAVE "NA" results --- .../vcf_files/bulk_vep_vcf_annotation_inserter.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py b/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py index 4eade20b7..03e3ba757 100644 --- a/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py +++ b/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py @@ -136,7 +136,9 @@ def _get_vep_columns_from_csq(infos): def _add_vep_field_handlers(self): # TOPMED and 1k genomes can return multiple values - take highest - format_pick_lowest_float = get_clean_and_pick_single_value_func(min, float) + empty_mave_float_values = EMPTY_VALUES | {"NA"} + format_pick_lowest_float = get_clean_and_pick_single_value_func(min, float, + empty_values=empty_mave_float_values) format_pick_highest_float = get_clean_and_pick_single_value_func(max, float) format_pick_highest_int = get_clean_and_pick_single_value_func(max, int) remove_empty_multiples = get_clean_and_pick_single_value_func(join_uniq) @@ -643,17 +645,20 @@ def format_choice(raw_value): return format_choice -def get_clean_and_pick_single_value_func(pick_single_value_func, cast_func=None): +def get_clean_and_pick_single_value_func(pick_single_value_func, cast_func=None, empty_values=None): """ Returns a function to clean and pick single value. casting is performed before calling pick_single_value_func so you can call min/max """ + if empty_values is None: + empty_values = EMPTY_VALUES + def _clean_and_pick_single_value_func(raw_value): it = (tm for tm in raw_value.split(VEP_SEPARATOR) if tm != '') # Handle '.' if cast_func: - values = [cast_func(v) for v in it if v not in EMPTY_VALUES] + values = [cast_func(v) for v in it if v not in empty_values] else: - values = [v for v in it if v not in EMPTY_VALUES] + values = [v for v in it if v not in empty_values] value = None if values: value = pick_single_value_func(values)