From 6d297ef5f6c6cb70cd3ac149539c69603d355836 Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Fri, 24 Nov 2023 16:36:25 +1030
Subject: [PATCH 01/29] #850 - new VEP 110 fields

---
 annotation/management/commands/vep_run.py     |  9 +++-
 annotation/vep_annotation.py                  | 49 ++++++++++++++-----
 .../settings/components/default_settings.py   |  6 +++
 3 files changed, 51 insertions(+), 13 deletions(-)

diff --git a/annotation/management/commands/vep_run.py b/annotation/management/commands/vep_run.py
index fe6b2aa7e..e93ee9927 100644
--- a/annotation/management/commands/vep_run.py
+++ b/annotation/management/commands/vep_run.py
@@ -20,10 +20,12 @@ class Command(BaseCommand):
 
     def add_arguments(self, parser):
         parser.add_argument('--test', action='store_true')
+        parser.add_argument('--cnv', action='store_true')
         parser.add_argument('--genome-build', required=True)
 
     def handle(self, *args, **options):
         test = options["test"]
+        cnv = options["cnv"]
         build_name = options["genome_build"]
         genome_build = GenomeBuild.get_name_or_alias(build_name)
 
@@ -47,9 +49,14 @@ def handle(self, *args, **options):
                 vcf_filename = os.path.join(settings.ANNOTATION_VCF_DUMP_DIR, f"{base_name}.vcf")
 
         output_filename = os.path.join(output_dir, f"{base_name}.{vep_suffix}.vcf.gz")
+        if cnv:
+            pipeline_type = VariantAnnotationPipelineType.CNV
+        else:
+            pipeline_type = VariantAnnotationPipelineType.STANDARD
+
         return_code, std_out, std_err = run_vep(vcf_filename, output_filename,
                                                 genome_build, genome_build.annotation_consortium,
-                                                VariantAnnotationPipelineType.STANDARD)
+                                                pipeline_type)
         if return_code != 0:
             logging.info(std_out)
             logging.error(std_err)
diff --git a/annotation/vep_annotation.py b/annotation/vep_annotation.py
index d9cfde420..fd80fbd02 100644
--- a/annotation/vep_annotation.py
+++ b/annotation/vep_annotation.py
@@ -131,26 +131,28 @@ def get_vep_command(vcf_filename, output_filename, genome_build: GenomeBuild, an
         ])
 
         # Plugins that require data - ok for these to fail when retrieving vep config
-        PLUGINS = {VEPPlugin.MASTERMIND: lambda: f"Mastermind,{vc['mastermind']},1",  # 1 to not filter
-                   VEPPlugin.MAXENTSCAN: lambda: f"MaxEntScan,{vc['maxentscan']}",
-                   VEPPlugin.DBNSFP: lambda: _get_dbnsfp_plugin_command(genome_build, vc),
-                   VEPPlugin.DBSCSNV: lambda: f"dbscSNV,{vc['dbscsnv']}",
-                   VEPPlugin.SPLICEAI: lambda: f"SpliceAI,snv={vc['spliceai_snv']},indel={vc['spliceai_indel']}"}
+        plugin_data_func = {
+            VEPPlugin.MASTERMIND: lambda: f"Mastermind,{vc['mastermind']},1",  # 1 to not filter
+            VEPPlugin.MAXENTSCAN: lambda: f"MaxEntScan,{vc['maxentscan']}",
+            VEPPlugin.DBNSFP: lambda: _get_dbnsfp_plugin_command(genome_build, vc),
+            VEPPlugin.DBSCSNV: lambda: f"dbscSNV,{vc['dbscsnv']}",
+            VEPPlugin.SPLICEAI: lambda: f"SpliceAI,snv={vc['spliceai_snv']},indel={vc['spliceai_indel']}"
+        }
 
         if vc.columns_version >= 2:
             cmd.extend(["--plugin", "NMD"])
 
-        for vep_plugin, plugin_arg_func in PLUGINS.items():
-            try:
-                cmd.extend(["--plugin", plugin_arg_func()])
-            except Exception as e:
-                logging.warning(e)
-                logging.warning("No annotation set for plugin: %s", vep_plugin)
+        if vc.columns_version >= 3:
+            plugin_data_func.update({
+                VEPPlugin.ALPHAMISSENSE: lambda: f"AlphaMissense,file={vc['alphamissense']}",
+                VEPPlugin.MAVEDB: lambda: f"MaveDB,file={vc['mave']},single_aminoacid_changes=0,transcript_match=0 ",
+            })
 
         # Custom
         for vep_custom, prefix in dict(VEPCustom.choices).items():
             try:
-                if fields := ColumnVEPField.get_source_fields(genome_build, vep_custom=vep_custom):
+                q = ColumnVEPField.get_columns_version_q(vc.columns_version)
+                if fields := ColumnVEPField.get_source_fields(genome_build, q, vep_custom=vep_custom):
                     prefix_lc = prefix.lower()
                     if cfg := vc[prefix_lc]:  # annotation settings are lower case
                         cmd.extend(_get_custom_params_list(fields, prefix, cfg))
@@ -162,6 +164,29 @@ def get_vep_command(vcf_filename, output_filename, genome_build: GenomeBuild, an
                 # Not all annotations available for all builds - ok to just warn
                 logging.warning("Skipped custom annotation: %s", prefix)
 
+    else:
+        plugin_data_func = {
+            # TODO: Need to decide on overlap criteria
+            # percentage : percentage overlap between SVs (default: 80)
+            # reciprocal : calculate reciprocal overlap, options: 0 or 1. (default: 0)
+            # (overlap is expressed as % of input SV by default)
+            # cols : colon delimited list of data types to return from the INFO fields (only AF by default)
+            # same_type : 1/0 only report SV of the same type (eg deletions for deletions, off by default)
+            # distance : the distance the ends of the overlapping SVs should be within.
+            # match_type : only report reference SV which lie within or completely surround the input SV
+            # options: within, surrounding
+            VEPPlugin.STRUCTURALVARIANTOVERLAP: lambda: f"StructuralVariantOverlap,file={vc['structuralvariantoverlap']}",
+        }
+
+    for vep_plugin, plugin_arg_func in plugin_data_func.items():
+        try:
+            cmd.extend(["--plugin", plugin_arg_func()])
+        except Exception as e:
+            logging.warning(e)
+            logging.warning("No annotation set for plugin: %s", vep_plugin)
+
+
+
     return cmd
 
 
diff --git a/variantgrid/settings/components/default_settings.py b/variantgrid/settings/components/default_settings.py
index b273b5d51..db9986e9d 100644
--- a/variantgrid/settings/components/default_settings.py
+++ b/variantgrid/settings/components/default_settings.py
@@ -230,11 +230,13 @@
         # so you can change just that variable and have everything else work
         # The names correspond to VEPPlugin or VEPCustom entries (but lower case)
         "vep_config": {
+            "alphamissense": "annotation_data/GRCh37/AlphaMissense_hg19.tsv.gz",
             "cosmic": "annotation_data/GRCh37/CosmicCodingMuts_v95_20211101_grch37.normal.vcf.gz",
             "dbnsfp": "annotation_data/GRCh37/dbNSFP4.0a.grch37.stripped.gz",
             "dbscsnv": "annotation_data/GRCh37/dbscSNV1.1_GRCh37.txt.gz",
             "gnomad2": "annotation_data/GRCh37/gnomad2.1.1_GRCh37_combined_af.vcf.bgz",
             "mastermind": "annotation_data/GRCh37/mastermind_cited_variants_reference-2022.04.02-grch37.vcf.gz",
+            "mave": None, # n/a for GRCh37
             "maxentscan": "annotation_data/all_builds/maxentscan",
             'phastcons100way': "annotation_data/GRCh37/hg19.100way.phastCons.bw",
             'phastcons46way': "annotation_data/GRCh37/hg19.phastCons46way.placental.bw",
@@ -245,6 +247,7 @@
             "repeatmasker": "annotation_data/GRCh37/repeatmasker_hg19.bed.gz",
             "spliceai_snv": "annotation_data/GRCh37/spliceai_scores.raw.snv.hg19.vcf.gz",
             "spliceai_indel": "annotation_data/GRCh37/spliceai_scores.raw.indel.hg19.vcf.gz",
+            "structuralvariantoverlap": "annotation_data/GRCh37/gnomad_v2.1_sv.sites.grch37.converted.vcf.gz",
             "topmed": "annotation_data/GRCh37/TOPMED_GRCh37.vcf.gz",
             "uk10k": "annotation_data/GRCh37/UK10K_COHORT.20160215.sites.vcf.gz",
         }
@@ -262,12 +265,14 @@
         # so you can change just that variable and have everything else work
         # The names correspond to VEPPlugin or VEPCustom entries (but lower case)
         "vep_config": {
+            "alphamissense": "annotation_data/GRCh38/AlphaMissense_hg38.tsv.gz",
             "cosmic": "annotation_data/GRCh38/CosmicCodingMuts_v95_20211101_grch38.normal.vcf.gz",
             "dbnsfp": "annotation_data/GRCh38/dbNSFP4.0a.grch38.stripped.gz",
             "dbscsnv": "annotation_data/GRCh38/dbscSNV1.1_GRCh38.txt.gz",
             "gnomad2": "annotation_data/GRCh38/gnomad2.1.1_GRCh38_combined_af.vcf.bgz",
             "gnomad3": "annotation_data/GRCh38/gnomad3.1_GRCh38_merged.vcf.bgz",
             "mastermind": "annotation_data/GRCh38/mastermind_cited_variants_reference-2022.04.02-grch38.vcf.gz",
+            "mave": "annotation_data/GRCh38/MaveDB_variants.tsv.gz",
             "maxentscan": "annotation_data/all_builds/maxentscan",
             'phastcons100way': "annotation_data/GRCh38/hg38.phastCons100way.bw",
             'phastcons46way': None,  # n/a for GRCh38
@@ -278,6 +283,7 @@
             "repeatmasker": "annotation_data/GRCh38/repeatmasker_hg38.bed.gz",
             "spliceai_snv": "annotation_data/GRCh38/spliceai_scores.raw.snv.hg38.vcf.gz",
             "spliceai_indel": "annotation_data/GRCh38/spliceai_scores.raw.indel.hg38.vcf.gz",
+            "structuralvariantoverlap": "annotation_data/GRCh38/gnomad.v4.0.sv.merged.vcf.gz",
             "topmed": "annotation_data/GRCh38/TOPMED_GRCh38_20180418.vcf.gz",
             "uk10k": "annotation_data/GRCh38/UK10K_COHORT.20160215.sites.GRCh38.vcf.gz",
         }

From 9922a51658fad64c011c2bab8ea10534b68eb13e Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Mon, 27 Nov 2023 22:48:13 +1030
Subject: [PATCH 02/29] #938 - Add X/hemi stuff

---
 .../annotation_data/generate_annotation/gnomad4_data.py  | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/annotation/annotation_data/generate_annotation/gnomad4_data.py b/annotation/annotation_data/generate_annotation/gnomad4_data.py
index 8f68fc48d..01f8add6c 100755
--- a/annotation/annotation_data/generate_annotation/gnomad4_data.py
+++ b/annotation/annotation_data/generate_annotation/gnomad4_data.py
@@ -20,6 +20,7 @@
 COUNTS = ['AC', 'AN']
 OTHER_INFOS = ["nhomalt", "non_par", "faf95", "faf99", "fafmax_faf95_max", "fafmax_faf99_max"]
 GNOMAD_SUB_POPS = ["afr", "amr", "asj", "eas", "fin", "mid", "nfe", "remaining", "sas"]  # Will get AF for each
+CHR_X_ONLY = ["AC_XY", "AN_XY", "AF_XY"]
 
 # popmax/grpmax is calculated using non-bottlenecked genetic ancestry groups
 BOTTLENECKED_SUB_POPS = ["asj", "fin", "mid", "remaining"]
@@ -85,6 +86,8 @@ def write_scripts(args):
                 # To remove all INFO tags except "FOO" and "BAR", use "^INFO/FOO,INFO/BAR"
                 # @see https://samtools.github.io/bcftools/bcftools.html#annotate """
                 my_columns = columns.copy()
+                if chrom == "X":
+                    my_columns.extend(CHR_X_ONLY)
 
                 info_columns = [f"INFO/{i}" for i in my_columns]
                 keep_columns = ','.join(info_columns)  # AC/AN are special format fields
@@ -112,6 +115,7 @@ def write_scripts(args):
                 # Merge exomes/genome VCFs
                 # if we leave out rule, will take from 1st file which is ok for PAR as will be the same
                 skip_columns = {"non_par"}
+                # Default rule = "sum" if not below (or skipped)
                 rule_ops = {
                     # Will take higher of whatever is there in genomes/exomes
                     "faf95": "max",
@@ -120,7 +124,7 @@ def write_scripts(args):
                     "fafmax_faf99_max": "max",
                 }
                 info_rules = []
-                for c in columns:
+                for c in my_columns:
                     if c not in skip_columns:
                         op = rule_ops.get(c, "sum")
                         info_rules.append(f"{c}:{op}")
@@ -193,6 +197,9 @@ def write_vcf_header():
 ##INFO=<ID=AN_grpmax,Number=1,Type=Integer,Description="Allele Number for highest population">
 ##INFO=<ID=AC,Number=1,Type=Integer,Description="Alternate allele count (exomes + genomes)">
 ##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles  (exomes + genomes)">
+##INFO=<ID=AC_XY,Number=1,Type=Integer,Description="Alternate allele count for XY samples">
+##INFO=<ID=AF_XY,Number=1,Type=Float,Description="Alternate allele frequency in XY samples">
+##INFO=<ID=AN_XY,Number=1,Type=Integer,Description="Total number of alleles in XY samples">
 ##INFO=<ID=faf95,Number=1,Type=Float,Description="Filtering allele frequency (using Poisson 95%% CI) (max of exomes/genomes)">
 ##INFO=<ID=faf99,Number=1,Type=Float,Description="Filtering allele frequency (using Poisson 99%% CI) (max of exomes/genomes)">
 ##INFO=<ID=fafmax_faf95_max,Number=1,Type=Float,Description="Maximum filtering allele frequency (using Poisson 95%% CI) across genetic_ancestry groups (max of exomes/genomes)">

From f701538a8d8082811cf4b3367fcd905b345f1f6e Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Tue, 28 Nov 2023 13:36:26 +1030
Subject: [PATCH 03/29] Consistent names (we use filename to obtain gnomAD
 version in VAV)

---
 .../generate_annotation/gnomad4_data.py               | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/annotation/annotation_data/generate_annotation/gnomad4_data.py b/annotation/annotation_data/generate_annotation/gnomad4_data.py
index 01f8add6c..7dd190e1b 100755
--- a/annotation/annotation_data/generate_annotation/gnomad4_data.py
+++ b/annotation/annotation_data/generate_annotation/gnomad4_data.py
@@ -14,6 +14,7 @@
 from argparse import ArgumentParser
 from datetime import datetime
 
+GNOMAD_VERSION="4.0"
 GRCh38 = "GRCh38"
 
 # We deliberately leave out AF and "grpmax" stuff as we recalculate that later in 'calculate_allele_frequency'
@@ -31,7 +32,7 @@ def get_args():
     parser.add_argument("--test", action='store_true', help="Only download 5k of each file.")
     # parser.add_argument("--genome-fasta", help='Fasta (correct for build)')
     parser.add_argument("--chrom-mapping-file", help='bcftools chromosome conversion')
-    parser.add_argument("--version", help='gnomAD version (default: 4.0)', default='4.0')
+    parser.add_argument("--version", help=f'gnomAD version (default: {GNOMAD_VERSION})', default=GNOMAD_VERSION)
     parser.add_argument("--path", help='Colon separated paths for tabix/bgzip/vt/bcftools')
     parser.add_argument("--gnomad-input-vcf")
     parser.add_argument("--af-output-vcf")
@@ -75,7 +76,7 @@ def write_scripts(args):
     chrom_scripts = []
     af_vcfs = []
     for chrom in CHROMOSOMES:
-        prefix = f"gnomad4_chr{chrom}"
+        prefix = f"gnomad{GNOMAD_VERSION}_{GRCh38}_chr{chrom}"
         chrom_script = f"{prefix}.sh"
         chrom_scripts.append(chrom_script)
         with open(chrom_script, "w") as cs:
@@ -94,7 +95,7 @@ def write_scripts(args):
                 output_vcf = f"{prefix}_{vcf_type}.filtered_info.vcf.gz"
                 annotate_args = f"--rename-chrs={args.chrom_mapping_file}"
 
-                gnomad_vcf_filename = f"gnomad.{vcf_type}.v4.0.sites.chr{chrom}.vcf.bgz"
+                gnomad_vcf_filename = f"gnomad.{vcf_type}.{GNOMAD_VERSION}_{GRCh38}.sites.chr{chrom}.vcf.bgz"
 
                 # bcftools merge doesn't work with type='A'
                 # bcftools now works with AC/AN etc - see https://github.com/samtools/bcftools/issues/1394
@@ -150,8 +151,8 @@ def write_scripts(args):
     with open(merge_script_filename, "w") as ms:
         ms.write(bash_header)
         quoted_files = ' '.join([f"'{f}'" for f in af_vcfs])
-        gnomad_combined_af_vcf = f"gnomad4_combined_af.vcf.bgz"
-        ms.write(f"zcat {vcf_header} {quoted_files} | bgzip > {gnomad_combined_af_vcf}\n")
+        gnomad_combined_af_vcf = f"gnomad{GNOMAD_VERSION}_{GRCh38}_combined_af.vcf.bgz"
+        ms.write(f"cat {vcf_header} {quoted_files} > {gnomad_combined_af_vcf}\n")
         ms.write(f"tabix {gnomad_combined_af_vcf}\n")
 
     launch_script_filename = f"gnomad4_launch.sh"

From 6c4bd6460c3f73d94b3b78ae20e92704e12bfef5 Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Tue, 28 Nov 2023 14:29:13 +1030
Subject: [PATCH 04/29] consolidate download scripts

---
 .../generate_annotation/gnomad2.1_download.sh | 19 +++++++++++
 .../generate_annotation/gnomad4_download.sh   | 22 +++++++++++++
 .../gnomad4_download_exomes.sh                | 10 ------
 .../gnomad4_download_genomes.sh               | 10 ------
 .../gnomad4_download_structural.sh            | 33 -------------------
 5 files changed, 41 insertions(+), 53 deletions(-)
 create mode 100644 annotation/annotation_data/generate_annotation/gnomad2.1_download.sh
 create mode 100644 annotation/annotation_data/generate_annotation/gnomad4_download.sh
 delete mode 100644 annotation/annotation_data/generate_annotation/gnomad4_download_exomes.sh
 delete mode 100644 annotation/annotation_data/generate_annotation/gnomad4_download_genomes.sh
 delete mode 100644 annotation/annotation_data/generate_annotation/gnomad4_download_structural.sh

diff --git a/annotation/annotation_data/generate_annotation/gnomad2.1_download.sh b/annotation/annotation_data/generate_annotation/gnomad2.1_download.sh
new file mode 100644
index 000000000..b629dee4a
--- /dev/null
+++ b/annotation/annotation_data/generate_annotation/gnomad2.1_download.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+# gnomad v4.0
+
+# Exomes
+for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do
+  wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/2.1.1/vcf/exomes/gnomad.exomes.r2.1.1.sites.${chrom}.vcf.bgz
+  wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/2.1.1/vcf/exomes/gnomad.exomes.r2.1.1.sites.${chrom}.vcf.bgz.tbi
+done
+
+# Genomes
+for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do
+  wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/2.1.1/vcf/genomes/gnomad.genomes.r2.1.1.sites.${chrom}.vcf.bgz
+  wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/2.1.1/vcf/genomes/gnomad.genomes.r2.1.1.sites.${chrom}.vcf.bgz.tbi
+done
+
+# Structural
+wget https://gnomad-public-us-east-1.s3.amazonaws.com/papers/2019-sv/gnomad_v2.1_sv.sites.vcf.gz
+wget https://gnomad-public-us-east-1.s3.amazonaws.com/papers/2019-sv/gnomad_v2.1_sv.sites.vcf.gz.tbi
diff --git a/annotation/annotation_data/generate_annotation/gnomad4_download.sh b/annotation/annotation_data/generate_annotation/gnomad4_download.sh
new file mode 100644
index 000000000..59ec1e333
--- /dev/null
+++ b/annotation/annotation_data/generate_annotation/gnomad4_download.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+# gnomad v4.0
+
+# Exomes
+for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do
+  wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.0/vcf/exomes/gnomad.exomes.v4.0.sites.chr${chrom}.vcf.bgz
+  wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.0/vcf/exomes/gnomad.exomes.v4.0.sites.chr${chrom}.vcf.bgz.tbi
+done
+
+# Genomes
+for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do
+  wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chr${chrom}.vcf.bgz
+  wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chr${chrom}.vcf.bgz.tbi
+done
+
+# Structural
+for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do
+  GNOMAD_VCF=gnomad.v4.0.sv.chr${chrom}.vcf.gz
+  wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.0/genome_sv/${GNOMAD_VCF}
+  wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.0/genome_sv/${GNOMAD_VCF}.tbi
+done
\ No newline at end of file
diff --git a/annotation/annotation_data/generate_annotation/gnomad4_download_exomes.sh b/annotation/annotation_data/generate_annotation/gnomad4_download_exomes.sh
deleted file mode 100644
index 1375d380b..000000000
--- a/annotation/annotation_data/generate_annotation/gnomad4_download_exomes.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-
-
-# Structural variants
-
-# gnomad v4
-for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do
-  wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.0/vcf/exomes/gnomad.exomes.v4.0.sites.chr${chrom}.vcf.bgz
-  wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.0/vcf/exomes/gnomad.exomes.v4.0.sites.chr${chrom}.vcf.bgz.tbi
-done
\ No newline at end of file
diff --git a/annotation/annotation_data/generate_annotation/gnomad4_download_genomes.sh b/annotation/annotation_data/generate_annotation/gnomad4_download_genomes.sh
deleted file mode 100644
index dcb8b239d..000000000
--- a/annotation/annotation_data/generate_annotation/gnomad4_download_genomes.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-
-
-# Structural variants
-
-# gnomad v4
-for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do
-  wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chr${chrom}.vcf.bgz
-  wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chr${chrom}.vcf.bgz.tbi
-done
\ No newline at end of file
diff --git a/annotation/annotation_data/generate_annotation/gnomad4_download_structural.sh b/annotation/annotation_data/generate_annotation/gnomad4_download_structural.sh
deleted file mode 100644
index a2098050e..000000000
--- a/annotation/annotation_data/generate_annotation/gnomad4_download_structural.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-THIS_DIR=$(realpath "$(dirname "${BASH_SOURCE[0]}")")
-
-# Structural variants
-SV_COLUMNS=INFO/SVLEN,INFO/SVTYPE,INFO/END
-COLS=INFO/AC,INFO/AN,INFO/AF
-OTHER_COUNTS=INFO/N_HOMREF,INFO/N_HET,INFO/N_HOMALT
-SUBPOPS=INFO/afr_AF,INFO/amr_AF,INFO/asj_AF,INFO/eas_AF,INFO/fin_AF,INFO/mid_AF,INFO/nfe_AF,INFO/oth_AF,INFO/sas_AF
-
-KEEP_COLUMNS=${SV_COLUMNS},${COLS},${OTHER_COUNTS},${SUBPOPS}
-CHROM_MAPPING_FILE=${THIS_DIR}/../../../snpdb/genome/chrom_mapping_GRCh38.map
-GENOME_FASTA=/data/annotation/fasta/GCF_000001405.40_GRCh38.p14_genomic.fna.gz
-
-# gnomad v4
-for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do
-  GNOMAD_VCF=gnomad.v4.0.sv.chr${chrom}.vcf.gz
-  wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.0/genome_sv/${GNOMAD_VCF}
-  wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.0/genome_sv/${GNOMAD_VCF}.tbi
-
-  OUTPUT_VCF=
-  # bcftools annotate --exclude 'AC=0' --remove '^{KEEP_COLUMNS}' --rename-chrs={CHROM_MAPPING_FILE} | vt normalize - -r ${GENOME_FASTA} -o + | vt uniq + -o ${OUTPUT_VCF}
-
-done
-
-
-# OTHER_INFOS = ["AC_popmax", "AN_popmax", "AF_popmax", "popmax", "nhomalt", "nhomalt_popmax", "nonpar"]
-# GNOMAD_SUB_POPS = ["afr", "amr", "asj", "eas", "fin", "mid", "nfe", "oth", "sas"]  # Will get AF for each
-
-# These have been removed in v4 - "AC_popmax", "AN_popmax", "AF_popmax"
-# nonpar is now "par"
-
-

From 5860cc153c550151e5428c884a41fcf2ecab27cd Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Wed, 29 Nov 2023 15:58:49 +1030
Subject: [PATCH 05/29] #938 - gnomAD - consolidate scripts

---
 .../generate_annotation/gnomad2_data.py       | 270 ------------------
 .../gnomad3.1.2_download.sh                   |   7 +
 .../gnomad3_create_genome_scripts.py          | 114 --------
 ...mad4_download.sh => gnomad4.0_download.sh} |   0
 .../{gnomad4_data.py => gnomad_data.py}       | 210 +++++++++-----
 5 files changed, 150 insertions(+), 451 deletions(-)
 delete mode 100755 annotation/annotation_data/generate_annotation/gnomad2_data.py
 create mode 100644 annotation/annotation_data/generate_annotation/gnomad3.1.2_download.sh
 delete mode 100755 annotation/annotation_data/generate_annotation/gnomad3_create_genome_scripts.py
 rename annotation/annotation_data/generate_annotation/{gnomad4_download.sh => gnomad4.0_download.sh} (100%)
 rename annotation/annotation_data/generate_annotation/{gnomad4_data.py => gnomad_data.py} (50%)

diff --git a/annotation/annotation_data/generate_annotation/gnomad2_data.py b/annotation/annotation_data/generate_annotation/gnomad2_data.py
deleted file mode 100755
index bf03d8218..000000000
--- a/annotation/annotation_data/generate_annotation/gnomad2_data.py
+++ /dev/null
@@ -1,270 +0,0 @@
-#!/usr/bin/env python3
-"""
-We want to do this per-chrom so we can process in parallel
-
-Steps are:
-    1. Download exomes.vcf + genome.vcf, removing most INFO fields before writing to disk (to reduce disk space)
-    2. Merge exome + genome, summing counts
-    3. Run through this script with --af to calculate allele frequency, write TSV (more efficient than VCF)
-    4. Cat them all together again
-"""
-
-import gzip
-import os
-from argparse import ArgumentParser
-from datetime import datetime
-
-from cyvcf2 import VCF
-
-GRCh37 = "GRCh37"
-GRCh38 = "GRCh38"
-BUILDS = [GRCh37, GRCh38]
-
-COUNTS = ['AC', 'AN']
-OTHER_INFOS = ["nhomalt"]
-GNOMAD_SUB_POPS = ["afr", "amr", "asj", "eas", "fin", "nfe", "oth", "sas"]  # Will get AN/AC for these
-CHROMOSOMES = list(map(str, range(1, 23))) + ['X', 'Y']
-
-
-def get_args():
-    parser = ArgumentParser(description="Merge exome+genome VCFs for VariantGrid VEP pipeline")
-    parser.add_argument("--test", action='store_true', help="Only download 5k of each file.")
-    parser.add_argument("--genome-build", help='GRCh37 or GRCh38')
-    parser.add_argument("--genome-fasta", help='Fasta (correct for build)')
-    parser.add_argument("--genome-fasta-has-chr", type=bool, help='GnomAD has "chr1", set to false if ref uses "1"')
-    parser.add_argument("--version", help='gnomAD version (default: 2.1.1)', default='2.1.1')
-    parser.add_argument("--gnomad-input-vcf")
-    parser.add_argument("--af-output-vcf")
-
-    group = parser.add_mutually_exclusive_group(required=True)
-    group.add_argument('--scripts', action='store_true', help="Generate scripts")
-    group.add_argument('--af', action='store_true', help="Calculate allele frequency from VCF")
-
-    args = parser.parse_args()
-    if args.scripts:
-        if args.genome_build is None or args.genome_build not in BUILDS:
-            parser.error(f"--genome-build must be one of {','.join(BUILDS)}")
-        if args.genome_fasta is None:
-            parser.error("--genome-fasta required for --scripts")
-    else:
-        if args.gnomad_input_vcf is None:
-            parser.error("--gnomad-input-vcf required for --af")
-        if args.af_output_vcf is None:
-            parser.error("--af-output-vcf required for --af")
-
-    return args
-
-
-def main(args):
-    if args.scripts:
-        write_scripts(args)
-    else:
-        calculate_allele_frequency(args.gnomad_input_vcf, args.af_output_vcf)
-
-
-def write_scripts(args):
-    genome_build = args.genome_build
-    version = args.version
-    genome_fasta = args.genome_fasta
-    if args.test:
-        # only download 5k lines of file
-        extra_filters = "| bgzip -d | head -5000 | bcftools view -O z"
-    else:
-        extra_filters = ""  # nothing
-
-    if not args.genome_fasta_has_chr:
-        chrom_mapping_file = None
-    else:
-        chrom_mapping_file = write_chrom_mapping_file()
-
-    columns = get_columns()
-    bash_header = "#!/bin/bash\nset -e # fail on error\n"
-
-    chrom_scripts = []
-    af_vcfs = []
-    for chrom in CHROMOSOMES:
-        prefix = f"gnomad_{genome_build}_chr{chrom}"
-        chrom_script = f"{prefix}.sh"
-        chrom_scripts.append(chrom_script)
-        with open(chrom_script, "w") as cs:
-            cs.write(bash_header)
-
-            output_vcfs = []
-            for vcf_type in ["exomes", "genomes"]:
-                if genome_build == GRCh37:
-                    url = f"https://storage.googleapis.com/gnomad-public/release/{version}/vcf/{vcf_type}/gnomad.{vcf_type}.r{version}.sites.{chrom}.vcf.bgz"
-                else:
-                    url = f"https://storage.googleapis.com/gnomad-public/release/{version}/liftover_grch38/vcf/{vcf_type}/gnomad.{vcf_type}.r{version}.sites.{chrom}.liftover_grch38.vcf.bgz"
-
-                # To remove all INFO tags except "FOO" and "BAR", use "^INFO/FOO,INFO/BAR"
-                # @see https://samtools.github.io/bcftools/bcftools.html#annotate """
-                my_columns = columns.copy()
-                if vcf_type == "genomes":  # No SAS in genomes
-                    if chrom == 'Y':
-                        continue  # No Y in genomes
-                    my_columns.remove("AC_sas")
-                    my_columns.remove("AN_sas")
-
-                info_columns = [f"INFO/{i}" for i in my_columns]
-                keep_columns = ','.join(info_columns)  # AC/AN are special format fields
-                output_vcf = f"{prefix}_{vcf_type}.filtered_info.vcf.gz"
-                if chrom_mapping_file:
-                    annotate_args = f"--rename-chrs={chrom_mapping_file}"
-                else:
-                    annotate_args = ""
-
-                # bcftools merge doesn't work with type='A' or special AC/AN INFO fields w/o a FORMAT (which gnomAD doesn't have)
-                modify_fields = "sed -e 's/,Number=A,/,Number=1,/' -e 's/ID=AC,/ID=AC_count,/' -e 's/ID=AN,/ID=AN_count,/' -e 's/AC=/AC_count=/' -e 's/AN=/AN_count=/'"
-                # gnomAD appears to already be decomposed - vt decompose + -s -o +
-                cs.write("\necho Download and clean as we go to save disk\n")
-                cs.write(f"wget --quiet -O - {url} {extra_filters} | bcftools annotate --exclude 'AC=0' --remove '^{keep_columns}' {annotate_args} | {modify_fields} | vt normalize - -r {genome_fasta} -o + | vt uniq + -o {output_vcf}\n")
-                output_vcfs.append(output_vcf)
-
-            combined_vcf = f"{prefix}.combined.vcf.gz"
-            if len(output_vcfs) == 1:  # Just 1, rename it
-                output_vcf = output_vcfs[0]
-                cs.write(f"mv {output_vcf} {combined_vcf}\n")
-            else:
-                for ov in output_vcfs:
-                    cs.write(f"tabix {ov}\n")
-
-                # Merge - adding them together...
-                renamed_columns = [f"{c}_count" if c in ['AC', 'AN'] else c for c in columns]
-                info_rules = [f"{c}:sum" for c in renamed_columns]
-                info_rules_arg = ','.join(info_rules)
-                cs.write("\n\necho Merging VCFs - will keep flags from genomes.\n")
-                cs.write(f"bcftools merge --merge none --info-rules '{info_rules_arg}' '{output_vcfs[0]}' '{output_vcfs[1]}' -O z -o {combined_vcf}\n")
-
-            # Now process them with this script
-            cs.write("\n\necho Calculate Allele Frequency\n")
-            script_filename = os.path.realpath(__file__)
-            allele_frequency_vcf = f"{prefix}.af.vcf.gz"
-            cs.write(f"{script_filename} --af --gnomad-input-vcf={combined_vcf} --af-output-vcf={allele_frequency_vcf}\n")
-            af_vcfs.append(allele_frequency_vcf)
-
-        if args.test:
-            break  # Only do 1 chrom
-
-    # Write merge script
-    merge_script_filename = f"gnomad_{genome_build}_merge.sh"
-    vcf_header = write_vcf_header()
-
-    with open(merge_script_filename, "w") as ms:
-        ms.write(bash_header)
-        quoted_files = ' '.join([f"'{f}'" for f in af_vcfs])
-        gnomad_combined_af_vcf = f"gnomad_{genome_build}_combined_af.vcf.bgz"
-        ms.write(f"gzcat {vcf_header} {quoted_files} | bgzip > {gnomad_combined_af_vcf}\n")
-        ms.write(f"tabix {gnomad_combined_af_vcf}\n")
-
-    launch_script_filename = f"gnomad_{genome_build}_launch.sh"
-    with open(launch_script_filename, "w") as ms:
-        ms.write(bash_header)
-        ms.write('SCRIPT_DIR=$(dirname "${BASH_SOURCE[0]}")\n')
-        for cs in chrom_scripts:
-            ms.write(f"${{SCRIPT_DIR}}/{cs} > {cs}.log 2> {cs}.stderr.log &\n")
-
-        ms.write("echo Waiting for all chroms to finish...\n")
-        ms.write("wait\n")
-        ms.write(f"${{SCRIPT_DIR}}/{merge_script_filename}\n")
-
-
-def get_columns():
-    columns = COUNTS + OTHER_INFOS
-    for g in GNOMAD_SUB_POPS:
-        for f in ["AC", "AN"]:
-            columns.append(f"{f}_{g.lower()}")
-    return columns
-
-
-def get_af_info():
-    af_info = [
-        ("AF", None, "AC_count", "AN_count"),
-    ]
-    for g in GNOMAD_SUB_POPS:
-        af_info.append((f'AF_{g}', g, f'AC_{g}', f'AN_{g}'))
-    return af_info
-
-
-def write_vcf_header():
-    """ Needs to be gzipped so can be concatenated with other gzipped files """
-
-    now = datetime.now()
-    file_date = "%d%02d%02d" % (now.year, now.month, now.day)
-    source = __file__
-    meta = """##fileformat=VCFv4.2
-##fileDate=%(file_date)s
-##source=%(source)s
-##INFO=<ID=AF_popmax,Number=1,Type=Float,Description="Allele Frequency for highest population">
-##INFO=<ID=popmax,Number=1,Type=String,Description="Population with highest allele frequency (stored as AF_popmax)">
-##INFO=<ID=nhomalt,Number=1,Type=Integer,Description="Total number of homozygotest (exomes + genomes)">
-##INFO=<ID=gnomad_filtered,Number=1,Type=Integer,Description="Exomes or genomes had a filter entry (potential QC issues)">
-""" % {"file_date": file_date, "source": source}
-
-    af_info = get_af_info()
-    for info_id, pop_name, ac_name, an_name in af_info:
-        if pop_name:
-            af_desc = f"for {pop_name}"
-        else:
-            af_desc = ""
-        af_desc += f" made from (exomes_{ac_name} + genomes_{ac_name}) / (exomes_{an_name} + genomes_{an_name})"
-        meta += f'##INFO=<ID={info_id},Number=1,Type=Float,Description="Allele Frequency {af_desc}">\n'
-
-    vcf_header = "vcf_header.txt.gz"
-    with gzip.open(vcf_header, "wt") as f:
-        f.write(meta)
-        header_cols = ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
-        header = "#" + '\t'.join(header_cols)
-        f.write(header + "\n")
-    return vcf_header
-
-
-def write_chrom_mapping_file():
-    chrom_mapping_file = "chrom_mapping.txt"
-    with open(chrom_mapping_file, "w") as f:
-        for c in CHROMOSOMES:
-            f.write(f"chr{c}\t{c}\n")
-    return chrom_mapping_file
-
-
-def calculate_allele_frequency(gnomad_input_vcf, af_output_vcf):
-    # We have to re-calculate POPMAX as we can't merge it
-    af_info = get_af_info()
-    info_names = [ai[0] for ai in af_info] + OTHER_INFOS + ["AF_popmax", "popmax", "gnomad_filtered"]
-
-    with gzip.open(af_output_vcf, "wt") as f:
-        for variant in VCF(gnomad_input_vcf):
-            chrom = variant.CHROM
-            pos = str(variant.POS)
-            variant_id = variant.ID or '.'
-            ref = variant.REF
-            alt = variant.ALT[0]  # no multi-alts
-
-            af_popmax = 0
-            popmax = '.'
-            infos = []
-            for _, pop_name, ac_name, an_name in af_info:
-                ac = variant.INFO.get(ac_name, 0)
-                an = variant.INFO.get(an_name)
-                #print(f"{ac_name}/{an_name} {ac}/{an}")
-                if an:
-                    af = ac / an
-                    if pop_name and af > af_popmax:  # Only use subpops
-                        af_popmax = af
-                        popmax = pop_name
-                    af = f'{af:.6f}'
-                else:
-                    af = '.'
-                infos.append(af)
-
-            for o in OTHER_INFOS:
-                infos.append(str(variant.INFO.get(o, '.')))
-            gnomad_filtered = '0' if variant.FILTER is None else '1'
-            infos.extend([str(af_popmax), popmax, gnomad_filtered])
-            info_str = ";".join([i + "=" + v for i, v in zip(info_names, infos)])
-            columns = [chrom, pos, variant_id, ref, alt, '.', '.', info_str]
-            f.write("\t".join(columns) + "\n")
-
-
-if __name__ == "__main__":
-    args = get_args()
-    main(args)
diff --git a/annotation/annotation_data/generate_annotation/gnomad3.1.2_download.sh b/annotation/annotation_data/generate_annotation/gnomad3.1.2_download.sh
new file mode 100644
index 000000000..6e32e0975
--- /dev/null
+++ b/annotation/annotation_data/generate_annotation/gnomad3.1.2_download.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+# 3.1 only had genomes
+for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do
+  wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/3.1.2/vcf/genomes/gnomad.genomes.v3.1.2.sites.chr${chrom}.vcf.bgz
+  wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/3.1.2/vcf/genomes/gnomad.genomes.v3.1.2.sites.chr${chrom}.vcf.bgz.tbi
+done
diff --git a/annotation/annotation_data/generate_annotation/gnomad3_create_genome_scripts.py b/annotation/annotation_data/generate_annotation/gnomad3_create_genome_scripts.py
deleted file mode 100755
index 822013d1f..000000000
--- a/annotation/annotation_data/generate_annotation/gnomad3_create_genome_scripts.py
+++ /dev/null
@@ -1,114 +0,0 @@
-#!/usr/bin/env python3
-"""
-    The gnomAD v3.1.2 data set contains 76,156 whole genomes (and no exomes), all mapped to the GRCh38 reference sequence.
-"""
-
-from argparse import ArgumentParser
-
-GRCh38 = "GRCh38"
-BUILDS = [GRCh38]
-
-COUNTS = ['AC', 'AN', 'AF']
-OTHER_INFOS = ["AC_popmax", "AN_popmax", "AF_popmax", "popmax", "nhomalt", "nhomalt_popmax", "nonpar"]
-GNOMAD_SUB_POPS = ["afr", "amr", "asj", "eas", "fin", "nfe", "oth", "sas"]  # Will get AF for each
-CHROMOSOMES = list(map(str, range(1, 23))) + ['X', 'Y']
-
-
-def get_args():
-    parser = ArgumentParser(description="Get, strip and merge gnomAD VCFs for VariantGrid VEP pipeline")
-    parser.add_argument("--test", action='store_true', help="Only download 5k of each file.")
-    parser.add_argument("--genome-fasta", required=True, help='Fasta (correct for build)')
-    parser.add_argument("--chrom_mapping_file", help="Mapping file to convert chroms (if you get 'the sequence 'chr1' was not found)'")
-    return parser.parse_args()
-
-
-def main(args):
-    genome_build = GRCh38
-    genome_fasta = args.genome_fasta
-    if args.test:
-        # only download 5k lines of file
-        extra_filters = "| bgzip -d | head -5000 | bcftools view -O z"
-    else:
-        extra_filters = ""  # nothing
-
-    # To remove all INFO tags except "FOO" and "BAR", use "^INFO/FOO,INFO/BAR"
-    # @see https://samtools.github.io/bcftools/bcftools.html#annotate """
-    info_columns = [f"INFO/{i}" for i in get_columns()]
-    keep_columns = ','.join(info_columns)  # AC/AN are special format fields
-    bash_header = "#!/bin/bash\nset -e # fail on error\n"
-
-    chrom_scripts = []
-    chrom_vcfs = []
-    for chrom in CHROMOSOMES:
-        prefix = f"gnomad_{genome_build}_chr{chrom}"
-        chrom_script = f"{prefix}.sh"
-        chrom_scripts.append(chrom_script)
-        with open(chrom_script, "w") as cs:
-            cs.write(bash_header)
-            # gnomAD3.1 only has genomes, no exomes
-            url = f"https://storage.googleapis.com/gcp-public-data--gnomad/release/3.1.2/vcf/genomes/gnomad.genomes.v3.1.3.sites.chr{chrom}.vcf.bgz"
-            output_vcf = f"{prefix}.filtered_info.vcf.gz"
-            if args.chrom_mapping_file:
-                annotate_args = f"--rename-chrs={args.chrom_mapping_file}"
-            else:
-                annotate_args = ""
-
-            # gnomAD appears to already be decomposed - vt decompose + -s -o +
-            cs.write("\necho Download and clean as we go to save disk\n")
-            cs.write(f"wget --quiet -O - {url} {extra_filters} | bcftools annotate --exclude 'AC=0' --remove '^{keep_columns}' {annotate_args} | vt normalize - -r {genome_fasta} -o + | vt uniq + -o {output_vcf}\n")
-
-        chrom_vcfs.append(output_vcf)
-        if args.test:
-            break  # Only do 1 chrom
-
-    # Write merge script
-    merge_script_filename = f"gnomad_{genome_build}_merge.sh"
-
-    vcf_header = write_vcf_header()
-
-    with open(merge_script_filename, "w") as ms:
-        ms.write(bash_header)
-        quoted_files = ' '.join([f"'{f}'" for f in chrom_vcfs])
-        gnomad_combined_af_vcf = f"gnomad3_{genome_build}_combined.vcf.bgz"
-        ms.write(f"zcat {chrom_vcfs[0]} | head -1000 | grep '^#' | bgzip > vcf_header.bgz")
-        ms.write(f"gzcat vcf_header.bgz {quoted_files} | bgzip > {gnomad_combined_af_vcf}\n")
-        ms.write(f"tabix {gnomad_combined_af_vcf}\n")
-
-    launch_script_filename = f"gnomad_{genome_build}_launch.sh"
-    with open(launch_script_filename, "w") as ms:
-        ms.write(bash_header)
-        ms.write('SCRIPT_DIR=$(dirname "${BASH_SOURCE[0]}")\n')
-        for cs in chrom_scripts:
-            ms.write(f"${{SCRIPT_DIR}}/{cs} > {cs}.log 2> {cs}.stderr.log &\n")
-
-        ms.write("echo Waiting for all chroms to finish...\n")
-        ms.write("wait\n")
-        ms.write(f"${{SCRIPT_DIR}}/{merge_script_filename}\n")
-
-
-def write_vcf_header():
-    """ Needs to be gzipped so can be concatenated with other gzipped files """
-    vcf_header = ""
-    return vcf_header
-
-
-def get_columns():
-    columns = COUNTS + OTHER_INFOS
-    for g in GNOMAD_SUB_POPS:
-        # gnomAD 3 changed from underscore to dash
-        # 3.1.2 changed back to underscore
-        columns.append(f"AF_{g.lower()}")
-    return columns
-
-
-def write_chrom_mapping_file():
-    chrom_mapping_file = "chrom_mapping.txt"
-    with open(chrom_mapping_file, "w") as f:
-        for c in CHROMOSOMES:
-            f.write(f"chr{c}\t{c}\n")
-    return chrom_mapping_file
-
-
-if __name__ == "__main__":
-    args = get_args()
-    main(args)
diff --git a/annotation/annotation_data/generate_annotation/gnomad4_download.sh b/annotation/annotation_data/generate_annotation/gnomad4.0_download.sh
similarity index 100%
rename from annotation/annotation_data/generate_annotation/gnomad4_download.sh
rename to annotation/annotation_data/generate_annotation/gnomad4.0_download.sh
diff --git a/annotation/annotation_data/generate_annotation/gnomad4_data.py b/annotation/annotation_data/generate_annotation/gnomad_data.py
similarity index 50%
rename from annotation/annotation_data/generate_annotation/gnomad4_data.py
rename to annotation/annotation_data/generate_annotation/gnomad_data.py
index 7dd190e1b..aa9c55ce0 100755
--- a/annotation/annotation_data/generate_annotation/gnomad4_data.py
+++ b/annotation/annotation_data/generate_annotation/gnomad_data.py
@@ -13,27 +13,64 @@
 import os
 from argparse import ArgumentParser
 from datetime import datetime
-
-GNOMAD_VERSION="4.0"
-GRCh38 = "GRCh38"
-
-# We deliberately leave out AF and "grpmax" stuff as we recalculate that later in 'calculate_allele_frequency'
-COUNTS = ['AC', 'AN']
-OTHER_INFOS = ["nhomalt", "non_par", "faf95", "faf99", "fafmax_faf95_max", "fafmax_faf99_max"]
-GNOMAD_SUB_POPS = ["afr", "amr", "asj", "eas", "fin", "mid", "nfe", "remaining", "sas"]  # Will get AF for each
-CHR_X_ONLY = ["AC_XY", "AN_XY", "AF_XY"]
+from typing import Tuple, List
+
+GNOMAD_V_2_1 = "2.1.1"
+GNOMAD_V_3_1_2 = "3.1.2"
+GNOMAD_V_4_0 = "4.0"
+
+GNOMAD_VERSIONS = {
+    GNOMAD_V_2_1,
+    GNOMAD_V_3_1_2,
+    GNOMAD_V_4_0,
+}
+
+FILENAMES = {
+    GNOMAD_V_2_1: "gnomad.%(capture_type)s.r2.1.1.sites.%(chrom)s.vcf.bgz",
+    GNOMAD_V_3_1_2: "gnomad.%(capture_type)s.v3.1.2.sites.chr{chrom}.vcf.bgz",
+    GNOMAD_V_4_0: "gnomad.%(capture_type)s.v4.0.sites.chr%(chrom)s.vcf.bgz",
+}
+
+
+GENOME_BUILDS = {"GRCh37", "GRCh38"}
+
+def get_infos_for_version(gnomad_version) -> Tuple[List[str], List[str], List[str], List[str]]:
+    # We deliberately leave out AF and "grpmax" stuff as we recalculate that later in 'calculate_allele_frequency'
+    info_fields = ['AC', 'AN', "nhomalt", "nonpar"]
+    popmax_fields = ["AF_popmax", "AC_popmax", "AN_popmax", "popmax", "nhomalt_popmax"]
+    grpmax_fields = ["AF_grpmax", "AC_grpmax", "AN_grpmax", "grpmax", "nhomalt_grpmax"]
+    sub_pops = ["afr", "amr", "asj", "eas", "fin", "nfe", "oth", "sas"]  # Will get AF for each
+    chr_x_male = ["AC_male", "AN_male", "AF_male"]
+    chr_x_xy = ["AC_XY", "AN_XY", "AF_XY"]
+
+    if gnomad_version == GNOMAD_V_4_0:
+        popmax_fields = grpmax_fields
+        chr_x_male = chr_x_xy
+        info_fields.extend(["faf95", "faf99", "fafmax_faf95_max", "fafmax_faf99_max"])
+        # Others are now called remaining
+        sub_pops.remove("oth")
+        sub_pops.append("remaining")  #
+        sub_pops.append("mid")  # Middle easterners added in v4
+
+        info_fields.remove("nonpar")
+        info_fields.append("non_par")
+
+    return info_fields, chr_x_male, popmax_fields, sub_pops
 
 # popmax/grpmax is calculated using non-bottlenecked genetic ancestry groups
-BOTTLENECKED_SUB_POPS = ["asj", "fin", "mid", "remaining"]
+BOTTLENECKED_SUB_POPS = {"asj", "fin", "mid", "oth", "remaining"}
 
 
 def get_args():
+    available_builds = ", ".join(GENOME_BUILDS)
+    available_versions = ", ".join(GNOMAD_VERSIONS)
+
     parser = ArgumentParser(description="Merge exome+genome VCFs for VariantGrid VEP pipeline")
-    parser.add_argument("--test", action='store_true', help="Only download 5k of each file.")
-    # parser.add_argument("--genome-fasta", help='Fasta (correct for build)')
+    parser.add_argument("--test", action='store_true', help="Only do chrY (quick test)")
     parser.add_argument("--chrom-mapping-file", help='bcftools chromosome conversion')
-    parser.add_argument("--version", help=f'gnomAD version (default: {GNOMAD_VERSION})', default=GNOMAD_VERSION)
-    parser.add_argument("--path", help='Colon separated paths for tabix/bgzip/vt/bcftools')
+    parser.add_argument("--genome-build", help=f'GenomeBuild (one of {available_builds})')
+    parser.add_argument("--version", help=f'gnomAD version (one of {available_versions})')
+    parser.add_argument("--path", help='Optional Colon separated paths for tabix/bgzip/vt/bcftools')
     parser.add_argument("--gnomad-input-vcf")
     parser.add_argument("--af-output-vcf")
 
@@ -48,6 +85,9 @@ def get_args():
         if args.af_output_vcf is None:
             parser.error("--af-output-vcf required for --af")
 
+    if args.version not in GNOMAD_VERSIONS:
+        parser.error(f"Version must be one of: {available_versions}")
+
     return args
 
 
@@ -55,7 +95,7 @@ def main(args):
     if args.scripts:
         write_scripts(args)
     else:
-        calculate_allele_frequency(args.gnomad_input_vcf, args.af_output_vcf)
+        calculate_allele_frequency(args.version, args.gnomad_input_vcf, args.af_output_vcf)
 
 
 def write_scripts(args):
@@ -63,42 +103,50 @@ def write_scripts(args):
         raise ValueError("--chrom-mapping-file is required for write scripts step")
 
     if args.test:
-        CHROMOSOMES = ["Y"]  # Just do Y
+        chromosomes = ["Y"]  # Just do Y
     else:
-        CHROMOSOMES = list(map(str, range(1, 23))) + ['X', 'Y']
+        chromosomes = list(map(str, range(1, 23))) + ['X', 'Y']
+
+    info_fields, chr_x_male, popmax_fields, sub_pops = get_infos_for_version(args.version)
 
-    columns = get_columns()
+    columns = get_columns(info_fields, sub_pops)
     bash_header = "#!/bin/bash\nset -e # fail on error\n"
 
     if args.path:
         bash_header += "PATH=${PATH}:" + args.path + "\n"
 
+    filename_template = FILENAMES[args.version]
+
     chrom_scripts = []
     af_vcfs = []
-    for chrom in CHROMOSOMES:
-        prefix = f"gnomad{GNOMAD_VERSION}_{GRCh38}_chr{chrom}"
+    for chrom in chromosomes:
+        prefix = f"gnomad{args.version}_{args.genome_build}_chr{chrom}"
         chrom_script = f"{prefix}.sh"
         chrom_scripts.append(chrom_script)
-        with open(chrom_script, "w") as cs:
+        with (open(chrom_script, "w") as cs):
             cs.write(bash_header)
 
             output_vcfs = []
-            for vcf_type in ["exomes", "genomes"]:
+            for capture_type in ["exomes", "genomes"]:
                 # To remove all INFO tags except "FOO" and "BAR", use "^INFO/FOO,INFO/BAR"
                 # @see https://samtools.github.io/bcftools/bcftools.html#annotate """
                 my_columns = columns.copy()
                 if chrom == "X":
-                    my_columns.extend(CHR_X_ONLY)
+                    my_columns.extend(chr_x_male)
 
                 info_columns = [f"INFO/{i}" for i in my_columns]
                 keep_columns = ','.join(info_columns)  # AC/AN are special format fields
-                output_vcf = f"{prefix}_{vcf_type}.filtered_info.vcf.gz"
+                output_vcf = f"{prefix}_{capture_type}.filtered_info.vcf.gz"
                 annotate_args = f"--rename-chrs={args.chrom_mapping_file}"
 
-                gnomad_vcf_filename = f"gnomad.{vcf_type}.{GNOMAD_VERSION}_{GRCh38}.sites.chr{chrom}.vcf.bgz"
+                gnomad_vcf_filename = filename_template % {
+                    "capture_type": capture_type,
+                    "chrom": chrom,
+                }
 
-                # bcftools merge doesn't work with type='A'
                 # bcftools now works with AC/AN etc - see https://github.com/samtools/bcftools/issues/1394
+                # but make sure you are using v18
+                # bcftools merge doesn't work with type='A'
                 modify_fields2 = "sed -e 's/,Number=A,/,Number=1,/'"
                 # gnomAD appears to already be decomposed - vt decompose + -s -o +
                 # We no longer remove AC=0 as we want to keep AN (total counts) for pops for later AF calculations
@@ -115,7 +163,7 @@ def write_scripts(args):
 
                 # Merge exomes/genome VCFs
                 # if we leave out rule, will take from 1st file which is ok for PAR as will be the same
-                skip_columns = {"non_par"}
+                skip_columns = {"nonpar", "non_par"}
                 # Default rule = "sum" if not below (or skipped)
                 rule_ops = {
                     # Will take higher of whatever is there in genomes/exomes
@@ -145,17 +193,18 @@ def write_scripts(args):
             af_vcfs.append(allele_frequency_vcf)
 
     # Write merge script
-    merge_script_filename = f"gnomad4_merge.sh"
-    vcf_header = write_vcf_header()
+    merge_script_filename = f"gnomad{args.version}_merge.sh"
+    vcf_header = write_vcf_header(args.version, info_fields, popmax_fields, sub_pops)
 
     with open(merge_script_filename, "w") as ms:
         ms.write(bash_header)
         quoted_files = ' '.join([f"'{f}'" for f in af_vcfs])
-        gnomad_combined_af_vcf = f"gnomad{GNOMAD_VERSION}_{GRCh38}_combined_af.vcf.bgz"
-        ms.write(f"cat {vcf_header} {quoted_files} > {gnomad_combined_af_vcf}\n")
+        gnomad_combined_af_vcf = f"gnomad{args.version}_{args.genome_build}_combined_af.vcf.bgz"
+        # We produce gzipped files, but want bgzipped, so need to cat then bgzip
+        ms.write(f"zcat {vcf_header} {quoted_files} | bgzip > {gnomad_combined_af_vcf}\n")
         ms.write(f"tabix {gnomad_combined_af_vcf}\n")
 
-    launch_script_filename = f"gnomad4_launch.sh"
+    launch_script_filename = f"gnomad{args.version}_launch.sh"
     with open(launch_script_filename, "w") as ms:
         ms.write(bash_header)
         ms.write('SCRIPT_DIR=$(dirname "${BASH_SOURCE[0]}")\n')
@@ -167,50 +216,66 @@ def write_scripts(args):
         ms.write(f"${{SCRIPT_DIR}}/{merge_script_filename}\n")
 
 
-def get_columns():
-    columns = COUNTS + OTHER_INFOS
-    for g in GNOMAD_SUB_POPS:
+def get_columns(info_fields, sub_pops):
+    columns = info_fields.copy()
+    for g in sub_pops:
         for f in ["AC", "AN"]:
             columns.append(f"{f}_{g.lower()}")
     return columns
 
 
-def get_af_info():
+def get_af_info(sub_pops):
     af_info = [
         ("AF", None, "AC", "AN"),
     ]
-    for g in GNOMAD_SUB_POPS:
+    for g in sub_pops:
         af_info.append((f'AF_{g}', g, f'AC_{g}', f'AN_{g}'))
     return af_info
 
 
-def write_vcf_header():
+def write_vcf_header(version, info_fields, popmax_fields, sub_pops):
     """ Needs to be gzipped so can be concatenated with other gzipped files """
 
+    all_info = set(info_fields + popmax_fields + ["gnomad_filtered"])
+    field_headers = {
+        'AC': '##INFO=<ID=AC,Number=1,Type=Integer,Description="Alternate allele count (exomes + genomes)">',
+        'AN': '##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles  (exomes + genomes)">',
+        'AC_XY': '##INFO=<ID=AC_XY,Number=1,Type=Integer,Description="Alternate allele count for XY samples">',
+        'AF_XY': '##INFO=<ID=AF_XY,Number=1,Type=Float,Description="Alternate allele frequency in XY samples">',
+        'AN_XY': '##INFO=<ID=AN_XY,Number=1,Type=Integer,Description="Total number of alleles in XY samples">',
+        'faf95': '##INFO=<ID=faf95,Number=1,Type=Float,Description="Filtering allele frequency (using Poisson 95% CI) (max of exomes/genomes)">',
+        'faf99': '##INFO=<ID=faf99,Number=1,Type=Float,Description="Filtering allele frequency (using Poisson 99% CI) (max of exomes/genomes)">',
+        'fafmax_faf95_max': '##INFO=<ID=fafmax_faf95_max,Number=1,Type=Float,Description="Maximum filtering allele frequency (using Poisson 95% CI) across genetic_ancestry groups (max of exomes/genomes)">',
+        'fafmax_faf99_max': '##INFO=<ID=fafmax_faf99_max,Number=1,Type=Float,Description="Maximum filtering allele frequency (using Poisson 99% CI) across genetic_ancestry groups (max of exomes/genomes)">',
+        'AF_popmax': '##INFO=<ID=AF_popmax,Number=1,Type=Float,Description="Allele Frequency for highest population">"',
+        'AC_popmax': '##INFO=<ID=AC_popmax,Number=1,Type=Integer,Description="Allele Count for highest population">',
+        'AN_popmax': '##INFO=<ID=AN_popmax,Number=1,Type=Integer,Description="Allele Number for highest population">',
+        'popmax': '##INFO=<ID=popmax,Number=1,Type=String,Description="Ancestral group with highest allele frequency (stored as AF_popmax)">',
+        'AF_grpmax': '##INFO=<ID=AF_grpmax,Number=1,Type=Float,Description="Allele Frequency for highest population">"',
+        'AC_grpmax': '##INFO=<ID=AC_grpmax,Number=1,Type=Integer,Description="Allele Count for highest population">',
+        'AN_grpmax': '##INFO=<ID=AN_grpmax,Number=1,Type=Integer,Description="Allele Number for highest population">',
+        'grpmax': '##INFO=<ID=grpmax,Number=1,Type=String,Description="Ancestral group with highest allele frequency (stored as AF_grpmax)">',
+        'nhomalt': '##INFO=<ID=nhomalt,Number=1,Type=Integer,Description="Total number of homozygotest (exomes + genomes)">',
+        'gnomad_filtered': '##INFO=<ID=gnomad_filtered,Number=1,Type=Integer,Description="Exomes or genomes had a filter entry (potential QC issues)">',
+        'nonpar': '##INFO=<ID=nonpar,Number=0,Type=Flag,Description="Variant (on sex chromosome) falls outside a pseudoautosomal region">',
+        'non_par': '##INFO=<ID=nonpar,Number=0,Type=Flag,Description="Variant (on sex chromosome) falls outside a pseudoautosomal region">',
+    }
+
+    info_headers = ""
+    for field in all_info:
+        if header := field_headers.get(field):
+            info_headers += header + "\n"
+
     now = datetime.now()
     file_date = "%d%02d%02d" % (now.year, now.month, now.day)
     source = __file__
     meta = """##fileformat=VCFv4.2
 ##fileDate=%(file_date)s
 ##source=%(source)s
-##INFO=<ID=AF_grpmax,Number=1,Type=Float,Description="Allele Frequency for highest population">
-##INFO=<ID=AC_grpmax,Number=1,Type=Integer,Description="Allele Count for highest population">
-##INFO=<ID=AN_grpmax,Number=1,Type=Integer,Description="Allele Number for highest population">
-##INFO=<ID=AC,Number=1,Type=Integer,Description="Alternate allele count (exomes + genomes)">
-##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles  (exomes + genomes)">
-##INFO=<ID=AC_XY,Number=1,Type=Integer,Description="Alternate allele count for XY samples">
-##INFO=<ID=AF_XY,Number=1,Type=Float,Description="Alternate allele frequency in XY samples">
-##INFO=<ID=AN_XY,Number=1,Type=Integer,Description="Total number of alleles in XY samples">
-##INFO=<ID=faf95,Number=1,Type=Float,Description="Filtering allele frequency (using Poisson 95%% CI) (max of exomes/genomes)">
-##INFO=<ID=faf99,Number=1,Type=Float,Description="Filtering allele frequency (using Poisson 99%% CI) (max of exomes/genomes)">
-##INFO=<ID=fafmax_faf95_max,Number=1,Type=Float,Description="Maximum filtering allele frequency (using Poisson 95%% CI) across genetic_ancestry groups (max of exomes/genomes)">
-##INFO=<ID=fafmax_faf99_max,Number=1,Type=Float,Description="Maximum filtering allele frequency (using Poisson 99%% CI) across genetic_ancestry groups (max of exomes/genomes)">
-##INFO=<ID=grpmax,Number=1,Type=String,Description="Ancestral group with highest allele frequency (stored as AF_grpmax)">
-##INFO=<ID=nhomalt,Number=1,Type=Integer,Description="Total number of homozygotest (exomes + genomes)">
-##INFO=<ID=gnomad_filtered,Number=1,Type=Integer,Description="Exomes or genomes had a filter entry (potential QC issues)">
-""" % {"file_date": file_date, "source": source}
-
-    af_info = get_af_info()
+%(info_headers)s
+""" % {"file_date": file_date, "source": source, "info_headers": info_headers}
+
+    af_info = get_af_info(sub_pops)
     for info_id, pop_name, ac_name, an_name in af_info:
         if pop_name:
             af_desc = f"for {pop_name}"
@@ -219,7 +284,7 @@ def write_vcf_header():
         af_desc += f" made from (exomes_{ac_name} + genomes_{ac_name}) / (exomes_{an_name} + genomes_{an_name})"
         meta += f'##INFO=<ID={info_id},Number=1,Type=Float,Description="Allele Frequency {af_desc}">\n'
 
-    vcf_header = "vcf_header.txt.gz"
+    vcf_header = f"gnomad_{version}_vcf_header.txt.gz"
     with gzip.open(vcf_header, "wt") as f:
         f.write(meta)
         header_cols = ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
@@ -228,12 +293,13 @@ def write_vcf_header():
     return vcf_header
 
 
-def calculate_allele_frequency(gnomad_input_vcf, af_output_vcf):
+def calculate_allele_frequency(version, gnomad_input_vcf, af_output_vcf):
+    """ We have to re-calculate POPMAX as we can't merge it """
+
     from cyvcf2 import VCF  # Import here, so that rest of script can run on HPC easier
 
-    # We have to re-calculate POPMAX as we can't merge it
-    af_info = get_af_info()
-    info_names = [ai[0] for ai in af_info] + COUNTS + OTHER_INFOS + ["AF_grpmax", "AC_grpmax", "AN_grpmax", "grpmax", "gnomad_filtered"]
+    info_fields, _, popmax_fields, sub_pops = get_infos_for_version(version)
+    af_info = get_af_info(sub_pops)
 
     with gzip.open(af_output_vcf, "wt") as f:
         for variant in VCF(gnomad_input_vcf):
@@ -247,7 +313,7 @@ def calculate_allele_frequency(gnomad_input_vcf, af_output_vcf):
             ac_popmax = 0
             an_popmax = 0
             popmax = '.'
-            infos = []
+            infos = {}
             for _, pop_name, ac_name, an_name in af_info:
                 ac = variant.INFO.get(ac_name, 0)
                 an = variant.INFO.get(an_name)
@@ -262,13 +328,23 @@ def calculate_allele_frequency(gnomad_input_vcf, af_output_vcf):
                     af = f'{af:.6f}'
                 else:
                     af = '.'
-                infos.append(af)
+                infos["AF"] = af
 
-            for o in COUNTS + OTHER_INFOS:
-                infos.append(str(variant.INFO.get(o, '.')))
+            for o in info_fields:
+                infos[o] = str(variant.INFO.get(o, '.'))
             gnomad_filtered = '0' if variant.FILTER is None else '1'
-            infos.extend([str(af_popmax), str(ac_popmax), str(an_popmax), popmax, gnomad_filtered])
-            info_str = ";".join([i + "=" + v for i, v in zip(info_names, infos)])
+            infos["gnomad_filtered"] = gnomad_filtered
+
+            for p in popmax_fields: # can be popmax or grpmax
+                if p.startswith("AF_"):
+                    infos[p] = str(af_popmax)
+                elif p.startswith("AC_"):
+                    infos[p] = str(ac_popmax)
+                elif p.startswith(("AN_")):
+                    infos[p] = str(an_popmax)
+                elif p in {"popmax", "grpmax"}:
+                    infos[p] = popmax
+            info_str = ";".join([f"{k}={v}" for k, v in infos.items()])
             columns = [chrom, pos, variant_id, ref, alt, '.', '.', info_str]
             f.write("\t".join(columns) + "\n")
 

From 3abf2d6701cbddb11c78a26af415cc5e4a95bb73 Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Wed, 29 Nov 2023 18:13:27 +1030
Subject: [PATCH 06/29] #850 - new VEP 110 fields

---
 .../0024_new_vep_annotation_gnomad3.py        |  83 +++++------
 ...variantannotation_gnomad_faf95_and_more.py |  83 +++++++++++
 .../migrations/0082_new_vep_110_columns_v3.py | 134 ++++++++++++++++++
 annotation/models/models.py                   |  27 ++--
 annotation/models/models_enums.py             |   1 +
 annotation/vep_annotation.py                  |   3 +-
 ...lemergelog_allele_linking_tool_and_more.py |  28 ++++
 .../migrations/0107_new_vep_110_columns_v3.py | 120 ++++++++++++++++
 snpdb/models/models_enums.py                  |   4 +-
 .../settings/components/default_settings.py   |   1 +
 10 files changed, 431 insertions(+), 53 deletions(-)
 create mode 100644 annotation/migrations/0081_rename_faf95_variantannotation_gnomad_faf95_and_more.py
 create mode 100644 annotation/migrations/0082_new_vep_110_columns_v3.py
 create mode 100644 snpdb/migrations/0106_alter_allelemergelog_allele_linking_tool_and_more.py
 create mode 100644 snpdb/migrations/0107_new_vep_110_columns_v3.py

diff --git a/annotation/migrations/0024_new_vep_annotation_gnomad3.py b/annotation/migrations/0024_new_vep_annotation_gnomad3.py
index 39a1dc859..740ad1a80 100644
--- a/annotation/migrations/0024_new_vep_annotation_gnomad3.py
+++ b/annotation/migrations/0024_new_vep_annotation_gnomad3.py
@@ -7,6 +7,9 @@
 
 def _new_vep_annotation_gnomad3(apps, _schema_editor):
     # Separate out gnomAD 2 vs 3
+    GNOMAD_2 = 'g'
+    GNOMAD_3 = 'n'
+
     # Make everything from gnomAD2 GRCh37 specific EXCEPT gnomadAF (still want that one)
     ColumnVEPField = apps.get_model("annotation", "ColumnVEPField")
     GenomeBuild = apps.get_model("snpdb", "GenomeBuild")
@@ -15,7 +18,7 @@ def _new_vep_annotation_gnomad3(apps, _schema_editor):
     grch38 = GenomeBuild.objects.get(pk="GRCh38")
 
     # All existing gnomAD are now GRCh37 only (will insert new legacy one below)
-    ColumnVEPField.objects.filter(vep_custom='g').update(genome_build=grch37)
+    ColumnVEPField.objects.filter(vep_custom=GNOMAD_2).update(genome_build=grch37)
 
     # Might as well hide these now as we can - GRCh37 has 46, GRCh37 has 30
     ColumnVEPField.objects.filter(column__in=["phylop_46_way_mammalian", "phastcons_46_way_mammalian"]).update(genome_build=grch37)
@@ -23,48 +26,48 @@ def _new_vep_annotation_gnomad3(apps, _schema_editor):
 
     COLUMN_VEP_FIELD = [
         # Legacy
-        {'column': 'gnomad2_liftover_af', 'vep_plugin': None, 'source_field_has_custom_prefix': True,
-         'vep_custom': 'g', 'variant_grid_column_id': 'gnomad2_liftover_af', 'source_field': 'AF', 'category': 'F'},
+        {'column': 'gnomad2_liftover_af', 'source_field_has_custom_prefix': True,
+         'vep_custom': GNOMAD_2, 'variant_grid_column_id': 'gnomad2_liftover_af', 'source_field': 'AF', 'category': 'F'},
         # gnomAD 3
-        {'column': 'gnomad3_ac', 'vep_plugin': None, 'source_field_has_custom_prefix': True,
-         'vep_custom': 'n', 'variant_grid_column_id': 'gnomad_ac', 'source_field': 'AC', 'category': 'F'},
-        {'column': 'gnomad3_af', 'vep_plugin': None, 'source_field_has_custom_prefix': True,
-         'vep_custom': 'n', 'variant_grid_column_id': 'gnomad_af', 'source_field': 'AF', 'category': 'F'},
-        {'column': 'gnomad3_an', 'vep_plugin': None, 'source_field_has_custom_prefix': True,
-         'vep_custom': 'n', 'variant_grid_column_id': 'gnomad_an', 'source_field': 'AN', 'category': 'F'},
-        {'column': 'gnomad3_afr_af', 'vep_plugin': None, 'source_field_has_custom_prefix': True,
-         'vep_custom': 'n', 'variant_grid_column_id': 'gnomad_afr_af', 'source_field': 'AF-afr', 'category': 'F'},
-        {'column': 'gnomad3_amr_af', 'vep_plugin': None, 'source_field_has_custom_prefix': True,
-         'vep_custom': 'n', 'variant_grid_column_id': 'gnomad_amr_af', 'source_field': 'AF-amr', 'category': 'F'},
-        {'column': 'gnomad3_asj_af', 'vep_plugin': None, 'source_field_has_custom_prefix': True,
-         'vep_custom': 'n', 'variant_grid_column_id': 'gnomad_asj_af', 'source_field': 'AF-asj', 'category': 'F'},
-        {'column': 'gnomad3_eas_af', 'vep_plugin': None, 'source_field_has_custom_prefix': True,
-         'vep_custom': 'n', 'variant_grid_column_id': 'gnomad_eas_af', 'source_field': 'AF-eas', 'category': 'F'},
-        {'column': 'gnomad3_filtered', 'vep_plugin': None, 'source_field_has_custom_prefix': True,
-         'vep_custom': 'n', 'variant_grid_column_id': 'gnomad_filtered', 'source_field': 'FILTER', 'category': 'F'},
-        {'column': 'gnomad3_fin_af', 'vep_plugin': None, 'source_field_has_custom_prefix': True,
-         'vep_custom': 'n', 'variant_grid_column_id': 'gnomad_fin_af', 'source_field': 'AF-fin', 'category': 'F'},
-        {'column': 'gnomad3_hom_alt', 'vep_plugin': None, 'source_field_has_custom_prefix': True,
-         'vep_custom': 'n', 'variant_grid_column_id': 'gnomad_hom_alt', 'source_field': 'nhomalt', 'category': 'F'},
-        {'column': 'gnomad3_nfe_af', 'vep_plugin': None, 'source_field_has_custom_prefix': True,
-         'vep_custom': 'n', 'variant_grid_column_id': 'gnomad_nfe_af', 'source_field': 'AF-nfe', 'category': 'F'},
-        {'column': 'gnomad3_oth_af', 'vep_plugin': None, 'source_field_has_custom_prefix': True,
-         'vep_custom': 'n', 'variant_grid_column_id': 'gnomad_oth_af', 'source_field': 'AF-oth', 'category': 'F'},
-        {'column': 'gnomad3_popmax', 'vep_plugin': None, 'source_field_has_custom_prefix': True,
-         'vep_custom': 'n', 'variant_grid_column_id': 'gnomad_popmax', 'source_field': 'popmax', 'category': 'F'},
-        {'column': 'gnomad3_popmax_ac', 'vep_plugin': None, 'source_field_has_custom_prefix': True,
-         'vep_custom': 'n', 'variant_grid_column_id': 'gnomad_popmax_ac', 'source_field': 'AC_popmax', 'category': 'F'},
-        {'column': 'gnomad3_popmax_af', 'vep_plugin': None, 'source_field_has_custom_prefix': True,
-         'vep_custom': 'n', 'variant_grid_column_id': 'gnomad_popmax_af', 'source_field': 'AF_popmax', 'category': 'F'},
-        {'column': 'gnomad3_popmax_an', 'vep_plugin': None, 'source_field_has_custom_prefix': True,
-         'vep_custom': 'n', 'variant_grid_column_id': 'gnomad_popmax_an', 'source_field': 'AN_popmax', 'category': 'F'},
-        {'column': 'gnomad3_popmax_hom_alt', 'vep_plugin': None, 'source_field_has_custom_prefix': True,
-         'vep_custom': 'n', 'variant_grid_column_id': 'gnomad_popmax_hom_alt', 'source_field': 'nhomalt_popmax', 'category': 'F'},
-        {'column': 'gnomad3_sas_af', 'vep_plugin': None, 'source_field_has_custom_prefix': True,
-         'vep_custom': 'n', 'variant_grid_column_id': 'gnomad_sas_af', 'source_field': 'AF-sas', 'category': 'F'},
+        {'column': 'gnomad3_ac', 'source_field_has_custom_prefix': True,
+         'vep_custom': GNOMAD_3, 'variant_grid_column_id': 'gnomad_ac', 'source_field': 'AC', 'category': 'F'},
+        {'column': 'gnomad3_af', 'source_field_has_custom_prefix': True,
+         'vep_custom': GNOMAD_3, 'variant_grid_column_id': 'gnomad_af', 'source_field': 'AF', 'category': 'F'},
+        {'column': 'gnomad3_an', 'source_field_has_custom_prefix': True,
+         'vep_custom': GNOMAD_3, 'variant_grid_column_id': 'gnomad_an', 'source_field': 'AN', 'category': 'F'},
+        {'column': 'gnomad3_afr_af', 'source_field_has_custom_prefix': True,
+         'vep_custom': GNOMAD_3, 'variant_grid_column_id': 'gnomad_afr_af', 'source_field': 'AF-afr', 'category': 'F'},
+        {'column': 'gnomad3_amr_af', 'source_field_has_custom_prefix': True,
+         'vep_custom': GNOMAD_3, 'variant_grid_column_id': 'gnomad_amr_af', 'source_field': 'AF-amr', 'category': 'F'},
+        {'column': 'gnomad3_asj_af', 'source_field_has_custom_prefix': True,
+         'vep_custom': GNOMAD_3, 'variant_grid_column_id': 'gnomad_asj_af', 'source_field': 'AF-asj', 'category': 'F'},
+        {'column': 'gnomad3_eas_af', 'source_field_has_custom_prefix': True,
+         'vep_custom': GNOMAD_3, 'variant_grid_column_id': 'gnomad_eas_af', 'source_field': 'AF-eas', 'category': 'F'},
+        {'column': 'gnomad3_filtered', 'source_field_has_custom_prefix': True,
+         'vep_custom': GNOMAD_3, 'variant_grid_column_id': 'gnomad_filtered', 'source_field': 'FILTER', 'category': 'F'},
+        {'column': 'gnomad3_fin_af', 'source_field_has_custom_prefix': True,
+         'vep_custom': GNOMAD_3, 'variant_grid_column_id': 'gnomad_fin_af', 'source_field': 'AF-fin', 'category': 'F'},
+        {'column': 'gnomad3_hom_alt', 'source_field_has_custom_prefix': True,
+         'vep_custom': GNOMAD_3, 'variant_grid_column_id': 'gnomad_hom_alt', 'source_field': 'nhomalt', 'category': 'F'},
+        {'column': 'gnomad3_nfe_af', 'source_field_has_custom_prefix': True,
+         'vep_custom': GNOMAD_3, 'variant_grid_column_id': 'gnomad_nfe_af', 'source_field': 'AF-nfe', 'category': 'F'},
+        {'column': 'gnomad3_oth_af', 'source_field_has_custom_prefix': True,
+         'vep_custom': GNOMAD_3, 'variant_grid_column_id': 'gnomad_oth_af', 'source_field': 'AF-oth', 'category': 'F'},
+        {'column': 'gnomad3_popmax', 'source_field_has_custom_prefix': True,
+         'vep_custom': GNOMAD_3, 'variant_grid_column_id': 'gnomad_popmax', 'source_field': 'popmax', 'category': 'F'},
+        {'column': 'gnomad3_popmax_ac', 'source_field_has_custom_prefix': True,
+         'vep_custom': GNOMAD_3, 'variant_grid_column_id': 'gnomad_popmax_ac', 'source_field': 'AC_popmax', 'category': 'F'},
+        {'column': 'gnomad3_popmax_af', 'source_field_has_custom_prefix': True,
+         'vep_custom': GNOMAD_3, 'variant_grid_column_id': 'gnomad_popmax_af', 'source_field': 'AF_popmax', 'category': 'F'},
+        {'column': 'gnomad3_popmax_an', 'source_field_has_custom_prefix': True,
+         'vep_custom': GNOMAD_3, 'variant_grid_column_id': 'gnomad_popmax_an', 'source_field': 'AN_popmax', 'category': 'F'},
+        {'column': 'gnomad3_popmax_hom_alt', 'source_field_has_custom_prefix': True,
+         'vep_custom': GNOMAD_3, 'variant_grid_column_id': 'gnomad_popmax_hom_alt', 'source_field': 'nhomalt_popmax', 'category': 'F'},
+        {'column': 'gnomad3_sas_af', 'source_field_has_custom_prefix': True,
+         'vep_custom': GNOMAD_3, 'variant_grid_column_id': 'gnomad_sas_af', 'source_field': 'AF-sas', 'category': 'F'},
     ]
     bulk_insert_class_data(apps, "annotation", [("ColumnVEPField", COLUMN_VEP_FIELD)])
-    ColumnVEPField.objects.filter(vep_custom='n').update(genome_build=grch38)
+    ColumnVEPField.objects.filter(vep_custom=GNOMAD_3).update(genome_build=grch38)
 
 
 class Migration(migrations.Migration):
diff --git a/annotation/migrations/0081_rename_faf95_variantannotation_gnomad_faf95_and_more.py b/annotation/migrations/0081_rename_faf95_variantannotation_gnomad_faf95_and_more.py
new file mode 100644
index 000000000..d66e2bedb
--- /dev/null
+++ b/annotation/migrations/0081_rename_faf95_variantannotation_gnomad_faf95_and_more.py
@@ -0,0 +1,83 @@
+# Generated by Django 4.2.2 on 2023-11-29 07:20
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('annotation', '0080_columnvepfield_pipeline_type_variantannotation_faf95_and_more'),
+    ]
+
+    operations = [
+        migrations.RenameField(
+            model_name='variantannotation',
+            old_name='faf95',
+            new_name='gnomad_faf95',
+        ),
+        migrations.RenameField(
+            model_name='variantannotation',
+            old_name='faf99',
+            new_name='gnomad_faf99',
+        ),
+        migrations.RenameField(
+            model_name='variantannotation',
+            old_name='fafmax_faf95_max',
+            new_name='gnomad_fafmax_faf95_max',
+        ),
+        migrations.RenameField(
+            model_name='variantannotation',
+            old_name='fafmax_faf99_max',
+            new_name='gnomad_fafmax_faf99_max',
+        ),
+        migrations.AddField(
+            model_name='variantannotation',
+            name='gnomad_hemi_count',
+            field=models.IntegerField(blank=True, null=True),
+        ),
+        migrations.AddField(
+            model_name='variantannotation',
+            name='gnomad_non_par',
+            field=models.BooleanField(blank=True, null=True),
+        ),
+        migrations.AddField(
+            model_name='variantannotation',
+            name='gnomad_xy_ac',
+            field=models.IntegerField(blank=True, null=True),
+        ),
+        migrations.AddField(
+            model_name='variantannotation',
+            name='gnomad_xy_af',
+            field=models.FloatField(blank=True, null=True),
+        ),
+        migrations.AddField(
+            model_name='variantannotation',
+            name='gnomad_xy_an',
+            field=models.IntegerField(blank=True, null=True),
+        ),
+        migrations.AddField(
+            model_name='varianttranscriptannotation',
+            name='alphamissense_class',
+            field=models.CharField(blank=True, choices=[('b', 'likely_benign'), ('a', 'ambiguous'), ('p', 'likely_pathogenic')], max_length=1, null=True),
+        ),
+        migrations.AddField(
+            model_name='varianttranscriptannotation',
+            name='alphamissense_pathogenicity',
+            field=models.FloatField(blank=True, null=True),
+        ),
+        migrations.AddField(
+            model_name='varianttranscriptannotation',
+            name='mavedb_score',
+            field=models.FloatField(blank=True, null=True),
+        ),
+        migrations.AddField(
+            model_name='varianttranscriptannotation',
+            name='mavedb_urn',
+            field=models.TextField(blank=True, null=True),
+        ),
+        migrations.AlterField(
+            model_name='columnvepfield',
+            name='category',
+            field=models.CharField(choices=[('C', 'Conservation'), ('E', 'External ID'), ('F', 'Frequency Data'), ('f', 'Functional Effect'), ('G', 'Gene Annotations'), ('H', 'HGVS'), ('L', 'Literature'), ('N', 'Nearby Features'), ('P', 'Pathogenicity Predictions'), ('Y', 'Phenotype'), ('D', 'Protein Domains'), ('Q', 'Sequence'), ('S', 'Splicing Predictions'), ('V', 'Variant Data')], max_length=1),
+        ),
+    ]
diff --git a/annotation/migrations/0082_new_vep_110_columns_v3.py b/annotation/migrations/0082_new_vep_110_columns_v3.py
new file mode 100644
index 000000000..c9b259034
--- /dev/null
+++ b/annotation/migrations/0082_new_vep_110_columns_v3.py
@@ -0,0 +1,134 @@
+# Generated by Django 4.2.2 on 2023-11-29 06:00
+
+from django.db import migrations
+
+from library.django_utils import bulk_insert_class_data
+
+
+def _new_vep_110_annotation(apps, _schema_editor):
+    VEP_CUSTOM_GNOMAD_3 = 'n'
+    VEP_CUSTOM_GNOMAD_4 = 'o'
+
+    VEP_PLUGIN_MAVEDB = 'V'
+    VEP_PLUGIN_ALPHAMISSENSE = 'A'
+
+    FREQUENCY_DATA = 'F'
+    FUNCTIONAL_EFFECT = 'f'
+    PATHOGENICITY_PREDICTIONS = 'P'
+
+
+    ColumnVEPField = apps.get_model("annotation", "ColumnVEPField")
+    GenomeBuild = apps.get_model("snpdb", "GenomeBuild")
+
+    grch37 = GenomeBuild.objects.get(pk="GRCh37")
+    grch38 = GenomeBuild.objects.get(pk="GRCh38")
+
+    # Make existing gnomAD3 have max column version of 2
+    ColumnVEPField.objects.filter(vep_custom=VEP_CUSTOM_GNOMAD_3).update(max_vep_columns_version=2)
+
+    raise ValueError("Still need to add new fields from gnomad2 stuff")
+
+
+    COLUMN_VEP_FIELD = [
+        # gnomAD 4
+        {'column': 'gnomad4_ac', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_ac', 'source_field': 'AC', 'category': FREQUENCY_DATA},
+        {'column': 'gnomad4_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_af', 'source_field': 'AF', 'category': FREQUENCY_DATA},
+        {'column': 'gnomad4_an', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_an', 'source_field': 'AN', 'category': FREQUENCY_DATA},
+        {'column': 'gnomad4_afr_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_afr_af', 'source_field': 'AF_afr', 'category': FREQUENCY_DATA},
+        {'column': 'gnomad4_amr_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_amr_af', 'source_field': 'AF_amr', 'category': FREQUENCY_DATA},
+        {'column': 'gnomad4_asj_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_asj_af', 'source_field': 'AF_asj', 'category': FREQUENCY_DATA},
+        {'column': 'gnomad4_eas_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_eas_af', 'source_field': 'AF_eas', 'category': FREQUENCY_DATA},
+        {'column': 'gnomad4_fin_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_fin_af', 'source_field': 'AF_fin', 'category': FREQUENCY_DATA},
+        {'column': 'gnomad4_mid_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_mid_af', 'source_field': 'AF_mid', 'category': FREQUENCY_DATA},
+        {'column': 'gnomad4_nfe_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_nfe_af', 'source_field': 'AF_nfe', 'category': FREQUENCY_DATA},
+        {'column': 'gnomad4_oth_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_oth_af', 'source_field': 'AF_remaining', 'category': FREQUENCY_DATA},
+        {'column': 'gnomad4_sas_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_sas_af', 'source_field': 'AF_sas', 'category': FREQUENCY_DATA},
+
+        {'column': 'gnomad4_filtered', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_filtered', 'source_field': 'gnomad_filtered', 'category': FREQUENCY_DATA},
+        {'column': 'gnomad4_hom_alt', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_hom_alt', 'source_field': 'nhomalt', 'category': FREQUENCY_DATA},
+        {'column': 'gnomad4_popmax', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_popmax', 'source_field': 'grpmax', 'category': FREQUENCY_DATA},
+        {'column': 'gnomad4_popmax_ac', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_popmax_ac', 'source_field': 'AC_grpmax', 'category': FREQUENCY_DATA},
+        {'column': 'gnomad4_popmax_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_popmax_af', 'source_field': 'AF_grpmax', 'category': FREQUENCY_DATA},
+        {'column': 'gnomad4_popmax_an', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_popmax_an', 'source_field': 'AN_grpmax', 'category': FREQUENCY_DATA},
+
+        {'column': 'gnomad4_xy_ac', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_xy_ac', 'source_field': 'AC_XY',
+         'category': FREQUENCY_DATA},
+        {'column': 'gnomad4_xy_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_xy_af', 'source_field': 'AF_XY',
+         'category': FREQUENCY_DATA},
+        {'column': 'gnomad4_xy_an', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_popmax_an', 'source_field': 'AN_XY',
+         'category': FREQUENCY_DATA},
+
+        {'column': 'gnomad4_faf95', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_faf95', 'source_field': 'faf95',
+         'category': FREQUENCY_DATA},
+        {'column': 'gnomad4_faf99', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_faf99', 'source_field': 'faf99',
+         'category': FREQUENCY_DATA},
+        {'column': 'gnomad4_fafmax_faf95_max', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_fafmax_faf95_max', 'source_field': 'fafmax_faf95_max',
+         'category': FREQUENCY_DATA},
+        {'column': 'gnomad4_fafmax_faf99_max', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_fafmax_faf99_max', 'source_field': 'fafmax_faf99_max',
+         'category': FREQUENCY_DATA},
+
+        # I left this out don't think it really matters
+#        {'column': 'gnomad4_popmax_hom_alt', 'source_field_has_custom_prefix': True,
+#         'vep_custom': GNOMAD_4, 'variant_grid_column_id': 'gnomad_popmax_hom_alt', 'source_field': 'nhomalt_grpmax', 'category': FREQUENCY_DATA},
+
+        # MAVE
+        {'column': 'mavedb_score', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
+         'vep_plugin': VEP_PLUGIN_MAVEDB, 'variant_grid_column_id': 'mavedb_score',
+         'source_field': 'score',
+         'category': FUNCTIONAL_EFFECT},
+        {'column': 'mavedb_urn', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
+         'vep_plugin': VEP_PLUGIN_MAVEDB, 'variant_grid_column_id': 'mavedb_urn',
+         'source_field': 'urn',
+         'category': FUNCTIONAL_EFFECT},
+
+        # AlphaMissense
+        {'column': 'alphamissense_class', 'min_vep_columns_version': 3,
+         'vep_plugin': VEP_PLUGIN_ALPHAMISSENSE, 'variant_grid_column_id': 'alphamissense_class',
+         'source_field': 'am_class',
+         'category': PATHOGENICITY_PREDICTIONS},
+        {'column': 'alphamissense_pathogenicity', 'min_vep_columns_version': 3,
+         'vep_plugin': VEP_PLUGIN_ALPHAMISSENSE, 'variant_grid_column_id': 'alphamissense_pathogenicity',
+         'source_field': 'am_pathogenicity',
+         'category': PATHOGENICITY_PREDICTIONS},
+    ]
+    bulk_insert_class_data(apps, "annotation", [("ColumnVEPField", COLUMN_VEP_FIELD)])
+    ColumnVEPField.objects.filter(vep_custom=VEP_CUSTOM_GNOMAD_4).update(genome_build=grch38)
+
+
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('annotation', '0081_rename_faf95_variantannotation_gnomad_faf95_and_more'),
+        ("snpdb", "0107_new_vep_110_columns_v3"),  # Defines new columns
+    ]
+
+    operations = [
+        migrations.RunPython(_new_vep_110_annotation)
+    ]
diff --git a/annotation/models/models.py b/annotation/models/models.py
index 0287a467f..c9a308e3e 100644
--- a/annotation/models/models.py
+++ b/annotation/models/models.py
@@ -835,6 +835,12 @@ class AbstractVariantAnnotation(models.Model):
     splice_region = models.TextField(null=True, blank=True)
     symbol = models.TextField(null=True, blank=True)
 
+    alphamissense_class  = models.CharField(max_length=1, choices=AlphaMissensePrediction.choices, null=True, blank=True)
+    alphamissense_pathogenicity = models.FloatField(null=True, blank=True)
+
+    mavedb_score = models.FloatField(null=True, blank=True)
+    mavedb_urn = models.TextField(null=True, blank=True)
+
     class Meta:
         abstract = True
 
@@ -888,6 +894,7 @@ class VariantAnnotation(AbstractVariantAnnotation):
     # Population frequency
     af_1kg = models.FloatField(null=True, blank=True)
     af_uk10k = models.FloatField(null=True, blank=True)
+    topmed_af = models.FloatField(null=True, blank=True)
     gnomad_af = models.FloatField(null=True, blank=True)
     gnomad2_liftover_af = models.FloatField(null=True, blank=True)
     gnomad_ac = models.IntegerField(null=True, blank=True)
@@ -903,16 +910,20 @@ class VariantAnnotation(AbstractVariantAnnotation):
     gnomad_oth_af = models.FloatField(null=True, blank=True)
     gnomad_sas_af = models.FloatField(null=True, blank=True)
     # filtering allele frequencies (new in gnomADv4)
-    faf95 = models.FloatField(null=True, blank=True)
-    faf99 = models.FloatField(null=True, blank=True)
-    fafmax_faf95_max = models.FloatField(null=True, blank=True)
-    fafmax_faf99_max = models.FloatField(null=True, blank=True)
+    gnomad_faf95 = models.FloatField(null=True, blank=True)
+    gnomad_faf99 = models.FloatField(null=True, blank=True)
+    gnomad_fafmax_faf95_max = models.FloatField(null=True, blank=True)
+    gnomad_fafmax_faf99_max = models.FloatField(null=True, blank=True)
+    gnomad_xy_af = models.FloatField(null=True, blank=True)
+    gnomad_xy_ac = models.IntegerField(null=True, blank=True)
+    gnomad_xy_an = models.IntegerField(null=True, blank=True)
+    gnomad_hemi_count = models.IntegerField(null=True, blank=True)  # This is set from gnomad_xy_ac if gnomad_non_par
     gnomad_popmax_af = models.FloatField(null=True, blank=True)
     gnomad_popmax_ac = models.IntegerField(null=True, blank=True)
     gnomad_popmax_an = models.IntegerField(null=True, blank=True)
     gnomad_popmax_hom_alt = models.IntegerField(null=True, blank=True)
-    topmed_af = models.FloatField(null=True, blank=True)
     gnomad_filtered = models.BooleanField(null=True, blank=True)
+    gnomad_non_par = models.BooleanField(null=True, blank=True)  # Not pseudoautosomal regions
     gnomad_popmax = models.CharField(max_length=3, choices=GnomADPopulation.choices, null=True, blank=True)
 
     # From https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4267638/
@@ -966,12 +977,6 @@ class VariantAnnotation(AbstractVariantAnnotation):
     spliceai_pred_ds_dl = models.FloatField(null=True, blank=True)
     spliceai_gene_symbol = models.TextField(null=True, blank=True)
 
-    alphamissense_class  = models.CharField(max_length=1, choices=AlphaMissensePrediction.choices, null=True, blank=True)
-    alphamissense_pathogenicity = models.FloatField(null=True, blank=True)
-
-    mavedb_score = models.FloatField(null=True, blank=True)
-    mavedb_urn = models.TextField(null=True, blank=True)
-
     repeat_masker = models.TextField(null=True, blank=True)
     overlapping_symbols = models.TextField(null=True, blank=True)
     # Summary of most_damaging fields for faster DamageNode queries
diff --git a/annotation/models/models_enums.py b/annotation/models/models_enums.py
index d476c08ab..bda376d97 100644
--- a/annotation/models/models_enums.py
+++ b/annotation/models/models_enums.py
@@ -111,6 +111,7 @@ class ColumnAnnotationCategory(models.TextChoices):
     CONSERVATION = 'C', "Conservation"
     EXTERNAL_ID = 'E', "External ID"
     FREQUENCY_DATA = 'F', "Frequency Data"
+    FUNCTIONAL_EFFECT = 'f', "Functional Effect"
     GENE_ANNOTATIONS = 'G', 'Gene Annotations'
     HGVS = 'H', "HGVS"
     LITERATURE = 'L', 'Literature'
diff --git a/annotation/vep_annotation.py b/annotation/vep_annotation.py
index fd80fbd02..ee43f94b1 100644
--- a/annotation/vep_annotation.py
+++ b/annotation/vep_annotation.py
@@ -281,7 +281,8 @@ def vep_int_version(vep_string_version):
         kwargs["dbnsfp"] = 'n/a'
 
     # we use our own gnomAD custom annotation, not the default VEP one
-    if cvf := ColumnVEPField.objects.filter(variant_grid_column='gnomad_af', genome_build=genome_build).first():
+    q_cvf = ColumnVEPField.get_columns_version_q(vep_config.columns_version)
+    if cvf := ColumnVEPField.objects.filter(q_cvf, variant_grid_column='gnomad_af', genome_build=genome_build).first():
         try:
             # annotation_data/GRCh37/gnomad2.1.1_GRCh37_combined_af.vcf.bgz
             # gnomad3.1_GRCh38_merged.vcf.bgz
diff --git a/snpdb/migrations/0106_alter_allelemergelog_allele_linking_tool_and_more.py b/snpdb/migrations/0106_alter_allelemergelog_allele_linking_tool_and_more.py
new file mode 100644
index 000000000..7a6e3f756
--- /dev/null
+++ b/snpdb/migrations/0106_alter_allelemergelog_allele_linking_tool_and_more.py
@@ -0,0 +1,28 @@
+# Generated by Django 4.2.2 on 2023-11-29 05:58
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('snpdb', '0105_alter_settingsoverride_default_genome_build_and_more'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='allelemergelog',
+            name='allele_linking_tool',
+            field=models.CharField(choices=[('SC', 'Identical Contig/Version'), ('CA', 'ClinGen Allele Registry'), ('DB', 'dbSNP API'), ('NR', 'NCBI Remap'), ('PC', 'Picard LiftoverVCF'), ('CM', 'CrossMap')], max_length=2),
+        ),
+        migrations.AlterField(
+            model_name='liftover',
+            name='conversion_tool',
+            field=models.CharField(choices=[('SC', 'Identical Contig/Version'), ('CA', 'ClinGen Allele Registry'), ('DB', 'dbSNP API'), ('NR', 'NCBI Remap'), ('PC', 'Picard LiftoverVCF'), ('CM', 'CrossMap')], max_length=2),
+        ),
+        migrations.AlterField(
+            model_name='variantallele',
+            name='allele_linking_tool',
+            field=models.CharField(choices=[('SC', 'Identical Contig/Version'), ('CA', 'ClinGen Allele Registry'), ('DB', 'dbSNP API'), ('NR', 'NCBI Remap'), ('PC', 'Picard LiftoverVCF'), ('CM', 'CrossMap')], max_length=2),
+        ),
+    ]
diff --git a/snpdb/migrations/0107_new_vep_110_columns_v3.py b/snpdb/migrations/0107_new_vep_110_columns_v3.py
new file mode 100644
index 000000000..d0d003f71
--- /dev/null
+++ b/snpdb/migrations/0107_new_vep_110_columns_v3.py
@@ -0,0 +1,120 @@
+# Generated by Django 4.2.2 on 2023-11-29 06:39
+
+from django.db import migrations
+
+from library.django_utils import bulk_insert_class_data
+
+
+
+def _new_vep_110_annotation(apps, _schema_editor):
+    TRANSCRIPT_LEVEL = 'T'
+    VARIANT_LEVEL = 'V'
+
+    raise ValueError("This is not complete yet!")
+
+
+    NEW_VARIANT_GRID_COLUMNS = [
+        {'grid_column_name': 'gnomad_mid_af',
+         'variant_column': 'variantannotation__gnomad_mid_af',
+         'annotation_level': VARIANT_LEVEL,
+         'width': None,
+         'label': 'gnomAD MID AF',
+         'description': "Allele Frequency (0-1) among Middle Eastern genotypes (exome+genome) (GRCh38/gnomad4 only)",
+         'model_field': True,
+         'queryset_field': True},
+        {'grid_column_name': 'gnomad_faf95',
+         'variant_column': 'variantannotation__gnomad_faf95',
+         'annotation_level': VARIANT_LEVEL,
+         'width': None,
+         'label': 'gnomAD FAF95',
+         'description': "Filtering allele frequency (using Poisson 95% CI) (max of exomes/genomes) (GRCh38/gnomad4 only)",
+         'model_field': True,
+         'queryset_field': True},
+        {'grid_column_name': 'gnomad_faf99',
+         'variant_column': 'variantannotation__gnomad_faf99',
+         'annotation_level': VARIANT_LEVEL,
+         'width': None,
+         'label': 'gnomAD FAF99',
+         'description': "Filtering allele frequency (using Poisson 99% CI) (max of exomes/genomes) (GRCh38/gnomad4 only)",
+         'model_field': True,
+         'queryset_field': True},
+        {'grid_column_name': 'gnomad_fafmax_faf95_max',
+         'variant_column': 'variantannotation__gnomad_fafmax_faf95_max',
+         'annotation_level': VARIANT_LEVEL,
+         'width': None,
+         'label': 'gnomAD FAF95 Max',
+         'description': "Maximum filtering allele frequency (using Poisson 95% CI) across genetic_ancestry groups (max of exomes/genomes) (GRCh38/gnomad4 only)",
+         'model_field': True,
+         'queryset_field': True},
+        {'grid_column_name': 'gnomad_fafmax_faf99_max',
+         'variant_column': 'variantannotation__gnomad_fafmax_faf99_max',
+         'annotation_level': VARIANT_LEVEL,
+         'width': None,
+         'label': 'gnomAD FAF99 Max',
+         'description': "Maximum filtering allele frequency (using Poisson 99% CI) across genetic_ancestry groups (max of exomes/genomes) (GRCh38/gnomad4 only)",
+         'model_field': True,
+         'queryset_field': True},
+
+        {'grid_column_name': 'gnomad_hemi_count',
+         'variant_column': 'variantannotation__gnomad_hemi_count',
+         'annotation_level': VARIANT_LEVEL,
+         'width': None,
+         'label': 'gnomAD2 Hemizygous count',
+         'description': "XY count (in non-PAR regions) - only on chrX",
+         'model_field': True,
+         'queryset_field': True},
+
+        # TODO:
+        #     alphamissense_class
+        #     alphamissense_pathogenicity
+        #     mavedb_score
+        #     mavedb_urn
+
+    ]
+
+    NEW_COLUMN_VCF_INFO = [
+        {'info_id': 'GNOMAD3_AC',
+         'column_id': 'gnomad_ac',
+         'number': None,
+         'type': 'F',
+         'description': "gnomAD: Alternate Allele Count  (GRCh38 only)"},
+        {'info_id': 'GNOMAD3_AN',
+         'column_id': 'gnomad_an',
+         'number': None,
+         'type': 'I',
+         'description': "gnomAD: Total number of alleles  (GRCh38 only)"},
+        {'info_id': 'GNOMAD3_POPMAX_AC',
+         'column_id': 'gnomad_popmax_ac',
+         'number': None,
+         'type': 'I',
+         'description': "gnomAD: Allele count in the population with the maximum AF  (GRCh38 only)"},
+        {'info_id': 'GNOMAD3_POPMAX_AN',
+         'column_id': 'gnomad_popmax_an',
+         'number': None,
+         'type': 'I',
+         'description': "gnomAD: Total number of alleles in the population with the maximum AF  (GRCh38 only)"},
+        {'info_id': 'GNOMAD3_POPMAX_HOM_ALT',
+         'column_id': 'gnomad_popmax_hom_alt',
+         'number': None,
+         'type': 'I',
+         'description': "gnomAD: Count of homozygous individuals in the population with the maximum allele frequency (GRCh38 only)"},
+        {'info_id': 'GNOMAD2_LIFTOVER_AF',
+         'column_id': 'gnomad2_liftover_af',
+         'number': None,
+         'type': 'F',
+         'description': "gnomAD: Allele Frequency from gnomAD2 liftover (GRCh38 only)"},
+    ]
+
+    bulk_insert_class_data(apps, "snpdb", [("VariantGridColumn", NEW_VARIANT_GRID_COLUMNS)])
+    bulk_insert_class_data(apps, "snpdb", [("ColumnVCFInfo", NEW_COLUMN_VCF_INFO)])
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('snpdb', '0106_alter_allelemergelog_allele_linking_tool_and_more'),
+    ]
+
+    operations = [
+        migrations.RunPython(_new_vep_110_annotation)
+    ]
diff --git a/snpdb/models/models_enums.py b/snpdb/models/models_enums.py
index ec1d4c686..a51e2e8b9 100644
--- a/snpdb/models/models_enums.py
+++ b/snpdb/models/models_enums.py
@@ -138,7 +138,9 @@ class AlleleConversionTool(models.TextChoices):
     SAME_CONTIG = "SC", "Identical Contig/Version"
     CLINGEN_ALLELE_REGISTRY = 'CA', "ClinGen Allele Registry"
     DBSNP = 'DB', "dbSNP API"
-    NCBI_REMAP = 'NR', "NCBI Remap"
+    NCBI_REMAP = 'NR', "NCBI Remap"  # This is obsolete as of November 2023
+    PICARD = "PC", "Picard LiftoverVCF"
+    CROSSMAP = "CM", "CrossMap"
 
     @classmethod
     def vcf_tuples_in_destination_build(cls, conversion_tool):
diff --git a/variantgrid/settings/components/default_settings.py b/variantgrid/settings/components/default_settings.py
index db9986e9d..a60a34fd9 100644
--- a/variantgrid/settings/components/default_settings.py
+++ b/variantgrid/settings/components/default_settings.py
@@ -271,6 +271,7 @@
             "dbscsnv": "annotation_data/GRCh38/dbscSNV1.1_GRCh38.txt.gz",
             "gnomad2": "annotation_data/GRCh38/gnomad2.1.1_GRCh38_combined_af.vcf.bgz",
             "gnomad3": "annotation_data/GRCh38/gnomad3.1_GRCh38_merged.vcf.bgz",
+            "gnomad4": "annotation_data/GRCh38/gnomad4.0_GRCh38_combined_af.vcf.bgz",
             "mastermind": "annotation_data/GRCh38/mastermind_cited_variants_reference-2022.04.02-grch38.vcf.gz",
             "mave": "annotation_data/GRCh38/MaveDB_variants.tsv.gz",
             "maxentscan": "annotation_data/all_builds/maxentscan",

From ebd2d129ebd5b01eac5712e546c29b8b692928e7 Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Wed, 29 Nov 2023 23:33:16 +1030
Subject: [PATCH 07/29] #938 - need to add version on AF script

---
 .../annotation_data/generate_annotation/gnomad_data.py   | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/annotation/annotation_data/generate_annotation/gnomad_data.py b/annotation/annotation_data/generate_annotation/gnomad_data.py
index aa9c55ce0..f024a3d78 100755
--- a/annotation/annotation_data/generate_annotation/gnomad_data.py
+++ b/annotation/annotation_data/generate_annotation/gnomad_data.py
@@ -79,7 +79,12 @@ def get_args():
     group.add_argument('--af', action='store_true', help="Calculate allele frequency from VCF")
 
     args = parser.parse_args()
-    if not args.scripts:
+    if args.scripts:
+        if args.genome_build is None:
+            parser.error("--genome-build required for --scripts")
+        if args.genome_build not in GENOME_BUILDS:
+            parser.error(f"--genome-build must be one of {', '.join(GENOME_BUILDS)}")
+    else:
         if args.gnomad_input_vcf is None:
             parser.error("--gnomad-input-vcf required for --af")
         if args.af_output_vcf is None:
@@ -189,7 +194,7 @@ def write_scripts(args):
             # cs.write("source /home/a1059391/venv/dave_venv/bin/activate\n")
             script_filename = os.path.realpath(__file__)
             allele_frequency_vcf = f"{prefix}.af.vcf.gz"
-            cs.write(f"{script_filename} --af --gnomad-input-vcf={combined_vcf} --af-output-vcf={allele_frequency_vcf}\n")
+            cs.write(f"{script_filename} --af --gnomad-input-vcf={combined_vcf} --af-output-vcf={allele_frequency_vcf} --version={args.version}\n")
             af_vcfs.append(allele_frequency_vcf)
 
     # Write merge script

From 541fb2ecd8c33c8dda20a2ca8eca73331a9231c1 Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Thu, 30 Nov 2023 11:33:57 +1030
Subject: [PATCH 08/29] #938 - fix VCF header, get AF script to include all
 fields correctly

---
 .../generate_annotation/gnomad_data.py        | 21 ++++++++++++-------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/annotation/annotation_data/generate_annotation/gnomad_data.py b/annotation/annotation_data/generate_annotation/gnomad_data.py
index f024a3d78..2d461b990 100755
--- a/annotation/annotation_data/generate_annotation/gnomad_data.py
+++ b/annotation/annotation_data/generate_annotation/gnomad_data.py
@@ -248,11 +248,14 @@ def write_vcf_header(version, info_fields, popmax_fields, sub_pops):
         'AC_XY': '##INFO=<ID=AC_XY,Number=1,Type=Integer,Description="Alternate allele count for XY samples">',
         'AF_XY': '##INFO=<ID=AF_XY,Number=1,Type=Float,Description="Alternate allele frequency in XY samples">',
         'AN_XY': '##INFO=<ID=AN_XY,Number=1,Type=Integer,Description="Total number of alleles in XY samples">',
+        'AC_male': '##INFO=<ID=AC_male,Number=1,Type=Integer,Description="Alternate allele count for male samples">',
+        'AN_male': '##INFO=<ID=AN_male,Number=1,Type=Integer,Description="Total number of alleles in male samples">',
+        'AF_male': '##INFO=<ID=AF_male,Number=1,Type=Float,Description="Alternate allele frequency in male samples">',
         'faf95': '##INFO=<ID=faf95,Number=1,Type=Float,Description="Filtering allele frequency (using Poisson 95% CI) (max of exomes/genomes)">',
         'faf99': '##INFO=<ID=faf99,Number=1,Type=Float,Description="Filtering allele frequency (using Poisson 99% CI) (max of exomes/genomes)">',
         'fafmax_faf95_max': '##INFO=<ID=fafmax_faf95_max,Number=1,Type=Float,Description="Maximum filtering allele frequency (using Poisson 95% CI) across genetic_ancestry groups (max of exomes/genomes)">',
         'fafmax_faf99_max': '##INFO=<ID=fafmax_faf99_max,Number=1,Type=Float,Description="Maximum filtering allele frequency (using Poisson 99% CI) across genetic_ancestry groups (max of exomes/genomes)">',
-        'AF_popmax': '##INFO=<ID=AF_popmax,Number=1,Type=Float,Description="Allele Frequency for highest population">"',
+        'AF_popmax': '##INFO=<ID=AF_popmax,Number=1,Type=Float,Description="Allele Frequency for highest population">',
         'AC_popmax': '##INFO=<ID=AC_popmax,Number=1,Type=Integer,Description="Allele Count for highest population">',
         'AN_popmax': '##INFO=<ID=AN_popmax,Number=1,Type=Integer,Description="Allele Number for highest population">',
         'popmax': '##INFO=<ID=popmax,Number=1,Type=String,Description="Ancestral group with highest allele frequency (stored as AF_popmax)">',
@@ -277,8 +280,7 @@ def write_vcf_header(version, info_fields, popmax_fields, sub_pops):
     meta = """##fileformat=VCFv4.2
 ##fileDate=%(file_date)s
 ##source=%(source)s
-%(info_headers)s
-""" % {"file_date": file_date, "source": source, "info_headers": info_headers}
+%(info_headers)s""" % {"file_date": file_date, "source": source, "info_headers": info_headers}
 
     af_info = get_af_info(sub_pops)
     for info_id, pop_name, ac_name, an_name in af_info:
@@ -303,7 +305,7 @@ def calculate_allele_frequency(version, gnomad_input_vcf, af_output_vcf):
 
     from cyvcf2 import VCF  # Import here, so that rest of script can run on HPC easier
 
-    info_fields, _, popmax_fields, sub_pops = get_infos_for_version(version)
+    info_fields, chr_x_male, popmax_fields, sub_pops = get_infos_for_version(version)
     af_info = get_af_info(sub_pops)
 
     with gzip.open(af_output_vcf, "wt") as f:
@@ -319,10 +321,10 @@ def calculate_allele_frequency(version, gnomad_input_vcf, af_output_vcf):
             an_popmax = 0
             popmax = '.'
             infos = {}
-            for _, pop_name, ac_name, an_name in af_info:
+            for af_name, pop_name, ac_name, an_name in af_info:
                 ac = variant.INFO.get(ac_name, 0)
                 an = variant.INFO.get(an_name)
-                #print(f"{ac_name}/{an_name} {ac}/{an}")
+                # print(f"{pop_name=},{ac_name=},{an_name=} {ac=}/{an=}")
                 if an:
                     af = ac / an
                     if pop_name and (pop_name not in BOTTLENECKED_SUB_POPS) and af > af_popmax:
@@ -333,9 +335,12 @@ def calculate_allele_frequency(version, gnomad_input_vcf, af_output_vcf):
                     af = f'{af:.6f}'
                 else:
                     af = '.'
-                infos["AF"] = af
 
-            for o in info_fields:
+                infos[af_name] = af
+                infos[ac_name] = ac
+                infos[an_name] = an
+
+            for o in info_fields + chr_x_male:
                 infos[o] = str(variant.INFO.get(o, '.'))
             gnomad_filtered = '0' if variant.FILTER is None else '1'
             infos["gnomad_filtered"] = gnomad_filtered

From 787f408d2f9e06c81cd53d90ba7fa9ff69b7b781 Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Thu, 30 Nov 2023 18:11:53 +1030
Subject: [PATCH 09/29] #850 - new VEP 110 fields

---
 .../migrations/0082_new_vep_110_columns_v3.py | 131 +++++++++++-------
 .../migrations/0107_new_vep_110_columns_v3.py |  85 ++++++++++--
 2 files changed, 155 insertions(+), 61 deletions(-)

diff --git a/annotation/migrations/0082_new_vep_110_columns_v3.py b/annotation/migrations/0082_new_vep_110_columns_v3.py
index c9b259034..278cdb5ae 100644
--- a/annotation/migrations/0082_new_vep_110_columns_v3.py
+++ b/annotation/migrations/0082_new_vep_110_columns_v3.py
@@ -16,114 +16,143 @@ def _new_vep_110_annotation(apps, _schema_editor):
     FUNCTIONAL_EFFECT = 'f'
     PATHOGENICITY_PREDICTIONS = 'P'
 
-
     ColumnVEPField = apps.get_model("annotation", "ColumnVEPField")
-    GenomeBuild = apps.get_model("snpdb", "GenomeBuild")
-
-    grch37 = GenomeBuild.objects.get(pk="GRCh37")
-    grch38 = GenomeBuild.objects.get(pk="GRCh38")
 
     # Make existing gnomAD3 have max column version of 2
     ColumnVEPField.objects.filter(vep_custom=VEP_CUSTOM_GNOMAD_3).update(max_vep_columns_version=2)
 
-    raise ValueError("Still need to add new fields from gnomad2 stuff")
-
-
     COLUMN_VEP_FIELD = [
-        # gnomAD 4
+        # gnomAD 2.1 additional fields - issue #231
+        {'column': 'gnomad2_ac', 'variant_grid_column_id': 'gnomad_ac',
+         'genome_build_id': 'GRCh37', 'pipeline_type': 'S', 'category': 'F', 'source_field': 'AC',
+         'source_field_processing_description': 'Sum of exome AC + genome AC',
+         'vep_custom': 'g', 'source_field_has_custom_prefix': True},
+        {'column': 'gnomad2_popmax_ac', 'variant_grid_column_id': 'gnomad_popmax_ac',
+         'genome_build_id': 'GRCh37', 'pipeline_type': 'S', 'category': 'F', 'source_field': 'AC_popmax',
+         'source_field_processing_description': 'Sum of exome AC_popmax + genome AC_popmax',
+         'vep_custom': 'g', 'source_field_has_custom_prefix': True},
+        {'column': 'gnomad2_an', 'variant_grid_column_id': 'gnomad_an',
+         'genome_build_id': 'GRCh37', 'pipeline_type': 'S', 'category': 'F', 'source_field': 'AN',
+         'source_field_processing_description': 'Sum of exome AN + genome AN',
+         'vep_custom': 'g', 'source_field_has_custom_prefix': True},
+        {'column': 'gnomad2_popmax_an', 'variant_grid_column_id': 'gnomad_popmax_an',
+         'genome_build_id': 'GRCh37', 'pipeline_type': 'S', 'category': 'F', 'source_field': 'AN_popmax',
+         'source_field_processing_description': 'Sum of exome AN_popmax + genome AN_popmax',
+         'vep_custom': 'g', 'source_field_has_custom_prefix': True},
+        {'column': 'gnomad2_nonpar', 'variant_grid_column_id': 'gnomad_non_par',
+         'genome_build_id': 'GRCh37', 'pipeline_type': 'S', 'category': 'F', 'source_field': 'nonpar',
+         'source_field_processing_description': 'nonpar from genomes',
+         'vep_custom': 'g', 'source_field_has_custom_prefix': True},
+
+        # gnomAD 4 - issue #938
         {'column': 'gnomad4_ac', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
-         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_ac', 'source_field': 'AC', 'category': FREQUENCY_DATA},
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_ac', 'source_field': 'AC',
+         'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'},
         {'column': 'gnomad4_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
-         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_af', 'source_field': 'AF', 'category': FREQUENCY_DATA},
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_af', 'source_field': 'AF',
+         'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'},
         {'column': 'gnomad4_an', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
-         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_an', 'source_field': 'AN', 'category': FREQUENCY_DATA},
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_an', 'source_field': 'AN',
+         'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'},
         {'column': 'gnomad4_afr_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
-         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_afr_af', 'source_field': 'AF_afr', 'category': FREQUENCY_DATA},
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_afr_af', 'source_field': 'AF_afr',
+         'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'},
         {'column': 'gnomad4_amr_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
-         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_amr_af', 'source_field': 'AF_amr', 'category': FREQUENCY_DATA},
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_amr_af', 'source_field': 'AF_amr',
+         'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'},
         {'column': 'gnomad4_asj_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
-         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_asj_af', 'source_field': 'AF_asj', 'category': FREQUENCY_DATA},
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_asj_af', 'source_field': 'AF_asj',
+         'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'},
         {'column': 'gnomad4_eas_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
-         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_eas_af', 'source_field': 'AF_eas', 'category': FREQUENCY_DATA},
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_eas_af', 'source_field': 'AF_eas',
+         'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'},
         {'column': 'gnomad4_fin_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
-         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_fin_af', 'source_field': 'AF_fin', 'category': FREQUENCY_DATA},
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_fin_af', 'source_field': 'AF_fin',
+         'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'},
         {'column': 'gnomad4_mid_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
-         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_mid_af', 'source_field': 'AF_mid', 'category': FREQUENCY_DATA},
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_mid_af', 'source_field': 'AF_mid',
+         'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'},
         {'column': 'gnomad4_nfe_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
-         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_nfe_af', 'source_field': 'AF_nfe', 'category': FREQUENCY_DATA},
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_nfe_af', 'source_field': 'AF_nfe',
+         'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'},
         {'column': 'gnomad4_oth_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
-         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_oth_af', 'source_field': 'AF_remaining', 'category': FREQUENCY_DATA},
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_oth_af', 'source_field': 'AF_remaining',
+         'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'},
         {'column': 'gnomad4_sas_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
-         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_sas_af', 'source_field': 'AF_sas', 'category': FREQUENCY_DATA},
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_sas_af', 'source_field': 'AF_sas',
+         'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'},
 
+        {'column': 'gnomad4_nonpar', 'variant_grid_column_id': 'gnomad_non_par',
+         'genome_build_id': 'GRCh37', 'pipeline_type': 'S', 'category': 'F', 'source_field': 'non_par',
+         'source_field_processing_description': 'nonpar from genomes',
+         'vep_custom': 'g', 'source_field_has_custom_prefix': True},
         {'column': 'gnomad4_filtered', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
-         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_filtered', 'source_field': 'gnomad_filtered', 'category': FREQUENCY_DATA},
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_filtered',
+         'source_field': 'gnomad_filtered', 'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'},
         {'column': 'gnomad4_hom_alt', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
-         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_hom_alt', 'source_field': 'nhomalt', 'category': FREQUENCY_DATA},
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_hom_alt', 'source_field': 'nhomalt',
+         'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'},
         {'column': 'gnomad4_popmax', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
-         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_popmax', 'source_field': 'grpmax', 'category': FREQUENCY_DATA},
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_popmax', 'source_field': 'grpmax',
+         'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'},
         {'column': 'gnomad4_popmax_ac', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
-         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_popmax_ac', 'source_field': 'AC_grpmax', 'category': FREQUENCY_DATA},
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_popmax_ac', 'source_field': 'AC_grpmax',
+         'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'},
         {'column': 'gnomad4_popmax_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
-         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_popmax_af', 'source_field': 'AF_grpmax', 'category': FREQUENCY_DATA},
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_popmax_af', 'source_field': 'AF_grpmax',
+         'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'},
         {'column': 'gnomad4_popmax_an', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
-         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_popmax_an', 'source_field': 'AN_grpmax', 'category': FREQUENCY_DATA},
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_popmax_an', 'source_field': 'AN_grpmax',
+         'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'},
 
         {'column': 'gnomad4_xy_ac', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
          'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_xy_ac', 'source_field': 'AC_XY',
-         'category': FREQUENCY_DATA},
+         'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'},
         {'column': 'gnomad4_xy_af', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
          'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_xy_af', 'source_field': 'AF_XY',
-         'category': FREQUENCY_DATA},
+         'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'},
         {'column': 'gnomad4_xy_an', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
-         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_popmax_an', 'source_field': 'AN_XY',
-         'category': FREQUENCY_DATA},
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_xy_an', 'source_field': 'AN_XY',
+         'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'},
 
         {'column': 'gnomad4_faf95', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
          'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_faf95', 'source_field': 'faf95',
-         'category': FREQUENCY_DATA},
+         'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'},
         {'column': 'gnomad4_faf99', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
          'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_faf99', 'source_field': 'faf99',
-         'category': FREQUENCY_DATA},
+         'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'},
         {'column': 'gnomad4_fafmax_faf95_max', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
-         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_fafmax_faf95_max', 'source_field': 'fafmax_faf95_max',
-         'category': FREQUENCY_DATA},
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_fafmax_faf95_max',
+         'source_field': 'fafmax_faf95_max', 'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'},
         {'column': 'gnomad4_fafmax_faf99_max', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
-         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_fafmax_faf99_max', 'source_field': 'fafmax_faf99_max',
-         'category': FREQUENCY_DATA},
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_fafmax_faf99_max',
+         'source_field': 'fafmax_faf99_max', 'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'},
 
         # I left this out don't think it really matters
-#        {'column': 'gnomad4_popmax_hom_alt', 'source_field_has_custom_prefix': True,
-#         'vep_custom': GNOMAD_4, 'variant_grid_column_id': 'gnomad_popmax_hom_alt', 'source_field': 'nhomalt_grpmax', 'category': FREQUENCY_DATA},
+        #        {'column': 'gnomad4_popmax_hom_alt', 'source_field_has_custom_prefix': True,
+        #         'vep_custom': GNOMAD_4, 'variant_grid_column_id': 'gnomad_popmax_hom_alt', 'source_field': 'nhomalt_grpmax', 'category': FREQUENCY_DATA},
 
         # MAVE
         {'column': 'mavedb_score', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
          'vep_plugin': VEP_PLUGIN_MAVEDB, 'variant_grid_column_id': 'mavedb_score',
-         'source_field': 'score',
-         'category': FUNCTIONAL_EFFECT},
+         'source_field': 'score', 'category': FUNCTIONAL_EFFECT, 'genome_build_id': 'GRCh38'},
         {'column': 'mavedb_urn', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
          'vep_plugin': VEP_PLUGIN_MAVEDB, 'variant_grid_column_id': 'mavedb_urn',
-         'source_field': 'urn',
-         'category': FUNCTIONAL_EFFECT},
+         'source_field': 'urn', 'category': FUNCTIONAL_EFFECT, 'genome_build_id': 'GRCh38'},
 
         # AlphaMissense
         {'column': 'alphamissense_class', 'min_vep_columns_version': 3,
          'vep_plugin': VEP_PLUGIN_ALPHAMISSENSE, 'variant_grid_column_id': 'alphamissense_class',
-         'source_field': 'am_class',
-         'category': PATHOGENICITY_PREDICTIONS},
+         'source_field': 'am_class', 'category': PATHOGENICITY_PREDICTIONS},
         {'column': 'alphamissense_pathogenicity', 'min_vep_columns_version': 3,
          'vep_plugin': VEP_PLUGIN_ALPHAMISSENSE, 'variant_grid_column_id': 'alphamissense_pathogenicity',
-         'source_field': 'am_pathogenicity',
-         'category': PATHOGENICITY_PREDICTIONS},
+         'source_field': 'am_pathogenicity', 'category': PATHOGENICITY_PREDICTIONS},
+
     ]
     bulk_insert_class_data(apps, "annotation", [("ColumnVEPField", COLUMN_VEP_FIELD)])
-    ColumnVEPField.objects.filter(vep_custom=VEP_CUSTOM_GNOMAD_4).update(genome_build=grch38)
-
-
 
 
 class Migration(migrations.Migration):
-
     dependencies = [
         ('annotation', '0081_rename_faf95_variantannotation_gnomad_faf95_and_more'),
         ("snpdb", "0107_new_vep_110_columns_v3"),  # Defines new columns
diff --git a/snpdb/migrations/0107_new_vep_110_columns_v3.py b/snpdb/migrations/0107_new_vep_110_columns_v3.py
index d0d003f71..b94c06283 100644
--- a/snpdb/migrations/0107_new_vep_110_columns_v3.py
+++ b/snpdb/migrations/0107_new_vep_110_columns_v3.py
@@ -13,15 +13,23 @@ def _new_vep_110_annotation(apps, _schema_editor):
     raise ValueError("This is not complete yet!")
 
 
+
     NEW_VARIANT_GRID_COLUMNS = [
-        {'grid_column_name': 'gnomad_mid_af',
-         'variant_column': 'variantannotation__gnomad_mid_af',
-         'annotation_level': VARIANT_LEVEL,
+        {'grid_column_name': 'alphamissense_class',
+         'variant_column': 'variantannotation__alphamissense_class',
+         'annotation_level': TRANSCRIPT_LEVEL,
          'width': None,
-         'label': 'gnomAD MID AF',
-         'description': "Allele Frequency (0-1) among Middle Eastern genotypes (exome+genome) (GRCh38/gnomad4 only)",
+         'label': 'AlphaMissense Class',
          'model_field': True,
          'queryset_field': True},
+        {'grid_column_name': 'alphamissense_pathogenicity',
+         'variant_column': 'variantannotation__alphamissense_pathogenicity',
+         'annotation_level': TRANSCRIPT_LEVEL,
+         'width': None,
+         'label': 'AlphaMissense Pathogenicity',
+         'model_field': True,
+         'queryset_field': True},
+
         {'grid_column_name': 'gnomad_faf95',
          'variant_column': 'variantannotation__gnomad_faf95',
          'annotation_level': VARIANT_LEVEL,
@@ -55,6 +63,49 @@ def _new_vep_110_annotation(apps, _schema_editor):
          'model_field': True,
          'queryset_field': True},
 
+        {'grid_column_name': 'gnomad_mid_af',
+         'variant_column': 'variantannotation__gnomad_mid_af',
+         'annotation_level': VARIANT_LEVEL,
+         'width': None,
+         'label': 'gnomAD MID AF',
+         'description': "Allele Frequency (0-1) among Middle Eastern genotypes (exome+genome) (GRCh38/gnomad4 only)",
+         'model_field': True,
+         'queryset_field': True},
+
+        {'grid_column_name': 'gnomad_non_par',
+         'variant_column': 'variantannotation__gnomad_non_par',
+         'annotation_level': VARIANT_LEVEL,
+         'width': None,
+         'label': 'gnomAD non-PAR',
+         'description': "non_par in genomes or exomes",
+         'model_field': True,
+         'queryset_field': True},
+
+        {'grid_column_name': 'gnomad_xy_ac',
+         'variant_column': 'variantannotation__gnomad_xy_ac',
+         'annotation_level': VARIANT_LEVEL,
+         'width': None,
+         'label': 'gnomAD XY AC',
+         'description': "Allele Count in XY",
+         'model_field': True,
+         'queryset_field': True},
+        {'grid_column_name': 'gnomad_xy_af',
+         'variant_column': 'variantannotation__gnomad_xy_af',
+         'annotation_level': VARIANT_LEVEL,
+         'width': None,
+         'label': 'gnomAD XY AF',
+         'description': "Allele Frequency in XY",
+         'model_field': True,
+         'queryset_field': True},
+        {'grid_column_name': 'gnomad_xy_an',
+         'variant_column': 'variantannotation__gnomad_xy_an',
+         'annotation_level': VARIANT_LEVEL,
+         'width': None,
+         'label': 'gnomAD XY AN',
+         'description': "Allele Number in XY",
+         'model_field': True,
+         'queryset_field': True},
+
         {'grid_column_name': 'gnomad_hemi_count',
          'variant_column': 'variantannotation__gnomad_hemi_count',
          'annotation_level': VARIANT_LEVEL,
@@ -64,12 +115,21 @@ def _new_vep_110_annotation(apps, _schema_editor):
          'model_field': True,
          'queryset_field': True},
 
-        # TODO:
-        #     alphamissense_class
-        #     alphamissense_pathogenicity
-        #     mavedb_score
-        #     mavedb_urn
+        {'grid_column_name': 'mavedb_score',
+         'variant_column': 'variantannotation__mavedb_score',
+         'annotation_level': TRANSCRIPT_LEVEL,
+         'width': None,
+         'label': 'MAVEdb score',
+         'model_field': True,
+         'queryset_field': True},
 
+        {'grid_column_name': 'mavedb_urn',
+         'variant_column': 'variantannotation__mavedb_urn',
+         'annotation_level': TRANSCRIPT_LEVEL,
+         'width': None,
+         'label': 'MAVEdb urn',
+         'model_field': True,
+         'queryset_field': True},
     ]
 
     NEW_COLUMN_VCF_INFO = [
@@ -103,8 +163,13 @@ def _new_vep_110_annotation(apps, _schema_editor):
          'number': None,
          'type': 'F',
          'description': "gnomAD: Allele Frequency from gnomAD2 liftover (GRCh38 only)"},
+
+         # TODO:
+
     ]
 
+    raise ValueError("Huge amount of NEW_COLUMN_VCF_INFO to do")
+
     bulk_insert_class_data(apps, "snpdb", [("VariantGridColumn", NEW_VARIANT_GRID_COLUMNS)])
     bulk_insert_class_data(apps, "snpdb", [("ColumnVCFInfo", NEW_COLUMN_VCF_INFO)])
 

From ab2538d3d5b6a751370d17cd111c96b699d5dba5 Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Fri, 1 Dec 2023 11:35:15 +1030
Subject: [PATCH 10/29] #850 - new VEP 110 fields

---
 .../migrations/0082_new_vep_110_columns_v3.py |   4 +-
 .../migrations/0107_new_vep_110_columns_v3.py | 101 +++++++++++-------
 2 files changed, 67 insertions(+), 38 deletions(-)

diff --git a/annotation/migrations/0082_new_vep_110_columns_v3.py b/annotation/migrations/0082_new_vep_110_columns_v3.py
index 278cdb5ae..dd2fabce2 100644
--- a/annotation/migrations/0082_new_vep_110_columns_v3.py
+++ b/annotation/migrations/0082_new_vep_110_columns_v3.py
@@ -83,9 +83,9 @@ def _new_vep_110_annotation(apps, _schema_editor):
          'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'},
 
         {'column': 'gnomad4_nonpar', 'variant_grid_column_id': 'gnomad_non_par',
-         'genome_build_id': 'GRCh37', 'pipeline_type': 'S', 'category': 'F', 'source_field': 'non_par',
+         'genome_build_id': 'GRCh38', 'pipeline_type': 'S', 'category': 'F', 'source_field': 'non_par',
          'source_field_processing_description': 'nonpar from genomes',
-         'vep_custom': 'g', 'source_field_has_custom_prefix': True},
+         'vep_custom': VEP_CUSTOM_GNOMAD_4, 'source_field_has_custom_prefix': True},
         {'column': 'gnomad4_filtered', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
          'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_filtered',
          'source_field': 'gnomad_filtered', 'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'},
diff --git a/snpdb/migrations/0107_new_vep_110_columns_v3.py b/snpdb/migrations/0107_new_vep_110_columns_v3.py
index b94c06283..cadaccfb0 100644
--- a/snpdb/migrations/0107_new_vep_110_columns_v3.py
+++ b/snpdb/migrations/0107_new_vep_110_columns_v3.py
@@ -5,15 +5,10 @@
 from library.django_utils import bulk_insert_class_data
 
 
-
 def _new_vep_110_annotation(apps, _schema_editor):
     TRANSCRIPT_LEVEL = 'T'
     VARIANT_LEVEL = 'V'
 
-    raise ValueError("This is not complete yet!")
-
-
-
     NEW_VARIANT_GRID_COLUMNS = [
         {'grid_column_name': 'alphamissense_class',
          'variant_column': 'variantannotation__alphamissense_class',
@@ -133,49 +128,83 @@ def _new_vep_110_annotation(apps, _schema_editor):
     ]
 
     NEW_COLUMN_VCF_INFO = [
-        {'info_id': 'GNOMAD3_AC',
-         'column_id': 'gnomad_ac',
-         'number': None,
+        {'info_id': 'ALPHAMISSENSE_class',
+         'column_id': 'alphamissense_class',
+         'number': 1,
+         'type': 'S',
+         'description': 'AlphaMissense pathogenicity prediction'},
+        {'info_id': 'ALPHAMISSENSE_pathogenicity',
+         'column_id': 'alphamissense_pathogenicity',
+         'number': 1,
          'type': 'F',
-         'description': "gnomAD: Alternate Allele Count  (GRCh38 only)"},
-        {'info_id': 'GNOMAD3_AN',
-         'column_id': 'gnomad_an',
-         'number': None,
-         'type': 'I',
-         'description': "gnomAD: Total number of alleles  (GRCh38 only)"},
-        {'info_id': 'GNOMAD3_POPMAX_AC',
-         'column_id': 'gnomad_popmax_ac',
-         'number': None,
+         'description': 'AlphaMissense pathogenicity prediction score'},
+        {'info_id': 'GNOMAD_faf95',
+         'column_id': 'gnomad_faf95',
+         'number': 1,
+         'type': 'F',
+         'description': 'Filtering allele frequency (using Poisson 95% CI) (max of exomes/genomes) (GRCh38/gnomad4 only)'},
+        {'info_id': 'GNOMAD_faf99',
+         'column_id': 'gnomad_faf99',
+         'number': 1,
+         'type': 'F',
+         'description': 'Filtering allele frequency (using Poisson 99% CI) (max of exomes/genomes) (GRCh38/gnomad4 only)'},
+        {'info_id': 'GNOMAD_fafmax_faf95_max',
+         'column_id': 'gnomad_fafmax_faf95_max',
+         'number': 1,
+         'type': 'F',
+         'description': 'Maximum filtering allele frequency (using Poisson 95% CI) across genetic_ancestry groups (max of exomes/genomes) (GRCh38/gnomad4 only)'},
+        {'info_id': 'GNOMAD_fafmax_faf99_max',
+         'column_id': 'gnomad_fafmax_faf99_max',
+         'number': 1,
+         'type': 'F',
+         'description': 'Maximum filtering allele frequency (using Poisson 99% CI) across genetic_ancestry groups (max of exomes/genomes) (GRCh38/gnomad4 only)'},
+        {'info_id': 'GNOMAD_AF_mid',
+         'column_id': 'gnomad_mid_af',
+         'number': 1,
+         'type': 'F',
+         'description': 'Allele Frequency (0-1) among Middle Eastern genotypes (exome+genome) (GRCh38/gnomad4 only)'},
+        {'info_id': 'GNOMAD_non_par',
+         'column_id': 'gnomad_non_par',
+         'number': 1,
+         'type': 'F',
+         'description': 'non_par in genomes or exomes'},
+        {'info_id': 'GNOMAD_AC_XY',
+         'column_id': 'gnomad_xy_ac',
+         'number': 1,
          'type': 'I',
-         'description': "gnomAD: Allele count in the population with the maximum AF  (GRCh38 only)"},
-        {'info_id': 'GNOMAD3_POPMAX_AN',
-         'column_id': 'gnomad_popmax_an',
-         'number': None,
+         'description': 'Allele Count in XY'},
+        {'info_id': 'GNOMAD_AF_XY',
+         'column_id': 'gnomad_xy_af',
+         'number': 1,
+         'type': 'F',
+         'description': 'Allele Frequency in XY'},
+        {'info_id': 'GNOMAD_AN_XY',
+         'column_id': 'gnomad_xy_an',
+         'number': 1,
          'type': 'I',
-         'description': "gnomAD: Total number of alleles in the population with the maximum AF  (GRCh38 only)"},
-        {'info_id': 'GNOMAD3_POPMAX_HOM_ALT',
-         'column_id': 'gnomad_popmax_hom_alt',
-         'number': None,
+         'description': 'Allele Number in XY'},
+        {'info_id': 'GNOMAD_HEMI_COUNT',
+         'column_id': 'gnomad_hemi_count',
+         'number': 1,
          'type': 'I',
-         'description': "gnomAD: Count of homozygous individuals in the population with the maximum allele frequency (GRCh38 only)"},
-        {'info_id': 'GNOMAD2_LIFTOVER_AF',
-         'column_id': 'gnomad2_liftover_af',
-         'number': None,
+         'description': 'XY count (in non-PAR regions) - only on chrX'},
+        {'info_id': 'MaveDB_score',
+         'column_id': 'mavedb_score',
+         'number': 1,
          'type': 'F',
-         'description': "gnomAD: Allele Frequency from gnomAD2 liftover (GRCh38 only)"},
-
-         # TODO:
-
+         'description': 'MaveDB score - see MaveDB for interpretation of scores'},
+        {'info_id': 'MaveDB_urn',
+         'column_id': 'mavedb_urn',
+         'number': 1,
+         'type': 'S',
+         'description': 'MaveDB database identifier'}
     ]
 
-    raise ValueError("Huge amount of NEW_COLUMN_VCF_INFO to do")
-
     bulk_insert_class_data(apps, "snpdb", [("VariantGridColumn", NEW_VARIANT_GRID_COLUMNS)])
     bulk_insert_class_data(apps, "snpdb", [("ColumnVCFInfo", NEW_COLUMN_VCF_INFO)])
 
 
 class Migration(migrations.Migration):
-
     dependencies = [
         ('snpdb', '0106_alter_allelemergelog_allele_linking_tool_and_more'),
     ]

From 8010053366e7aa5d02c0023ff2b5b1706814daad Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Fri, 1 Dec 2023 13:33:24 +1030
Subject: [PATCH 11/29] #850 - new VEP 110 fields

---
 annotation/management/commands/vep_version.py |   9 +-
 .../migrations/0082_new_vep_110_columns_v3.py |   8 +-
 annotation/models/damage_enums.py             |  16 ++-
 annotation/models/models.py                   |  15 ++-
 ..._columns_version3_grch37.vep_annotated.vcf | 101 +++++++++++++++
 ..._columns_version3_grch38.vep_annotated.vcf | 116 ++++++++++++++++++
 6 files changed, 252 insertions(+), 13 deletions(-)
 create mode 100644 annotation/tests/test_data/test_columns_version3_grch37.vep_annotated.vcf
 create mode 100644 annotation/tests/test_data/test_columns_version3_grch38.vep_annotated.vcf

diff --git a/annotation/management/commands/vep_version.py b/annotation/management/commands/vep_version.py
index 06d47f749..c916d7765 100644
--- a/annotation/management/commands/vep_version.py
+++ b/annotation/management/commands/vep_version.py
@@ -1,6 +1,6 @@
 from django.core.management.base import BaseCommand
 
-from annotation.vep_annotation import get_vep_version, VEPConfig
+from annotation.vep_annotation import get_vep_version, VEPConfig, vep_dict_to_variant_annotation_version_kwargs
 from snpdb.models.models_genome import GenomeBuild
 
 
@@ -14,4 +14,11 @@ def handle(self, *args, **options):
         genome_build = GenomeBuild.get_name_or_alias(build_name)
         vep_config = VEPConfig(genome_build)
         vep_version = get_vep_version(genome_build, vep_config.annotation_consortium)
+        print("*" * 40)
+        print("VEP kwargs:")
         print(vep_version)
+
+        vav_kwargs = vep_dict_to_variant_annotation_version_kwargs(vep_config, vep_version)
+        print("*" * 40)
+        print("VariantAnnotationVersion kwargs:")
+        print(vav_kwargs)
diff --git a/annotation/migrations/0082_new_vep_110_columns_v3.py b/annotation/migrations/0082_new_vep_110_columns_v3.py
index dd2fabce2..877aaca63 100644
--- a/annotation/migrations/0082_new_vep_110_columns_v3.py
+++ b/annotation/migrations/0082_new_vep_110_columns_v3.py
@@ -133,12 +133,12 @@ def _new_vep_110_annotation(apps, _schema_editor):
         #         'vep_custom': GNOMAD_4, 'variant_grid_column_id': 'gnomad_popmax_hom_alt', 'source_field': 'nhomalt_grpmax', 'category': FREQUENCY_DATA},
 
         # MAVE
-        {'column': 'mavedb_score', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
+        {'column': 'mavedb_score', 'min_vep_columns_version': 3,
          'vep_plugin': VEP_PLUGIN_MAVEDB, 'variant_grid_column_id': 'mavedb_score',
-         'source_field': 'score', 'category': FUNCTIONAL_EFFECT, 'genome_build_id': 'GRCh38'},
-        {'column': 'mavedb_urn', 'source_field_has_custom_prefix': True, 'min_vep_columns_version': 3,
+         'source_field': 'MaveDB_score', 'category': FUNCTIONAL_EFFECT, 'genome_build_id': 'GRCh38'},
+        {'column': 'mavedb_urn', 'min_vep_columns_version': 3,
          'vep_plugin': VEP_PLUGIN_MAVEDB, 'variant_grid_column_id': 'mavedb_urn',
-         'source_field': 'urn', 'category': FUNCTIONAL_EFFECT, 'genome_build_id': 'GRCh38'},
+         'source_field': 'MaveDB_urn', 'category': FUNCTIONAL_EFFECT, 'genome_build_id': 'GRCh38'},
 
         # AlphaMissense
         {'column': 'alphamissense_class', 'min_vep_columns_version': 3,
diff --git a/annotation/models/damage_enums.py b/annotation/models/damage_enums.py
index d366f0ce3..16c4aaa1e 100644
--- a/annotation/models/damage_enums.py
+++ b/annotation/models/damage_enums.py
@@ -156,8 +156,16 @@ class ALoFTPrediction(models.TextChoices):
     DOMINANT = "d", "Dominant"
 
 
-class AlphaMissensePrediction(models.TextChoices):
+class AlphaMissensePrediction(AbstractPathogenicity):
     """ @see https://asia.ensembl.org/info/docs/tools/vep/script/vep_plugins.html#alphamissense """
-    LIKELY_BENIGN = 'b', "likely_benign"
-    AMBIGUOUS = "a", "ambiguous"
-    LIKELY_PATHOGENIC = "p", "likely_pathogenic"
+    LIKELY_BENIGN = 'b'
+    AMBIGUOUS = "a"
+    LIKELY_PATHOGENIC = "p"
+
+    CHOICES = [
+        (LIKELY_BENIGN, 'likely_benign'),
+        (AMBIGUOUS, 'ambiguous'),
+        (LIKELY_PATHOGENIC, 'likely_pathogenic'),
+    ]
+    MINIMUM_FLAG_DAMAGE_LEVEL = LIKELY_PATHOGENIC
+    VARIANT_PATH = "variantannotation__alphamissense_class"
diff --git a/annotation/models/models.py b/annotation/models/models.py
index c9a308e3e..b99d3fa45 100644
--- a/annotation/models/models.py
+++ b/annotation/models/models.py
@@ -459,6 +459,9 @@ class ColumnVEPField(models.Model):
     min_vep_columns_version = models.IntegerField(null=True)
     max_vep_columns_version = models.IntegerField(null=True)
 
+    def __str__(self) -> str:
+        return self.column
+
     @property
     def vep_info_field(self):
         """ For VCFs, be sure to set source_field_has_custom_prefix=True
@@ -467,7 +470,7 @@ def vep_info_field(self):
             We need to adjust for this in BulkVEPVCFAnnotationInserter """
 
         vif = self.source_field
-        if self.source_field_has_custom_prefix:
+        if self.vep_custom and self.source_field_has_custom_prefix:
             vif = self.get_vep_custom_display() + "_" + vif
         return vif
 
@@ -550,11 +553,15 @@ def get_pathogenic_prediction_funcs(self) -> Dict[str, Callable]:
                 'mutation_taster_pred_most_damaging': lambda d: d in MutationTasterPrediction.get_damage_or_greater_levels(),
                 'polyphen2_hvar_pred_most_damaging': lambda d: d in Polyphen2Prediction.get_damage_or_greater_levels(),
             }
-        elif self.columns_version == 2:
+        elif self.columns_version in (2, 3):
             pathogenic_rankscore = settings.ANNOTATION_MIN_PATHOGENIC_RANKSCORE
             pathogenic_prediction_columns = ['bayesdel_noaf_rankscore', 'cadd_raw_rankscore', 'clinpred_rankscore',
                                              'revel_rankscore', 'metalr_rankscore', 'vest4_rankscore']
-            return {c: lambda d: float(d) >= pathogenic_rankscore for c in pathogenic_prediction_columns}
+            pp_funcs = {c: lambda d: float(d) >= pathogenic_rankscore for c in pathogenic_prediction_columns}
+            if self.columns_version == 3:
+                pp_funcs["alphamissense_class"] = lambda d: d in AlphaMissensePrediction.get_damage_or_greater_levels()
+            return pp_funcs
+
         raise ValueError(f"Don't know fields for {self.columns_version=}")
 
     @cached_property
@@ -835,7 +842,7 @@ class AbstractVariantAnnotation(models.Model):
     splice_region = models.TextField(null=True, blank=True)
     symbol = models.TextField(null=True, blank=True)
 
-    alphamissense_class  = models.CharField(max_length=1, choices=AlphaMissensePrediction.choices, null=True, blank=True)
+    alphamissense_class  = models.CharField(max_length=1, choices=AlphaMissensePrediction.CHOICES, null=True, blank=True)
     alphamissense_pathogenicity = models.FloatField(null=True, blank=True)
 
     mavedb_score = models.FloatField(null=True, blank=True)
diff --git a/annotation/tests/test_data/test_columns_version3_grch37.vep_annotated.vcf b/annotation/tests/test_data/test_columns_version3_grch37.vep_annotated.vcf
new file mode 100644
index 000000000..9e647723a
--- /dev/null
+++ b/annotation/tests/test_data/test_columns_version3_grch37.vep_annotated.vcf
@@ -0,0 +1,101 @@
+##fileformat=VCFv4.1
+##INFO=<ID=variant_id,Number=1,Type=Integer,Description="VariantGrid primary column">
+##contig=<ID=1,length=249250621,assembly=hg19>
+##contig=<ID=2,length=243199373,assembly=hg19>
+##contig=<ID=3,length=198022430,assembly=hg19>
+##contig=<ID=4,length=191154276,assembly=hg19>
+##contig=<ID=5,length=180915260,assembly=hg19>
+##contig=<ID=6,length=171115067,assembly=hg19>
+##contig=<ID=7,length=159138663,assembly=hg19>
+##contig=<ID=8,length=146364022,assembly=hg19>
+##contig=<ID=9,length=141213431,assembly=hg19>
+##contig=<ID=10,length=135534747,assembly=hg19>
+##contig=<ID=11,length=135006516,assembly=hg19>
+##contig=<ID=12,length=133851895,assembly=hg19>
+##contig=<ID=13,length=115169878,assembly=hg19>
+##contig=<ID=14,length=107349540,assembly=hg19>
+##contig=<ID=15,length=102531392,assembly=hg19>
+##contig=<ID=16,length=90354753,assembly=hg19>
+##contig=<ID=17,length=81195210,assembly=hg19>
+##contig=<ID=18,length=78077248,assembly=hg19>
+##contig=<ID=19,length=59128983,assembly=hg19>
+##contig=<ID=20,length=63025520,assembly=hg19>
+##contig=<ID=21,length=48129895,assembly=hg19>
+##contig=<ID=22,length=51304566,assembly=hg19>
+##contig=<ID=X,length=155270560,assembly=hg19>
+##contig=<ID=Y,length=59373566,assembly=hg19>
+##contig=<ID=MT,length=16569,assembly=hg19>
+##VEP="v110" time="2023-12-01 11:54:41" cache="/data/annotation/VEP/vep_cache/homo_sapiens_refseq/110_GRCh37" ensembl-funcgen=110.24e6da6 ensembl-variation=110.d34d25e ensembl-io=110.b1a0d57 ensembl=110.9eadbc2 1000genomes="phase3" COSMIC="92" ClinVar="202012" HGMD-PUBLIC="20204" assembly="GRCh37.p13" dbSNP="154" gencode="GENCODE 19" genebuild="2011-04" gnomADe="r2.1" polyphen="2.2.2" refseq="2020-10-26 17:03:42 - GCF_000001405.25_GRCh37.p13_genomic.gff" regbuild="1.0" sift="sift5.2.2"
+##INFO=<ID=CSQ,Number=.,Type=String,Description="Consequence annotations from Ensembl VEP. Format: Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|DISTANCE|STRAND|FLAGS|PICK|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|ENSP|REFSEQ_MATCH|REFSEQ_OFFSET|SOURCE|SIFT|DOMAINS|HGVS_OFFSET|AF|CLIN_SIG|SOMATIC|PHENO|PUBMED|Grantham|SpliceRegion|NMD|Mastermind_MMID3|Mastermind_counts|MaxEntScan_alt|MaxEntScan_diff|MaxEntScan_ref|Aloft_Confidence|Aloft_pred|Aloft_prob_Dominant|Aloft_prob_Recessive|Aloft_prob_Tolerant|BayesDel_noAF_rankscore|CADD_raw_rankscore|ClinPred_rankscore|Ensembl_transcriptid|GERP++_RS|Interpro_domain|MetaLR_rankscore|REVEL_rankscore|VEST4_rankscore|ada_score|rf_score|SpliceAI_pred_DP_AG|SpliceAI_pred_DP_AL|SpliceAI_pred_DP_DG|SpliceAI_pred_DP_DL|SpliceAI_pred_DS_AG|SpliceAI_pred_DS_AL|SpliceAI_pred_DS_DG|SpliceAI_pred_DS_DL|SpliceAI_pred_SYMBOL|am_class|am_pathogenicity|gnomAD2|gnomAD2_AC|gnomAD2_AC_popmax|gnomAD2_AF|gnomAD2_AF_afr|gnomAD2_AF_amr|gnomAD2_AF_asj|gnomAD2_AF_eas|gnomAD2_AF_fin|gnomAD2_AF_nfe|gnomAD2_AF_oth|gnomAD2_AF_popmax|gnomAD2_AF_sas|gnomAD2_AN|gnomAD2_AN_popmax|gnomAD2_gnomad_filtered|gnomAD2_nhomalt|gnomAD2_nonpar|gnomAD2_popmax|phastCons100way_vertebrate|phastCons46way_mammalian|phyloP100way_vertebrate|phyloP46way_mammalian|REPEAT_MASKER|TopMed|TopMed_TOPMED|UK10k|UK10k_AF|COSMIC|COSMIC_CNT|COSMIC_LEGACY_ID">
+##Grantham=Grantham Matrix score - Grantham, R. Amino Acid Difference Formula to Help Explain Protein Evolution, Science 1974 Sep 6;185(4154):862-4.
+##SpliceRegion=SpliceRegion predictions
+##NMD=Nonsense-mediated mRNA decay escaping variants prediction
+##Mastermind_MMID3=Mastermind MMID3 variant identifier(s), as gene:key. Link to the Genomenon Mastermind Genomic Search Engine.
+##Mastermind_counts=Mastermind number of citations in the medical literature. Output includes three unique counts: MMCNT1|MMCNT2|MMCNT3. MMCNT1 - Count of Mastermind articles with cDNA matches for this specific variant; MMCNT2 - Count of Mastermind articles with variants either explicitly matching at the cDNA level or given only at protein level; MMCNT3 - Count of Mastermind articles including other DNA-level variants resulting in the same amino acid change.
+##MaxEntScan_alt=MaxEntScan alternate sequence score
+##MaxEntScan_diff=MaxEntScan score difference
+##MaxEntScan_ref=MaxEntScan reference sequence score
+##Aloft_Confidence=Aloft_Confidence from dbNSFP file
+##Aloft_pred=Aloft_pred from dbNSFP file
+##Aloft_prob_Dominant=Aloft_prob_Dominant from dbNSFP file
+##Aloft_prob_Recessive=Aloft_prob_Recessive from dbNSFP file
+##Aloft_prob_Tolerant=Aloft_prob_Tolerant from dbNSFP file
+##BayesDel_noAF_rankscore=BayesDel_noAF_rankscore from dbNSFP file
+##CADD_raw_rankscore=CADD_raw_rankscore from dbNSFP file
+##ClinPred_rankscore=ClinPred_rankscore from dbNSFP file
+##Ensembl_transcriptid=Ensembl_transcriptid from dbNSFP file
+##GERP++_RS=GERP++_RS from dbNSFP file
+##Interpro_domain=Interpro_domain from dbNSFP file
+##MetaLR_rankscore=MetaLR_rankscore from dbNSFP file
+##REVEL_rankscore=REVEL_rankscore from dbNSFP file
+##VEST4_rankscore=VEST4_rankscore from dbNSFP file
+##ada_score=dbscSNV ADA score
+##rf_score=dbscSNV RF score
+##SpliceAI_pred_DP_AG=SpliceAI predicted effect on splicing. Delta position for acceptor gain
+##SpliceAI_pred_DP_AL=SpliceAI predicted effect on splicing. Delta position for acceptor loss
+##SpliceAI_pred_DP_DG=SpliceAI predicted effect on splicing. Delta position for donor gain
+##SpliceAI_pred_DP_DL=SpliceAI predicted effect on splicing. Delta position for donor loss
+##SpliceAI_pred_DS_AG=SpliceAI predicted effect on splicing. Delta score for acceptor gain
+##SpliceAI_pred_DS_AL=SpliceAI predicted effect on splicing. Delta score for acceptor loss
+##SpliceAI_pred_DS_DG=SpliceAI predicted effect on splicing. Delta score for donor gain
+##SpliceAI_pred_DS_DL=SpliceAI predicted effect on splicing. Delta score for donor loss
+##SpliceAI_pred_SYMBOL=SpliceAI gene symbol
+##am_class=AlphaMissense pathogenicity prediction; column from /data/annotation/VEP/annotation_data/GRCh37/AlphaMissense_hg19.tsv.gz
+##am_pathogenicity=AlphaMissense pathogenicity score; column from /data/annotation/VEP/annotation_data/GRCh37/AlphaMissense_hg19.tsv.gz
+##INFO=<ID=gnomAD2,Number=.,Type=String,Description="[PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AC,Number=.,Type=String,Description="AC field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AC_popmax,Number=.,Type=String,Description="AC_popmax field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AF,Number=.,Type=String,Description="AF field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AF_afr,Number=.,Type=String,Description="AF_afr field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AF_amr,Number=.,Type=String,Description="AF_amr field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AF_asj,Number=.,Type=String,Description="AF_asj field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AF_eas,Number=.,Type=String,Description="AF_eas field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AF_fin,Number=.,Type=String,Description="AF_fin field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AF_nfe,Number=.,Type=String,Description="AF_nfe field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AF_oth,Number=.,Type=String,Description="AF_oth field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AF_popmax,Number=.,Type=String,Description="AF_popmax field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AF_sas,Number=.,Type=String,Description="AF_sas field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AN,Number=.,Type=String,Description="AN field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AN_popmax,Number=.,Type=String,Description="AN_popmax field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_gnomad_filtered,Number=.,Type=String,Description="gnomad_filtered field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_nhomalt,Number=.,Type=String,Description="nhomalt field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_nonpar,Number=.,Type=String,Description="nonpar field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_popmax,Number=.,Type=String,Description="popmax field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=phastCons100way_vertebrate,Number=.,Type=String,Description="[PATH]/hg19.100way.phastCons.bw">
+##INFO=<ID=phastCons46way_mammalian,Number=.,Type=String,Description="[PATH]/hg19.phastCons46way.placental.bw">
+##INFO=<ID=phyloP100way_vertebrate,Number=.,Type=String,Description="[PATH]/hg19.100way.phyloP100way.bw">
+##INFO=<ID=phyloP46way_mammalian,Number=.,Type=String,Description="[PATH]/hg19.phyloP46way.placental.bw">
+##INFO=<ID=REPEAT_MASKER,Number=.,Type=String,Description="[PATH]/repeatmasker_hg19.bed.gz">
+##INFO=<ID=TopMed,Number=.,Type=String,Description="[PATH]/TOPMED_GRCh37.vcf.gz">
+##INFO=<ID=TopMed_TOPMED,Number=.,Type=String,Description="TOPMED field from [PATH]/TOPMED_GRCh37.vcf.gz">
+##INFO=<ID=UK10k,Number=.,Type=String,Description="[PATH]/UK10K_COHORT.20160215.sites.vcf.gz">
+##INFO=<ID=UK10k_AF,Number=.,Type=String,Description="AF field from [PATH]/UK10K_COHORT.20160215.sites.vcf.gz">
+##INFO=<ID=COSMIC,Number=.,Type=String,Description="[PATH]/CosmicCodingMuts_v95_20211101_grch37.normal.vcf.gz">
+##INFO=<ID=COSMIC_CNT,Number=.,Type=String,Description="CNT field from [PATH]/CosmicCodingMuts_v95_20211101_grch37.normal.vcf.gz">
+##INFO=<ID=COSMIC_LEGACY_ID,Number=.,Type=String,Description="LEGACY_ID field from [PATH]/CosmicCodingMuts_v95_20211101_grch37.normal.vcf.gz">
+##VEP-command-line='vep --af --assembly GRCh37 --biotype --cache --canonical --check_existing --compress_output gzip --custom [PATH]/CosmicCodingMuts_v95_20211101_grch37.normal.vcf.gz,COSMIC,vcf,exact,0,CNT,LEGACY_ID --database 0 --dir [PATH]/vep_cache --distance 5000 --domains --exclude_predicted --fasta [PATH]/GCF_000001405.25_GRCh37.p13_genomic.fna.gz --flag_pick --force_overwrite --hgvs --input_file [PATH]/test_grch37.vcf --no_escape --no_stats --numbers --offline --output_file [PATH]/test_grch37.vep_annotated.vcf.gz --plugin [PATH]/AlphaMissense_hg19.tsv.gz --protein --pubmed --refseq --sift b --symbol --transcript_version --use_given_ref --variant_class --vcf'
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+1	69098	.	C	G	.	.	variant_id=13629760;CSQ=G|missense_variant|MODERATE|OR4F5|79501|Transcript|NM_001005484.2|protein_coding|3/3||NM_001005484.2:c.71C>G|NP_001005484.2:p.Thr24Ser|131|71|24|T/S|aCt/aGt|||1||1|SNV|EntrezGene||YES|NP_001005484.2||||tolerated(1)||||||||58||||||||.&.&|.&.&|.&.&|.&.&|.&.&|0.05653|0.12098|0.08831|ENST00000641515&ENST00000335137|2.31|.&.|0.01092|0.14661|0.07811|||-5|35|-5|17|0.00|0.15|0.27|0.08|OR4F5||||||||||||||||||||||0.0020000000949949|0.966000020503998|0.894999980926514|1.23199999332428||||||||
+1	69589	.	G	A	.	.	variant_id=13629761;CSQ=A|missense_variant|MODERATE|OR4F5|79501|Transcript|NM_001005484.2|protein_coding|3/3||NM_001005484.2:c.562G>A|NP_001005484.2:p.Val188Ile|622|562|188|V/I|Gtc/Atc|COSV58736794||1||1|SNV|EntrezGene||YES|NP_001005484.2||||tolerated(0.41)|||||1|1||29|||OR4F5:V167I|0&1&1||||.&.&|.&.&|.&.&|.&.&|.&.&|0.01817|0.11649|0.18729|ENST00000641515&ENST00000335137|0.138|GPCR&_rhodopsin-like&_7TM&GPCR&_rhodopsin-like&_7TM|0.00039|0.11576|0.05287|||29|-46|-12|-1|0.00|0.00|0.00|0.11|OR4F5||||||||||||||||||||||0|0.814000010490417|-1.23099994659424|-0.108000002801418||||||COSV58736794|1|COSM6847255
+13	95839002	.	C	T	.	.	variant_id=13629762;CSQ=T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_001105515.3|protein_coding|11/21||NM_001105515.3:c.1498G>A|NP_001098985.1:p.Glu500Lys|1635|1498|500|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1|||SNV|EntrezGene|||NP_001098985.1||||tolerated(0.11)|||0.0004||0&1|0&1||56|||ABCC4:E500K|0&1&1||||.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|0.63287|0.41304|0.15198|ENST00000645237&ENST00000376887&ENST00000646439&ENST00000536256&ENST00000629385&ENST00000645532|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|0.80456|0.69527|0.56662|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|benign|0.1101|rs145886106|41|36|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|282728|24956|0|0|.|afr|0.84799998998642|0.998000025749207|1.37699997425079|2.61299991607666||rs145886106|0.000927261|||COSV65320224|1|COSM7286401,T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_001301829.2|protein_coding|11/30||NM_001301829.2:c.1498G>A|NP_001288758.1:p.Glu500Lys|1635|1498|500|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1|||SNV|EntrezGene|||NP_001288758.1||||tolerated(0.06)|||0.0004||0&1|0&1||56|||ABCC4:E500K|0&1&1||||.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|0.63287|0.41304|0.15198|ENST00000645237&ENST00000376887&ENST00000646439&ENST00000536256&ENST00000629385&ENST00000645532|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|0.80456|0.69527|0.56662|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|benign|0.1101|rs145886106|41|36|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|282728|24956|0|0|.|afr|0.84799998998642|0.998000025749207|1.37699997425079|2.61299991607666||rs145886106|0.000927261|||COSV65320224|1|COSM7286401,T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_001301830.2|protein_coding|10/20||NM_001301830.2:c.1273G>A|NP_001288759.1:p.Glu425Lys|1410|1273|425|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1|||SNV|EntrezGene|||NP_001288759.1||||tolerated(0.11)|||0.0004||0&1|0&1||56|||ABCC4:E500K|0&1&1||||.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|0.63287|0.41304|0.15198|ENST00000645237&ENST00000376887&ENST00000646439&ENST00000536256&ENST00000629385&ENST00000645532|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|0.80456|0.69527|0.56662|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|||rs145886106|41|36|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|282728|24956|0|0|.|afr|0.84799998998642|0.998000025749207|1.37699997425079|2.61299991607666||rs145886106|0.000927261|||COSV65320224|1|COSM7286401,T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_005845.5|protein_coding|11/31||NM_005845.5:c.1498G>A|NP_005836.2:p.Glu500Lys|1635|1498|500|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1||1|SNV|EntrezGene||YES|NP_005836.2||||tolerated(0.06)|||0.0004||0&1|0&1||56|||ABCC4:E500K|0&1&1||||.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|0.63287|0.41304|0.15198|ENST00000645237&ENST00000376887&ENST00000646439&ENST00000536256&ENST00000629385&ENST00000645532|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|0.80456|0.69527|0.56662|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|benign|0.1101|rs145886106|41|36|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|282728|24956|0|0|.|afr|0.84799998998642|0.998000025749207|1.37699997425079|2.61299991607666||rs145886106|0.000927261|||COSV65320224|1|COSM7286401
+15	32928050	.	C	T	.	.	variant_id=13638004;CSQ=T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_001286479.3|protein_coding|11/12||NM_001286479.3:c.850C>T|NP_001273408.1:p.Arg284Ter|1398|850|284|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|EntrezGene|||NP_001273408.1|||||||||0&1|0&1||||||||||.&High&.&.&.&|.&Recessive&.&.&.&|.&0.13585&.&.&.&|.&0.81255&.&.&.&|.&0.0516&.&.&.&|0.99401|0.93057||ENST00000565905&ENST00000361627&ENST00000567348&ENST00000563864&ENST00000543522|2.5|.&.&.&.&.|||0.36043|||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|||rs776172390|4|1|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|247816|29766|0|0|.|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835|3|COSM185114,T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_001286480.3|protein_coding|12/13||NM_001286480.3:c.850C>T|NP_001273409.1:p.Arg284Ter|1506|850|284|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|EntrezGene|||NP_001273409.1|||||||||0&1|0&1||||||||||.&High&.&.&.&|.&Recessive&.&.&.&|.&0.13585&.&.&.&|.&0.81255&.&.&.&|.&0.0516&.&.&.&|0.99401|0.93057||ENST00000565905&ENST00000361627&ENST00000567348&ENST00000563864&ENST00000543522|2.5|.&.&.&.&.|||0.36043|||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|||rs776172390|4|1|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|247816|29766|0|0|.|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835|3|COSM185114,T|intron_variant|MODIFIER|ARHGAP11A-SCG5|114118903|Transcript|NM_001368319.1|protein_coding||9/13|NM_001368319.1:c.1235+2741C>T|||||||rs776172390&COSV64380835||1|||SNV|EntrezGene||YES|NP_001355248.1|||||||||0&1|0&1|||||||||||||||||||||||||||||||||||||rs776172390|4|1|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|247816|29766|0|0|.|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835|3|COSM185114,T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_014783.6|protein_coding|11/12||NM_014783.6:c.1417C>T|NP_055598.1:p.Arg473Ter|2125|1417|473|R/*|Cga/Tga|rs776172390&COSV64380835||1||1|SNV|EntrezGene||YES|NP_055598.1|||||||||0&1|0&1||||||||||.&High&.&.&.&|.&Recessive&.&.&.&|.&0.13585&.&.&.&|.&0.81255&.&.&.&|.&0.0516&.&.&.&|0.99401|0.93057||ENST00000565905&ENST00000361627&ENST00000567348&ENST00000563864&ENST00000543522|2.5|.&.&.&.&.|||0.36043|||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|||rs776172390|4|1|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|247816|29766|0|0|.|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835|3|COSM185114,T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_199357.3|protein_coding|11/11||NM_199357.3:c.1417C>T|NP_955389.1:p.Arg473Ter|2125|1417|473|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|EntrezGene|||NP_955389.1|||||||||0&1|0&1||||NMD_escaping_variant||||||.&High&.&.&.&|.&Recessive&.&.&.&|.&0.13585&.&.&.&|.&0.81255&.&.&.&|.&0.0516&.&.&.&|0.99401|0.93057||ENST00000565905&ENST00000361627&ENST00000567348&ENST00000563864&ENST00000543522|2.5|.&.&.&.&.|||0.36043|||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|||rs776172390|4|1|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|247816|29766|0|0|.|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835|3|COSM185114
diff --git a/annotation/tests/test_data/test_columns_version3_grch38.vep_annotated.vcf b/annotation/tests/test_data/test_columns_version3_grch38.vep_annotated.vcf
new file mode 100644
index 000000000..b74781756
--- /dev/null
+++ b/annotation/tests/test_data/test_columns_version3_grch38.vep_annotated.vcf
@@ -0,0 +1,116 @@
+##fileformat=VCFv4.1
+##INFO=<ID=variant_id,Number=1,Type=Integer,Description="VariantGrid primary column">
+##contig=<ID=1,length=248956422,assembly=GRCh38>
+##contig=<ID=2,length=242193529,assembly=GRCh38>
+##contig=<ID=3,length=198295559,assembly=GRCh38>
+##contig=<ID=4,length=190214555,assembly=GRCh38>
+##contig=<ID=5,length=181538259,assembly=GRCh38>
+##contig=<ID=6,length=170805979,assembly=GRCh38>
+##contig=<ID=7,length=159345973,assembly=GRCh38>
+##contig=<ID=8,length=145138636,assembly=GRCh38>
+##contig=<ID=9,length=138394717,assembly=GRCh38>
+##contig=<ID=10,length=133797422,assembly=GRCh38>
+##contig=<ID=11,length=135086622,assembly=GRCh38>
+##contig=<ID=12,length=133275309,assembly=GRCh38>
+##contig=<ID=13,length=114364328,assembly=GRCh38>
+##contig=<ID=14,length=107043718,assembly=GRCh38>
+##contig=<ID=15,length=101991189,assembly=GRCh38>
+##contig=<ID=16,length=90338345,assembly=GRCh38>
+##contig=<ID=17,length=83257441,assembly=GRCh38>
+##contig=<ID=18,length=80373285,assembly=GRCh38>
+##contig=<ID=19,length=58617616,assembly=GRCh38>
+##contig=<ID=20,length=64444167,assembly=GRCh38>
+##contig=<ID=21,length=46709983,assembly=GRCh38>
+##contig=<ID=22,length=50818468,assembly=GRCh38>
+##contig=<ID=X,length=156040895,assembly=GRCh38>
+##contig=<ID=Y,length=57227415,assembly=GRCh38>
+##contig=<ID=MT,length=16569,assembly=GRCh38>
+##VEP="v110" time="2023-12-01 11:54:29" cache="/data/annotation/VEP/vep_cache/homo_sapiens_refseq/110_GRCh38" ensembl=110.9eadbc2 ensembl-io=110.b1a0d57 ensembl-funcgen=110.24e6da6 ensembl-variation=110.d34d25e 1000genomes="phase3" COSMIC="97" ClinVar="202301" HGMD-PUBLIC="20204" assembly="GRCh38.p14" dbSNP="154" gencode="GENCODE 44" genebuild="2014-07" gnomADe="r2.1.1" gnomADg="v3.1.2" polyphen="2.2.3" refseq="110 - GCF_000001405.40_GRCh38.p14_genomic.gff" regbuild="1.0" sift="6.2.1"
+##INFO=<ID=CSQ,Number=.,Type=String,Description="Consequence annotations from Ensembl VEP. Format: Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|DISTANCE|STRAND|FLAGS|PICK|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|ENSP|REFSEQ_MATCH|REFSEQ_OFFSET|SOURCE|SIFT|DOMAINS|HGVS_OFFSET|AF|CLIN_SIG|SOMATIC|PHENO|PUBMED|Grantham|SpliceRegion|NMD|Mastermind_MMID3|Mastermind_counts|MaxEntScan_alt|MaxEntScan_diff|MaxEntScan_ref|Aloft_Confidence|Aloft_pred|Aloft_prob_Dominant|Aloft_prob_Recessive|Aloft_prob_Tolerant|BayesDel_noAF_rankscore|CADD_raw_rankscore|ClinPred_rankscore|Ensembl_transcriptid|GERP++_RS|Interpro_domain|MetaLR_rankscore|REVEL_rankscore|VEST4_rankscore|ada_score|rf_score|SpliceAI_pred_DP_AG|SpliceAI_pred_DP_AL|SpliceAI_pred_DP_DG|SpliceAI_pred_DP_DL|SpliceAI_pred_DS_AG|SpliceAI_pred_DS_AL|SpliceAI_pred_DS_DG|SpliceAI_pred_DS_DL|SpliceAI_pred_SYMBOL|am_class|am_pathogenicity|MaveDB_nt|MaveDB_pro|MaveDB_score|MaveDB_urn|gnomAD2|gnomAD2_AF|gnomAD4|gnomAD4_AC|gnomAD4_AC_grpmax|gnomAD4_AC_XY|gnomAD4_AF|gnomAD4_AF_afr|gnomAD4_AF_amr|gnomAD4_AF_asj|gnomAD4_AF_eas|gnomAD4_AF_fin|gnomAD4_AF_grpmax|gnomAD4_AF_mid|gnomAD4_AF_nfe|gnomAD4_AF_remaining|gnomAD4_AF_sas|gnomAD4_AF_XY|gnomAD4_AN|gnomAD4_AN_grpmax|gnomAD4_AN_XY|gnomAD4_faf95|gnomAD4_faf99|gnomAD4_fafmax_faf95_max|gnomAD4_fafmax_faf99_max|gnomAD4_gnomad_filtered|gnomAD4_grpmax|gnomAD4_nhomalt|gnomAD4_non_par|phastCons100way_vertebrate|phastCons30way_mammalian|phyloP100way_vertebrate|phyloP30way_mammalian|REPEAT_MASKER|TopMed|TopMed_TOPMED|UK10k|UK10k_AF|COSMIC|COSMIC_CNT|COSMIC_LEGACY_ID">
+##Grantham=Grantham Matrix score - Grantham, R. Amino Acid Difference Formula to Help Explain Protein Evolution, Science 1974 Sep 6;185(4154):862-4.
+##SpliceRegion=SpliceRegion predictions
+##NMD=Nonsense-mediated mRNA decay escaping variants prediction
+##Mastermind_MMID3=Mastermind MMID3 variant identifier(s), as gene:key. Link to the Genomenon Mastermind Genomic Search Engine.
+##Mastermind_counts=Mastermind number of citations in the medical literature. Output includes three unique counts: MMCNT1|MMCNT2|MMCNT3. MMCNT1 - Count of Mastermind articles with cDNA matches for this specific variant; MMCNT2 - Count of Mastermind articles with variants either explicitly matching at the cDNA level or given only at protein level; MMCNT3 - Count of Mastermind articles including other DNA-level variants resulting in the same amino acid change.
+##MaxEntScan_alt=MaxEntScan alternate sequence score
+##MaxEntScan_diff=MaxEntScan score difference
+##MaxEntScan_ref=MaxEntScan reference sequence score
+##Aloft_Confidence=Aloft_Confidence from dbNSFP file
+##Aloft_pred=Aloft_pred from dbNSFP file
+##Aloft_prob_Dominant=Aloft_prob_Dominant from dbNSFP file
+##Aloft_prob_Recessive=Aloft_prob_Recessive from dbNSFP file
+##Aloft_prob_Tolerant=Aloft_prob_Tolerant from dbNSFP file
+##BayesDel_noAF_rankscore=BayesDel_noAF_rankscore from dbNSFP file
+##CADD_raw_rankscore=CADD_raw_rankscore from dbNSFP file
+##ClinPred_rankscore=ClinPred_rankscore from dbNSFP file
+##Ensembl_transcriptid=Ensembl_transcriptid from dbNSFP file
+##GERP++_RS=GERP++_RS from dbNSFP file
+##Interpro_domain=Interpro_domain from dbNSFP file
+##MetaLR_rankscore=MetaLR_rankscore from dbNSFP file
+##REVEL_rankscore=REVEL_rankscore from dbNSFP file
+##VEST4_rankscore=VEST4_rankscore from dbNSFP file
+##ada_score=dbscSNV ADA score
+##rf_score=dbscSNV RF score
+##SpliceAI_pred_DP_AG=SpliceAI predicted effect on splicing. Delta position for acceptor gain
+##SpliceAI_pred_DP_AL=SpliceAI predicted effect on splicing. Delta position for acceptor loss
+##SpliceAI_pred_DP_DG=SpliceAI predicted effect on splicing. Delta position for donor gain
+##SpliceAI_pred_DP_DL=SpliceAI predicted effect on splicing. Delta position for donor loss
+##SpliceAI_pred_DS_AG=SpliceAI predicted effect on splicing. Delta score for acceptor gain
+##SpliceAI_pred_DS_AL=SpliceAI predicted effect on splicing. Delta score for acceptor loss
+##SpliceAI_pred_DS_DG=SpliceAI predicted effect on splicing. Delta score for donor gain
+##SpliceAI_pred_DS_DL=SpliceAI predicted effect on splicing. Delta score for donor loss
+##SpliceAI_pred_SYMBOL=SpliceAI gene symbol
+##am_class=AlphaMissense pathogenicity prediction; column from /data/annotation/VEP/annotation_data/GRCh38/AlphaMissense_hg38.tsv.gz
+##am_pathogenicity=AlphaMissense pathogenicity score; column from /data/annotation/VEP/annotation_data/GRCh38/AlphaMissense_hg38.tsv.gz
+##MaveDB_nt=MaveDB HGVS (nucleotide); column from MaveDB_variants.tsv.gz
+##MaveDB_pro=MaveDB HGVS (protein); column from MaveDB_variants.tsv.gz
+##MaveDB_score=MaveDB score - see MaveDB for interpretation of scores; column from MaveDB_variants.tsv.gz
+##MaveDB_urn=MaveDB database identifier; column from MaveDB_variants.tsv.gz
+##INFO=<ID=gnomAD2,Number=.,Type=String,Description="[PATH]/gnomad2.1.1_GRCh38_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AF,Number=.,Type=String,Description="AF field from [PATH]/gnomad2.1.1_GRCh38_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD4,Number=.,Type=String,Description="[PATH]/gnomad4.0_GRCh38_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD4_AC,Number=.,Type=String,Description="AC field from [PATH]/gnomad4.0_GRCh38_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD4_AC_grpmax,Number=.,Type=String,Description="AC_grpmax field from [PATH]/gnomad4.0_GRCh38_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD4_AC_XY,Number=.,Type=String,Description="AC_XY field from [PATH]/gnomad4.0_GRCh38_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD4_AF,Number=.,Type=String,Description="AF field from [PATH]/gnomad4.0_GRCh38_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD4_AF_afr,Number=.,Type=String,Description="AF_afr field from [PATH]/gnomad4.0_GRCh38_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD4_AF_amr,Number=.,Type=String,Description="AF_amr field from [PATH]/gnomad4.0_GRCh38_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD4_AF_asj,Number=.,Type=String,Description="AF_asj field from [PATH]/gnomad4.0_GRCh38_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD4_AF_eas,Number=.,Type=String,Description="AF_eas field from [PATH]/gnomad4.0_GRCh38_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD4_AF_fin,Number=.,Type=String,Description="AF_fin field from [PATH]/gnomad4.0_GRCh38_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD4_AF_grpmax,Number=.,Type=String,Description="AF_grpmax field from [PATH]/gnomad4.0_GRCh38_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD4_AF_mid,Number=.,Type=String,Description="AF_mid field from [PATH]/gnomad4.0_GRCh38_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD4_AF_nfe,Number=.,Type=String,Description="AF_nfe field from [PATH]/gnomad4.0_GRCh38_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD4_AF_remaining,Number=.,Type=String,Description="AF_remaining field from [PATH]/gnomad4.0_GRCh38_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD4_AF_sas,Number=.,Type=String,Description="AF_sas field from [PATH]/gnomad4.0_GRCh38_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD4_AF_XY,Number=.,Type=String,Description="AF_XY field from [PATH]/gnomad4.0_GRCh38_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD4_AN,Number=.,Type=String,Description="AN field from [PATH]/gnomad4.0_GRCh38_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD4_AN_grpmax,Number=.,Type=String,Description="AN_grpmax field from [PATH]/gnomad4.0_GRCh38_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD4_AN_XY,Number=.,Type=String,Description="AN_XY field from [PATH]/gnomad4.0_GRCh38_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD4_faf95,Number=.,Type=String,Description="faf95 field from [PATH]/gnomad4.0_GRCh38_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD4_faf99,Number=.,Type=String,Description="faf99 field from [PATH]/gnomad4.0_GRCh38_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD4_fafmax_faf95_max,Number=.,Type=String,Description="fafmax_faf95_max field from [PATH]/gnomad4.0_GRCh38_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD4_fafmax_faf99_max,Number=.,Type=String,Description="fafmax_faf99_max field from [PATH]/gnomad4.0_GRCh38_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD4_gnomad_filtered,Number=.,Type=String,Description="gnomad_filtered field from [PATH]/gnomad4.0_GRCh38_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD4_grpmax,Number=.,Type=String,Description="grpmax field from [PATH]/gnomad4.0_GRCh38_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD4_nhomalt,Number=.,Type=String,Description="nhomalt field from [PATH]/gnomad4.0_GRCh38_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD4_non_par,Number=.,Type=String,Description="non_par field from [PATH]/gnomad4.0_GRCh38_combined_af.vcf.bgz">
+##INFO=<ID=phastCons100way_vertebrate,Number=.,Type=String,Description="[PATH]/hg38.phastCons100way.bw">
+##INFO=<ID=phastCons30way_mammalian,Number=.,Type=String,Description="[PATH]/hg38.phastCons30way.bw">
+##INFO=<ID=phyloP100way_vertebrate,Number=.,Type=String,Description="[PATH]/hg38.phyloP100way.bw">
+##INFO=<ID=phyloP30way_mammalian,Number=.,Type=String,Description="[PATH]/hg38.phyloP30way.bw">
+##INFO=<ID=REPEAT_MASKER,Number=.,Type=String,Description="[PATH]/repeatmasker_hg38.bed.gz">
+##INFO=<ID=TopMed,Number=.,Type=String,Description="[PATH]/TOPMED_GRCh38_20180418.vcf.gz">
+##INFO=<ID=TopMed_TOPMED,Number=.,Type=String,Description="TOPMED field from [PATH]/TOPMED_GRCh38_20180418.vcf.gz">
+##INFO=<ID=UK10k,Number=.,Type=String,Description="[PATH]/UK10K_COHORT.20160215.sites.GRCh38.vcf.gz">
+##INFO=<ID=UK10k_AF,Number=.,Type=String,Description="AF field from [PATH]/UK10K_COHORT.20160215.sites.GRCh38.vcf.gz">
+##INFO=<ID=COSMIC,Number=.,Type=String,Description="[PATH]/CosmicCodingMuts_v95_20211101_grch38.normal.vcf.gz">
+##INFO=<ID=COSMIC_CNT,Number=.,Type=String,Description="CNT field from [PATH]/CosmicCodingMuts_v95_20211101_grch38.normal.vcf.gz">
+##INFO=<ID=COSMIC_LEGACY_ID,Number=.,Type=String,Description="LEGACY_ID field from [PATH]/CosmicCodingMuts_v95_20211101_grch38.normal.vcf.gz">
+##VEP-command-line='vep --af --assembly GRCh38 --biotype --cache --canonical --check_existing --compress_output gzip --custom [PATH]/CosmicCodingMuts_v95_20211101_grch38.normal.vcf.gz,COSMIC,vcf,exact,0,CNT,LEGACY_ID --database 0 --dir [PATH]/vep_cache --distance 5000 --domains --exclude_predicted --fasta [PATH]/GCF_000001405.39_GRCh38.p13_genomic.fna.gz --flag_pick --force_overwrite --hgvs --input_file [PATH]/test_grch38.vcf --no_escape --no_stats --numbers --offline --output_file [PATH]/test_grch38.vep_annotated.vcf.gz --plugin [PATH]/MaveDB_variants.tsv.gz,single_aminoacid_changes=0,transcript_match=0 --protein --pubmed --refseq --sift b --symbol --transcript_version --use_given_ref --variant_class --vcf'
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+1	69113	.	T	A	.	.	variant_id=131165;CSQ=A|missense_variant|MODERATE|OR4F5|79501|Transcript|NM_001005484.2|protein_coding|3/3||NM_001005484.2:c.86T>A|NP_001005484.2:p.Leu29Gln|146|86|29|L/Q|cTg/cAg|||1||1|SNV|EntrezGene||YES|NP_001005484.2||||deleterious(0.02)||||||||113||||||||.&.&|.&.&|.&.&|.&.&|.&.&|0.18177|0.37495|0.36595|ENST00000641515&ENST00000335137|2.31|.&.|0.00994|0.27654|0.49146|||2|20|-5|20|0.05|0.17|0.01|0.04|OR4F5|||||||||NC_000001.11:69113-69113|1|1||0.000003|0.000000|0.000000|0.000000|0.000000|0.000000|5.681366254956992e-06|0.000000|0.000006|0.000000|0.000000||306558|176014||0.0|0.0|.|.|1|nfe|0|.|0.00100000004749745|0.967999994754791|1.01499998569489|0.986999988555908||||||||
+1	1020216	.	CG	GT	.	.	variant_id=131166;CSQ=GT|missense_variant|MODERATE|AGRN|375790|Transcript|NM_001305275.2|protein_coding|1/39||NM_001305275.2:c.44_45delinsGT|NP_001292204.1:p.Pro15Arg|97-98|44-45|15|P/R|cCG/cGT|rs1553170743||1|||substitution|EntrezGene|||NP_001292204.1||||tolerated_low_confidence(0.08)||||uncertain_significance||1||103||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||1&0.619000017642975|0.651000022888184&0.649999976158142|1.41199994087219&0.852999985218048|0.0419999994337559&0.894999980926514||||||||,GT|missense_variant|MODERATE|AGRN|375790|Transcript|NM_198576.4|protein_coding|1/36||NM_198576.4:c.44_45delinsGT|NP_940978.2:p.Pro15Arg|97-98|44-45|15|P/R|cCG/cGT|rs1553170743||1||1|substitution|EntrezGene||YES|NP_940978.2||||tolerated_low_confidence(0.08)||||uncertain_significance||1||103||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||1&0.619000017642975|0.651000022888184&0.649999976158142|1.41199994087219&0.852999985218048|0.0419999994337559&0.894999980926514||||||||
+1	68440906	.	C	T	.	.	variant_id=131167;CSQ=T|missense_variant|MODERATE|RPE65|6121|Transcript|NM_000329.3|protein_coding|6/14||NM_000329.3:c.590G>A|NP_000320.1:p.Gly197Glu|639|590|197|G/E|gGa/gAa|COSV52017509||-1||1|SNV|EntrezGene||YES|NP_000320.1||||deleterious(0)|||||1|1||98||||||||.&|.&|.&|.&|.&|0.93672|0.69004|0.89503|ENST00000262340|5.43||0.97965|0.97022|0.95374|||-35|43|13|-49|0.00|0.00|0.01|0.00|RPE65|likely_pathogenic|0.9466||||||||||||||||||||||||||||||||||1|0.999000012874603|5.75400018692017|1.02600002288818||||||COSV52017509|2|COSM5924206
+13	95186748	.	C	T	.	.	variant_id=24601;CSQ=T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_001105515.3|protein_coding|11/21||NM_001105515.3:c.1498G>A|NP_001098985.1:p.Glu500Lys|1635|1498|500|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1|||SNV|EntrezGene|||NP_001098985.1||||tolerated_low_confidence(0.07)|||0.0004||0&1|0&1||56|||ABCC4:E500K|0&1&1||||.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|0.63287|0.41304|0.15198|ENST00000645237&ENST00000376887&ENST00000646439&ENST00000536256&ENST00000629385&ENST00000645532|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|0.80456|0.69527|0.56662|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|likely_benign|0.1101|||||13:95186748-95186748|0.000145|rs145886106|109|87||0.000068|0.001160|0.000133|0.000000|0.000000|0.000000|0.0011598453539528064|0.000000|0.000005|0.000128|0.000000||1614060|75010||0.0002782499941531569|0.00025161998928524554|0.0009378800168633461|0.0008432099712081254|0|afr|0|.|0.86599999666214|0.990000009536743|1.43400001525879|1.02600002288818||rs145886106|0.000414118|||COSV65320224|1|COSM7286401,T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_001301829.2|protein_coding|11/30||NM_001301829.2:c.1498G>A|NP_001288758.1:p.Glu500Lys|1635|1498|500|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1|||SNV|EntrezGene|||NP_001288758.1||||tolerated_low_confidence(0.08)|||0.0004||0&1|0&1||56|||ABCC4:E500K|0&1&1||||.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|0.63287|0.41304|0.15198|ENST00000645237&ENST00000376887&ENST00000646439&ENST00000536256&ENST00000629385&ENST00000645532|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|0.80456|0.69527|0.56662|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|likely_benign|0.1101|||||13:95186748-95186748|0.000145|rs145886106|109|87||0.000068|0.001160|0.000133|0.000000|0.000000|0.000000|0.0011598453539528064|0.000000|0.000005|0.000128|0.000000||1614060|75010||0.0002782499941531569|0.00025161998928524554|0.0009378800168633461|0.0008432099712081254|0|afr|0|.|0.86599999666214|0.990000009536743|1.43400001525879|1.02600002288818||rs145886106|0.000414118|||COSV65320224|1|COSM7286401,T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_001301830.2|protein_coding|10/20||NM_001301830.2:c.1273G>A|NP_001288759.1:p.Glu425Lys|1410|1273|425|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1|||SNV|EntrezGene|||NP_001288759.1||||tolerated_low_confidence(0.08)|||0.0004||0&1|0&1||56|||ABCC4:E500K|0&1&1||||.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|0.63287|0.41304|0.15198|ENST00000645237&ENST00000376887&ENST00000646439&ENST00000536256&ENST00000629385&ENST00000645532|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|0.80456|0.69527|0.56662|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|||||||13:95186748-95186748|0.000145|rs145886106|109|87||0.000068|0.001160|0.000133|0.000000|0.000000|0.000000|0.0011598453539528064|0.000000|0.000005|0.000128|0.000000||1614060|75010||0.0002782499941531569|0.00025161998928524554|0.0009378800168633461|0.0008432099712081254|0|afr|0|.|0.86599999666214|0.990000009536743|1.43400001525879|1.02600002288818||rs145886106|0.000414118|||COSV65320224|1|COSM7286401,T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_005845.5|protein_coding|11/31||NM_005845.5:c.1498G>A|NP_005836.2:p.Glu500Lys|1635|1498|500|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1||1|SNV|EntrezGene||YES|NP_005836.2||||tolerated_low_confidence(0.07)|||0.0004||0&1|0&1||56|||ABCC4:E500K|0&1&1||||.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|0.63287|0.41304|0.15198|ENST00000645237&ENST00000376887&ENST00000646439&ENST00000536256&ENST00000629385&ENST00000645532|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|0.80456|0.69527|0.56662|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|likely_benign|0.1101|||||13:95186748-95186748|0.000145|rs145886106|109|87||0.000068|0.001160|0.000133|0.000000|0.000000|0.000000|0.0011598453539528064|0.000000|0.000005|0.000128|0.000000||1614060|75010||0.0002782499941531569|0.00025161998928524554|0.0009378800168633461|0.0008432099712081254|0|afr|0|.|0.86599999666214|0.990000009536743|1.43400001525879|1.02600002288818||rs145886106|0.000414118|||COSV65320224|1|COSM7286401
+15	32635849	.	C	T	.	.	variant_id=42;CSQ=T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_001286479.3|protein_coding|11/12||NM_001286479.3:c.850C>T|NP_001273408.1:p.Arg284Ter|1398|850|284|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|EntrezGene|||NP_001273408.1|||||||||0&1|0&1||||||||||.&High&.&.&.&|.&Recessive&.&.&.&|.&0.13585&.&.&.&|.&0.81255&.&.&.&|.&0.0516&.&.&.&|0.99401|0.93057||ENST00000565905&ENST00000361627&ENST00000567348&ENST00000563864&ENST00000543522|2.5|.&.&.&.&.|||0.36043|||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|||||||15:32635849-32635849|1.6e-05|rs776172390|13|2||0.000008|0.000027|0.000000|0.000000|0.000000|0.000000|2.6797438164911436e-05|0.000000|0.000008|0.000016|0.000000||1609462|74634||5.259999852569308e-06|2.530000074330019e-06|8.029999662539922e-06|3.000000106112566e-06|0|afr|0|.|0.00899999961256981|0.839999973773956|0.433999985456467|0.282000005245209||rs776172390|3.18550e-05|||COSV64380835|3|COSM185114,T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_001286480.3|protein_coding|12/13||NM_001286480.3:c.850C>T|NP_001273409.1:p.Arg284Ter|1506|850|284|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|EntrezGene|||NP_001273409.1|||||||||0&1|0&1||||||||||.&High&.&.&.&|.&Recessive&.&.&.&|.&0.13585&.&.&.&|.&0.81255&.&.&.&|.&0.0516&.&.&.&|0.99401|0.93057||ENST00000565905&ENST00000361627&ENST00000567348&ENST00000563864&ENST00000543522|2.5|.&.&.&.&.|||0.36043|||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|||||||15:32635849-32635849|1.6e-05|rs776172390|13|2||0.000008|0.000027|0.000000|0.000000|0.000000|0.000000|2.6797438164911436e-05|0.000000|0.000008|0.000016|0.000000||1609462|74634||5.259999852569308e-06|2.530000074330019e-06|8.029999662539922e-06|3.000000106112566e-06|0|afr|0|.|0.00899999961256981|0.839999973773956|0.433999985456467|0.282000005245209||rs776172390|3.18550e-05|||COSV64380835|3|COSM185114,T|intron_variant|MODIFIER|ARHGAP11A-SCG5|114118903|Transcript|NM_001368319.1|protein_coding||9/13|NM_001368319.1:c.1235+2741C>T|||||||rs776172390&COSV64380835||1|||SNV|EntrezGene||YES|NP_001355248.1|||||||||0&1|0&1|||||||||||||||||||||||||||||||||||||||||15:32635849-32635849|1.6e-05|rs776172390|13|2||0.000008|0.000027|0.000000|0.000000|0.000000|0.000000|2.6797438164911436e-05|0.000000|0.000008|0.000016|0.000000||1609462|74634||5.259999852569308e-06|2.530000074330019e-06|8.029999662539922e-06|3.000000106112566e-06|0|afr|0|.|0.00899999961256981|0.839999973773956|0.433999985456467|0.282000005245209||rs776172390|3.18550e-05|||COSV64380835|3|COSM185114,T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_014783.6|protein_coding|11/12||NM_014783.6:c.1417C>T|NP_055598.1:p.Arg473Ter|2125|1417|473|R/*|Cga/Tga|rs776172390&COSV64380835||1||1|SNV|EntrezGene||YES|NP_055598.1|||||||||0&1|0&1||||||||||.&High&.&.&.&|.&Recessive&.&.&.&|.&0.13585&.&.&.&|.&0.81255&.&.&.&|.&0.0516&.&.&.&|0.99401|0.93057||ENST00000565905&ENST00000361627&ENST00000567348&ENST00000563864&ENST00000543522|2.5|.&.&.&.&.|||0.36043|||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|||||||15:32635849-32635849|1.6e-05|rs776172390|13|2||0.000008|0.000027|0.000000|0.000000|0.000000|0.000000|2.6797438164911436e-05|0.000000|0.000008|0.000016|0.000000||1609462|74634||5.259999852569308e-06|2.530000074330019e-06|8.029999662539922e-06|3.000000106112566e-06|0|afr|0|.|0.00899999961256981|0.839999973773956|0.433999985456467|0.282000005245209||rs776172390|3.18550e-05|||COSV64380835|3|COSM185114,T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_199357.3|protein_coding|11/11||NM_199357.3:c.1417C>T|NP_955389.1:p.Arg473Ter|2125|1417|473|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|EntrezGene|||NP_955389.1|||||||||0&1|0&1||||NMD_escaping_variant||||||.&High&.&.&.&|.&Recessive&.&.&.&|.&0.13585&.&.&.&|.&0.81255&.&.&.&|.&0.0516&.&.&.&|0.99401|0.93057||ENST00000565905&ENST00000361627&ENST00000567348&ENST00000563864&ENST00000543522|2.5|.&.&.&.&.|||0.36043|||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|||||||15:32635849-32635849|1.6e-05|rs776172390|13|2||0.000008|0.000027|0.000000|0.000000|0.000000|0.000000|2.6797438164911436e-05|0.000000|0.000008|0.000016|0.000000||1609462|74634||5.259999852569308e-06|2.530000074330019e-06|8.029999662539922e-06|3.000000106112566e-06|0|afr|0|.|0.00899999961256981|0.839999973773956|0.433999985456467|0.282000005245209||rs776172390|3.18550e-05|||COSV64380835|3|COSM185114

From b396b8caf7aab562d66dc3f1334f97faa8013a21 Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Fri, 1 Dec 2023 13:50:54 +1030
Subject: [PATCH 12/29] Use new gnomADv2 fields for 37 test generation

---
 ..._columns_version2_grch37.vep_annotated.vcf | 95 ++++++++++---------
 1 file changed, 52 insertions(+), 43 deletions(-)

diff --git a/annotation/tests/test_data/test_columns_version2_grch37.vep_annotated.vcf b/annotation/tests/test_data/test_columns_version2_grch37.vep_annotated.vcf
index b87b50674..8a8253d6c 100644
--- a/annotation/tests/test_data/test_columns_version2_grch37.vep_annotated.vcf
+++ b/annotation/tests/test_data/test_columns_version2_grch37.vep_annotated.vcf
@@ -25,31 +25,30 @@
 ##contig=<ID=X,length=155270560,assembly=hg19>
 ##contig=<ID=Y,length=59373566,assembly=hg19>
 ##contig=<ID=MT,length=16569,assembly=hg19>
-##VEP="v106" time="2022-05-19 12:29:43" cache="/data/annotation/VEP/vep_cache/homo_sapiens/106_GRCh37" ensembl-variation=106.2aa7a5d ensembl=106.f4b50c6 ensembl-io=106.6eafdaa ensembl-funcgen=106.027e023 1000genomes="phase3" COSMIC="92" ClinVar="202012" HGMD-PUBLIC="20204" assembly="GRCh37.p13" dbSNP="154" gencode="GENCODE 19" genebuild="2011-04" gnomAD="r2.1" polyphen="2.2.2" regbuild="1.0" sift="sift5.2.2"
-##INFO=<ID=CSQ,Number=.,Type=String,Description="Consequence annotations from Ensembl VEP. Format: Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|DISTANCE|STRAND|FLAGS|PICK|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|ENSP|SOURCE|SIFT|DOMAINS|HGVS_OFFSET|AF|CLIN_SIG|SOMATIC|PHENO|PUBMED|Grantham|SpliceRegion|LoFtool|NMD|Mastermind_MMID3|Mastermind_counts|MaxEntScan_alt|MaxEntScan_diff|MaxEntScan_ref|Aloft_Confidence|Aloft_pred|Aloft_prob_Dominant|Aloft_prob_Recessive|Aloft_prob_Tolerant|BayesDel_noAF_rankscore|CADD_raw_rankscore|ClinPred_rankscore|Ensembl_transcriptid|GERP++_RS|Interpro_domain|MetaLR_rankscore|REVEL_rankscore|VEST4_rankscore|ada_score|rf_score|SpliceAI_pred_DP_AG|SpliceAI_pred_DP_AL|SpliceAI_pred_DP_DG|SpliceAI_pred_DP_DL|SpliceAI_pred_DS_AG|SpliceAI_pred_DS_AL|SpliceAI_pred_DS_DG|SpliceAI_pred_DS_DL|SpliceAI_pred_SYMBOL|gnomAD2|gnomAD2_AF|gnomAD2_AF_afr|gnomAD2_AF_amr|gnomAD2_AF_asj|gnomAD2_AF_eas|gnomAD2_AF_fin|gnomAD2_AF_nfe|gnomAD2_AF_oth|gnomAD2_AF_popmax|gnomAD2_AF_sas|gnomAD2_gnomad_filtered|gnomAD2_nhomalt|gnomAD2_popmax|REPEAT_MASKER|TopMed|TopMed_TOPMED|UK10k|UK10k_AF|COSMIC|COSMIC_CNT|COSMIC_LEGACY_ID">
+##VEP="v110" time="2023-12-01 13:33:59" cache="/data/annotation/VEP/vep_cache/homo_sapiens_refseq/110_GRCh37" ensembl-io=110.b1a0d57 ensembl-variation=110.d34d25e ensembl-funcgen=110.24e6da6 ensembl=110.9eadbc2 1000genomes="phase3" COSMIC="92" ClinVar="202012" HGMD-PUBLIC="20204" assembly="GRCh37.p13" dbSNP="154" gencode="GENCODE 19" genebuild="2011-04" gnomADe="r2.1" polyphen="2.2.2" refseq="2020-10-26 17:03:42 - GCF_000001405.25_GRCh37.p13_genomic.gff" regbuild="1.0" sift="sift5.2.2"
+##INFO=<ID=CSQ,Number=.,Type=String,Description="Consequence annotations from Ensembl VEP. Format: Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|DISTANCE|STRAND|FLAGS|PICK|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|ENSP|REFSEQ_MATCH|REFSEQ_OFFSET|SOURCE|SIFT|DOMAINS|HGVS_OFFSET|AF|CLIN_SIG|SOMATIC|PHENO|PUBMED|Grantham|SpliceRegion|NMD|Mastermind_MMID3|Mastermind_counts|MaxEntScan_alt|MaxEntScan_diff|MaxEntScan_ref|Aloft_Confidence|Aloft_pred|Aloft_prob_Dominant|Aloft_prob_Recessive|Aloft_prob_Tolerant|BayesDel_noAF_rankscore|CADD_raw_rankscore|ClinPred_rankscore|Ensembl_transcriptid|GERP++_RS|Interpro_domain|MetaLR_rankscore|REVEL_rankscore|VEST4_rankscore|ada_score|rf_score|SpliceAI_pred_DP_AG|SpliceAI_pred_DP_AL|SpliceAI_pred_DP_DG|SpliceAI_pred_DP_DL|SpliceAI_pred_DS_AG|SpliceAI_pred_DS_AL|SpliceAI_pred_DS_DG|SpliceAI_pred_DS_DL|SpliceAI_pred_SYMBOL|gnomAD2|gnomAD2_AC|gnomAD2_AC_popmax|gnomAD2_AF|gnomAD2_AF_afr|gnomAD2_AF_amr|gnomAD2_AF_asj|gnomAD2_AF_eas|gnomAD2_AF_fin|gnomAD2_AF_nfe|gnomAD2_AF_oth|gnomAD2_AF_popmax|gnomAD2_AF_sas|gnomAD2_AN|gnomAD2_AN_popmax|gnomAD2_gnomad_filtered|gnomAD2_nhomalt|gnomAD2_nonpar|gnomAD2_popmax|phastCons100way_vertebrate|phastCons46way_mammalian|phyloP100way_vertebrate|phyloP46way_mammalian|REPEAT_MASKER|TopMed|TopMed_TOPMED|UK10k|UK10k_AF|COSMIC|COSMIC_CNT|COSMIC_LEGACY_ID">
 ##Grantham=Grantham Matrix score - Grantham, R. Amino Acid Difference Formula to Help Explain Protein Evolution, Science 1974 Sep 6;185(4154):862-4.
 ##SpliceRegion=SpliceRegion predictions
-##LoFtool=LoFtool score for gene
 ##NMD=Nonsense-mediated mRNA decay escaping variants prediction
 ##Mastermind_MMID3=Mastermind MMID3 variant identifier(s), as gene:key. Link to the Genomenon Mastermind Genomic Search Engine.
 ##Mastermind_counts=Mastermind number of citations in the medical literature. Output includes three unique counts: MMCNT1|MMCNT2|MMCNT3. MMCNT1 - Count of Mastermind articles with cDNA matches for this specific variant; MMCNT2 - Count of Mastermind articles with variants either explicitly matching at the cDNA level or given only at protein level; MMCNT3 - Count of Mastermind articles including other DNA-level variants resulting in the same amino acid change.
 ##MaxEntScan_alt=MaxEntScan alternate sequence score
 ##MaxEntScan_diff=MaxEntScan score difference
 ##MaxEntScan_ref=MaxEntScan reference sequence score
-##Aloft_Confidence=(from dbNSFP) Confidence level of Aloft_pred; values can be "High Confidence" (p < 0.05) or "Low Confidence" (p > 0.05) multiple values separated by ";", corresponding to Ensembl_proteinid.
-##Aloft_pred=(from dbNSFP) final classification predicted by ALoFT; values can be Tolerant, Recessive or Dominant multiple values separated by ";", corresponding to Ensembl_proteinid.
-##Aloft_prob_Dominant=(from dbNSFP) Probability of the SNP being classified as dominant disease-causing by ALoFT multiple values separated by ";", corresponding to Ensembl_proteinid.
-##Aloft_prob_Recessive=(from dbNSFP) Probability of the SNP being classified as recessive disease-causing by ALoFT multiple values separated by ";", corresponding to Ensembl_proteinid.
-##Aloft_prob_Tolerant=(from dbNSFP) Probability of the SNP being classified as benign by ALoFT multiple values separated by ";", corresponding to Ensembl_proteinid.
-##BayesDel_noAF_rankscore=(from dbNSFP) BayesDel_noAF scores were ranked among all BayesDel_noAF scores in dbNSFP. The rankscore is the ratio of the rank of the score over the total number of BayesDel_noAF scores in dbNSFP.
-##CADD_raw_rankscore=(from dbNSFP) CADD raw scores were ranked among all CADD raw scores in dbNSFP. The rankscore is the ratio of the rank of the score over the total number of CADD raw scores in dbNSFP. Please note the following copyright statement for CADD: "CADD scores (http://cadd.gs.washington.edu/) are Copyright 2013 University of Washington and Hudson-Alpha Institute for Biotechnology (all rights reserved) but are freely available for all academic, non-commercial applications. For commercial licensing information contact Jennifer McCullar (mccullaj@uw.edu)."
-##ClinPred_rankscore=(from dbNSFP) ClinPred scores were ranked among all ClinPred scores in dbNSFP. The rankscore is the ratio of the rank of the score over the total number of ClinPred scores in dbNSFP.
-##Ensembl_transcriptid=(from dbNSFP) Ensembl transcript ids (Multiple entries separated by ";")
-##GERP++_RS=(from dbNSFP) GERP++ RS score, the larger the score, the more conserved the site. Scores range from -12.3 to 6.17.
-##Interpro_domain=(from dbNSFP) domain or conserved site on which the variant locates. Domain annotations come from Interpro database. The number in the brackets following a specific domain is the count of times Interpro assigns the variant position to that domain, typically coming from different predicting databases. Multiple entries separated by ";".
-##MetaLR_rankscore=(from dbNSFP) MetaLR scores were ranked among all MetaLR scores in dbNSFP. The rankscore is the ratio of the rank of the score over the total number of MetaLR scores in dbNSFP. The scores range from 0 to 1.
-##REVEL_rankscore=(from dbNSFP) REVEL scores were ranked among all REVEL scores in dbNSFP. The rankscore is the ratio of the rank of the score over the total number of REVEL scores in dbNSFP.
-##VEST4_rankscore=(from dbNSFP) VEST4 scores were ranked among all VEST4 scores in dbNSFP. The rankscore is the ratio of the rank of the score over the total number of VEST4 scores in dbNSFP. In case there are multiple scores for the same variant, the largest score (most damaging) is presented. The scores range from 0 to 1. Please note VEST score is free for non-commercial use. For more details please refer to http://wiki.chasmsoftware.org/index.php/SoftwareLicense. Commercial users should contact the Johns Hopkins Technology Transfer office.
+##Aloft_Confidence=Aloft_Confidence from dbNSFP file
+##Aloft_pred=Aloft_pred from dbNSFP file
+##Aloft_prob_Dominant=Aloft_prob_Dominant from dbNSFP file
+##Aloft_prob_Recessive=Aloft_prob_Recessive from dbNSFP file
+##Aloft_prob_Tolerant=Aloft_prob_Tolerant from dbNSFP file
+##BayesDel_noAF_rankscore=BayesDel_noAF_rankscore from dbNSFP file
+##CADD_raw_rankscore=CADD_raw_rankscore from dbNSFP file
+##ClinPred_rankscore=ClinPred_rankscore from dbNSFP file
+##Ensembl_transcriptid=Ensembl_transcriptid from dbNSFP file
+##GERP++_RS=GERP++_RS from dbNSFP file
+##Interpro_domain=Interpro_domain from dbNSFP file
+##MetaLR_rankscore=MetaLR_rankscore from dbNSFP file
+##REVEL_rankscore=REVEL_rankscore from dbNSFP file
+##VEST4_rankscore=VEST4_rankscore from dbNSFP file
 ##ada_score=dbscSNV ADA score
 ##rf_score=dbscSNV RF score
 ##SpliceAI_pred_DP_AG=SpliceAI predicted effect on splicing. Delta position for acceptor gain
@@ -61,30 +60,40 @@
 ##SpliceAI_pred_DS_DG=SpliceAI predicted effect on splicing. Delta score for donor gain
 ##SpliceAI_pred_DS_DL=SpliceAI predicted effect on splicing. Delta score for donor loss
 ##SpliceAI_pred_SYMBOL=SpliceAI gene symbol
-##INFO=<ID=gnomAD2,Number=.,Type=String,Description="/data/annotation/VEP/annotation_data/GRCh37/gnomad2.1.1_GRCh37_combined_af.vcf.bgz (exact)">
-##INFO=<ID=gnomAD2_AF,Number=.,Type=String,Description="AF field from /data/annotation/VEP/annotation_data/GRCh37/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
-##INFO=<ID=gnomAD2_AF_afr,Number=.,Type=String,Description="AF_afr field from /data/annotation/VEP/annotation_data/GRCh37/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
-##INFO=<ID=gnomAD2_AF_amr,Number=.,Type=String,Description="AF_amr field from /data/annotation/VEP/annotation_data/GRCh37/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
-##INFO=<ID=gnomAD2_AF_asj,Number=.,Type=String,Description="AF_asj field from /data/annotation/VEP/annotation_data/GRCh37/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
-##INFO=<ID=gnomAD2_AF_eas,Number=.,Type=String,Description="AF_eas field from /data/annotation/VEP/annotation_data/GRCh37/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
-##INFO=<ID=gnomAD2_AF_fin,Number=.,Type=String,Description="AF_fin field from /data/annotation/VEP/annotation_data/GRCh37/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
-##INFO=<ID=gnomAD2_AF_nfe,Number=.,Type=String,Description="AF_nfe field from /data/annotation/VEP/annotation_data/GRCh37/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
-##INFO=<ID=gnomAD2_AF_oth,Number=.,Type=String,Description="AF_oth field from /data/annotation/VEP/annotation_data/GRCh37/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
-##INFO=<ID=gnomAD2_AF_popmax,Number=.,Type=String,Description="AF_popmax field from /data/annotation/VEP/annotation_data/GRCh37/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
-##INFO=<ID=gnomAD2_AF_sas,Number=.,Type=String,Description="AF_sas field from /data/annotation/VEP/annotation_data/GRCh37/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
-##INFO=<ID=gnomAD2_gnomad_filtered,Number=.,Type=String,Description="gnomad_filtered field from /data/annotation/VEP/annotation_data/GRCh37/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
-##INFO=<ID=gnomAD2_nhomalt,Number=.,Type=String,Description="nhomalt field from /data/annotation/VEP/annotation_data/GRCh37/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
-##INFO=<ID=gnomAD2_popmax,Number=.,Type=String,Description="popmax field from /data/annotation/VEP/annotation_data/GRCh37/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
-##INFO=<ID=REPEAT_MASKER,Number=.,Type=String,Description="/data/annotation/VEP/annotation_data/GRCh37/repeatmasker_hg19.bed.gz (overlap)">
-##INFO=<ID=TopMed,Number=.,Type=String,Description="/data/annotation/VEP/annotation_data/GRCh37/TOPMED_GRCh37.vcf.gz (exact)">
-##INFO=<ID=TopMed_TOPMED,Number=.,Type=String,Description="TOPMED field from /data/annotation/VEP/annotation_data/GRCh37/TOPMED_GRCh37.vcf.gz">
-##INFO=<ID=UK10k,Number=.,Type=String,Description="/data/annotation/VEP/annotation_data/GRCh37/UK10K_COHORT.20160215.sites.vcf.gz (exact)">
-##INFO=<ID=UK10k_AF,Number=.,Type=String,Description="AF field from /data/annotation/VEP/annotation_data/GRCh37/UK10K_COHORT.20160215.sites.vcf.gz">
-##INFO=<ID=COSMIC,Number=.,Type=String,Description="/data/annotation/VEP/annotation_data/GRCh37/CosmicCodingMuts_v95_20211101_grch37.normal.vcf.gz (exact)">
-##INFO=<ID=COSMIC_CNT,Number=.,Type=String,Description="CNT field from /data/annotation/VEP/annotation_data/GRCh37/CosmicCodingMuts_v95_20211101_grch37.normal.vcf.gz">
-##INFO=<ID=COSMIC_LEGACY_ID,Number=.,Type=String,Description="LEGACY_ID field from /data/annotation/VEP/annotation_data/GRCh37/CosmicCodingMuts_v95_20211101_grch37.normal.vcf.gz">
+##INFO=<ID=gnomAD2,Number=.,Type=String,Description="[PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AC,Number=.,Type=String,Description="AC field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AC_popmax,Number=.,Type=String,Description="AC_popmax field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AF,Number=.,Type=String,Description="AF field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AF_afr,Number=.,Type=String,Description="AF_afr field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AF_amr,Number=.,Type=String,Description="AF_amr field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AF_asj,Number=.,Type=String,Description="AF_asj field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AF_eas,Number=.,Type=String,Description="AF_eas field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AF_fin,Number=.,Type=String,Description="AF_fin field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AF_nfe,Number=.,Type=String,Description="AF_nfe field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AF_oth,Number=.,Type=String,Description="AF_oth field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AF_popmax,Number=.,Type=String,Description="AF_popmax field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AF_sas,Number=.,Type=String,Description="AF_sas field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AN,Number=.,Type=String,Description="AN field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AN_popmax,Number=.,Type=String,Description="AN_popmax field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_gnomad_filtered,Number=.,Type=String,Description="gnomad_filtered field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_nhomalt,Number=.,Type=String,Description="nhomalt field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_nonpar,Number=.,Type=String,Description="nonpar field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_popmax,Number=.,Type=String,Description="popmax field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=phastCons100way_vertebrate,Number=.,Type=String,Description="[PATH]/hg19.100way.phastCons.bw">
+##INFO=<ID=phastCons46way_mammalian,Number=.,Type=String,Description="[PATH]/hg19.phastCons46way.placental.bw">
+##INFO=<ID=phyloP100way_vertebrate,Number=.,Type=String,Description="[PATH]/hg19.100way.phyloP100way.bw">
+##INFO=<ID=phyloP46way_mammalian,Number=.,Type=String,Description="[PATH]/hg19.phyloP46way.placental.bw">
+##INFO=<ID=REPEAT_MASKER,Number=.,Type=String,Description="[PATH]/repeatmasker_hg19.bed.gz">
+##INFO=<ID=TopMed,Number=.,Type=String,Description="[PATH]/TOPMED_GRCh37.vcf.gz">
+##INFO=<ID=TopMed_TOPMED,Number=.,Type=String,Description="TOPMED field from [PATH]/TOPMED_GRCh37.vcf.gz">
+##INFO=<ID=UK10k,Number=.,Type=String,Description="[PATH]/UK10K_COHORT.20160215.sites.vcf.gz">
+##INFO=<ID=UK10k_AF,Number=.,Type=String,Description="AF field from [PATH]/UK10K_COHORT.20160215.sites.vcf.gz">
+##INFO=<ID=COSMIC,Number=.,Type=String,Description="[PATH]/CosmicCodingMuts_v95_20211101_grch37.normal.vcf.gz">
+##INFO=<ID=COSMIC_CNT,Number=.,Type=String,Description="CNT field from [PATH]/CosmicCodingMuts_v95_20211101_grch37.normal.vcf.gz">
+##INFO=<ID=COSMIC_LEGACY_ID,Number=.,Type=String,Description="LEGACY_ID field from [PATH]/CosmicCodingMuts_v95_20211101_grch37.normal.vcf.gz">
+##VEP-command-line='vep --af --assembly GRCh37 --biotype --cache --canonical --check_existing --compress_output gzip --custom [PATH]/CosmicCodingMuts_v95_20211101_grch37.normal.vcf.gz,COSMIC,vcf,exact,0,CNT,LEGACY_ID --database 0 --dir [PATH]/vep_cache --distance 5000 --domains --exclude_predicted --fasta [PATH]/GCF_000001405.25_GRCh37.p13_genomic.fna.gz --flag_pick --force_overwrite --hgvs --input_file [PATH]/test_grch37.vcf --no_escape --no_stats --numbers --offline --output_file [PATH]/test_grch37.vep_annotated.vcf.gz --plugin [PATH]/spliceai_scores.raw.indel.hg19.vcf.gz --protein --pubmed --refseq --sift b --symbol --transcript_version --use_given_ref --variant_class --vcf'
 #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
-1	69098	.	C	G	.	.	variant_id=13629760;CSQ=G|missense_variant|MODERATE|OR4F5|ENSG00000186092|Transcript|ENST00000335137.3|protein_coding|1/1||ENST00000335137.3:c.8C>G|ENSP00000334393.3:p.Thr3Ser|8|8|3|T/S|aCt/aGt|||1||1|SNV|HGNC|14825|YES|ENSP00000334393||tolerated(1)|PANTHER:PTHR26451&PANTHER:PTHR26451:SF72&Gene3D:1.20.1070.10&Superfamily:SSF81321|||||||58|||||||||.&.&|.&.&|.&.&|.&.&|.&.&|0.05653|0.12098|0.08831|ENST00000641515&ENST00000335137|2.31|.&.|0.01092|0.14661|0.07811|||-5|35|-5|17|0.00|0.15|0.27|0.08|OR4F5||||||||||||||||||||||
-1	69589	.	G	A	.	.	variant_id=13629761;CSQ=A|missense_variant|MODERATE|OR4F5|ENSG00000186092|Transcript|ENST00000335137.3|protein_coding|1/1||ENST00000335137.3:c.499G>A|ENSP00000334393.3:p.Val167Ile|499|499|167|V/I|Gtc/Atc|COSV58736794||1||1|SNV|HGNC|14825|YES|ENSP00000334393||tolerated(0.44)|PANTHER:PTHR26451&PANTHER:PTHR26451:SF72&Gene3D:1.20.1070.10&Pfam:PF13853&Superfamily:SSF81321&PROSITE_profiles:PS50262||||1|1||29||||OR4F5:V167I|0&1&1||||.&.&|.&.&|.&.&|.&.&|.&.&|0.01817|0.11649|0.18729|ENST00000641515&ENST00000335137|0.138|GPCR&_rhodopsin-like&_7TM&GPCR&_rhodopsin-like&_7TM|0.00039|0.11576|0.05287|||||||||||||||||||||||||||||||COSV58736794|1|COSM6847255
-13	95839002	.	C	T	.	.	variant_id=13629762;CSQ=T|missense_variant|MODERATE|ABCC4|ENSG00000125257|Transcript|ENST00000376887.4|protein_coding|11/31||ENST00000376887.4:c.1498G>A|ENSP00000366084.4:p.Glu500Lys|1613|1498|500|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1||1|SNV|HGNC|55|YES|ENSP00000366084||tolerated(0.06)|Superfamily:SSF52540&SMART:SM00382&Pfam:PF00005&Gene3D:3.40.50.300&PANTHER:PTHR24223&PANTHER:PTHR24223:SF205&PROSITE_profiles:PS50893||0.0004||0&1|0&1||56||0.0441||ABCC4:E500K|0&1&1||||.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|0.63287|0.41304|0.15198|ENST00000645237&ENST00000376887&ENST00000646439&ENST00000536256&ENST00000629385&ENST00000645532|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|0.80456|0.69527|0.56662||||||||||||rs145886106|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|0|0|afr||rs145886106|0.000927261|||COSV65320224&COSV65320224&COSV65320224&COSV65320224&COSV65320224|1&1&1&1&1|COSM7286401&COSM7286401&COSM7286401&COSM7286401&COSM7286401,T|missense_variant|MODERATE|ABCC4|ENSG00000125257|Transcript|ENST00000412704.1|protein_coding|11/30||ENST00000412704.1:c.1498G>A|ENSP00000388657.1:p.Glu500Lys|1617|1498|500|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1|||SNV|HGNC|55||ENSP00000388657||tolerated(0.06)|PROSITE_profiles:PS50893&PANTHER:PTHR24223:SF205&PANTHER:PTHR24223&Pfam:PF00005&Gene3D:3.40.50.300&SMART:SM00382&Superfamily:SSF52540||0.0004||0&1|0&1||56||0.0441||ABCC4:E500K|0&1&1||||.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|0.63287|0.41304|0.15198|ENST00000645237&ENST00000376887&ENST00000646439&ENST00000536256&ENST00000629385&ENST00000645532|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|0.80456|0.69527|0.56662||||||||||||rs145886106|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|0|0|afr||rs145886106|0.000927261|||COSV65320224&COSV65320224&COSV65320224&COSV65320224&COSV65320224|1&1&1&1&1|COSM7286401&COSM7286401&COSM7286401&COSM7286401&COSM7286401,T|missense_variant|MODERATE|ABCC4|ENSG00000125257|Transcript|ENST00000431522.1|protein_coding|11/21||ENST00000431522.1:c.1498G>A|ENSP00000398562.1:p.Glu500Lys|1617|1498|500|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1|||SNV|HGNC|55||ENSP00000398562||tolerated(0.11)|Superfamily:SSF52540&SMART:SM00382&Pfam:PF00005&Gene3D:3.40.50.300&PANTHER:PTHR24223:SF205&PANTHER:PTHR24223&PROSITE_profiles:PS50893||0.0004||0&1|0&1||56||0.0441||ABCC4:E500K|0&1&1||||.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|0.63287|0.41304|0.15198|ENST00000645237&ENST00000376887&ENST00000646439&ENST00000536256&ENST00000629385&ENST00000645532|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|0.80456|0.69527|0.56662||||||||||||rs145886106|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|0|0|afr||rs145886106|0.000927261|||COSV65320224&COSV65320224&COSV65320224&COSV65320224&COSV65320224|1&1&1&1&1|COSM7286401&COSM7286401&COSM7286401&COSM7286401&COSM7286401,T|missense_variant|MODERATE|ABCC4|ENSG00000125257|Transcript|ENST00000536256.1|protein_coding|10/20||ENST00000536256.1:c.1273G>A|ENSP00000442024.1:p.Glu425Lys|1392|1273|425|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1|||SNV|HGNC|55||ENSP00000442024||tolerated(0.11)|PROSITE_profiles:PS50893&PANTHER:PTHR24223&PANTHER:PTHR24223:SF205&Pfam:PF00005&Gene3D:3.40.50.300&SMART:SM00382&Superfamily:SSF52540||0.0004||0&1|0&1||56||0.0441||ABCC4:E500K|0&1&1||||.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|0.63287|0.41304|0.15198|ENST00000645237&ENST00000376887&ENST00000646439&ENST00000536256&ENST00000629385&ENST00000645532|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|0.80456|0.69527|0.56662||||||||||||rs145886106|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|0|0|afr||rs145886106|0.000927261|||COSV65320224&COSV65320224&COSV65320224&COSV65320224&COSV65320224|1&1&1&1&1|COSM7286401&COSM7286401&COSM7286401&COSM7286401&COSM7286401,T|3_prime_UTR_variant|MODIFIER|ABCC4|ENSG00000125257|Transcript|ENST00000538287.1|protein_coding|13/17||ENST00000538287.1:c.*1689G>A||1785|||||rs145886106&COSV65320224||-1|||SNV|HGNC|55||ENSP00000440160|||||0.0004||0&1|0&1||||0.0441||ABCC4:E500K|0&1&1|||||||||||||||||||||||||||||rs145886106|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|0|0|afr||rs145886106|0.000927261|||COSV65320224&COSV65320224&COSV65320224&COSV65320224&COSV65320224|1&1&1&1&1|COSM7286401&COSM7286401&COSM7286401&COSM7286401&COSM7286401
-15	32928050	.	C	T	.	.	variant_id=13638004;CSQ=T|stop_gained|HIGH|ARHGAP11A|ENSG00000198826|Transcript|ENST00000361627.3|protein_coding|11/12||ENST00000361627.3:c.1417C>T|ENSP00000355090.3:p.Arg473Ter|2139|1417|473|R/*|Cga/Tga|rs776172390&COSV64380835||1||1|SNV|HGNC|15783|YES|ENSP00000355090|||PANTHER:PTHR15670&PANTHER:PTHR15670:SF3||||0&1|0&1||||0.426|||||||.&High&.&.&.&|.&Recessive&.&.&.&|.&0.13585&.&.&.&|.&0.81255&.&.&.&|.&0.0516&.&.&.&|0.99401|0.93057||ENST00000565905&ENST00000361627&ENST00000567348&ENST00000563864&ENST00000543522|2.5|.&.&.&.&.|||0.36043||||||||||||rs776172390|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|0|0|sas||rs776172390|3.4343e-005|||COSV64380835&COSV64380835&COSV64380835&COSV64380835&COSV64380835|3&3&3&3&3|COSM185114&COSM185114&COSM185114&COSM185114&COSM185114,T|stop_gained|HIGH|ARHGAP11A|ENSG00000198826|Transcript|ENST00000543522.1|protein_coding|12/13||ENST00000543522.1:c.850C>T|ENSP00000440073.1:p.Arg284Ter|1439|850|284|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|HGNC|15783||ENSP00000440073|||PANTHER:PTHR15670&PANTHER:PTHR15670:SF3||||0&1|0&1||||0.426|||||||.&High&.&.&.&|.&Recessive&.&.&.&|.&0.13585&.&.&.&|.&0.81255&.&.&.&|.&0.0516&.&.&.&|0.99401|0.93057||ENST00000565905&ENST00000361627&ENST00000567348&ENST00000563864&ENST00000543522|2.5|.&.&.&.&.|||0.36043||||||||||||rs776172390|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|0|0|sas||rs776172390|3.4343e-005|||COSV64380835&COSV64380835&COSV64380835&COSV64380835&COSV64380835|3&3&3&3&3|COSM185114&COSM185114&COSM185114&COSM185114&COSM185114,T|stop_gained|HIGH|ARHGAP11A|ENSG00000198826|Transcript|ENST00000562481.1|protein_coding|5/6||ENST00000562481.1:c.469C>T|ENSP00000455593.1:p.Arg157Ter|468|469|157|R/*|Cga/Tga|rs776172390&COSV64380835||1|cds_start_NF&cds_end_NF||SNV|HGNC|15783||ENSP00000455593|||PANTHER:PTHR15670&PANTHER:PTHR15670:SF3||||0&1|0&1||||0.426|||||||.&High&.&.&.&|.&Recessive&.&.&.&|.&0.13585&.&.&.&|.&0.81255&.&.&.&|.&0.0516&.&.&.&|0.99401|0.93057||ENST00000565905&ENST00000361627&ENST00000567348&ENST00000563864&ENST00000543522|2.5|.&.&.&.&.|||0.36043||||||||||||rs776172390|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|0|0|sas||rs776172390|3.4343e-005|||COSV64380835&COSV64380835&COSV64380835&COSV64380835&COSV64380835|3&3&3&3&3|COSM185114&COSM185114&COSM185114&COSM185114&COSM185114,T|stop_gained|HIGH|ARHGAP11A|ENSG00000198826|Transcript|ENST00000563864.1|protein_coding|11/11||ENST00000563864.1:c.1333C>T|ENSP00000456078.1:p.Arg445Ter|1980|1333|445|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|HGNC|15783||ENSP00000456078|||PANTHER:PTHR15670&PANTHER:PTHR15670:SF3||||0&1|0&1||||0.426|NMD_escaping_variant||||||.&High&.&.&.&|.&Recessive&.&.&.&|.&0.13585&.&.&.&|.&0.81255&.&.&.&|.&0.0516&.&.&.&|0.99401|0.93057||ENST00000565905&ENST00000361627&ENST00000567348&ENST00000563864&ENST00000543522|2.5|.&.&.&.&.|||0.36043||||||||||||rs776172390|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|0|0|sas||rs776172390|3.4343e-005|||COSV64380835&COSV64380835&COSV64380835&COSV64380835&COSV64380835|3&3&3&3&3|COSM185114&COSM185114&COSM185114&COSM185114&COSM185114,T|non_coding_transcript_exon_variant|MODIFIER|ARHGAP11A|ENSG00000198826|Transcript|ENST00000564918.1|retained_intron|3/4||ENST00000564918.1:n.466C>T||466|||||rs776172390&COSV64380835||1|||SNV|HGNC|15783|||||||||0&1|0&1||||0.426||||||||||||||||||||||||||||||||rs776172390|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|0|0|sas||rs776172390|3.4343e-005|||COSV64380835&COSV64380835&COSV64380835&COSV64380835&COSV64380835|3&3&3&3&3|COSM185114&COSM185114&COSM185114&COSM185114&COSM185114,T|stop_gained|HIGH|ARHGAP11A|ENSG00000198826|Transcript|ENST00000565905.1|protein_coding|11/12||ENST00000565905.1:c.850C>T|ENSP00000455754.1:p.Arg284Ter|1398|850|284|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|HGNC|15783||ENSP00000455754|||PANTHER:PTHR15670:SF3&PANTHER:PTHR15670||||0&1|0&1||||0.426|||||||.&High&.&.&.&|.&Recessive&.&.&.&|.&0.13585&.&.&.&|.&0.81255&.&.&.&|.&0.0516&.&.&.&|0.99401|0.93057||ENST00000565905&ENST00000361627&ENST00000567348&ENST00000563864&ENST00000543522|2.5|.&.&.&.&.|||0.36043||||||||||||rs776172390|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|0|0|sas||rs776172390|3.4343e-005|||COSV64380835&COSV64380835&COSV64380835&COSV64380835&COSV64380835|3&3&3&3&3|COSM185114&COSM185114&COSM185114&COSM185114&COSM185114,T|stop_gained|HIGH|ARHGAP11A|ENSG00000198826|Transcript|ENST00000567348.1|protein_coding|11/11||ENST00000567348.1:c.1417C>T|ENSP00000454575.1:p.Arg473Ter|2090|1417|473|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|HGNC|15783||ENSP00000454575|||PANTHER:PTHR15670&PANTHER:PTHR15670:SF3||||0&1|0&1||||0.426|NMD_escaping_variant||||||.&High&.&.&.&|.&Recessive&.&.&.&|.&0.13585&.&.&.&|.&0.81255&.&.&.&|.&0.0516&.&.&.&|0.99401|0.93057||ENST00000565905&ENST00000361627&ENST00000567348&ENST00000563864&ENST00000543522|2.5|.&.&.&.&.|||0.36043||||||||||||rs776172390|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|0|0|sas||rs776172390|3.4343e-005|||COSV64380835&COSV64380835&COSV64380835&COSV64380835&COSV64380835|3&3&3&3&3|COSM185114&COSM185114&COSM185114&COSM185114&COSM185114
+1	69098	.	C	G	.	.	variant_id=13629760;CSQ=G|missense_variant|MODERATE|OR4F5|79501|Transcript|NM_001005484.2|protein_coding|3/3||NM_001005484.2:c.71C>G|NP_001005484.2:p.Thr24Ser|131|71|24|T/S|aCt/aGt|||1||1|SNV|EntrezGene||YES|NP_001005484.2||||tolerated(1)||||||||58||||||||.&.&|.&.&|.&.&|.&.&|.&.&|0.05653|0.12098|0.08831|ENST00000641515&ENST00000335137|2.31|.&.|0.01092|0.14661|0.07811|||-5|35|-5|17|0.00|0.15|0.27|0.08|OR4F5||||||||||||||||||||0.0020000000949949|0.966000020503998|0.894999980926514|1.23199999332428||||||||
+1	69589	.	G	A	.	.	variant_id=13629761;CSQ=A|missense_variant|MODERATE|OR4F5|79501|Transcript|NM_001005484.2|protein_coding|3/3||NM_001005484.2:c.562G>A|NP_001005484.2:p.Val188Ile|622|562|188|V/I|Gtc/Atc|COSV58736794||1||1|SNV|EntrezGene||YES|NP_001005484.2||||tolerated(0.41)|||||1|1||29|||OR4F5:V167I|0&1&1||||.&.&|.&.&|.&.&|.&.&|.&.&|0.01817|0.11649|0.18729|ENST00000641515&ENST00000335137|0.138|GPCR&_rhodopsin-like&_7TM&GPCR&_rhodopsin-like&_7TM|0.00039|0.11576|0.05287|||29|-46|-12|-1|0.00|0.00|0.00|0.11|OR4F5||||||||||||||||||||0|0.814000010490417|-1.23099994659424|-0.108000002801418||||||COSV58736794|1|COSM6847255
+13	95839002	.	C	T	.	.	variant_id=13629762;CSQ=T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_001105515.3|protein_coding|11/21||NM_001105515.3:c.1498G>A|NP_001098985.1:p.Glu500Lys|1635|1498|500|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1|||SNV|EntrezGene|||NP_001098985.1||||tolerated(0.11)|||0.0004||0&1|0&1||56|||ABCC4:E500K|0&1&1||||.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|0.63287|0.41304|0.15198|ENST00000645237&ENST00000376887&ENST00000646439&ENST00000536256&ENST00000629385&ENST00000645532|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|0.80456|0.69527|0.56662|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|rs145886106|41|36|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|282728|24956|0|0|.|afr|0.84799998998642|0.998000025749207|1.37699997425079|2.61299991607666||rs145886106|0.000927261|||COSV65320224|1|COSM7286401,T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_001301829.2|protein_coding|11/30||NM_001301829.2:c.1498G>A|NP_001288758.1:p.Glu500Lys|1635|1498|500|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1|||SNV|EntrezGene|||NP_001288758.1||||tolerated(0.06)|||0.0004||0&1|0&1||56|||ABCC4:E500K|0&1&1||||.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|0.63287|0.41304|0.15198|ENST00000645237&ENST00000376887&ENST00000646439&ENST00000536256&ENST00000629385&ENST00000645532|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|0.80456|0.69527|0.56662|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|rs145886106|41|36|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|282728|24956|0|0|.|afr|0.84799998998642|0.998000025749207|1.37699997425079|2.61299991607666||rs145886106|0.000927261|||COSV65320224|1|COSM7286401,T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_001301830.2|protein_coding|10/20||NM_001301830.2:c.1273G>A|NP_001288759.1:p.Glu425Lys|1410|1273|425|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1|||SNV|EntrezGene|||NP_001288759.1||||tolerated(0.11)|||0.0004||0&1|0&1||56|||ABCC4:E500K|0&1&1||||.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|0.63287|0.41304|0.15198|ENST00000645237&ENST00000376887&ENST00000646439&ENST00000536256&ENST00000629385&ENST00000645532|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|0.80456|0.69527|0.56662|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|rs145886106|41|36|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|282728|24956|0|0|.|afr|0.84799998998642|0.998000025749207|1.37699997425079|2.61299991607666||rs145886106|0.000927261|||COSV65320224|1|COSM7286401,T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_005845.5|protein_coding|11/31||NM_005845.5:c.1498G>A|NP_005836.2:p.Glu500Lys|1635|1498|500|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1||1|SNV|EntrezGene||YES|NP_005836.2||||tolerated(0.06)|||0.0004||0&1|0&1||56|||ABCC4:E500K|0&1&1||||.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|.&.&.&.&.&.&|0.63287|0.41304|0.15198|ENST00000645237&ENST00000376887&ENST00000646439&ENST00000536256&ENST00000629385&ENST00000645532|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|0.80456|0.69527|0.56662|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|rs145886106|41|36|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|282728|24956|0|0|.|afr|0.84799998998642|0.998000025749207|1.37699997425079|2.61299991607666||rs145886106|0.000927261|||COSV65320224|1|COSM7286401
+15	32928050	.	C	T	.	.	variant_id=13638004;CSQ=T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_001286479.3|protein_coding|11/12||NM_001286479.3:c.850C>T|NP_001273408.1:p.Arg284Ter|1398|850|284|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|EntrezGene|||NP_001273408.1|||||||||0&1|0&1||||||||||.&High&.&.&.&|.&Recessive&.&.&.&|.&0.13585&.&.&.&|.&0.81255&.&.&.&|.&0.0516&.&.&.&|0.99401|0.93057||ENST00000565905&ENST00000361627&ENST00000567348&ENST00000563864&ENST00000543522|2.5|.&.&.&.&.|||0.36043|||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|rs776172390|4|1|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|247816|29766|0|0|.|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835|3|COSM185114,T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_001286480.3|protein_coding|12/13||NM_001286480.3:c.850C>T|NP_001273409.1:p.Arg284Ter|1506|850|284|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|EntrezGene|||NP_001273409.1|||||||||0&1|0&1||||||||||.&High&.&.&.&|.&Recessive&.&.&.&|.&0.13585&.&.&.&|.&0.81255&.&.&.&|.&0.0516&.&.&.&|0.99401|0.93057||ENST00000565905&ENST00000361627&ENST00000567348&ENST00000563864&ENST00000543522|2.5|.&.&.&.&.|||0.36043|||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|rs776172390|4|1|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|247816|29766|0|0|.|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835|3|COSM185114,T|intron_variant|MODIFIER|ARHGAP11A-SCG5|114118903|Transcript|NM_001368319.1|protein_coding||9/13|NM_001368319.1:c.1235+2741C>T|||||||rs776172390&COSV64380835||1|||SNV|EntrezGene||YES|NP_001355248.1|||||||||0&1|0&1|||||||||||||||||||||||||||||||||||rs776172390|4|1|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|247816|29766|0|0|.|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835|3|COSM185114,T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_014783.6|protein_coding|11/12||NM_014783.6:c.1417C>T|NP_055598.1:p.Arg473Ter|2125|1417|473|R/*|Cga/Tga|rs776172390&COSV64380835||1||1|SNV|EntrezGene||YES|NP_055598.1|||||||||0&1|0&1||||||||||.&High&.&.&.&|.&Recessive&.&.&.&|.&0.13585&.&.&.&|.&0.81255&.&.&.&|.&0.0516&.&.&.&|0.99401|0.93057||ENST00000565905&ENST00000361627&ENST00000567348&ENST00000563864&ENST00000543522|2.5|.&.&.&.&.|||0.36043|||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|rs776172390|4|1|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|247816|29766|0|0|.|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835|3|COSM185114,T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_199357.3|protein_coding|11/11||NM_199357.3:c.1417C>T|NP_955389.1:p.Arg473Ter|2125|1417|473|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|EntrezGene|||NP_955389.1|||||||||0&1|0&1||||NMD_escaping_variant||||||.&High&.&.&.&|.&Recessive&.&.&.&|.&0.13585&.&.&.&|.&0.81255&.&.&.&|.&0.0516&.&.&.&|0.99401|0.93057||ENST00000565905&ENST00000361627&ENST00000567348&ENST00000563864&ENST00000543522|2.5|.&.&.&.&.|||0.36043|||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|rs776172390|4|1|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|247816|29766|0|0|.|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835|3|COSM185114

From bb9a2059d43482ed187e062f972d1b9da7def791 Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Fri, 1 Dec 2023 13:51:06 +1030
Subject: [PATCH 13/29] #850 - new VEP 110 fields - alpha missense formatter

---
 .../vcf_files/bulk_vep_vcf_annotation_inserter.py   | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py b/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py
index e8f7580f3..446f10646 100644
--- a/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py
+++ b/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py
@@ -10,7 +10,7 @@
 
 from annotation.models.damage_enums import SIFTPrediction, FATHMMPrediction, \
     MutationAssessorPrediction, MutationTasterPrediction, Polyphen2Prediction, \
-    PathogenicityImpact, ALoFTPrediction
+    PathogenicityImpact, ALoFTPrediction, AlphaMissensePrediction
 from annotation.models.models import ColumnVEPField, VariantAnnotation, \
     VariantTranscriptAnnotation, VariantAnnotationVersion, VariantGeneOverlap
 from annotation.models.models_enums import VariantClass, VariantAnnotationPipelineType
@@ -156,6 +156,7 @@ def _add_vep_field_handlers(self):
             "aloft_pred": get_choice_formatter_func(ALoFTPrediction.choices, empty_values=["."]),
             "aloft_high_confidence": format_aloft_high_confidence,
             "aloft_ensembl_transcript": format_empty_as_none,
+            "alphamissense_class": get_format_alphamissense_class_func(),
             "canonical": format_canonical,
             "cosmic_count": format_pick_highest_int,
             "cosmic_id": extract_cosmic,
@@ -598,6 +599,16 @@ def format_vep_sift_to_choice(vep_sift):
         return SIFTPrediction.TOLERATED
     raise ValueError(f"Unknown SIFT value: '{vep_sift}'")
 
+def get_format_alphamissense_class_func():
+    """ GRCh37 has 'benign' while GRCh38 has 'likely_benign'
+        @see https://github.com/Ensembl/VEP_plugins/issues/668
+    """
+    cff = get_choice_formatter_func(AlphaMissensePrediction.CHOICES)
+    def _format_alphamissense_class(alphamissense_class):
+        if alphamissense_class == "benign":
+            alphamissense_class = "likely_benign"
+        return cff(alphamissense_class)
+    return _format_alphamissense_class
 
 def get_extract_existing_variation(prefix):
     def format_vep_existing_variation(vep_existing_variation):

From b8cdbcddd701da152e5f7f54dd49adeed111ca2c Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Fri, 1 Dec 2023 15:19:45 +1030
Subject: [PATCH 14/29] #850 - unit tests

---
 analysis/tests/test_urls.py                   |  2 +-
 annotation/fake_annotation.py                 |  2 +-
 annotation/management/commands/vep_run.py     |  5 +-
 .../migrations/0082_new_vep_110_columns_v3.py |  2 +-
 annotation/tests/test_annotation_vcf.py       | 55 ++++++++++-
 ..._columns_version1_grch37.vep_annotated.vcf | 92 +++++++++++++++++++
 ...columns_version1_grch38.vep_annotated.vcf} |  0
 .../test_data/test_grch37.vep_annotated.vcf   | 88 ------------------
 .../bulk_vep_vcf_annotation_inserter.py       | 12 ++-
 9 files changed, 160 insertions(+), 98 deletions(-)
 create mode 100644 annotation/tests/test_data/test_columns_version1_grch37.vep_annotated.vcf
 rename annotation/tests/test_data/{test_grch38.vep_annotated.vcf => test_columns_version1_grch38.vep_annotated.vcf} (100%)
 delete mode 100644 annotation/tests/test_data/test_grch37.vep_annotated.vcf

diff --git a/analysis/tests/test_urls.py b/analysis/tests/test_urls.py
index e8af0d6c8..0f609db41 100644
--- a/analysis/tests/test_urls.py
+++ b/analysis/tests/test_urls.py
@@ -69,7 +69,7 @@ def setUpTestData(cls):
                                        father=father_cs,
                                        father_affected=False,
                                        proband=proband_cs)
-        vcf_filename = os.path.join(settings.BASE_DIR, "annotation/tests/test_data/test_grch37.vep_annotated.vcf")
+        vcf_filename = os.path.join(settings.BASE_DIR, "annotation/tests/test_data/test_columns_version1_grch37.vep_annotated.vcf")
         slowly_create_loci_and_variants_for_vcf(grch37, vcf_filename, get_variant_id_from_info=True)
         variant = Variant.objects.filter(Variant.get_no_reference_q()).first()
         CohortGenotype.objects.create(collection=collection,
diff --git a/annotation/fake_annotation.py b/annotation/fake_annotation.py
index de3438b66..3bca22a1b 100644
--- a/annotation/fake_annotation.py
+++ b/annotation/fake_annotation.py
@@ -87,7 +87,7 @@ def get_fake_annotation_version(genome_build: GenomeBuild):
 
 def create_fake_variants(genome_build: GenomeBuild):
     build_lc = genome_build.name.lower()
-    vcf_filename = os.path.join(settings.BASE_DIR, f"annotation/tests/test_data/test_{build_lc}.vep_annotated.vcf")
+    vcf_filename = os.path.join(settings.BASE_DIR, f"annotation/tests/test_data/test_columns_version1_{build_lc}.vep_annotated.vcf")
     slowly_create_loci_and_variants_for_vcf(genome_build, vcf_filename, get_variant_id_from_info=True)
 
 
diff --git a/annotation/management/commands/vep_run.py b/annotation/management/commands/vep_run.py
index e93ee9927..684332464 100644
--- a/annotation/management/commands/vep_run.py
+++ b/annotation/management/commands/vep_run.py
@@ -10,7 +10,7 @@
 from django.core.management.base import BaseCommand
 
 from annotation.models import VariantAnnotationPipelineType
-from annotation.vep_annotation import run_vep
+from annotation.vep_annotation import run_vep, VEPConfig
 from snpdb.models.models_genome import GenomeBuild
 
 DO_SMALL = False
@@ -28,6 +28,8 @@ def handle(self, *args, **options):
         cnv = options["cnv"]
         build_name = options["genome_build"]
         genome_build = GenomeBuild.get_name_or_alias(build_name)
+        vc = VEPConfig(genome_build)
+
 
         if test:
             print("Re-generating VCF for unit test")
@@ -36,6 +38,7 @@ def handle(self, *args, **options):
             unit_test_dir = os.path.join(settings.BASE_DIR, "annotation/tests/test_data")
             vcf_filename = os.path.join(unit_test_dir, f"{base_name}.vcf")
             output_dir = unit_test_dir
+            base_name = f"test_columns_version_{vc.columns_version}_{genome_build.name.lower()}"
         else:
             vep_suffix = f"vep_annotated_{genome_build.name}"
             output_dir = settings.ANNOTATION_VCF_DUMP_DIR
diff --git a/annotation/migrations/0082_new_vep_110_columns_v3.py b/annotation/migrations/0082_new_vep_110_columns_v3.py
index 877aaca63..3a87573ae 100644
--- a/annotation/migrations/0082_new_vep_110_columns_v3.py
+++ b/annotation/migrations/0082_new_vep_110_columns_v3.py
@@ -82,7 +82,7 @@ def _new_vep_110_annotation(apps, _schema_editor):
          'vep_custom': VEP_CUSTOM_GNOMAD_4, 'variant_grid_column_id': 'gnomad_sas_af', 'source_field': 'AF_sas',
          'category': FREQUENCY_DATA, 'genome_build_id': 'GRCh38'},
 
-        {'column': 'gnomad4_nonpar', 'variant_grid_column_id': 'gnomad_non_par',
+        {'column': 'gnomad4_nonpar', 'variant_grid_column_id': 'gnomad_non_par', 'min_vep_columns_version': 3,
          'genome_build_id': 'GRCh38', 'pipeline_type': 'S', 'category': 'F', 'source_field': 'non_par',
          'source_field_processing_description': 'nonpar from genomes',
          'vep_custom': VEP_CUSTOM_GNOMAD_4, 'source_field_has_custom_prefix': True},
diff --git a/annotation/tests/test_annotation_vcf.py b/annotation/tests/test_annotation_vcf.py
index bf7fd6a33..2f97767fc 100644
--- a/annotation/tests/test_annotation_vcf.py
+++ b/annotation/tests/test_annotation_vcf.py
@@ -48,8 +48,8 @@
                    ANNOTATION=ANNOTATION_COLUMNS_V1)
 class TestAnnotationVCF(TestCase):
     TEST_DATA_DIR = os.path.join(settings.BASE_DIR, "annotation/tests/test_data")
-    TEST_ANNOTATION_VCF_GRCH37 = os.path.join(TEST_DATA_DIR, "test_grch37.vep_annotated.vcf")
-    TEST_ANNOTATION_VCF_GRCH38 = os.path.join(TEST_DATA_DIR, "test_grch38.vep_annotated.vcf")
+    TEST_ANNOTATION_VCF_GRCH37 = os.path.join(TEST_DATA_DIR, "test_columns_version1_grch37.vep_annotated.vcf")
+    TEST_ANNOTATION_VCF_GRCH38 = os.path.join(TEST_DATA_DIR, "test_columns_version1_grch38.vep_annotated.vcf")
 
     @classmethod
     def setUpTestData(cls):
@@ -128,6 +128,10 @@ def _test_extra_grch37(self):
         self.assertEqual(va.predictions_num_pathogenic, 1)
         self.assertEqual(va.predictions_num_benign, 0)
 
+    def _test_24601_gnomad_grch38(self, va):
+        # This is from gnomAD v3
+        self.assertAlmostEqual(va.gnomad_af, 0.000354913)
+
     def test_import_variant_annotations_grch38(self):
         genome_build = GenomeBuild.get_name_or_alias('GRCh38')
         vav = self.variant_annotation_versions_by_build[genome_build.name]
@@ -147,7 +151,7 @@ def test_import_variant_annotations_grch38(self):
         self.assertEqual(va.impact, PathogenicityImpact.MODERATE)
         self.assertEqual(va.dbsnp_rs_id, "rs145886106")
         self.assertEqual(va.cosmic_legacy_id, "COSM7286401")  # Test it has collapsed dupes
-        self.assertAlmostEqual(va.gnomad_af, 0.000354913)
+        self._test_24601_gnomad_grch38(va)
         self.assertEqual(va.gnomad_filtered, False)  # Test it converted FILTER properly to bool
 
         va = VariantAnnotation.objects.get(variant_id=42)
@@ -230,6 +234,51 @@ def _test_extra_grch38(self):
         self.assertTrue(vta.nmd_escaping_variant)
 
 
+ANNOTATION_COLUMNS_V3 = copy.deepcopy(TEST_ANNOTATION)
+ANNOTATION_COLUMNS_V3[settings.BUILD_GRCH37]["columns_version"] = 3
+ANNOTATION_COLUMNS_V3[settings.BUILD_GRCH38]["columns_version"] = 3
+
+
+@override_settings(IMPORT_PROCESSING_DIR=TEST_IMPORT_PROCESSING_DIR,
+                   VARIANT_ZYGOSITY_GLOBAL_COLLECTION="global",
+                   ANNOTATION_VEP_FAKE_VERSION=True,
+                   ANNOTATION=ANNOTATION_COLUMNS_V3)
+class TestAnnotationVCF3(TestAnnotationVCF):
+    TEST_DATA_DIR = os.path.join(settings.BASE_DIR, "annotation/tests/test_data")
+    TEST_ANNOTATION_VCF_GRCH37 = os.path.join(TEST_DATA_DIR, "test_columns_version3_grch37.vep_annotated.vcf")
+    TEST_ANNOTATION_VCF_GRCH38 = os.path.join(TEST_DATA_DIR, "test_columns_version3_grch38.vep_annotated.vcf")
+
+    def _test_extra_grch37(self):
+        # This is testing columns_version 2
+        pass
+
+    def _test_24601_gnomad_grch38(self, va):
+        """ gnomAD v4 """
+        # AF total copied from https://gnomad.broadinstitute.org/variant/13-95186748-C-T?dataset=gnomad_r4
+        self.assertAlmostEqual(va.gnomad_af, 0.00006753, places=6)
+
+    def _test_extra_grch38(self):
+        # This is testing columns_version 2
+        va = VariantAnnotation.objects.get(variant_id=24601)
+        self.assertAlmostEqual(va.metalr_rankscore, 0.80456)
+        self.assertAlmostEqual(va.revel_rankscore, 0.69527)
+        self.assertAlmostEqual(va.vest4_rankscore, 0.56662)
+        self.assertAlmostEqual(va.bayesdel_noaf_rankscore, 0.63287)
+        self.assertAlmostEqual(va.cadd_raw_rankscore, 0.41304)
+        self.assertAlmostEqual(va.clinpred_rankscore, 0.15198)
+
+        va = VariantAnnotation.objects.get(variant_id=42)
+        self.assertEqual(va.aloft_high_confidence, True)
+        self.assertEqual(va.aloft_pred, ALoFTPrediction.RECESSIVE)
+        self.assertAlmostEqual(va.aloft_prob_dominant, 0.13585)
+        self.assertAlmostEqual(va.aloft_prob_recessive, 0.81255)
+        self.assertAlmostEqual(va.aloft_prob_tolerant, 0.0516)
+
+        vta = VariantTranscriptAnnotation.objects.get(variant_id=42, hgvs_c='NM_199357.3:c.1417C>T')
+        self.assertTrue(vta.nmd_escaping_variant)
+
+
+
 class TestVEP(TestCase):
     """ Random VEP annotation methods """
     maxDiff = None
diff --git a/annotation/tests/test_data/test_columns_version1_grch37.vep_annotated.vcf b/annotation/tests/test_data/test_columns_version1_grch37.vep_annotated.vcf
new file mode 100644
index 000000000..26197612d
--- /dev/null
+++ b/annotation/tests/test_data/test_columns_version1_grch37.vep_annotated.vcf
@@ -0,0 +1,92 @@
+##fileformat=VCFv4.1
+##INFO=<ID=variant_id,Number=1,Type=Integer,Description="VariantGrid primary column">
+##contig=<ID=1,length=249250621,assembly=hg19>
+##contig=<ID=2,length=243199373,assembly=hg19>
+##contig=<ID=3,length=198022430,assembly=hg19>
+##contig=<ID=4,length=191154276,assembly=hg19>
+##contig=<ID=5,length=180915260,assembly=hg19>
+##contig=<ID=6,length=171115067,assembly=hg19>
+##contig=<ID=7,length=159138663,assembly=hg19>
+##contig=<ID=8,length=146364022,assembly=hg19>
+##contig=<ID=9,length=141213431,assembly=hg19>
+##contig=<ID=10,length=135534747,assembly=hg19>
+##contig=<ID=11,length=135006516,assembly=hg19>
+##contig=<ID=12,length=133851895,assembly=hg19>
+##contig=<ID=13,length=115169878,assembly=hg19>
+##contig=<ID=14,length=107349540,assembly=hg19>
+##contig=<ID=15,length=102531392,assembly=hg19>
+##contig=<ID=16,length=90354753,assembly=hg19>
+##contig=<ID=17,length=81195210,assembly=hg19>
+##contig=<ID=18,length=78077248,assembly=hg19>
+##contig=<ID=19,length=59128983,assembly=hg19>
+##contig=<ID=20,length=63025520,assembly=hg19>
+##contig=<ID=21,length=48129895,assembly=hg19>
+##contig=<ID=22,length=51304566,assembly=hg19>
+##contig=<ID=X,length=155270560,assembly=hg19>
+##contig=<ID=Y,length=59373566,assembly=hg19>
+##contig=<ID=MT,length=16569,assembly=hg19>
+##VEP="v110" time="2023-12-01 15:10:26" cache="/data/annotation/VEP/vep_cache/homo_sapiens_refseq/110_GRCh37" ensembl-variation=110.d34d25e ensembl=110.9eadbc2 ensembl-funcgen=110.24e6da6 ensembl-io=110.b1a0d57 1000genomes="phase3" COSMIC="92" ClinVar="202012" HGMD-PUBLIC="20204" assembly="GRCh37.p13" dbSNP="154" gencode="GENCODE 19" genebuild="2011-04" gnomADe="r2.1" polyphen="2.2.2" refseq="2020-10-26 17:03:42 - GCF_000001405.25_GRCh37.p13_genomic.gff" regbuild="1.0" sift="sift5.2.2"
+##INFO=<ID=CSQ,Number=.,Type=String,Description="Consequence annotations from Ensembl VEP. Format: Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|DISTANCE|STRAND|FLAGS|PICK|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|ENSP|REFSEQ_MATCH|REFSEQ_OFFSET|SOURCE|SIFT|DOMAINS|HGVS_OFFSET|AF|CLIN_SIG|SOMATIC|PHENO|PUBMED|Grantham|SpliceRegion|Mastermind_MMID3|Mastermind_counts|MaxEntScan_alt|MaxEntScan_diff|MaxEntScan_ref|CADD_phred|FATHMM_pred|GERP++_RS|Interpro_domain|MutationAssessor_pred|MutationTaster_pred|Polyphen2_HVAR_pred|REVEL_score|ada_score|rf_score|SpliceAI_pred_DP_AG|SpliceAI_pred_DP_AL|SpliceAI_pred_DP_DG|SpliceAI_pred_DP_DL|SpliceAI_pred_DS_AG|SpliceAI_pred_DS_AL|SpliceAI_pred_DS_DG|SpliceAI_pred_DS_DL|SpliceAI_pred_SYMBOL|gnomAD2|gnomAD2_AC|gnomAD2_AC_popmax|gnomAD2_AF|gnomAD2_AF_afr|gnomAD2_AF_amr|gnomAD2_AF_asj|gnomAD2_AF_eas|gnomAD2_AF_fin|gnomAD2_AF_nfe|gnomAD2_AF_oth|gnomAD2_AF_popmax|gnomAD2_AF_sas|gnomAD2_AN|gnomAD2_AN_popmax|gnomAD2_gnomad_filtered|gnomAD2_nhomalt|gnomAD2_nonpar|gnomAD2_popmax|phastCons100way_vertebrate|phastCons46way_mammalian|phyloP100way_vertebrate|phyloP46way_mammalian|REPEAT_MASKER|TopMed|TopMed_TOPMED|UK10k|UK10k_AF|COSMIC|COSMIC_CNT|COSMIC_LEGACY_ID">
+##Grantham=Grantham Matrix score - Grantham, R. Amino Acid Difference Formula to Help Explain Protein Evolution, Science 1974 Sep 6;185(4154):862-4.
+##SpliceRegion=SpliceRegion predictions
+##Mastermind_MMID3=Mastermind MMID3 variant identifier(s), as gene:key. Link to the Genomenon Mastermind Genomic Search Engine.
+##Mastermind_counts=Mastermind number of citations in the medical literature. Output includes three unique counts: MMCNT1|MMCNT2|MMCNT3. MMCNT1 - Count of Mastermind articles with cDNA matches for this specific variant; MMCNT2 - Count of Mastermind articles with variants either explicitly matching at the cDNA level or given only at protein level; MMCNT3 - Count of Mastermind articles including other DNA-level variants resulting in the same amino acid change.
+##MaxEntScan_alt=MaxEntScan alternate sequence score
+##MaxEntScan_diff=MaxEntScan score difference
+##MaxEntScan_ref=MaxEntScan reference sequence score
+##CADD_phred=CADD_phred from dbNSFP file
+##FATHMM_pred=FATHMM_pred from dbNSFP file
+##GERP++_RS=GERP++_RS from dbNSFP file
+##Interpro_domain=Interpro_domain from dbNSFP file
+##MutationAssessor_pred=MutationAssessor_pred from dbNSFP file
+##MutationTaster_pred=MutationTaster_pred from dbNSFP file
+##Polyphen2_HVAR_pred=Polyphen2_HVAR_pred from dbNSFP file
+##REVEL_score=REVEL_score from dbNSFP file
+##ada_score=dbscSNV ADA score
+##rf_score=dbscSNV RF score
+##SpliceAI_pred_DP_AG=SpliceAI predicted effect on splicing. Delta position for acceptor gain
+##SpliceAI_pred_DP_AL=SpliceAI predicted effect on splicing. Delta position for acceptor loss
+##SpliceAI_pred_DP_DG=SpliceAI predicted effect on splicing. Delta position for donor gain
+##SpliceAI_pred_DP_DL=SpliceAI predicted effect on splicing. Delta position for donor loss
+##SpliceAI_pred_DS_AG=SpliceAI predicted effect on splicing. Delta score for acceptor gain
+##SpliceAI_pred_DS_AL=SpliceAI predicted effect on splicing. Delta score for acceptor loss
+##SpliceAI_pred_DS_DG=SpliceAI predicted effect on splicing. Delta score for donor gain
+##SpliceAI_pred_DS_DL=SpliceAI predicted effect on splicing. Delta score for donor loss
+##SpliceAI_pred_SYMBOL=SpliceAI gene symbol
+##INFO=<ID=gnomAD2,Number=.,Type=String,Description="[PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AC,Number=.,Type=String,Description="AC field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AC_popmax,Number=.,Type=String,Description="AC_popmax field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AF,Number=.,Type=String,Description="AF field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AF_afr,Number=.,Type=String,Description="AF_afr field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AF_amr,Number=.,Type=String,Description="AF_amr field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AF_asj,Number=.,Type=String,Description="AF_asj field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AF_eas,Number=.,Type=String,Description="AF_eas field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AF_fin,Number=.,Type=String,Description="AF_fin field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AF_nfe,Number=.,Type=String,Description="AF_nfe field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AF_oth,Number=.,Type=String,Description="AF_oth field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AF_popmax,Number=.,Type=String,Description="AF_popmax field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AF_sas,Number=.,Type=String,Description="AF_sas field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AN,Number=.,Type=String,Description="AN field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_AN_popmax,Number=.,Type=String,Description="AN_popmax field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_gnomad_filtered,Number=.,Type=String,Description="gnomad_filtered field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_nhomalt,Number=.,Type=String,Description="nhomalt field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_nonpar,Number=.,Type=String,Description="nonpar field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=gnomAD2_popmax,Number=.,Type=String,Description="popmax field from [PATH]/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
+##INFO=<ID=phastCons100way_vertebrate,Number=.,Type=String,Description="[PATH]/hg19.100way.phastCons.bw">
+##INFO=<ID=phastCons46way_mammalian,Number=.,Type=String,Description="[PATH]/hg19.phastCons46way.placental.bw">
+##INFO=<ID=phyloP100way_vertebrate,Number=.,Type=String,Description="[PATH]/hg19.100way.phyloP100way.bw">
+##INFO=<ID=phyloP46way_mammalian,Number=.,Type=String,Description="[PATH]/hg19.phyloP46way.placental.bw">
+##INFO=<ID=REPEAT_MASKER,Number=.,Type=String,Description="[PATH]/repeatmasker_hg19.bed.gz">
+##INFO=<ID=TopMed,Number=.,Type=String,Description="[PATH]/TOPMED_GRCh37.vcf.gz">
+##INFO=<ID=TopMed_TOPMED,Number=.,Type=String,Description="TOPMED field from [PATH]/TOPMED_GRCh37.vcf.gz">
+##INFO=<ID=UK10k,Number=.,Type=String,Description="[PATH]/UK10K_COHORT.20160215.sites.vcf.gz">
+##INFO=<ID=UK10k_AF,Number=.,Type=String,Description="AF field from [PATH]/UK10K_COHORT.20160215.sites.vcf.gz">
+##INFO=<ID=COSMIC,Number=.,Type=String,Description="[PATH]/CosmicCodingMuts_v95_20211101_grch37.normal.vcf.gz">
+##INFO=<ID=COSMIC_CNT,Number=.,Type=String,Description="CNT field from [PATH]/CosmicCodingMuts_v95_20211101_grch37.normal.vcf.gz">
+##INFO=<ID=COSMIC_LEGACY_ID,Number=.,Type=String,Description="LEGACY_ID field from [PATH]/CosmicCodingMuts_v95_20211101_grch37.normal.vcf.gz">
+##VEP-command-line='vep --af --assembly GRCh37 --biotype --cache --canonical --check_existing --compress_output gzip --custom [PATH]/CosmicCodingMuts_v95_20211101_grch37.normal.vcf.gz,COSMIC,vcf,exact,0,CNT,LEGACY_ID --database 0 --dir [PATH]/vep_cache --distance 5000 --domains --exclude_predicted --fasta [PATH]/GCF_000001405.25_GRCh37.p13_genomic.fna.gz --flag_pick --force_overwrite --hgvs --input_file [PATH]/test_grch37.vcf --no_escape --no_stats --numbers --offline --output_file [PATH]/test_grch37.vep_annotated.vcf.gz --plugin [PATH]/spliceai_scores.raw.indel.hg19.vcf.gz --protein --pubmed --refseq --sift b --symbol --transcript_version --use_given_ref --variant_class --vcf'
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+1	69098	.	C	G	.	.	variant_id=13629760;CSQ=G|missense_variant|MODERATE|OR4F5|79501|Transcript|NM_001005484.2|protein_coding|3/3||NM_001005484.2:c.71C>G|NP_001005484.2:p.Thr24Ser|131|71|24|T/S|aCt/aGt|||1||1|SNV|EntrezGene||YES|NP_001005484.2||||tolerated(1)||||||||58|||||||13.33|.&T|2.31|.&.|.&N|N|.&B|0.052|||-5|35|-5|17|0.00|0.15|0.27|0.08|OR4F5||||||||||||||||||||0.0020000000949949|0.966000020503998|0.894999980926514|1.23199999332428||||||||
+1	69589	.	G	A	.	.	variant_id=13629761;CSQ=A|missense_variant|MODERATE|OR4F5|79501|Transcript|NM_001005484.2|protein_coding|3/3||NM_001005484.2:c.562G>A|NP_001005484.2:p.Val188Ile|622|562|188|V/I|Gtc/Atc|COSV58736794||1||1|SNV|EntrezGene||YES|NP_001005484.2||||tolerated(0.41)|||||1|1||29||OR4F5:V167I|0&1&1||||14.31|.&T|0.138|GPCR&_rhodopsin-like&_7TM&GPCR&_rhodopsin-like&_7TM|.&N|N|.&B|0.043|||29|-46|-12|-1|0.00|0.00|0.00|0.11|OR4F5||||||||||||||||||||0|0.814000010490417|-1.23099994659424|-0.108000002801418||||||COSV58736794|1|COSM6847255
+13	95839002	.	C	T	.	.	variant_id=13629762;CSQ=T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_001105515.3|protein_coding|11/21||NM_001105515.3:c.1498G>A|NP_001098985.1:p.Glu500Lys|1635|1498|500|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1|||SNV|EntrezGene|||NP_001098985.1||||tolerated(0.11)|||0.0004||0&1|0&1||56||ABCC4:E500K|0&1&1||||22.3|.&D&.&D&.&.|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|N&N&N&.&N&.|D&D&D&D|B&B&B&.&.&.|0.377|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|rs145886106|41|36|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|282728|24956|0|0|.|afr|0.84799998998642|0.998000025749207|1.37699997425079|2.61299991607666||rs145886106|0.000927261|||COSV65320224|1|COSM7286401,T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_001301829.2|protein_coding|11/30||NM_001301829.2:c.1498G>A|NP_001288758.1:p.Glu500Lys|1635|1498|500|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1|||SNV|EntrezGene|||NP_001288758.1||||tolerated(0.06)|||0.0004||0&1|0&1||56||ABCC4:E500K|0&1&1||||22.3|.&D&.&D&.&.|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|N&N&N&.&N&.|D&D&D&D|B&B&B&.&.&.|0.377|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|rs145886106|41|36|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|282728|24956|0|0|.|afr|0.84799998998642|0.998000025749207|1.37699997425079|2.61299991607666||rs145886106|0.000927261|||COSV65320224|1|COSM7286401,T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_001301830.2|protein_coding|10/20||NM_001301830.2:c.1273G>A|NP_001288759.1:p.Glu425Lys|1410|1273|425|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1|||SNV|EntrezGene|||NP_001288759.1||||tolerated(0.11)|||0.0004||0&1|0&1||56||ABCC4:E500K|0&1&1||||22.3|.&D&.&D&.&.|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|N&N&N&.&N&.|D&D&D&D|B&B&B&.&.&.|0.377|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|rs145886106|41|36|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|282728|24956|0|0|.|afr|0.84799998998642|0.998000025749207|1.37699997425079|2.61299991607666||rs145886106|0.000927261|||COSV65320224|1|COSM7286401,T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_005845.5|protein_coding|11/31||NM_005845.5:c.1498G>A|NP_005836.2:p.Glu500Lys|1635|1498|500|E/K|Gaa/Aaa|rs145886106&COSV65320224||-1||1|SNV|EntrezGene||YES|NP_005836.2||||tolerated(0.06)|||0.0004||0&1|0&1||56||ABCC4:E500K|0&1&1||||22.3|.&D&.&D&.&.|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|N&N&N&.&N&.|D&D&D&D|B&B&B&.&.&.|0.377|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|rs145886106|41|36|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|282728|24956|0|0|.|afr|0.84799998998642|0.998000025749207|1.37699997425079|2.61299991607666||rs145886106|0.000927261|||COSV65320224|1|COSM7286401
+15	32928050	.	C	T	.	.	variant_id=13638004;CSQ=T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_001286479.3|protein_coding|11/12||NM_001286479.3:c.850C>T|NP_001273408.1:p.Arg284Ter|1398|850|284|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|EntrezGene|||NP_001273408.1|||||||||0&1|0&1|||||||||34|.&.&.&.&.|2.5|.&.&.&.&.|.&.&.&.&.|A&A&A&D&D|.&.&.&.&.||||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|rs776172390|4|1|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|247816|29766|0|0|.|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835|3|COSM185114,T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_001286480.3|protein_coding|12/13||NM_001286480.3:c.850C>T|NP_001273409.1:p.Arg284Ter|1506|850|284|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|EntrezGene|||NP_001273409.1|||||||||0&1|0&1|||||||||34|.&.&.&.&.|2.5|.&.&.&.&.|.&.&.&.&.|A&A&A&D&D|.&.&.&.&.||||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|rs776172390|4|1|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|247816|29766|0|0|.|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835|3|COSM185114,T|intron_variant|MODIFIER|ARHGAP11A-SCG5|114118903|Transcript|NM_001368319.1|protein_coding||9/13|NM_001368319.1:c.1235+2741C>T|||||||rs776172390&COSV64380835||1|||SNV|EntrezGene||YES|NP_001355248.1|||||||||0&1|0&1||||||||||||||||||||||||||||rs776172390|4|1|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|247816|29766|0|0|.|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835|3|COSM185114,T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_014783.6|protein_coding|11/12||NM_014783.6:c.1417C>T|NP_055598.1:p.Arg473Ter|2125|1417|473|R/*|Cga/Tga|rs776172390&COSV64380835||1||1|SNV|EntrezGene||YES|NP_055598.1|||||||||0&1|0&1|||||||||34|.&.&.&.&.|2.5|.&.&.&.&.|.&.&.&.&.|A&A&A&D&D|.&.&.&.&.||||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|rs776172390|4|1|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|247816|29766|0|0|.|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835|3|COSM185114,T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_199357.3|protein_coding|11/11||NM_199357.3:c.1417C>T|NP_955389.1:p.Arg473Ter|2125|1417|473|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|EntrezGene|||NP_955389.1|||||||||0&1|0&1|||||||||34|.&.&.&.&.|2.5|.&.&.&.&.|.&.&.&.&.|A&A&A&D&D|.&.&.&.&.||||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|rs776172390|4|1|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|247816|29766|0|0|.|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835|3|COSM185114
diff --git a/annotation/tests/test_data/test_grch38.vep_annotated.vcf b/annotation/tests/test_data/test_columns_version1_grch38.vep_annotated.vcf
similarity index 100%
rename from annotation/tests/test_data/test_grch38.vep_annotated.vcf
rename to annotation/tests/test_data/test_columns_version1_grch38.vep_annotated.vcf
diff --git a/annotation/tests/test_data/test_grch37.vep_annotated.vcf b/annotation/tests/test_data/test_grch37.vep_annotated.vcf
deleted file mode 100644
index 59415bffe..000000000
--- a/annotation/tests/test_data/test_grch37.vep_annotated.vcf
+++ /dev/null
@@ -1,88 +0,0 @@
-##fileformat=VCFv4.1
-##INFO=<ID=variant_id,Number=1,Type=Integer,Description="VariantGrid primary column">
-##contig=<ID=1,length=249250621,assembly=hg19>
-##contig=<ID=2,length=243199373,assembly=hg19>
-##contig=<ID=3,length=198022430,assembly=hg19>
-##contig=<ID=4,length=191154276,assembly=hg19>
-##contig=<ID=5,length=180915260,assembly=hg19>
-##contig=<ID=6,length=171115067,assembly=hg19>
-##contig=<ID=7,length=159138663,assembly=hg19>
-##contig=<ID=8,length=146364022,assembly=hg19>
-##contig=<ID=9,length=141213431,assembly=hg19>
-##contig=<ID=10,length=135534747,assembly=hg19>
-##contig=<ID=11,length=135006516,assembly=hg19>
-##contig=<ID=12,length=133851895,assembly=hg19>
-##contig=<ID=13,length=115169878,assembly=hg19>
-##contig=<ID=14,length=107349540,assembly=hg19>
-##contig=<ID=15,length=102531392,assembly=hg19>
-##contig=<ID=16,length=90354753,assembly=hg19>
-##contig=<ID=17,length=81195210,assembly=hg19>
-##contig=<ID=18,length=78077248,assembly=hg19>
-##contig=<ID=19,length=59128983,assembly=hg19>
-##contig=<ID=20,length=63025520,assembly=hg19>
-##contig=<ID=21,length=48129895,assembly=hg19>
-##contig=<ID=22,length=51304566,assembly=hg19>
-##contig=<ID=X,length=155270560,assembly=hg19>
-##contig=<ID=Y,length=59373566,assembly=hg19>
-##contig=<ID=MT,length=16569,assembly=hg19>
-##VEP="v100" time="2021-02-26 10:31:47" cache="/media/dlawrence/SpinningIron/reference/VEP/vep_cache/homo_sapiens_refseq/100_GRCh37" ensembl-variation=100.b220ff4 ensembl=100.7e964b7 ensembl-io=100.f87ae4f ensembl-funcgen=100.f0c3948 1000genomes="phase3" COSMIC="90" ClinVar="201912" ESP="20141103" HGMD-PUBLIC="20194" assembly="GRCh37.p13" dbSNP="153" gencode="GENCODE 19" genebuild="2011-04" gnomAD="r2.1" polyphen="2.2.2" refseq="01_2015" regbuild="1.0" sift="sift5.2.2"
-##INFO=<ID=CSQ,Number=.,Type=String,Description="Consequence annotations from Ensembl VEP. Format: Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|DISTANCE|STRAND|FLAGS|PICK|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|ENSP|SWISSPROT|TREMBL|UNIPARC|REFSEQ_MATCH|REFSEQ_OFFSET|SOURCE|SIFT|DOMAINS|HGVS_OFFSET|AF|CLIN_SIG|SOMATIC|PHENO|PUBMED|Grantham|SpliceRegion|LoFtool|Mastermind_MMID3|Mastermind_counts|MaxEntScan_alt|MaxEntScan_diff|MaxEntScan_ref|CADD_phred|CADD_raw|FATHMM_pred|GERP++_RS|Interpro_domain|MutationAssessor_pred|MutationTaster_pred|Polyphen2_HVAR_pred|REVEL_score|ada_score|rf_score|SpliceAI_pred_DP_AG|SpliceAI_pred_DP_AL|SpliceAI_pred_DP_DG|SpliceAI_pred_DP_DL|SpliceAI_pred_DS_AG|SpliceAI_pred_DS_AL|SpliceAI_pred_DS_DG|SpliceAI_pred_DS_DL|SpliceAI_pred_SYMBOL|gnomAD2|gnomAD2_AF|gnomAD2_AF_afr|gnomAD2_AF_amr|gnomAD2_AF_asj|gnomAD2_AF_eas|gnomAD2_AF_fin|gnomAD2_AF_nfe|gnomAD2_AF_oth|gnomAD2_AF_popmax|gnomAD2_AF_sas|gnomAD2_gnomad_filtered|gnomAD2_nhomalt|gnomAD2_popmax|phastCons100way_vertebrate|phastCons46way_mammalian|phyloP100way_vertebrate|phyloP46way_mammalian|REPEAT_MASKER|TopMed|TopMed_TOPMED|UK10k|UK10k_AF|COSMIC|COSMIC_CNT|COSMIC_LEGACY_ID">
-##Grantham=Grantham Matrix score - Grantham, R. Amino Acid Difference Formula to Help Explain Protein Evolution, Science 1974 Sep 6;185(4154):862-4.
-##SpliceRegion=SpliceRegion predictions
-##LoFtool=LoFtool score for gene
-##Mastermind_MMID3=Mastermind MMID3 variant identifier(s), as gene:key, for MMCNT3.
-##Mastermind_counts=Mastermind number of citations in the medical literature. Output includes three unique counts: MMCNT1|MMCNT2|MMCNT3. MMCNT1 - Count of Mastermind articles with cDNA matches for this specific variant; MMCNT2 - Count of Mastermind articles with variants either explicitly matching at the cDNA level or given only at protein level; MMCNT3 - Count of Mastermind articles including other DNA-level variants resulting in the same amino acid change.
-##MaxEntScan_alt=MaxEntScan alternate sequence score
-##MaxEntScan_diff=MaxEntScan score difference
-##MaxEntScan_ref=MaxEntScan reference sequence score
-##CADD_phred=(from dbNSFP) CADD phred-like score. This is phred-like rank score based on whole genome CADD raw scores. Please refer to Kircher et al. (2014) Nature Genetics 46(3):310-5 for details. The larger the score the more likely the SNP has damaging effect. Please note the following copyright statement for CADD: "CADD scores (http://cadd.gs.washington.edu/) are Copyright 2013 University of Washington and Hudson-Alpha Institute for Biotechnology (all rights reserved) but are freely available for all academic, non-commercial applications. For commercial licensing information contact Jennifer McCullar (mccullaj@uw.edu)."
-##CADD_raw=(from dbNSFP) CADD raw score for functional prediction of a SNP. Please refer to Kircher et al. (2014) Nature Genetics 46(3):310-5 for details. The larger the score the more likely the SNP has damaging effect. Scores range from -6.458163 to 18.301497 in dbNSFP. Please note the following copyright statement for CADD: "CADD scores (http://cadd.gs.washington.edu/) are Copyright 2013 University of Washington and Hudson-Alpha Institute for Biotechnology (all rights reserved) but are freely available for all academic, non-commercial applications. For commercial licensing information contact Jennifer McCullar (mccullaj@uw.edu)."
-##FATHMM_pred=(from dbNSFP) If a FATHMMori score is <=-1.5 (or rankscore >=0.81332) the corresponding nsSNV is predicted as "D(AMAGING)"; otherwise it is predicted as "T(OLERATED)". Multiple predictions separated by ";", corresponding to Ensembl_proteinid.
-##GERP++_RS=(from dbNSFP) GERP++ RS score, the larger the score, the more conserved the site. Scores range from -12.3 to 6.17.
-##Interpro_domain=(from dbNSFP) domain or conserved site on which the variant locates. Domain annotations come from Interpro database. The number in the brackets following a specific domain is the count of times Interpro assigns the variant position to that domain, typically coming from different predicting databases. Multiple entries separated by ";".
-##MutationAssessor_pred=(from dbNSFP) MutationAssessor's functional impact of a variant - predicted functional, i.e. high ("H") or medium ("M"), or predicted non-functional, i.e. low ("L") or neutral ("N"). The MAori score cutoffs between "H" and "M", "M" and "L", and "L" and "N", are 3.5, 1.935 and 0.8, respectively. The rankscore cutoffs between "H" and "M", "M" and "L", and "L" and "N", are 0.9307, 0.52043 and 0.19675, respectively.
-##MutationTaster_pred=(from dbNSFP) MutationTaster prediction, "A" ("disease_causing_automatic"), "D" ("disease_causing"), "N" ("polymorphism") or "P" ("polymorphism_automatic"). The score cutoff between "D" and "N" is 0.5 for MTnew and 0.31733 for the rankscore.
-##Polyphen2_HVAR_pred=(from dbNSFP) Polyphen2 prediction based on HumVar, "D" ("probably damaging", HVAR score in [0.909,1] or rankscore in [0.65694,0.97581]), "P" ("possibly damaging", HVAR in [0.447,0.908] or rankscore in [0.47121,0.65622]) and "B" ("benign", HVAR score in [0,0.446] or rankscore in [0.01493,0.47076]). Score cutoff for binary classification is 0.5 for HVAR score or 0.48762 for rankscore, i.e. the prediction is "neutral" if the HVAR score is smaller than 0.5 (rankscore is smaller than 0.48762), and "deleterious" if the HVAR score is larger than 0.5 (rankscore is larger than 0.48762). Multiple entries are separated by ";", corresponding to Uniprot_acc.
-##REVEL_score=(from dbNSFP) REVEL is an ensemble score based on 13 individual scores for predicting the pathogenicity of missense variants. Scores range from 0 to 1. The larger the score the more likely the SNP has damaging effect. "REVEL scores are freely available for non-commercial use. For other uses, please contact Weiva Sieh" (weiva.sieh@mssm.edu)
-##ada_score=dbscSNV ADA score
-##rf_score=dbscSNV RF score
-##SpliceAI_pred_DP_AG=SpliceAI predicted effect on splicing. Delta position for acceptor gain
-##SpliceAI_pred_DP_AL=SpliceAI predicted effect on splicing. Delta position for acceptor loss
-##SpliceAI_pred_DP_DG=SpliceAI predicted effect on splicing. Delta position for donor gain
-##SpliceAI_pred_DP_DL=SpliceAI predicted effect on splicing. Delta position for donor loss
-##SpliceAI_pred_DS_AG=SpliceAI predicted effect on splicing. Delta score for acceptor gain
-##SpliceAI_pred_DS_AL=SpliceAI predicted effect on splicing. Delta score for acceptor loss
-##SpliceAI_pred_DS_DG=SpliceAI predicted effect on splicing. Delta score for donor gain
-##SpliceAI_pred_DS_DL=SpliceAI predicted effect on splicing. Delta score for donor loss
-##SpliceAI_pred_SYMBOL=SpliceAI gene symbol
-##INFO=<ID=gnomAD2,Number=.,Type=String,Description="/media/dlawrence/SpinningIron/reference/VEP/annotation_data/GRCh37/gnomad2.1.1_GRCh37_combined_af.vcf.bgz (exact)">
-##INFO=<ID=gnomAD2_AF,Number=.,Type=String,Description="AF field from /media/dlawrence/SpinningIron/reference/VEP/annotation_data/GRCh37/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
-##INFO=<ID=gnomAD2_AF_afr,Number=.,Type=String,Description="AF_afr field from /media/dlawrence/SpinningIron/reference/VEP/annotation_data/GRCh37/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
-##INFO=<ID=gnomAD2_AF_amr,Number=.,Type=String,Description="AF_amr field from /media/dlawrence/SpinningIron/reference/VEP/annotation_data/GRCh37/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
-##INFO=<ID=gnomAD2_AF_asj,Number=.,Type=String,Description="AF_asj field from /media/dlawrence/SpinningIron/reference/VEP/annotation_data/GRCh37/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
-##INFO=<ID=gnomAD2_AF_eas,Number=.,Type=String,Description="AF_eas field from /media/dlawrence/SpinningIron/reference/VEP/annotation_data/GRCh37/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
-##INFO=<ID=gnomAD2_AF_fin,Number=.,Type=String,Description="AF_fin field from /media/dlawrence/SpinningIron/reference/VEP/annotation_data/GRCh37/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
-##INFO=<ID=gnomAD2_AF_nfe,Number=.,Type=String,Description="AF_nfe field from /media/dlawrence/SpinningIron/reference/VEP/annotation_data/GRCh37/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
-##INFO=<ID=gnomAD2_AF_oth,Number=.,Type=String,Description="AF_oth field from /media/dlawrence/SpinningIron/reference/VEP/annotation_data/GRCh37/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
-##INFO=<ID=gnomAD2_AF_popmax,Number=.,Type=String,Description="AF_popmax field from /media/dlawrence/SpinningIron/reference/VEP/annotation_data/GRCh37/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
-##INFO=<ID=gnomAD2_AF_sas,Number=.,Type=String,Description="AF_sas field from /media/dlawrence/SpinningIron/reference/VEP/annotation_data/GRCh37/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
-##INFO=<ID=gnomAD2_gnomad_filtered,Number=.,Type=String,Description="gnomad_filtered field from /media/dlawrence/SpinningIron/reference/VEP/annotation_data/GRCh37/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
-##INFO=<ID=gnomAD2_nhomalt,Number=.,Type=String,Description="nhomalt field from /media/dlawrence/SpinningIron/reference/VEP/annotation_data/GRCh37/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
-##INFO=<ID=gnomAD2_popmax,Number=.,Type=String,Description="popmax field from /media/dlawrence/SpinningIron/reference/VEP/annotation_data/GRCh37/gnomad2.1.1_GRCh37_combined_af.vcf.bgz">
-##INFO=<ID=phastCons100way_vertebrate,Number=.,Type=String,Description="/media/dlawrence/SpinningIron/reference/VEP/annotation_data/GRCh37/hg19.100way.phastCons.bw (overlap)">
-##INFO=<ID=phastCons46way_mammalian,Number=.,Type=String,Description="/media/dlawrence/SpinningIron/reference/VEP/annotation_data/GRCh37/hg19.phastCons46way.placental.bw (overlap)">
-##INFO=<ID=phyloP100way_vertebrate,Number=.,Type=String,Description="/media/dlawrence/SpinningIron/reference/VEP/annotation_data/GRCh37/hg19.100way.phyloP100way.bw (overlap)">
-##INFO=<ID=phyloP46way_mammalian,Number=.,Type=String,Description="/media/dlawrence/SpinningIron/reference/VEP/annotation_data/GRCh37/hg19.phyloP46way.placental.bw (overlap)">
-##INFO=<ID=REPEAT_MASKER,Number=.,Type=String,Description="/media/dlawrence/SpinningIron/reference/VEP/annotation_data/GRCh37/repeatmasker_hg19.bed.gz (overlap)">
-##INFO=<ID=TopMed,Number=.,Type=String,Description="/media/dlawrence/SpinningIron/reference/VEP/annotation_data/GRCh37/TOPMED_GRCh37.vcf.gz (exact)">
-##INFO=<ID=TopMed_TOPMED,Number=.,Type=String,Description="TOPMED field from /media/dlawrence/SpinningIron/reference/VEP/annotation_data/GRCh37/TOPMED_GRCh37.vcf.gz">
-##INFO=<ID=UK10k,Number=.,Type=String,Description="/media/dlawrence/SpinningIron/reference/VEP/annotation_data/GRCh37/UK10K_COHORT.20160215.sites.vcf.gz (exact)">
-##INFO=<ID=UK10k_AF,Number=.,Type=String,Description="AF field from /media/dlawrence/SpinningIron/reference/VEP/annotation_data/GRCh37/UK10K_COHORT.20160215.sites.vcf.gz">
-##INFO=<ID=COSMIC,Number=.,Type=String,Description="/media/dlawrence/SpinningIron/reference/VEP/annotation_data/GRCh37/CosmicCodingMuts.normal.grch37.vcf.gz (exact)">
-##INFO=<ID=COSMIC_CNT,Number=.,Type=String,Description="CNT field from /media/dlawrence/SpinningIron/reference/VEP/annotation_data/GRCh37/CosmicCodingMuts.normal.grch37.vcf.gz">
-##INFO=<ID=COSMIC_LEGACY_ID,Number=.,Type=String,Description="LEGACY_ID field from /media/dlawrence/SpinningIron/reference/VEP/annotation_data/GRCh37/CosmicCodingMuts.normal.grch37.vcf.gz">
-#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
-1	69098	.	C	G	.	.	variant_id=13629760;CSQ=G|missense_variant|MODERATE|OR4F5|79501|Transcript|NM_001005484.1|protein_coding|1/1||NM_001005484.1:c.8C>G|NP_001005484.1:p.Thr3Ser|8|8|3|T/S|aCt/aGt|||1||1|SNV|EntrezGene||YES|NP_001005484.1||||rseq_mrna_match|||tolerated(1)||||||||58||||||||13.33|1.049236|.&T|2.31|.&.|.&N|N|.&B|0.052|||-5|35|-5|17|0.00|0.15|0.27|0.08|OR4F5|||||||||||||||0.0020000000949949|0.966000020503998|0.894999980926514|1.23199999332428||||||||
-1	69589	.	G	A	.	.	variant_id=13629761;CSQ=A|missense_variant|MODERATE|OR4F5|79501|Transcript|NM_001005484.1|protein_coding|1/1||NM_001005484.1:c.499G>A|NP_001005484.1:p.Val167Ile|499|499|167|V/I|Gtc/Atc|COSV58736794||1||1|SNV|EntrezGene||YES|NP_001005484.1||||rseq_mrna_match|||tolerated(0.44)|||||1|1||29|||OR4F5:V167I|0&1&1||||14.31|1.192033|.&T|0.138|GPCR&_rhodopsin-like&_7TM&GPCR&_rhodopsin-like&_7TM|.&N|N|.&B|0.043|||29|-46|-12|-1|0.00|0.00|0.00|0.11|OR4F5|||||||||||||||0|0.814000010490417|-1.23099994659424|-0.108000002801418||||||COSV58736794|1|COSM6847255
-13	95839002	.	C	T	.	.	variant_id=13629762;CSQ=T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_001105515.3|protein_coding|11/21||NM_001105515.3:c.1498G>A|NP_001098985.1:p.Glu500Lys|1635|1498|500|E/K|Gaa/Aaa|rs145886106||-1|||SNV|EntrezGene|||NP_001098985.1||||rseq_mrna_match|||tolerated(0.11)|||0.0004|||||56||0.0441||||||22.3|2.646731|.&D&.&D&.&.|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|N&N&N&.&N&.|D&D&D&D|B&B&B&.&.&.|0.377|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|rs145886106|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|0|0|afr|0.84799998998642|0.998000025749207|1.37699997425079|2.61299991607666||rs145886106|0.000927261|||COSV65320224&COSV65320224&COSV65320224&COSV65320224&COSV65320224|1&1&1&1&1|COSM7286401&COSM7286401&COSM7286401&COSM7286401&COSM7286401,T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_001301829.2|protein_coding|11/30||NM_001301829.2:c.1498G>A|NP_001288758.1:p.Glu500Lys|1635|1498|500|E/K|Gaa/Aaa|rs145886106||-1|||SNV|EntrezGene|||NP_001288758.1||||rseq_mrna_match|||tolerated(0.06)|||0.0004|||||56||0.0441||||||22.3|2.646731|.&D&.&D&.&.|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|N&N&N&.&N&.|D&D&D&D|B&B&B&.&.&.|0.377|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|rs145886106|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|0|0|afr|0.84799998998642|0.998000025749207|1.37699997425079|2.61299991607666||rs145886106|0.000927261|||COSV65320224&COSV65320224&COSV65320224&COSV65320224&COSV65320224|1&1&1&1&1|COSM7286401&COSM7286401&COSM7286401&COSM7286401&COSM7286401,T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_001301830.2|protein_coding|10/20||NM_001301830.2:c.1273G>A|NP_001288759.1:p.Glu425Lys|1410|1273|425|E/K|Gaa/Aaa|rs145886106||-1|||SNV|EntrezGene|||NP_001288759.1||||rseq_mrna_match|||tolerated(0.11)|||0.0004|||||56||0.0441||||||22.3|2.646731|.&D&.&D&.&.|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|N&N&N&.&N&.|D&D&D&D|B&B&B&.&.&.|0.377|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|rs145886106|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|0|0|afr|0.84799998998642|0.998000025749207|1.37699997425079|2.61299991607666||rs145886106|0.000927261|||COSV65320224&COSV65320224&COSV65320224&COSV65320224&COSV65320224|1&1&1&1&1|COSM7286401&COSM7286401&COSM7286401&COSM7286401&COSM7286401,T|missense_variant|MODERATE|ABCC4|10257|Transcript|NM_005845.4|protein_coding|11/31||NM_005845.4:c.1498G>A|NP_005836.2:p.Glu500Lys|1630|1498|500|E/K|Gaa/Aaa|rs145886106||-1||1|SNV|EntrezGene||YES|NP_005836.2||||rseq_mrna_nonmatch&rseq_3p_mismatch|||tolerated(0.06)|||0.0004|||||56||0.0441||||||22.3|2.646731|.&D&.&D&.&.|5.54|ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain&.&.&.&ABC_transporter-like&ABC_transporter-like&AAA+_ATPase_domain|N&N&N&.&N&.|D&D&D&D|B&B&B&.&.&.|0.377|||-6|6|27|-47|0.00|0.00|0.00|0.00|ABCC4|rs145886106|0.000145|0.001443|0.000141|0.000000|0.000000|0.000000|0.000000|0.000000|0.0014425388684083987|0.000000|0|0|afr|0.84799998998642|0.998000025749207|1.37699997425079|2.61299991607666||rs145886106|0.000927261|||COSV65320224&COSV65320224&COSV65320224&COSV65320224&COSV65320224|1&1&1&1&1|COSM7286401&COSM7286401&COSM7286401&COSM7286401&COSM7286401
-15	32928050	.	C	T	.	.	variant_id=13638004;CSQ=T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_001286479.2|protein_coding|11/12||NM_001286479.2:c.850C>T|NP_001273408.1:p.Arg284Ter|1398|850|284|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|EntrezGene|||NP_001273408.1||||rseq_mrna_match||||||||0&1|0&1||||0.426||||||34|4.790614|.&.&.&.&.|2.5|.&.&.&.&.|.&.&.&.&.|A&A&A&D&D|.&.&.&.&.||||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|rs776172390|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|0|0|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835&COSV64380835&COSV64380835&COSV64380835&COSV64380835|3&3&3&3&3|COSM185114&COSM185114&COSM185114&COSM185114&COSM185114,T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_001286480.3|protein_coding|12/13||NM_001286480.3:c.850C>T|NP_001273409.1:p.Arg284Ter|1506|850|284|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|EntrezGene|||NP_001273409.1||||rseq_mrna_match||||||||0&1|0&1||||0.426||||||34|4.790614|.&.&.&.&.|2.5|.&.&.&.&.|.&.&.&.&.|A&A&A&D&D|.&.&.&.&.||||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|rs776172390|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|0|0|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835&COSV64380835&COSV64380835&COSV64380835&COSV64380835|3&3&3&3&3|COSM185114&COSM185114&COSM185114&COSM185114&COSM185114,T|intron_variant|MODIFIER|ARHGAP11A-SCG5|114118903|Transcript|NM_001368319.1|protein_coding||9/13|NM_001368319.1:c.1235+2741C>T|||||||rs776172390&COSV64380835||1|||SNV|EntrezGene||YES|NP_001355248.1||||rseq_mrna_match||||||||0&1|0&1|||||||||||||||||||||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|rs776172390|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|0|0|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835&COSV64380835&COSV64380835&COSV64380835&COSV64380835|3&3&3&3&3|COSM185114&COSM185114&COSM185114&COSM185114&COSM185114,T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_014783.6|protein_coding|11/12||NM_014783.6:c.1417C>T|NP_055598.1:p.Arg473Ter|2125|1417|473|R/*|Cga/Tga|rs776172390&COSV64380835||1||1|SNV|EntrezGene||YES|NP_055598.1||||rseq_mrna_match||||||||0&1|0&1||||0.426||||||34|4.790614|.&.&.&.&.|2.5|.&.&.&.&.|.&.&.&.&.|A&A&A&D&D|.&.&.&.&.||||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|rs776172390|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|0|0|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835&COSV64380835&COSV64380835&COSV64380835&COSV64380835|3&3&3&3&3|COSM185114&COSM185114&COSM185114&COSM185114&COSM185114,T|stop_gained|HIGH|ARHGAP11A|9824|Transcript|NM_199357.2|protein_coding|11/11||NM_199357.2:c.1417C>T|NP_955389.1:p.Arg473Ter|2153|1417|473|R/*|Cga/Tga|rs776172390&COSV64380835||1|||SNV|EntrezGene|||NP_955389.1||||rseq_mrna_match||||||||0&1|0&1||||0.426||||||34|4.790614|.&.&.&.&.|2.5|.&.&.&.&.|.&.&.&.&.|A&A&A&D&D|.&.&.&.&.||||22|-49|-40|21|0.00|0.00|0.00|0.06|ARHGAP11A|rs776172390|0.000016|0.000000|0.000000|0.000000|0.000000|0.000000|0.000027|0.000000|3.359537727608681e-05|0.000034|0|0|sas|0.00700000021606684|0.32600000500679|0.425999999046326|0.187000006437302||rs776172390|3.4343e-005|||COSV64380835&COSV64380835&COSV64380835&COSV64380835&COSV64380835|3&3&3&3&3|COSM185114&COSM185114&COSM185114&COSM185114&COSM185114
diff --git a/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py b/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py
index 446f10646..6845acb88 100644
--- a/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py
+++ b/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py
@@ -188,13 +188,16 @@ def _add_vep_field_handlers(self):
             "topmed_af": format_pick_highest_float,
             "variant_class": get_choice_formatter_func(VariantClass.choices),
         }
-        if self.genome_build == GenomeBuild.grch38():
+
+        vc = VEPConfig(self.genome_build)
+        # gnomad3 wasn't combined using gnomad_data.py so just uses FILTER
+        # while combined exome/genomes use "gnomad_filtered=1" (which should auto-convert bool)
+        if self.genome_build == GenomeBuild.grch38() and vc.columns_version <= 2:
             self.field_formatters["gnomad_filtered"] = gnomad_filtered_func
 
         self.source_field_to_columns = defaultdict(set)
         self.ignored_vep_fields = self.VEP_NOT_COPIED_FIELDS.copy()
 
-        vc = VEPConfig(self.genome_build)
         cvf_filters = [ColumnVEPField.get_columns_version_q(vc.columns_version)]
         if self.annotation_run.pipeline_type == VariantAnnotationPipelineType.CNV:
             cvf_filters.extend([
@@ -582,7 +585,10 @@ def empty_to_none(it):
 
 # Field formatters
 def gnomad_filtered_func(raw_value):
-    """ We use FILTER in Gnomad3 (GRCh38 only) - need to convert back to bool """
+    """ We use FILTER in Gnomad3 (GRCh38 only) - need to convert back to bool
+        In the combined exomes/genomes (gnomad2, gnomad4) we use gnomad_filtered=1
+        So don't need to format this etc
+    """
     return raw_value not in (None, "PASS")
 
 

From 68459da14a1650ff07759fbc8dbc4284f0b3422f Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Sun, 3 Dec 2023 20:58:24 +1030
Subject: [PATCH 15/29] update to latest cdot data

---
 annotation/annotation_data/cdot_update.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/annotation/annotation_data/cdot_update.sh b/annotation/annotation_data/cdot_update.sh
index 5ea4563f4..ab3ce9485 100755
--- a/annotation/annotation_data/cdot_update.sh
+++ b/annotation/annotation_data/cdot_update.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-CDOT_VERSION=0.2.21
+CDOT_VERSION=0.2.22
 THIS_DIR=$(realpath "$(dirname "${BASH_SOURCE[0]}")")
 VG_DIR=${THIS_DIR}/../..
 DOWNLOAD_DIR=/tmp

From dc33997ac3e1938d9927941f3c1b5239ef70a9b7 Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Sun, 3 Dec 2023 21:47:09 +1030
Subject: [PATCH 16/29] cdot v0.2.22 (data)

---
 annotation/annotation_data/cdot_update.sh | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/annotation/annotation_data/cdot_update.sh b/annotation/annotation_data/cdot_update.sh
index ab3ce9485..8cccb389c 100755
--- a/annotation/annotation_data/cdot_update.sh
+++ b/annotation/annotation_data/cdot_update.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+set -e
+
 CDOT_VERSION=0.2.22
 THIS_DIR=$(realpath "$(dirname "${BASH_SOURCE[0]}")")
 VG_DIR=${THIS_DIR}/../..
@@ -9,10 +11,10 @@ echo "Downloading data in ${DOWNLOAD_DIR}"
 cd ${DOWNLOAD_DIR}
 
 wget \
-  https://github.com/SACGF/cdot/releases/download/v${CDOT_VERSION}/cdot-${CDOT_VERSION}.ensembl.grch37.json.gz \
-  https://github.com/SACGF/cdot/releases/download/v${CDOT_VERSION}/cdot-${CDOT_VERSION}.ensembl.grch38.json.gz \
-  https://github.com/SACGF/cdot/releases/download/v${CDOT_VERSION}/cdot-${CDOT_VERSION}.refseq.grch37.json.gz \
-  https://github.com/SACGF/cdot/releases/download/v${CDOT_VERSION}/cdot-${CDOT_VERSION}.refseq.grch38.json.gz
+  https://github.com/SACGF/cdot/releases/download/data_v${CDOT_VERSION}/cdot-${CDOT_VERSION}.ensembl.grch37.json.gz \
+  https://github.com/SACGF/cdot/releases/download/data_v${CDOT_VERSION}/cdot-${CDOT_VERSION}.ensembl.grch38.json.gz \
+  https://github.com/SACGF/cdot/releases/download/data_v${CDOT_VERSION}/cdot-${CDOT_VERSION}.refseq.grch37.json.gz \
+  https://github.com/SACGF/cdot/releases/download/data_v${CDOT_VERSION}/cdot-${CDOT_VERSION}.refseq.grch38.json.gz
 
 cd ${VG_DIR}
 

From 21450197166f7b6b09fb61d830b904d81d12471c Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Mon, 4 Dec 2023 15:38:37 +1030
Subject: [PATCH 17/29] #850 - Upgrade dbnsfp to 4.5 (and include Alpha
 Missense)

---
 .../dbnsfp_grch37_strip.sh                    | 32 ++++++++++--------
 .../dbnsfp_grch38_strip.sh                    | 33 ++++++++++---------
 2 files changed, 35 insertions(+), 30 deletions(-)

diff --git a/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh b/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh
index 076cd4042..4a02d506c 100755
--- a/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh
+++ b/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh
@@ -2,33 +2,37 @@
 
 set -e
 
-# All of this python is just to get the columns used in cut and tabix args
+# Download 4.5 from https://sites.google.com/site/jpopgen/dbNSFP
+
+# https://m.ensembl.org/info/docs/tools/vep/script/vep_plugins.html#dbnsfp
+# zcat dbNSFP4.5a_variant.chr1.gz | head -n1 > header.txt
+# mkdir /tmp/dbsnp37
+# zgrep -h -v ^#chr dbNSFP4.5a_variant.chr* | awk '$8 != "." ' | sort -T /tmp/dbsnp37 -k8,8 -k9,9n - | cat header.txt - | bgzip -c > dbNSFP4.5a_grch37.gz
+# tabix -s 8 -b 9 -e 9 dbNSFP4.5a.grch37.gz
+
+
+# All of this python is just to get the columns used in cut and tabix args at bottom of this file
 
 # Get dbNSFP fields used by VariantGrid - run python3 manage.py shell
 # In [12]: ",".join(ColumnVEPField.get_source_fields(vep_plugin='d'))                                                                                                                                     
 
 # Get column names from dbNSFP data file
-# df = pd.read_csv("./dbNSFP4.3a.grch37.gz", sep='\t', index_col=None, nrows=0)
-# vep_fields = 'GERP++_RS,Interpro_domain,CADD_raw_rankscore,REVEL_rankscore,BayesDel_noAF_rankscore,ClinPred_rankscore,VEST4_rankscore,MetaLR_rankscore,Aloft_prob_Tolerant,Aloft_prob_Recessive,Aloft_prob_Dominant,Aloft_pred,Aloft_Confidence'
+# import pandas as pd
+# df = pd.read_csv("header.txt", sep='\t', index_col=None, nrows=0)
+# vep_fields = 'GERP++_RS,Interpro_domain,CADD_raw_rankscore,REVEL_rankscore,BayesDel_noAF_rankscore,ClinPred_rankscore,VEST4_rankscore,MetaLR_rankscore,Aloft_prob_Tolerant,Aloft_prob_Recessive,Aloft_prob_Dominant,Aloft_pred,Aloft_Confidence,AlphaMissense_rankscore,AlphaMissense_pred'
 # columns = ['ref', 'alt', 'aaref', 'aaalt', 'hg19_chr', 'hg19_pos(1-based)', 'Ensembl_transcriptid'] + vep_fields.split(",")
 # cols = []
 # for i in columns:
 #    cols.append(list(df.columns).index(i) + 1)
-# ",".join([str(c) for c in sorted(cols)])
-# columns are: '3,4,5,6,8,9,15,69,74,84,104,107,113,114,115,116,117,119,156,640'
-
+# print(",".join([str(c) for c in sorted(cols)]))
+# columns are: '3,4,5,6,8,9,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705'
 
-# Download 4.3 from https://sites.google.com/site/jpopgen/dbNSFP
 
-# https://m.ensembl.org/info/docs/tools/vep/script/vep_plugins.html#dbnsfp
-# zcat dbNSFP4.3a_variant.chr1.gz | head -n1 > h
-# zgrep -h -v ^#chr dbNSFP4.3a_variant.chr* | awk '$8 != "." ' | sort -T /path/to/tmp_folder -k8,8 -k9,9n - | cat h - | bgzip -c > dbNSFP4.3a_grch37.gz
-# tabix -s 8 -b 9 -e 9 dbNSFP4.3a.grch37.gz
 
-IN_FILE=dbNSFP4.3a.grch37.gz
-OUT_FILE=dbNSFP4.3a.grch37.stripped.gz
+IN_FILE=dbNSFP4.5a.grch37.gz
+OUT_FILE=dbNSFP4.5a.grch37.stripped.gz
 
 # Header needs to start with #
-(echo -n "#" ; zcat ${IN_FILE} | cut -f 3,4,5,6,8,9,15,69,74,84,104,107,113,114,115,116,117,119,156,640 ) | bgzip > ${OUT_FILE}
+(echo -n "#" ; zcat ${IN_FILE} | cut -f 3,4,5,6,8,9,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705 ) | bgzip > ${OUT_FILE}
 tabix -s 5 -b 6 -e 6 ${OUT_FILE} # cols are: 1=ref, 2=alt, 3=chr, 4=pos
 
diff --git a/annotation/annotation_data/generate_annotation/dbnsfp_grch38_strip.sh b/annotation/annotation_data/generate_annotation/dbnsfp_grch38_strip.sh
index a2666fc7c..1fcd065b4 100755
--- a/annotation/annotation_data/generate_annotation/dbnsfp_grch38_strip.sh
+++ b/annotation/annotation_data/generate_annotation/dbnsfp_grch38_strip.sh
@@ -2,34 +2,35 @@
 
 set -e
 
+# Download 4.5 from https://sites.google.com/site/jpopgen/dbNSFP
+
+# https://m.ensembl.org/info/docs/tools/vep/script/vep_plugins.html#dbnsfp
+
+# zcat dbNSFP4.5a_variant.chr1.gz | head -n1 > header.txt
+# mkdir /tmp/dbsnp38
+# zgrep -h -v ^#chr dbNSFP4.5a_variant.chr* | sort -T /tmp/dbsnp38 -k1,1 -k2,2n - | cat header.txt - | bgzip -c > dbNSFP4.5a_grch38.gz
+# tabix -s 1 -b 2 -e 2 dbNSFP4.5a_grch38.gz
+
+
 # All of this python is just to get the columns used in cut and tabix args
 
 # Get dbNSFP fields used by VariantGrid - run python3 manage.py shell
 # In [12]: ",".join(ColumnVEPField.get_source_fields(vep_plugin='d'))                                                                                                                                     
 
 # Get column names from dbNSFP data file
-# df = pd.read_csv("./dbNSFP4.3a.grch38.gz", sep='\t', index_col=None, nrows=0)
-# vep_fields = 'GERP++_RS,Interpro_domain,CADD_raw_rankscore,REVEL_rankscore,BayesDel_noAF_rankscore,ClinPred_rankscore,VEST4_rankscore,MetaLR_rankscore,Aloft_prob_Tolerant,Aloft_prob_Recessive,Aloft_prob_Dominant,Aloft_pred,Aloft_Confidence'
+# df = pd.read_csv("header.txt", sep='\t', index_col=None, nrows=0)
+# vep_fields = 'GERP++_RS,Interpro_domain,CADD_raw_rankscore,REVEL_rankscore,BayesDel_noAF_rankscore,ClinPred_rankscore,VEST4_rankscore,MetaLR_rankscore,Aloft_prob_Tolerant,Aloft_prob_Recessive,Aloft_prob_Dominant,Aloft_pred,Aloft_Confidence,AlphaMissense_rankscore,AlphaMissense_pred'
 # columns = ['#chr', 'pos(1-based)', 'ref', 'alt', 'aaref', 'aaalt', 'Ensembl_transcriptid'] + vep_fields.split(",")
 # cols = []
 # for i in columns:
 #    cols.append(list(df.columns).index(i) + 1)
-# ",".join([str(c) for c in sorted(cols)])
-# columns are: '1,2,3,4,5,6,15,69,74,84,104,107,113,114,115,116,117,119,156,640'
-
-# Download 4.3 from https://sites.google.com/site/jpopgen/dbNSFP
-
-# https://m.ensembl.org/info/docs/tools/vep/script/vep_plugins.html#dbnsfp
-
-# zcat dbNSFP4.3a_variant.chr1.gz | head -n1 > h
-# zgrep -h -v ^#chr dbNSFP4.3a_variant.chr* | sort -T /path/to/tmp_folder -k1,1 -k2,2n - | cat h - | bgzip -c > dbNSFP4.3a_grch38.gz
-# tabix -s 1 -b 2 -e 2 dbNSFP4.3a_grch38.gz
-
+# print(",".join([str(c) for c in sorted(cols)]))
+# columns are: '1,2,3,4,5,6,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705'
 
-IN_FILE=dbNSFP4.3a.grch38.gz
-OUT_FILE=dbNSFP4.3a.grch38.stripped.gz
+IN_FILE=dbNSFP4.5a.grch38.gz
+OUT_FILE=dbNSFP4.5a.grch38.stripped.gz
 
 # Header needs to start with #
-(echo -n "#" ; zcat ${IN_FILE} | cut -f 1,2,3,4,5,6,15,69,74,84,104,107,113,114,115,116,117,119,156,640 ) | bgzip > ${OUT_FILE}
+(echo -n "#" ; zcat ${IN_FILE} | cut -f 1,2,3,4,5,6,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705 ) | bgzip > ${OUT_FILE}
 tabix -s 1 -b 2 -e 2 ${OUT_FILE} # cols are: 1=chr, 2=pos
 

From 56dab0d54b85448c88e5517a4eb4460bcd93b593 Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Tue, 5 Dec 2023 12:13:37 +1030
Subject: [PATCH 18/29] #850 - do per-chrom to make it faster

---
 .../generate_annotation/dbnsfp_grch37_strip.sh    | 15 +++++++++------
 .../generate_annotation/dbnsfp_grch38_strip.sh    | 13 +++++++++----
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh b/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh
index 4a02d506c..966afd597 100755
--- a/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh
+++ b/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh
@@ -27,12 +27,15 @@ set -e
 # print(",".join([str(c) for c in sorted(cols)]))
 # columns are: '3,4,5,6,8,9,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705'
 
-
-
-IN_FILE=dbNSFP4.5a.grch37.gz
+CUT_COLUMNS="3,4,5,6,8,9,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705"
 OUT_FILE=dbNSFP4.5a.grch37.stripped.gz
+TMP_DIR=/tmp # /hpcfs/groups/phoenix-hpc-sacgf/scratch/dbnsfp_GRCh37
+mkdir -p ${TMP_DIR}
 
-# Header needs to start with #
-(echo -n "#" ; zcat ${IN_FILE} | cut -f 3,4,5,6,8,9,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705 ) | bgzip > ${OUT_FILE}
-tabix -s 5 -b 6 -e 6 ${OUT_FILE} # cols are: 1=ref, 2=alt, 3=chr, 4=pos
+# Sort chromosomes individually as that's much more efficient
+cat header.txt | cut -f ${CUT_COLUMNS} | bgzip > ${OUT_FILE}
+for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do
+    zgrep -h -v ^#chr dbNSFP4.5a_variant.chr${chrom}.gz | awk '$8 != "." ' | cut -f ${CUT_COLUMNS} | sort -T ${TMP_DIR} -k8,8 -k9,9n - | bgzip >> ${OUT_FILE}
+done
 
+tabix -s 5 -b 6 -e 6 ${OUT_FILE} # cols are: 1=ref, 2=alt, 3=chr, 4=pos
diff --git a/annotation/annotation_data/generate_annotation/dbnsfp_grch38_strip.sh b/annotation/annotation_data/generate_annotation/dbnsfp_grch38_strip.sh
index 1fcd065b4..262b2ac79 100755
--- a/annotation/annotation_data/generate_annotation/dbnsfp_grch38_strip.sh
+++ b/annotation/annotation_data/generate_annotation/dbnsfp_grch38_strip.sh
@@ -27,10 +27,15 @@ set -e
 # print(",".join([str(c) for c in sorted(cols)]))
 # columns are: '1,2,3,4,5,6,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705'
 
-IN_FILE=dbNSFP4.5a.grch38.gz
+CUT_COLUMNS="1,2,3,4,5,6,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705"
 OUT_FILE=dbNSFP4.5a.grch38.stripped.gz
+TMP_DIR=/tmp # /hpcfs/groups/phoenix-hpc-sacgf/scratch/dbnsfp4.5_GRCh38
+mkdir -p ${TMP_DIR}
 
-# Header needs to start with #
-(echo -n "#" ; zcat ${IN_FILE} | cut -f 1,2,3,4,5,6,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705 ) | bgzip > ${OUT_FILE}
-tabix -s 1 -b 2 -e 2 ${OUT_FILE} # cols are: 1=chr, 2=pos
+# Sort chromosomes individually as that's much more efficient
+cat header.txt | cut -f ${CUT_COLUMNS} | bgzip > ${OUT_FILE}
+for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do
+    zgrep -h -v ^#chr dbNSFP4.5a_variant.chr${chrom}.gz | cut -f ${CUT_COLUMNS} | sort -T ${TMP_DIR} -k1,1 -k2,2n - >> ${OUT_FILE}
+done
 
+tabix -s 1 -b 2 -e 2 ${OUT_FILE} # cols are: 1=chr, 2=pos

From 9339bcb01c010294740e20f9ac2dbb63c5bb846f Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Tue, 5 Dec 2023 14:25:40 +1030
Subject: [PATCH 19/29] #850 - Need to bgzip data

---
 .../annotation_data/generate_annotation/dbnsfp_grch38_strip.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/annotation/annotation_data/generate_annotation/dbnsfp_grch38_strip.sh b/annotation/annotation_data/generate_annotation/dbnsfp_grch38_strip.sh
index 262b2ac79..0a9199067 100755
--- a/annotation/annotation_data/generate_annotation/dbnsfp_grch38_strip.sh
+++ b/annotation/annotation_data/generate_annotation/dbnsfp_grch38_strip.sh
@@ -35,7 +35,7 @@ mkdir -p ${TMP_DIR}
 # Sort chromosomes individually as that's much more efficient
 cat header.txt | cut -f ${CUT_COLUMNS} | bgzip > ${OUT_FILE}
 for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do
-    zgrep -h -v ^#chr dbNSFP4.5a_variant.chr${chrom}.gz | cut -f ${CUT_COLUMNS} | sort -T ${TMP_DIR} -k1,1 -k2,2n - >> ${OUT_FILE}
+    zgrep -h -v ^#chr dbNSFP4.5a_variant.chr${chrom}.gz | cut -f ${CUT_COLUMNS} | sort -T ${TMP_DIR} -k1,1 -k2,2n - | bgzip >> ${OUT_FILE}
 done
 
 tabix -s 1 -b 2 -e 2 ${OUT_FILE} # cols are: 1=chr, 2=pos

From 101c84584a2bd6846ef49d6c37ffd8a8c108c067 Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Tue, 5 Dec 2023 15:53:53 +1030
Subject: [PATCH 20/29] #850 - Do bgzip afterwards, 37 need to shift sort out
 as columns were cut first

---
 .../generate_annotation/dbnsfp_grch37_strip.sh       | 12 ++++++++----
 .../generate_annotation/dbnsfp_grch38_strip.sh       |  9 +++++----
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh b/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh
index 966afd597..de6851765 100755
--- a/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh
+++ b/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh
@@ -28,14 +28,18 @@ set -e
 # columns are: '3,4,5,6,8,9,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705'
 
 CUT_COLUMNS="3,4,5,6,8,9,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705"
-OUT_FILE=dbNSFP4.5a.grch37.stripped.gz
+SEQ_COL=5 # hg19_chr
+POS_COL=6 # hg19_pos(1-based)
+OUT_FILE=dbNSFP4.5a.grch37.stripped
 TMP_DIR=/tmp # /hpcfs/groups/phoenix-hpc-sacgf/scratch/dbnsfp_GRCh37
 mkdir -p ${TMP_DIR}
 
 # Sort chromosomes individually as that's much more efficient
-cat header.txt | cut -f ${CUT_COLUMNS} | bgzip > ${OUT_FILE}
+cat header.txt | cut -f ${CUT_COLUMNS} > ${OUT_FILE}
 for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do
-    zgrep -h -v ^#chr dbNSFP4.5a_variant.chr${chrom}.gz | awk '$8 != "." ' | cut -f ${CUT_COLUMNS} | sort -T ${TMP_DIR} -k8,8 -k9,9n - | bgzip >> ${OUT_FILE}
+    zgrep -h -v ^#chr dbNSFP4.5a_variant.chr${chrom}.gz | awk '$8 != "." ' | cut -f ${CUT_COLUMNS} | sort -T ${TMP_DIR} -k${SEQ_COL},${SEQ_COL} -k${POS_COL},${POS_COL}n - >> ${OUT_FILE}
 done
 
-tabix -s 5 -b 6 -e 6 ${OUT_FILE} # cols are: 1=ref, 2=alt, 3=chr, 4=pos
+bgzip ${OUT_FILE}
+tabix -s ${SEQ_COL} -b ${POS_COL} -e ${POS_COL} ${OUT_FILE}.gz
+
diff --git a/annotation/annotation_data/generate_annotation/dbnsfp_grch38_strip.sh b/annotation/annotation_data/generate_annotation/dbnsfp_grch38_strip.sh
index 0a9199067..af197a4d2 100755
--- a/annotation/annotation_data/generate_annotation/dbnsfp_grch38_strip.sh
+++ b/annotation/annotation_data/generate_annotation/dbnsfp_grch38_strip.sh
@@ -28,14 +28,15 @@ set -e
 # columns are: '1,2,3,4,5,6,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705'
 
 CUT_COLUMNS="1,2,3,4,5,6,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705"
-OUT_FILE=dbNSFP4.5a.grch38.stripped.gz
+OUT_FILE=dbNSFP4.5a.grch38.stripped
 TMP_DIR=/tmp # /hpcfs/groups/phoenix-hpc-sacgf/scratch/dbnsfp4.5_GRCh38
 mkdir -p ${TMP_DIR}
 
 # Sort chromosomes individually as that's much more efficient
-cat header.txt | cut -f ${CUT_COLUMNS} | bgzip > ${OUT_FILE}
+cat header.txt | cut -f ${CUT_COLUMNS} > ${OUT_FILE}
 for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do
-    zgrep -h -v ^#chr dbNSFP4.5a_variant.chr${chrom}.gz | cut -f ${CUT_COLUMNS} | sort -T ${TMP_DIR} -k1,1 -k2,2n - | bgzip >> ${OUT_FILE}
+    zgrep -h -v ^#chr dbNSFP4.5a_variant.chr${chrom}.gz | cut -f ${CUT_COLUMNS} | sort -T ${TMP_DIR} -k1,1 -k2,2n - >> ${OUT_FILE}
 done
 
-tabix -s 1 -b 2 -e 2 ${OUT_FILE} # cols are: 1=chr, 2=pos
+bgzip ${OUT_FILE}
+tabix -s 1 -b 2 -e 2 ${OUT_FILE}.gz # cols are: 1=chr, 2=pos

From 31999dd033a1dfd528b5b5fb4b2e954044d22007 Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Tue, 5 Dec 2023 16:46:38 +1030
Subject: [PATCH 21/29] Move alpha missense to dbNSFP

---
 .../0083_one_off_move_alphamissense_dbnsfp.py | 31 +++++++++++++++
 ...tation_alphamissense_rankscore_and_more.py | 35 +++++++++++++++++
 annotation/models/damage_enums.py             | 16 ++------
 annotation/models/models.py                   | 11 +++---
 annotation/models/models_enums.py             |  1 -
 .../0108_one_off_move_alphamissense_dbnsfp.py | 39 +++++++++++++++++++
 6 files changed, 114 insertions(+), 19 deletions(-)
 create mode 100644 annotation/migrations/0083_one_off_move_alphamissense_dbnsfp.py
 create mode 100644 annotation/migrations/0084_rename_alphamissense_pathogenicity_variantannotation_alphamissense_rankscore_and_more.py
 create mode 100644 snpdb/migrations/0108_one_off_move_alphamissense_dbnsfp.py

diff --git a/annotation/migrations/0083_one_off_move_alphamissense_dbnsfp.py b/annotation/migrations/0083_one_off_move_alphamissense_dbnsfp.py
new file mode 100644
index 000000000..03dad05f9
--- /dev/null
+++ b/annotation/migrations/0083_one_off_move_alphamissense_dbnsfp.py
@@ -0,0 +1,31 @@
+# Generated by Django 4.2.2 on 2023-12-05 05:51
+
+from django.db import migrations
+
+def _one_off_move_alphamissense_dbnsfp(apps, _schema_editor):
+    PATHOGENICITY_PREDICTIONS = 'P'
+    VEP_PLUGIN_DBNSFP = 'd'
+
+
+    ColumnVEPField = apps.get_model("annotation", "ColumnVEPField")
+    # The old alphamissense VEPFields cascade deleted from 0108_one_off_move_alphamissense_dbnsfp
+
+    data = {
+        'column': 'alphamissense_pathogenicity', 'min_vep_columns_version': 3,
+        'variant_grid_column_id': 'alphamissense_rankscore',
+        'vep_plugin': VEP_PLUGIN_DBNSFP,
+        'source_field': 'AlphaMissense_rankscore', 'category': PATHOGENICITY_PREDICTIONS
+    }
+    ColumnVEPField.objects.create(**data)
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('annotation', '0082_new_vep_110_columns_v3'),
+        ("snpdb", "0108_one_off_move_alphamissense_dbnsfp"),
+    ]
+
+    operations = [
+        migrations.RunPython(_one_off_move_alphamissense_dbnsfp)
+    ]
diff --git a/annotation/migrations/0084_rename_alphamissense_pathogenicity_variantannotation_alphamissense_rankscore_and_more.py b/annotation/migrations/0084_rename_alphamissense_pathogenicity_variantannotation_alphamissense_rankscore_and_more.py
new file mode 100644
index 000000000..6c30cca77
--- /dev/null
+++ b/annotation/migrations/0084_rename_alphamissense_pathogenicity_variantannotation_alphamissense_rankscore_and_more.py
@@ -0,0 +1,35 @@
+# Generated by Django 4.2.2 on 2023-12-05 06:16
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('annotation', '0083_one_off_move_alphamissense_dbnsfp'),
+    ]
+
+    operations = [
+        migrations.RenameField(
+            model_name='variantannotation',
+            old_name='alphamissense_pathogenicity',
+            new_name='alphamissense_rankscore',
+        ),
+        migrations.RemoveField(
+            model_name='variantannotation',
+            name='alphamissense_class',
+        ),
+        migrations.RemoveField(
+            model_name='varianttranscriptannotation',
+            name='alphamissense_class',
+        ),
+        migrations.RemoveField(
+            model_name='varianttranscriptannotation',
+            name='alphamissense_pathogenicity',
+        ),
+        migrations.AlterField(
+            model_name='columnvepfield',
+            name='vep_plugin',
+            field=models.CharField(choices=[('d', 'dbNSFP'), ('v', 'dbscSNV'), ('g', 'Grantham'), ('l', 'LoFtool'), ('n', 'Mastermind'), ('V', 'MaveDb'), ('m', 'MaxEntScan'), ('N', 'NMD'), ('a', 'SpliceAI'), ('s', 'SpliceRegion'), ('o', 'StructuralVariantOverlap')], max_length=1, null=True),
+        ),
+    ]
diff --git a/annotation/models/damage_enums.py b/annotation/models/damage_enums.py
index 16c4aaa1e..fdc588643 100644
--- a/annotation/models/damage_enums.py
+++ b/annotation/models/damage_enums.py
@@ -156,16 +156,8 @@ class ALoFTPrediction(models.TextChoices):
     DOMINANT = "d", "Dominant"
 
 
-class AlphaMissensePrediction(AbstractPathogenicity):
+class AlphaMissensePrediction(models.TextChoices):
     """ @see https://asia.ensembl.org/info/docs/tools/vep/script/vep_plugins.html#alphamissense """
-    LIKELY_BENIGN = 'b'
-    AMBIGUOUS = "a"
-    LIKELY_PATHOGENIC = "p"
-
-    CHOICES = [
-        (LIKELY_BENIGN, 'likely_benign'),
-        (AMBIGUOUS, 'ambiguous'),
-        (LIKELY_PATHOGENIC, 'likely_pathogenic'),
-    ]
-    MINIMUM_FLAG_DAMAGE_LEVEL = LIKELY_PATHOGENIC
-    VARIANT_PATH = "variantannotation__alphamissense_class"
+    LIKELY_BENIGN = 'b', 'likely_benign'
+    AMBIGUOUS = "a", 'ambiguous'
+    LIKELY_PATHOGENIC = "p", 'likely_pathogenic'
diff --git a/annotation/models/models.py b/annotation/models/models.py
index b99d3fa45..79cea1783 100644
--- a/annotation/models/models.py
+++ b/annotation/models/models.py
@@ -557,10 +557,10 @@ def get_pathogenic_prediction_funcs(self) -> Dict[str, Callable]:
             pathogenic_rankscore = settings.ANNOTATION_MIN_PATHOGENIC_RANKSCORE
             pathogenic_prediction_columns = ['bayesdel_noaf_rankscore', 'cadd_raw_rankscore', 'clinpred_rankscore',
                                              'revel_rankscore', 'metalr_rankscore', 'vest4_rankscore']
-            pp_funcs = {c: lambda d: float(d) >= pathogenic_rankscore for c in pathogenic_prediction_columns}
             if self.columns_version == 3:
-                pp_funcs["alphamissense_class"] = lambda d: d in AlphaMissensePrediction.get_damage_or_greater_levels()
-            return pp_funcs
+                pathogenic_prediction_columns.append("alphamissense_rankscore")
+
+            return {c: lambda d: float(d) >= pathogenic_rankscore for c in pathogenic_prediction_columns}
 
         raise ValueError(f"Don't know fields for {self.columns_version=}")
 
@@ -842,9 +842,6 @@ class AbstractVariantAnnotation(models.Model):
     splice_region = models.TextField(null=True, blank=True)
     symbol = models.TextField(null=True, blank=True)
 
-    alphamissense_class  = models.CharField(max_length=1, choices=AlphaMissensePrediction.CHOICES, null=True, blank=True)
-    alphamissense_pathogenicity = models.FloatField(null=True, blank=True)
-
     mavedb_score = models.FloatField(null=True, blank=True)
     mavedb_urn = models.TextField(null=True, blank=True)
 
@@ -955,6 +952,8 @@ class VariantAnnotation(AbstractVariantAnnotation):
     clinpred_rankscore = models.FloatField(null=True, blank=True)
     vest4_rankscore = models.FloatField(null=True, blank=True)
     metalr_rankscore = models.FloatField(null=True, blank=True)
+    alphamissense_rankscore = models.FloatField(null=True, blank=True)
+
     # ALoFT (from dbNSFP)
     aloft_prob_tolerant = models.FloatField(null=True, blank=True)
     aloft_prob_recessive = models.FloatField(null=True, blank=True)
diff --git a/annotation/models/models_enums.py b/annotation/models/models_enums.py
index bda376d97..92a24ee0c 100644
--- a/annotation/models/models_enums.py
+++ b/annotation/models/models_enums.py
@@ -125,7 +125,6 @@ class ColumnAnnotationCategory(models.TextChoices):
 
 
 class VEPPlugin(models.TextChoices):
-    ALPHAMISSENSE = 'A', 'AlphaMissense'
     DBNSFP = 'd', 'dbNSFP'
     DBSCSNV = 'v', 'dbscSNV'
     GRANTHAM = 'g', 'Grantham'
diff --git a/snpdb/migrations/0108_one_off_move_alphamissense_dbnsfp.py b/snpdb/migrations/0108_one_off_move_alphamissense_dbnsfp.py
new file mode 100644
index 000000000..21f4c7837
--- /dev/null
+++ b/snpdb/migrations/0108_one_off_move_alphamissense_dbnsfp.py
@@ -0,0 +1,39 @@
+# Generated by Django 4.2.2 on 2023-12-05 05:55
+
+from django.db import migrations
+
+
+def _one_off_move_alphamissense_dbnsfp(apps, _schema_editor):
+    # Getting rid of alphamissense_pathogenicity to replace with alphamissense_rankscore
+
+    TRANSCRIPT_LEVEL = 'T'
+
+    VariantGridColumn = apps.get_model("snpdb", "VariantGridColumn")
+    ColumnVCFInfo = apps.get_model("snpdb", "ColumnVCFInfo")
+
+    VariantGridColumn.objects.filter(pk__in=['alphamissense_pathogenicity', 'alphamissense_class']).delete()
+
+    alphamissense_rankscore = VariantGridColumn.objects.create(grid_column_name='alphamissense_rankscore',
+                                                               variant_column='variantannotation__alphamissense_rankscore',
+                                                               annotation_level=TRANSCRIPT_LEVEL,
+                                                               width=None,
+                                                               label='AlphaMissense RankScore',
+                                                               model_field=True,
+                                                               queryset_field=True)
+
+    ColumnVCFInfo.objects.create(info_id='ALPHAMISSENSE_rankscore',
+                                 column=alphamissense_rankscore,
+                                 number=1,
+                                 type='F',
+                                 description='AlphaMissense pathogenicity rank score')
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ('annotation', '0082_new_vep_110_columns_v3'),
+        ('snpdb', '0107_new_vep_110_columns_v3'),
+    ]
+
+    operations = [
+        migrations.RunPython(_one_off_move_alphamissense_dbnsfp)
+    ]

From 5a04c47cafa7ca355d7d341615c97820eab6522b Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Tue, 5 Dec 2023 23:14:13 +1030
Subject: [PATCH 22/29] #850 - annotation

---
 ...pgeneannotationversion_options_and_more.py | 28 +++++++++++++++++++
 annotation/models/models.py                   |  8 ++++--
 .../bulk_vep_vcf_annotation_inserter.py       |  2 +-
 annotation/vep_annotation.py                  |  1 -
 4 files changed, 35 insertions(+), 4 deletions(-)
 create mode 100644 annotation/migrations/0085_alter_dbnsfpgeneannotationversion_options_and_more.py

diff --git a/annotation/migrations/0085_alter_dbnsfpgeneannotationversion_options_and_more.py b/annotation/migrations/0085_alter_dbnsfpgeneannotationversion_options_and_more.py
new file mode 100644
index 000000000..a72e77fa1
--- /dev/null
+++ b/annotation/migrations/0085_alter_dbnsfpgeneannotationversion_options_and_more.py
@@ -0,0 +1,28 @@
+# Generated by Django 4.1.4 on 2023-12-05 12:00
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        (
+            "annotation",
+            "0084_rename_alphamissense_pathogenicity_variantannotation_alphamissense_rankscore_and_more",
+        ),
+    ]
+
+    operations = [
+        migrations.AlterModelOptions(
+            name="dbnsfpgeneannotationversion",
+            options={},
+        ),
+        migrations.AlterField(
+            model_name="dbnsfpgeneannotationversion",
+            name="md5_hash",
+            field=models.CharField(max_length=32),
+        ),
+        migrations.AlterUniqueTogether(
+            name="dbnsfpgeneannotationversion",
+            unique_together={("version", "md5_hash")},
+        ),
+    ]
diff --git a/annotation/models/models.py b/annotation/models/models.py
index 79cea1783..fd9f41773 100644
--- a/annotation/models/models.py
+++ b/annotation/models/models.py
@@ -281,9 +281,13 @@ class ClinVarCitation(models.Model):
 
 
 class DBNSFPGeneAnnotationVersion(TimeStampedModel):
-    """ @see https://sites.google.com/site/jpopgen/dbNSFP """
+    """ @see https://sites.google.com/site/jpopgen/dbNSFP
+        This isn't updated every release, so can have same hash across diff versions """
     version = models.TextField(primary_key=True)
-    md5_hash = models.CharField(max_length=32, unique=True)
+    md5_hash = models.CharField(max_length=32)
+
+    class Meta:
+        unique_together = ('version', 'md5_hash')
 
     def save(self, **kwargs):
         created = not self.pk
diff --git a/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py b/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py
index 6845acb88..4eb015447 100644
--- a/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py
+++ b/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py
@@ -609,7 +609,7 @@ def get_format_alphamissense_class_func():
     """ GRCh37 has 'benign' while GRCh38 has 'likely_benign'
         @see https://github.com/Ensembl/VEP_plugins/issues/668
     """
-    cff = get_choice_formatter_func(AlphaMissensePrediction.CHOICES)
+    cff = get_choice_formatter_func(AlphaMissensePrediction.choices)
     def _format_alphamissense_class(alphamissense_class):
         if alphamissense_class == "benign":
             alphamissense_class = "likely_benign"
diff --git a/annotation/vep_annotation.py b/annotation/vep_annotation.py
index ee43f94b1..d8b0a11fc 100644
--- a/annotation/vep_annotation.py
+++ b/annotation/vep_annotation.py
@@ -144,7 +144,6 @@ def get_vep_command(vcf_filename, output_filename, genome_build: GenomeBuild, an
 
         if vc.columns_version >= 3:
             plugin_data_func.update({
-                VEPPlugin.ALPHAMISSENSE: lambda: f"AlphaMissense,file={vc['alphamissense']}",
                 VEPPlugin.MAVEDB: lambda: f"MaveDB,file={vc['mave']},single_aminoacid_changes=0,transcript_match=0 ",
             })
 

From 14176f0846fb1cedb53a9e0c7909ad306ccd0e6c Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Wed, 6 Dec 2023 14:35:45 +1030
Subject: [PATCH 23/29] MAVE format. Be able to load page if huge logs

---
 annotation/templates/annotation/view_annotation_run.html | 4 ++--
 annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/annotation/templates/annotation/view_annotation_run.html b/annotation/templates/annotation/view_annotation_run.html
index 9b7883ac7..3dd3eeecb 100644
--- a/annotation/templates/annotation/view_annotation_run.html
+++ b/annotation/templates/annotation/view_annotation_run.html
@@ -74,8 +74,8 @@
             {% labelled hint="chunky" label="VCF Dump Filename" %}{% code_shell annotation_run.vcf_dump_filename %}{% endlabelled %}
             {% labelled hint="chunky" label="VCF Annotated Filename" %}{% code_shell annotation_run.vcf_annotated_filename %}{% endlabelled %}
             {% labelled hint="chunky" label="Pipeline Command" %}{% code_shell annotation_run.pipeline_command %}{% endlabelled %}
-            {% labelled hint="chunky" label="Pipeline StdOut" %}{% code_shell annotation_run.pipeline_stdout %}{% endlabelled %}
-            {% labelled hint="chunky" label="Pipeline StdErr" %}{% code_shell annotation_run.pipeline_stderr %}{% endlabelled %}
+            {% labelled hint="chunky" label="Pipeline StdOut" %}{% code_shell annotation_run.pipeline_stdout|truncatechars:10000 %}{% endlabelled %}
+            {% labelled hint="chunky" label="Pipeline StdErr" %}{% code_shell annotation_run.pipeline_stderr|truncatechars:10000 %}{% endlabelled %}
             {% labelled hint="chunky" label="Error Exception" %}{% code_shell annotation_run.error_exceptionr %}{% endlabelled %}
         </div>
     </div>
diff --git a/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py b/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py
index 4eb015447..4eade20b7 100644
--- a/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py
+++ b/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py
@@ -136,6 +136,7 @@ def _get_vep_columns_from_csq(infos):
 
     def _add_vep_field_handlers(self):
         # TOPMED and 1k genomes can return multiple values - take highest
+        format_pick_lowest_float = get_clean_and_pick_single_value_func(min, float)
         format_pick_highest_float = get_clean_and_pick_single_value_func(max, float)
         format_pick_highest_int = get_clean_and_pick_single_value_func(max, int)
         remove_empty_multiples = get_clean_and_pick_single_value_func(join_uniq)
@@ -173,6 +174,7 @@ def _add_vep_field_handlers(self):
             "mastermind_count_3_aa_change": get_clean_and_pick_single_value_func(operator.itemgetter(2), int),
             "mutation_assessor_pred_most_damaging": get_most_damaging_func(MutationAssessorPrediction),
             "mutation_taster_pred_most_damaging": get_most_damaging_func(MutationTasterPrediction),
+            "mavedb_score": format_pick_lowest_float,
             "nmd_escaping_variant": format_nmd_escaping_variant,
             # conservation fields are from BigWig, which can return multiple entries
             # for deletions. Higher = more conserved, so for rare disease filtering taking max makes sense

From 3f825270f31ff93151a7c3272b6868583e3698b6 Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Wed, 6 Dec 2023 16:50:11 +1030
Subject: [PATCH 24/29] Be able to reload annotation runs (was broken after we
 split standard/CNV)

---
 annotation/views.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/annotation/views.py b/annotation/views.py
index 7d4b816c5..02387a6fd 100644
--- a/annotation/views.py
+++ b/annotation/views.py
@@ -394,8 +394,10 @@ def view_annotation_run(request, annotation_run_id):
     can_retry_annotation_run = False
     can_retry_annotation_run_upload = False
     if annotation_run.status == AnnotationStatus.ERROR:
+        # There may be other runs of different types (don't care about them)
         other_annotation_runs_qs = AnnotationRun.objects.filter(
-            annotation_range_lock=annotation_run.annotation_range_lock)
+            annotation_range_lock=annotation_run.annotation_range_lock,
+            pipeline_type=annotation_run.pipeline_type)
         other_annotation_runs_qs = other_annotation_runs_qs.exclude(status=AnnotationStatus.ERROR)
         can_retry_annotation_run = not other_annotation_runs_qs.exists()
         can_retry_annotation_run_upload = can_retry_annotation_run and annotation_run.vcf_annotated_filename

From fc80a54d5447b546ca5c8e9fa2f1e960590daf50 Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Wed, 6 Dec 2023 16:50:48 +1030
Subject: [PATCH 25/29] #938 - SV data processing scripts

---
 .../generate_annotation/gnomad4_process_sv.sh | 37 +++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100755 annotation/annotation_data/generate_annotation/gnomad4_process_sv.sh

diff --git a/annotation/annotation_data/generate_annotation/gnomad4_process_sv.sh b/annotation/annotation_data/generate_annotation/gnomad4_process_sv.sh
new file mode 100755
index 000000000..a84704a19
--- /dev/null
+++ b/annotation/annotation_data/generate_annotation/gnomad4_process_sv.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+export PATH=${PATH}:/hpcfs/groups/phoenix-hpc-sacgf/tools/tabix-0.2.6:/hpcfs/groups/phoenix-hpc-sacgf/tools/bcftools/current/bcftools
+
+# THIS_DIR=$(realpath "$(dirname "${BASH_SOURCE[0]}")")
+THIS_DIR=/hpcfs/groups/phoenix-hpc-sacgf/reference/hg38/Misce/gnomAD4/sv
+cd ${THIS_DIR}
+
+# Structural variants
+SV_COLUMNS=INFO/SVLEN,INFO/SVTYPE,INFO/END
+COLS=INFO/AC,INFO/AN,INFO/AF
+OTHER_COUNTS=INFO/N_HOMREF,INFO/N_HET,INFO/N_HOMALT,INFO/POPMAX_AF,INFO/PAR
+SUBPOPS=INFO/afr_AF,INFO/amr_AF,INFO/asj_AF,INFO/eas_AF,INFO/fin_AF,INFO/mid_AF,INFO/nfe_AF,INFO/oth_AF,INFO/sas_AF
+
+KEEP_COLUMNS=${SV_COLUMNS},${COLS},${OTHER_COUNTS},${SUBPOPS}
+MAPPING_DIR=$(dirname ${THIS_DIR})
+CHROM_MAPPING_FILE=${MAPPING_DIR}/chrom_mapping_GRCh38.map
+MERGE_VCF=gnomad.v4.0.sv.merged.vcf
+
+# gnomad v4
+merge_args=()
+for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do
+  GNOMAD_VCF=gnomad.v4.0.sv.chr${chrom}.vcf.gz
+  #wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.0/genome_sv/${GNOMAD_VCF}
+  #wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.0/genome_sv/${GNOMAD_VCF}.tbi
+
+  OUTPUT_VCF=gnomad.v4.0.sv.chr${chrom}.converted.vcf.gz
+  echo "Going from ${GNOMAD_VCF} -> ${OUTPUT_VCF}"
+
+  # Dont' normalize as is mostly "N" refs
+  bcftools annotate --exclude 'AC=0' --remove "^${KEEP_COLUMNS}" --rename-chrs=${CHROM_MAPPING_FILE} ${GNOMAD_VCF} -o ${OUTPUT_VCF}
+  merge_args+=(${OUTPUT_VCF})
+done
+
+bcftools concat --output-type b --output ${MERGE_VCF} ${merge_args[@]};
+bgzip ${MERGE_VCF}
+tabix -p vcf ${MERGE_VCF}.gz
\ No newline at end of file

From 98611aee01371d879ba76e42a0b51ecfebe17ecf Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Wed, 6 Dec 2023 17:03:11 +1030
Subject: [PATCH 26/29] dbnsfp 4.5 processing scripts

---
 .../dbnsfp_grch37_strip.sh                    | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh b/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh
index de6851765..e6d3ef889 100755
--- a/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh
+++ b/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh
@@ -27,19 +27,19 @@ set -e
 # print(",".join([str(c) for c in sorted(cols)]))
 # columns are: '3,4,5,6,8,9,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705'
 
+# Note: We can't do this per-contig then join them, as some variants switch contigs between builds
 CUT_COLUMNS="3,4,5,6,8,9,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705"
-SEQ_COL=5 # hg19_chr
-POS_COL=6 # hg19_pos(1-based)
-OUT_FILE=dbNSFP4.5a.grch37.stripped
-TMP_DIR=/tmp # /hpcfs/groups/phoenix-hpc-sacgf/scratch/dbnsfp_GRCh37
-mkdir -p ${TMP_DIR}
+SEQ_COL=3  # hg19_chr was col 5 (but 3rd after cut)
+POS_COL=4  # hg19_pos(1-based) was 6 (but 4th after cut)
 
-# Sort chromosomes individually as that's much more efficient
-cat header.txt | cut -f ${CUT_COLUMNS} > ${OUT_FILE}
-for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do
-    zgrep -h -v ^#chr dbNSFP4.5a_variant.chr${chrom}.gz | awk '$8 != "." ' | cut -f ${CUT_COLUMNS} | sort -T ${TMP_DIR} -k${SEQ_COL},${SEQ_COL} -k${POS_COL},${POS_COL}n - >> ${OUT_FILE}
-done
+version=4.5a
+out_vcf=dbNSFP${version}_grch37.gz
 
-bgzip ${OUT_FILE}
-tabix -s ${SEQ_COL} -b ${POS_COL} -e ${POS_COL} ${OUT_FILE}.gz
+# cd /hpcfs/groups/phoenix-hpc-sacgf/reference/annotation/dbnsfp/dbnsfp4.5
 
+zcat dbNSFP${version}_variant.chr1.gz | head -n1 > h
+zgrep -h -v ^#chr dbNSFP${version}_variant.chr* | awk '$8 != "." ' | sort -T ${TMP_DIR} -k8,8 -k9,9n - | cat h - | bgzip -c > ${out_vcf}
+zcat ${out_vcf} | cut -f  ${CUT_COLUMNS}  > dbNSFP${version}_grch37.stripped
+bgzip dbNSFP${version}_grch37.stripped
+
+tabix -s ${SEQ_COL} -b ${POS_COL} -e ${POS_COL} dbNSFP${version}_grch37.stripped.gz

From 49340d777b762281f547842f3d98b9c2d7816b6b Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Wed, 6 Dec 2023 17:06:29 +1030
Subject: [PATCH 27/29] Add missing info fields to get rid of warning

---
 annotation/annotation_data/generate_annotation/gnomad_data.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/annotation/annotation_data/generate_annotation/gnomad_data.py b/annotation/annotation_data/generate_annotation/gnomad_data.py
index 2d461b990..7a2622a44 100755
--- a/annotation/annotation_data/generate_annotation/gnomad_data.py
+++ b/annotation/annotation_data/generate_annotation/gnomad_data.py
@@ -289,7 +289,9 @@ def write_vcf_header(version, info_fields, popmax_fields, sub_pops):
         else:
             af_desc = ""
         af_desc += f" made from (exomes_{ac_name} + genomes_{ac_name}) / (exomes_{an_name} + genomes_{an_name})"
-        meta += f'##INFO=<ID={info_id},Number=1,Type=Float,Description="Allele Frequency {af_desc}">\n'
+        meta += f'##INFO=<ID={info_id},Number=1,Type=Float,Description="Allele Frequency for {af_desc}">\n'
+        meta += f'##INFO=<ID={ac_name},Number=1,Type=Integer,Description="Allele Count for {af_desc}">\n'
+        meta += f'##INFO=<ID={an_name},Number=1,Type=Integer,Description="Allele Number for {af_desc}">\n'
 
     vcf_header = f"gnomad_{version}_vcf_header.txt.gz"
     with gzip.open(vcf_header, "wt") as f:

From a5cd0990d52b3ae7ff853e630466f7be6e0860bc Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Thu, 7 Dec 2023 10:33:21 +1030
Subject: [PATCH 28/29] dbnsfp 4.5 processing scripts - left off "#" for
 GRCh37, get index cols right

---
 .../generate_annotation/dbnsfp_grch37_strip.sh            | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh b/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh
index e6d3ef889..cfeada043 100755
--- a/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh
+++ b/annotation/annotation_data/generate_annotation/dbnsfp_grch37_strip.sh
@@ -29,8 +29,8 @@ set -e
 
 # Note: We can't do this per-contig then join them, as some variants switch contigs between builds
 CUT_COLUMNS="3,4,5,6,8,9,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705"
-SEQ_COL=3  # hg19_chr was col 5 (but 3rd after cut)
-POS_COL=4  # hg19_pos(1-based) was 6 (but 4th after cut)
+SEQ_COL=5  # hg19_chr (after cut)
+POS_COL=6  # hg19_pos(1-based) (after cut)
 
 version=4.5a
 out_vcf=dbNSFP${version}_grch37.gz
@@ -39,7 +39,7 @@ out_vcf=dbNSFP${version}_grch37.gz
 
 zcat dbNSFP${version}_variant.chr1.gz | head -n1 > h
 zgrep -h -v ^#chr dbNSFP${version}_variant.chr* | awk '$8 != "." ' | sort -T ${TMP_DIR} -k8,8 -k9,9n - | cat h - | bgzip -c > ${out_vcf}
-zcat ${out_vcf} | cut -f  ${CUT_COLUMNS}  > dbNSFP${version}_grch37.stripped
-bgzip dbNSFP${version}_grch37.stripped
+# Needs a '#' header
+(echo -n "#" ; zcat ${out_vcf} | cut -f  ${CUT_COLUMNS}) | bgzip > dbNSFP${version}_grch37.stripped.gz
 
 tabix -s ${SEQ_COL} -b ${POS_COL} -e ${POS_COL} dbNSFP${version}_grch37.stripped.gz

From 263e3d3909164a652c6d9402864e990c1dca8d91 Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Thu, 7 Dec 2023 16:48:54 +1030
Subject: [PATCH 29/29] #850 - Allow for custom MAVE "NA" results

---
 .../vcf_files/bulk_vep_vcf_annotation_inserter.py   | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py b/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py
index 4eade20b7..03e3ba757 100644
--- a/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py
+++ b/annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py
@@ -136,7 +136,9 @@ def _get_vep_columns_from_csq(infos):
 
     def _add_vep_field_handlers(self):
         # TOPMED and 1k genomes can return multiple values - take highest
-        format_pick_lowest_float = get_clean_and_pick_single_value_func(min, float)
+        empty_mave_float_values = EMPTY_VALUES | {"NA"}
+        format_pick_lowest_float = get_clean_and_pick_single_value_func(min, float,
+                                                                        empty_values=empty_mave_float_values)
         format_pick_highest_float = get_clean_and_pick_single_value_func(max, float)
         format_pick_highest_int = get_clean_and_pick_single_value_func(max, int)
         remove_empty_multiples = get_clean_and_pick_single_value_func(join_uniq)
@@ -643,17 +645,20 @@ def format_choice(raw_value):
     return format_choice
 
 
-def get_clean_and_pick_single_value_func(pick_single_value_func, cast_func=None):
+def get_clean_and_pick_single_value_func(pick_single_value_func, cast_func=None, empty_values=None):
     """ Returns a function to clean and pick single value.
         casting is performed before calling pick_single_value_func so you can call min/max """
 
+    if empty_values is None:
+        empty_values = EMPTY_VALUES
+
     def _clean_and_pick_single_value_func(raw_value):
         it = (tm for tm in raw_value.split(VEP_SEPARATOR) if tm != '')
         # Handle '.'
         if cast_func:
-            values = [cast_func(v) for v in it if v not in EMPTY_VALUES]
+            values = [cast_func(v) for v in it if v not in empty_values]
         else:
-            values = [v for v in it if v not in EMPTY_VALUES]
+            values = [v for v in it if v not in empty_values]
         value = None
         if values:
             value = pick_single_value_func(values)