Skip to content

Commit

Permalink
Merge branch 'new_vep110_fields'
Browse files Browse the repository at this point in the history
  • Loading branch information
davmlaw committed Dec 7, 2023
2 parents f2adb15 + 263e3d3 commit 6e6407c
Show file tree
Hide file tree
Showing 42 changed files with 1,727 additions and 968 deletions.
2 changes: 1 addition & 1 deletion analysis/tests/test_urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def setUpTestData(cls):
father=father_cs,
father_affected=False,
proband=proband_cs)
vcf_filename = os.path.join(settings.BASE_DIR, "annotation/tests/test_data/test_grch37.vep_annotated.vcf")
vcf_filename = os.path.join(settings.BASE_DIR, "annotation/tests/test_data/test_columns_version1_grch37.vep_annotated.vcf")
slowly_create_loci_and_variants_for_vcf(grch37, vcf_filename, get_variant_id_from_info=True)
variant = Variant.objects.filter(Variant.get_no_reference_q()).first()
CohortGenotype.objects.create(collection=collection,
Expand Down
12 changes: 7 additions & 5 deletions annotation/annotation_data/cdot_update.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#!/bin/bash

CDOT_VERSION=0.2.21
set -e

CDOT_VERSION=0.2.22
THIS_DIR=$(realpath "$(dirname "${BASH_SOURCE[0]}")")
VG_DIR=${THIS_DIR}/../..
DOWNLOAD_DIR=/tmp
Expand All @@ -9,10 +11,10 @@ echo "Downloading data in ${DOWNLOAD_DIR}"
cd ${DOWNLOAD_DIR}

wget \
https://github.com/SACGF/cdot/releases/download/v${CDOT_VERSION}/cdot-${CDOT_VERSION}.ensembl.grch37.json.gz \
https://github.com/SACGF/cdot/releases/download/v${CDOT_VERSION}/cdot-${CDOT_VERSION}.ensembl.grch38.json.gz \
https://github.com/SACGF/cdot/releases/download/v${CDOT_VERSION}/cdot-${CDOT_VERSION}.refseq.grch37.json.gz \
https://github.com/SACGF/cdot/releases/download/v${CDOT_VERSION}/cdot-${CDOT_VERSION}.refseq.grch38.json.gz
https://github.com/SACGF/cdot/releases/download/data_v${CDOT_VERSION}/cdot-${CDOT_VERSION}.ensembl.grch37.json.gz \
https://github.com/SACGF/cdot/releases/download/data_v${CDOT_VERSION}/cdot-${CDOT_VERSION}.ensembl.grch38.json.gz \
https://github.com/SACGF/cdot/releases/download/data_v${CDOT_VERSION}/cdot-${CDOT_VERSION}.refseq.grch37.json.gz \
https://github.com/SACGF/cdot/releases/download/data_v${CDOT_VERSION}/cdot-${CDOT_VERSION}.refseq.grch38.json.gz

cd ${VG_DIR}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,33 +2,44 @@

set -e

# All of this python is just to get the columns used in cut and tabix args
# Download 4.5 from https://sites.google.com/site/jpopgen/dbNSFP

# https://m.ensembl.org/info/docs/tools/vep/script/vep_plugins.html#dbnsfp
# zcat dbNSFP4.5a_variant.chr1.gz | head -n1 > header.txt
# mkdir /tmp/dbsnp37
# zgrep -h -v ^#chr dbNSFP4.5a_variant.chr* | awk '$8 != "." ' | sort -T /tmp/dbsnp37 -k8,8 -k9,9n - | cat header.txt - | bgzip -c > dbNSFP4.5a_grch37.gz
# tabix -s 8 -b 9 -e 9 dbNSFP4.5a.grch37.gz


# All of this python is just to get the columns used in cut and tabix args at bottom of this file

# Get dbNSFP fields used by VariantGrid - run python3 manage.py shell
# In [12]: ",".join(ColumnVEPField.get_source_fields(vep_plugin='d'))

# Get column names from dbNSFP data file
# df = pd.read_csv("./dbNSFP4.3a.grch37.gz", sep='\t', index_col=None, nrows=0)
# vep_fields = 'GERP++_RS,Interpro_domain,CADD_raw_rankscore,REVEL_rankscore,BayesDel_noAF_rankscore,ClinPred_rankscore,VEST4_rankscore,MetaLR_rankscore,Aloft_prob_Tolerant,Aloft_prob_Recessive,Aloft_prob_Dominant,Aloft_pred,Aloft_Confidence'
# import pandas as pd
# df = pd.read_csv("header.txt", sep='\t', index_col=None, nrows=0)
# vep_fields = 'GERP++_RS,Interpro_domain,CADD_raw_rankscore,REVEL_rankscore,BayesDel_noAF_rankscore,ClinPred_rankscore,VEST4_rankscore,MetaLR_rankscore,Aloft_prob_Tolerant,Aloft_prob_Recessive,Aloft_prob_Dominant,Aloft_pred,Aloft_Confidence,AlphaMissense_rankscore,AlphaMissense_pred'
# columns = ['ref', 'alt', 'aaref', 'aaalt', 'hg19_chr', 'hg19_pos(1-based)', 'Ensembl_transcriptid'] + vep_fields.split(",")
# cols = []
# for i in columns:
# cols.append(list(df.columns).index(i) + 1)
# ",".join([str(c) for c in sorted(cols)])
# columns are: '3,4,5,6,8,9,15,69,74,84,104,107,113,114,115,116,117,119,156,640'

# print(",".join([str(c) for c in sorted(cols)]))
# columns are: '3,4,5,6,8,9,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705'

# Download 4.3 from https://sites.google.com/site/jpopgen/dbNSFP
# Note: We can't do this per-contig then join them, as some variants switch contigs between builds
CUT_COLUMNS="3,4,5,6,8,9,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705"
SEQ_COL=5 # hg19_chr (after cut)
POS_COL=6 # hg19_pos(1-based) (after cut)

# https://m.ensembl.org/info/docs/tools/vep/script/vep_plugins.html#dbnsfp
# zcat dbNSFP4.3a_variant.chr1.gz | head -n1 > h
# zgrep -h -v ^#chr dbNSFP4.3a_variant.chr* | awk '$8 != "." ' | sort -T /path/to/tmp_folder -k8,8 -k9,9n - | cat h - | bgzip -c > dbNSFP4.3a_grch37.gz
# tabix -s 8 -b 9 -e 9 dbNSFP4.3a.grch37.gz
version=4.5a
out_vcf=dbNSFP${version}_grch37.gz

IN_FILE=dbNSFP4.3a.grch37.gz
OUT_FILE=dbNSFP4.3a.grch37.stripped.gz
# cd /hpcfs/groups/phoenix-hpc-sacgf/reference/annotation/dbnsfp/dbnsfp4.5

# Header needs to start with #
(echo -n "#" ; zcat ${IN_FILE} | cut -f 3,4,5,6,8,9,15,69,74,84,104,107,113,114,115,116,117,119,156,640 ) | bgzip > ${OUT_FILE}
tabix -s 5 -b 6 -e 6 ${OUT_FILE} # cols are: 1=ref, 2=alt, 3=chr, 4=pos
zcat dbNSFP${version}_variant.chr1.gz | head -n1 > h
zgrep -h -v ^#chr dbNSFP${version}_variant.chr* | awk '$8 != "." ' | sort -T ${TMP_DIR} -k8,8 -k9,9n - | cat h - | bgzip -c > ${out_vcf}
# Needs a '#' header
(echo -n "#" ; zcat ${out_vcf} | cut -f ${CUT_COLUMNS}) | bgzip > dbNSFP${version}_grch37.stripped.gz

tabix -s ${SEQ_COL} -b ${POS_COL} -e ${POS_COL} dbNSFP${version}_grch37.stripped.gz
Original file line number Diff line number Diff line change
Expand Up @@ -2,34 +2,41 @@

set -e

# Download 4.5 from https://sites.google.com/site/jpopgen/dbNSFP

# https://m.ensembl.org/info/docs/tools/vep/script/vep_plugins.html#dbnsfp

# zcat dbNSFP4.5a_variant.chr1.gz | head -n1 > header.txt
# mkdir /tmp/dbsnp38
# zgrep -h -v ^#chr dbNSFP4.5a_variant.chr* | sort -T /tmp/dbsnp38 -k1,1 -k2,2n - | cat header.txt - | bgzip -c > dbNSFP4.5a_grch38.gz
# tabix -s 1 -b 2 -e 2 dbNSFP4.5a_grch38.gz


# All of this python is just to get the columns used in cut and tabix args

# Get dbNSFP fields used by VariantGrid - run python3 manage.py shell
# In [12]: ",".join(ColumnVEPField.get_source_fields(vep_plugin='d'))

# Get column names from dbNSFP data file
# df = pd.read_csv("./dbNSFP4.3a.grch38.gz", sep='\t', index_col=None, nrows=0)
# vep_fields = 'GERP++_RS,Interpro_domain,CADD_raw_rankscore,REVEL_rankscore,BayesDel_noAF_rankscore,ClinPred_rankscore,VEST4_rankscore,MetaLR_rankscore,Aloft_prob_Tolerant,Aloft_prob_Recessive,Aloft_prob_Dominant,Aloft_pred,Aloft_Confidence'
# df = pd.read_csv("header.txt", sep='\t', index_col=None, nrows=0)
# vep_fields = 'GERP++_RS,Interpro_domain,CADD_raw_rankscore,REVEL_rankscore,BayesDel_noAF_rankscore,ClinPred_rankscore,VEST4_rankscore,MetaLR_rankscore,Aloft_prob_Tolerant,Aloft_prob_Recessive,Aloft_prob_Dominant,Aloft_pred,Aloft_Confidence,AlphaMissense_rankscore,AlphaMissense_pred'
# columns = ['#chr', 'pos(1-based)', 'ref', 'alt', 'aaref', 'aaalt', 'Ensembl_transcriptid'] + vep_fields.split(",")
# cols = []
# for i in columns:
# cols.append(list(df.columns).index(i) + 1)
# ",".join([str(c) for c in sorted(cols)])
# columns are: '1,2,3,4,5,6,15,69,74,84,104,107,113,114,115,116,117,119,156,640'

# Download 4.3 from https://sites.google.com/site/jpopgen/dbNSFP

# https://m.ensembl.org/info/docs/tools/vep/script/vep_plugins.html#dbnsfp

# zcat dbNSFP4.3a_variant.chr1.gz | head -n1 > h
# zgrep -h -v ^#chr dbNSFP4.3a_variant.chr* | sort -T /path/to/tmp_folder -k1,1 -k2,2n - | cat h - | bgzip -c > dbNSFP4.3a_grch38.gz
# tabix -s 1 -b 2 -e 2 dbNSFP4.3a_grch38.gz


IN_FILE=dbNSFP4.3a.grch38.gz
OUT_FILE=dbNSFP4.3a.grch38.stripped.gz

# Header needs to start with #
(echo -n "#" ; zcat ${IN_FILE} | cut -f 1,2,3,4,5,6,15,69,74,84,104,107,113,114,115,116,117,119,156,640 ) | bgzip > ${OUT_FILE}
tabix -s 1 -b 2 -e 2 ${OUT_FILE} # cols are: 1=chr, 2=pos

# print(",".join([str(c) for c in sorted(cols)]))
# columns are: '1,2,3,4,5,6,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705'

CUT_COLUMNS="1,2,3,4,5,6,15,69,74,84,106,109,139,140,142,143,144,145,146,148,185,705"
OUT_FILE=dbNSFP4.5a.grch38.stripped
TMP_DIR=/tmp # /hpcfs/groups/phoenix-hpc-sacgf/scratch/dbnsfp4.5_GRCh38
mkdir -p ${TMP_DIR}

# Sort chromosomes individually as that's much more efficient
cat header.txt | cut -f ${CUT_COLUMNS} > ${OUT_FILE}
for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do
zgrep -h -v ^#chr dbNSFP4.5a_variant.chr${chrom}.gz | cut -f ${CUT_COLUMNS} | sort -T ${TMP_DIR} -k1,1 -k2,2n - >> ${OUT_FILE}
done

bgzip ${OUT_FILE}
tabix -s 1 -b 2 -e 2 ${OUT_FILE}.gz # cols are: 1=chr, 2=pos
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/bash

# gnomad v4.0

# Exomes
for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do
wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/2.1.1/vcf/exomes/gnomad.exomes.r2.1.1.sites.${chrom}.vcf.bgz
wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/2.1.1/vcf/exomes/gnomad.exomes.r2.1.1.sites.${chrom}.vcf.bgz.tbi
done

# Genomes
for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do
wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/2.1.1/vcf/genomes/gnomad.genomes.r2.1.1.sites.${chrom}.vcf.bgz
wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/2.1.1/vcf/genomes/gnomad.genomes.r2.1.1.sites.${chrom}.vcf.bgz.tbi
done

# Structural
wget https://gnomad-public-us-east-1.s3.amazonaws.com/papers/2019-sv/gnomad_v2.1_sv.sites.vcf.gz
wget https://gnomad-public-us-east-1.s3.amazonaws.com/papers/2019-sv/gnomad_v2.1_sv.sites.vcf.gz.tbi
Loading

0 comments on commit 6e6407c

Please sign in to comment.