diff --git a/annotation/annotation_data/generate_annotation/gnomad3_create_genome_scripts.py b/annotation/annotation_data/generate_annotation/gnomad3_create_genome_scripts.py index 054e77c4e..822013d1f 100755 --- a/annotation/annotation_data/generate_annotation/gnomad3_create_genome_scripts.py +++ b/annotation/annotation_data/generate_annotation/gnomad3_create_genome_scripts.py @@ -1,12 +1,6 @@ #!/usr/bin/env python3 """ -We want to do this per-chrom so we can process in parallel - -Steps are: - 1. Download exomes.vcf + genome.vcf, removing most INFO fields before writing to disk (to reduce disk space) - 2. Merge exome + genome, summing counts - 3. Run through this script with --af to calculate allele frequency, write TSV (more efficient than VCF) - 4. Cat them all together again + The gnomAD v3.1.2 data set contains 76,156 whole genomes (and no exomes), all mapped to the GRCh38 reference sequence. """ from argparse import ArgumentParser @@ -51,6 +45,7 @@ def main(args): chrom_scripts.append(chrom_script) with open(chrom_script, "w") as cs: cs.write(bash_header) + # gnomAD3.1 only has genomes, no exomes url = f"https://storage.googleapis.com/gcp-public-data--gnomad/release/3.1.2/vcf/genomes/gnomad.genomes.v3.1.3.sites.chr{chrom}.vcf.bgz" output_vcf = f"{prefix}.filtered_info.vcf.gz" if args.chrom_mapping_file: diff --git a/annotation/annotation_data/generate_annotation/gnomad4_download_exomes.sh b/annotation/annotation_data/generate_annotation/gnomad4_download_exomes.sh new file mode 100644 index 000000000..1375d380b --- /dev/null +++ b/annotation/annotation_data/generate_annotation/gnomad4_download_exomes.sh @@ -0,0 +1,10 @@ +#!/bin/bash + + +# Structural variants + +# gnomad v4 +for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do + wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.0/vcf/exomes/gnomad.exomes.v4.0.sites.chr${chrom}.vcf.bgz + wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.0/vcf/exomes/gnomad.exomes.v4.0.sites.chr${chrom}.vcf.bgz.tbi +done \ No newline at end of file diff --git a/annotation/annotation_data/generate_annotation/gnomad4_download_genomes.sh b/annotation/annotation_data/generate_annotation/gnomad4_download_genomes.sh new file mode 100644 index 000000000..dcb8b239d --- /dev/null +++ b/annotation/annotation_data/generate_annotation/gnomad4_download_genomes.sh @@ -0,0 +1,10 @@ +#!/bin/bash + + +# Structural variants + +# gnomad v4 +for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do + wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chr${chrom}.vcf.bgz + wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chr${chrom}.vcf.bgz.tbi +done \ No newline at end of file diff --git a/annotation/annotation_data/generate_annotation/gnomad4_download_structural.sh b/annotation/annotation_data/generate_annotation/gnomad4_download_structural.sh new file mode 100644 index 000000000..ea9a7381c --- /dev/null +++ b/annotation/annotation_data/generate_annotation/gnomad4_download_structural.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +THIS_DIR=$(realpath "$(dirname "${BASH_SOURCE[0]}")") + +# Structural variants +SV_COLUMNS=SVLEN,SVTYPE,END +COLS=AC,AN,AF +OTHER_COUNTS=N_HOMREF,N_HET,N_HOMALT +SUBPOPS=afr_AF,amr_AF,asj_AF,eas_AF,fin_AF,mid_AF,nfe_AF,oth_AF,sas_AF + +KEEP_COLUMNS=${SV_COLUMNS},${COLS},${OTHER_COUNTS},${SUBPOPS} +CHROM_MAPPING_FILE=${THIS_DIR}/../../../snpdb/genome/chrom_mapping_GRCh38.map +GENOME_FASTA=/data/annotation/fasta/GCF_000001405.40_GRCh38.p14_genomic.fna.gz + +# gnomad v4 +for chrom in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y; do + GNOMAD_VCF=gnomad.v4.0.sv.chr${chrom}.vcf.gz + wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.0/genome_sv/${GNOMAD_VCF} + wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.0/genome_sv/${GNOMAD_VCF}.tbi + + OUTPUT_VCF= + # bcftools annotate --exclude 'AC=0' --remove '^{KEEP_COLUMNS}' --rename-chrs={CHROM_MAPPING_FILE} | vt normalize - -r ${GENOME_FASTA} -o + | vt uniq + -o ${OUTPUT_VCF} + +done + + +# OTHER_INFOS = ["AC_popmax", "AN_popmax", "AF_popmax", "popmax", "nhomalt", "nhomalt_popmax", "nonpar"] +# GNOMAD_SUB_POPS = ["afr", "amr", "asj", "eas", "fin", "mid", "nfe", "oth", "sas"] # Will get AF for each + +# These have been removed in v4 - "AC_popmax", "AN_popmax", "AF_popmax" +# nonpar is now "par" + + diff --git a/snpdb/genome/chrom_mapping_GRCh37.map b/snpdb/genome/chrom_mapping_GRCh37.map new file mode 100644 index 000000000..519a97085 --- /dev/null +++ b/snpdb/genome/chrom_mapping_GRCh37.map @@ -0,0 +1,25 @@ +1 NC_000001.10 +2 NC_000002.11 +3 NC_000003.11 +4 NC_000004.11 +5 NC_000005.9 +6 NC_000006.11 +7 NC_000007.13 +8 NC_000008.10 +9 NC_000009.11 +10 NC_000010.10 +11 NC_000011.9 +12 NC_000012.11 +13 NC_000013.10 +14 NC_000014.8 +15 NC_000015.9 +16 NC_000016.9 +17 NC_000017.10 +18 NC_000018.9 +19 NC_000019.9 +20 NC_000020.10 +21 NC_000021.8 +22 NC_000022.10 +X NC_000023.10 +Y NC_000024.9 +MT NC_012920.1 diff --git a/snpdb/genome/chrom_mapping_GRCh38.map b/snpdb/genome/chrom_mapping_GRCh38.map new file mode 100644 index 000000000..4fd274f76 --- /dev/null +++ b/snpdb/genome/chrom_mapping_GRCh38.map @@ -0,0 +1,25 @@ +1 NC_000001.11 +2 NC_000002.12 +3 NC_000003.12 +4 NC_000004.12 +5 NC_000005.10 +6 NC_000006.12 +7 NC_000007.14 +8 NC_000008.11 +9 NC_000009.12 +10 NC_000010.11 +11 NC_000011.10 +12 NC_000012.12 +13 NC_000013.11 +14 NC_000014.9 +15 NC_000015.10 +16 NC_000016.10 +17 NC_000017.11 +18 NC_000018.10 +19 NC_000019.10 +20 NC_000020.11 +21 NC_000021.9 +22 NC_000022.11 +X NC_000023.11 +Y NC_000024.10 +MT NC_012920.1