Skip to content

Commit

Permalink
Merge pull request #3 from davemcg/Development
Browse files Browse the repository at this point in the history
Peddy QC added, SeeGEM reports generated, SigmaAF added
  • Loading branch information
davemcg authored Jul 12, 2018
2 parents cc0c5cd + aa956b0 commit d692fc4
Show file tree
Hide file tree
Showing 5 changed files with 121 additions and 21 deletions.
11 changes: 6 additions & 5 deletions src/GEMINI_db_to_SeeGEM.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ family_name <- args[2]
output_html <- args[3]
peddy_path <- args[4]

cur_dir <- getwd()

library(SeeGEM)

Expand Down Expand Up @@ -48,7 +49,7 @@ GEMINI_list$xlr <- gemini_test_wrapper(gemini_db,
(is_coding=1 OR is_splicing=1 OR impact_severity='HIGH')
AND filter IS NULL",
families = family_name)
writeLines('XLRecessive test done')
writeLines('XL Recessive test done')
GEMINI_list$xld <- gemini_test_wrapper(gemini_db,
test = 'x_linked_dominant',
min_gq = 20,
Expand All @@ -57,7 +58,7 @@ GEMINI_list$xld <- gemini_test_wrapper(gemini_db,
(is_coding=1 OR is_splicing=1 OR impact_severity='HIGH')
AND filter IS NULL",
families = family_name)
writeLines('XLDominant test done')
writeLines('XL Dominant test done')
GEMINI_list$xldn <- gemini_test_wrapper(gemini_db,
test = 'x_linked_de_novo',
min_gq = 20,
Expand Down Expand Up @@ -107,7 +108,7 @@ writeLines('ACMG test done')
# data.table rbindlist will collapse each element of the list into one data frame
# gemini_query_wrapper() and gemini_test_wrapper() will add the test name
# to each query, so you can distinguish them later (via the 'test' column)
my_GEMINI_data <- rbindlist(GEMINI_list, fill = TRUE)
my_GEMINI_data <- data.table::rbindlist(GEMINI_list, fill = TRUE)

# now that you've created the core data, you can create the reactive document
# I'm assuming you've already run peddy on the same vcf you used to make the GEMINI
Expand All @@ -122,8 +123,8 @@ sample_ped <- gemini_query_wrapper(gemini_db,
family_name, "' \""))

knit_see_gem(GEMINI_data = my_GEMINI_data,
output_file = output_html,
peddy_path_prefix = peddy_path,
output_file = paste0(cur_dir, '/', output_html),
peddy_path_prefix = paste0(cur_dir,'/',peddy_path),
peddy_id = sample_ped$name,
sample_name = family_name)

Expand Down
41 changes: 30 additions & 11 deletions src/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,14 @@ wildcard_constraints:
family_name=family_name_con


rule all:
input:
#expand('temp/{sample}.SORTED.VT.VEP.VCFANNO.vcf.gz', sample=SAMPLE)
#expand('{sample}.PED_{ped}.gemini.db', sample=SAMPLE, ped=PED),
expand('{sample}.{family_name}.PED_{ped}.excel.report.xlsx', sample=SAMPLE, ped=PEDfile, family_name=config['family_name'])
if config['family_name'] == '':
rule all:
input:
expand('{sample}.PED_{ped}.gemini.db', sample=SAMPLE, ped=PEDfile)
else:
rule all:
input:
expand('sample_reports/{sample}.{family_name}.PED_{ped}.SeeGEM.report.html', sample=SAMPLE, ped=PEDfile, family_name=config['family_name'])

rule n_split_vcf:
input:
Expand Down Expand Up @@ -182,6 +185,21 @@ rule merge_vcf:
tabix -f -p vcf {output.vcf}
"""

rule peddy_QC:
input:
'temp/{sample}.SORTED.VT.VEP.VCFANNO.vcf.gz'
output:
ped = '{sample}_PEDDY.ped_check.csv',
het = '{sample}_PEDDY.het_check.csv',
sex = '{sample}_PEDDY.sex_check.csv'
threads: 4
shell:
"""
module load {config[peddy_version]}
peddy -p {threads} {input} {config[ped]} --prefix {wildcards.sample}_PEDDY
"""


# create gemini database
rule make_gemini_db:
input:
Expand All @@ -199,13 +217,14 @@ rule make_gemini_db:
# now write report for each family given in the yaml family_name section
rule query_gemini:
input:
db = '{sample}.PED_{ped}.gemini.db'
params:
aaf = config['aaf_change']
db = '{sample}.PED_{ped}.gemini.db',
peddy_ped = '{sample}_PEDDY.ped_check.csv',
peddy_het = '{sample}_PEDDY.het_check.csv',
peddy_sex = '{sample}_PEDDY.sex_check.csv'
output:
report_name = '{sample}.{family_name}.PED_{ped}.excel.report.xlsx'
report_name = 'sample_reports/{sample}.{family_name}.PED_{ped}.SeeGEM.report.html'
shell:
"""
module load {config[gemini_version]}
bash /home/mcgaugheyd/git/variant_prioritization/query_gemini_wrapper.sh {input} {wildcards.family_name} {output} {params.aaf}
module load {config[R_version]}
Rscript {config[SeeGEM_script]} {input.db} {wildcards.family_name} {output} {wildcards.sample}_PEDDY
"""
71 changes: 71 additions & 0 deletions src/build_sigmaAF.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# python3

import os
import subprocess
import gzip

os.chdir('/data/mcgaugheyd/genomes/1000G_phase2_GRCh37/gemini_annotation/')
subprocess.call('rsync -av 165.112.66.100:/Volumes/Arges/2016-2018_John_Bryan_Archive/macbook_archive/Documents/git/rob_exac_gnomad/data/gnomad_genome_level_SumStatAnnotated.csv .', shell = True)
#subprocess.call('wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_28/GRCh37_mapping/gencode.v28lift37.annotation.gtf.gz ../', shell = True)

gtf = gzip.open('/data/mcgaugheyd/genomes/1000G_phase2_GRCh37/gencode.v28lift37.annotation.gtf.gz')

# build gene <-> coord dictionary
gtf_dict = {}
for line in gtf:
line = line.decode('utf-8')
if line[0] == '#':
continue
if line.split()[2] == 'gene':
gene_name = [x for x in line.split(';') if 'gene_name' in x][0].split('"')[1].upper()
chrom = line.split()[0]
start = line.split()[3]
end = line.split()[4]
gtf_dict[gene_name] = chrom + '_' + start + '_' + end

# read in sigmaAF csv (John Bryan)
LoF_0001 = {}
LoF_01 = {}
missense_0001 = {}
missense_01 = {}

gnomad_sigmaAF = open('/data/mcgaugheyd/genomes/1000G_phase2_GRCh37/gemini_annotation/gnomad_genome_level_SumStatAnnotated.csv')
for line in gnomad_sigmaAF:
#line = line.decode('utf-8')
line = line.split(',')
gene = line[2].replace('"', '').upper()
AF = line[1]
sumAF = line[4]
variation = line[3].replace('"', '')
if AF == '1e-04' and variation == 'LoF':
LoF_0001[gene] = sumAF
if AF == '0.01' and variation == 'LoF':
LoF_01[gene] = sumAF
if AF == '1e-04' and variation == 'missense_variant':
missense_0001[gene] = sumAF
if AF == '0.01' and variation == 'missense_variant':
missense_01[gene] = sumAF

output = []
for k, v in LoF_01.items():
try:
line = '\t'.join(gtf_dict[k].split('_')) + '\t' + \
k + '\t' + \
LoF_0001[k] + '\t' + \
LoF_01[k] + '\t' + \
missense_0001[k] + '\t' + \
missense_01[k]
output.append(line)
except:
print(k + ' is missing in gtf')

out_file = open('/data/mcgaugheyd/genomes/1000G_phase2_GRCh37/gemini_annotation/sigmaAF.bed', 'w')
for line in output:
out_file.write(line)
out_file.write('\n')

subprocess.call('python3 ~/git/ChromosomeMappings/convert_notation.py -c ~/git/ChromosomeMappings/GRCh37_UCSC2ensembl.txt -f /data/mcgaugheyd/genomes/1000G_phase2_GRCh37/gemini_annotation/sigmaAF.bed \
| sort -k1,1 -k2,2n | bgzip -f > /data/mcgaugheyd/genomes/1000G_phase2_GRCh37/gemini_annotation/sigmaAF.ensembl.bed.gz', shell = True)
subprocess.call('tabix -f -s 1 -b 2 -e 3 -0 /data/mcgaugheyd/genomes/1000G_phase2_GRCh37/gemini_annotation/sigmaAF.ensembl.bed.gz', shell = True)


14 changes: 9 additions & 5 deletions src/config_variant_prioritization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,19 @@ input_vcf: 'VCFs.GATK.vcf.gz'
aaf_change: '0.1'
family_name: ['COL036','COL076','COL091','COL114','COL130','COL158','COL159','DDL012','DDL013','DDL014','DDL015','DDL016','DDL017','DDL018','DDL019','DDL019','DDL020','DDL021','dor_yeshorim','dor_yeshorim','ambry','W89','W90','W97','W100','W107','W103','ZEIN_15-29-31','CCGO_FAM_800532','CCGO_FAM_800594','CCGO_FAM_800643','CCGO_FAM_800666','CCGO_FAM_800778','CCGO_FAM_800016and18','CCGO_FAM_800016and18','CCGO_FAM_800044','CCGO_FAM_800062','CCGO_FAM_800067and71','CCGO_FAM_800067and71','CCGO_FAM_800118','CCGO_FAM_800157','CCGO_FAM_800160','CCGO_FAM_800160','CCGO_FAM_800188','CCGO_FAM_800298','CCGO_FAM_800308','CCGO_FAM_800347','CCGO_FAM_800412','CCGO_FAM_800513','CCGO_FAM_800520','CCGO_FAM_800524','W23','DDL001','DDL002','DDL003','DDL004','DDL004','DDL004','DDL005','DDL005','DDL006','DDL007','DDL008','DDL009','DDL010','DDL011','DDL012','fam5','fam5','fam5','fam34','fam34']
ped: '/home/mcgaugheyd/git/NGS_db/master.ped'
pick: 'canonical'
gemini_db_name: 'gemini_db/MVL_2017-12-14.gemini.db'
samtools_version: 'samtools/1.6'
ref_genome: '/data/OGVFB/resources/human_g1k_v37_decoy.fasta'
VEP_version: 'VEP/89'
vcfanno_version: 'vcfanno/0.1.1'
VEP_version: 'VEP/92'
vcfanno_version: 'vcfanno/0.2.9'
vcfanno_lua: '/home/$USER/git/casey_to_gemini/src/vcfanno_custom.lua'
vcfanno_conf: '/home/$USER/git/casey_to_gemini/src/vcfanno_exomes.conf'
vcf2db_version: 'vcf2db/7dfc48a'
vcf2db_version: 'vcf2db/2017.12.11'
gemini_version: 'gemini/0.20.1'
R_version: 'R/3.4.0_gcc-6.2.0'
pandoc_version: 'pandoc/1.15.0.6'
peddy_version: 'peddy/0.3.1'
R_version: 'R/3.5'
pandoc_version: 'pandoc/2.1.1'
gemini_lenient: 'Yes'
SeeGEM_script: '/home/$USER/git/variant_prioritization/src/GEMINI_db_to_SeeGEM.R'
regions: '/home/mcgaugheyd/git/variant_prioritization/src/vcf_region_split_100_coords.txt'
5 changes: 5 additions & 0 deletions src/vcfanno_v2.conf
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,11 @@ columns=[2]
ops=["flag"]
names=["cse-hiseq"]

[[annotation]]
file="/data/mcgaugheyd/genomes/1000G_phase2_GRCh37/gemini_annotation/sigmaAF.ensembl.bed.gz"
columns=[5,6,7,8]
ops=["max","max","max","max"]
names=["SigmaAF_LoF_0001", "SigmaAF_LoF_01", "SigmaAF_Missense_0001", "SigmaAF_Missense_01"]

[[postannotation]]
fields=["ac_exac_all", "an_exac_all"]
Expand Down

0 comments on commit d692fc4

Please sign in to comment.