Merge pull request #3 from davemcg/Development

Peddy QC added, SeeGEM reports generated, SigmaAF added
davemcg · Jul 12, 2018 · d692fc4 · d692fc4
2 parents cc0c5cd + aa956b0
commit d692fc4
Show file tree

Hide file tree

Showing 5 changed files with 121 additions and 21 deletions.
diff --git a/src/GEMINI_db_to_SeeGEM.R b/src/GEMINI_db_to_SeeGEM.R
@@ -6,6 +6,7 @@ family_name <- args[2]
 output_html <- args[3]
 peddy_path <- args[4]
 
+cur_dir <- getwd()
 
 library(SeeGEM)
 
@@ -48,7 +49,7 @@ GEMINI_list$xlr <- gemini_test_wrapper(gemini_db,
                                        (is_coding=1 OR is_splicing=1 OR impact_severity='HIGH') 
                                        AND filter IS NULL",
                                        families = family_name)
-writeLines('XLRecessive test done')
+writeLines('XL Recessive test done')
 GEMINI_list$xld <- gemini_test_wrapper(gemini_db, 
                                        test = 'x_linked_dominant', 
                                        min_gq = 20, 
@@ -57,7 +58,7 @@ GEMINI_list$xld <- gemini_test_wrapper(gemini_db,
                                        (is_coding=1 OR is_splicing=1 OR impact_severity='HIGH') 
                                        AND filter IS NULL",
                                        families = family_name)
-writeLines('XLDominant test done')
+writeLines('XL Dominant test done')
 GEMINI_list$xldn <- gemini_test_wrapper(gemini_db, 
                                         test = 'x_linked_de_novo', 
                                         min_gq = 20, 
@@ -107,7 +108,7 @@ writeLines('ACMG test done')
 # data.table rbindlist will collapse each element of the list into one data frame
 # gemini_query_wrapper() and gemini_test_wrapper() will add the test name
 # to each query, so you can distinguish them later (via the 'test' column)
-my_GEMINI_data <- rbindlist(GEMINI_list, fill = TRUE)
+my_GEMINI_data <- data.table::rbindlist(GEMINI_list, fill = TRUE)
 
 # now that you've created the core data, you can create the reactive document
 # I'm assuming you've already run peddy on the same vcf you used to make the GEMINI
@@ -122,8 +123,8 @@ sample_ped <- gemini_query_wrapper(gemini_db,
                                                 family_name, "' \""))
 
 knit_see_gem(GEMINI_data = my_GEMINI_data, 
-             output_file = output_html, 
-             peddy_path_prefix = peddy_path, 
+             output_file = paste0(cur_dir, '/', output_html), 
+             peddy_path_prefix = paste0(cur_dir,'/',peddy_path), 
              peddy_id = sample_ped$name, 
              sample_name = family_name)
 

diff --git a/src/Snakefile b/src/Snakefile
@@ -39,11 +39,14 @@ wildcard_constraints:
 	family_name=family_name_con
 
 
-rule all:
-	input:
-		#expand('temp/{sample}.SORTED.VT.VEP.VCFANNO.vcf.gz', sample=SAMPLE)
-		#expand('{sample}.PED_{ped}.gemini.db', sample=SAMPLE, ped=PED),
-		expand('{sample}.{family_name}.PED_{ped}.excel.report.xlsx', sample=SAMPLE, ped=PEDfile, family_name=config['family_name'])
+if config['family_name'] == '':
+	rule all:
+		input:
+			expand('{sample}.PED_{ped}.gemini.db', sample=SAMPLE, ped=PEDfile)
+else:
+	rule all:
+		input:
+			expand('sample_reports/{sample}.{family_name}.PED_{ped}.SeeGEM.report.html', sample=SAMPLE, ped=PEDfile, family_name=config['family_name'])
 
 rule n_split_vcf:
 	input:
@@ -182,6 +185,21 @@ rule merge_vcf:
 		tabix -f -p vcf {output.vcf}
 		"""
 
+rule peddy_QC:
+	input:
+		'temp/{sample}.SORTED.VT.VEP.VCFANNO.vcf.gz'
+	output:
+		ped = '{sample}_PEDDY.ped_check.csv',
+		het = '{sample}_PEDDY.het_check.csv',
+		sex = '{sample}_PEDDY.sex_check.csv'
+	threads: 4
+	shell:
+		"""
+		module load {config[peddy_version]}
+		peddy -p {threads} {input} {config[ped]} --prefix {wildcards.sample}_PEDDY
+		"""
+
+
 # create gemini database
 rule make_gemini_db:
 	input:
@@ -199,13 +217,14 @@ rule make_gemini_db:
 # now write report for each family given in the yaml family_name section
 rule query_gemini:
 	input:
-		db = '{sample}.PED_{ped}.gemini.db'
-	params:
-		aaf = config['aaf_change']
+		db = '{sample}.PED_{ped}.gemini.db',
+		peddy_ped = '{sample}_PEDDY.ped_check.csv',
+		peddy_het = '{sample}_PEDDY.het_check.csv',
+		peddy_sex = '{sample}_PEDDY.sex_check.csv'
 	output:
-		report_name = '{sample}.{family_name}.PED_{ped}.excel.report.xlsx'
+		report_name = 'sample_reports/{sample}.{family_name}.PED_{ped}.SeeGEM.report.html'
 	shell:
 		"""
-		module load {config[gemini_version]}
-		bash /home/mcgaugheyd/git/variant_prioritization/query_gemini_wrapper.sh {input} {wildcards.family_name} {output} {params.aaf}
+		module load {config[R_version]}
+		Rscript {config[SeeGEM_script]} {input.db} {wildcards.family_name} {output} {wildcards.sample}_PEDDY
 		"""
diff --git a/src/build_sigmaAF.py b/src/build_sigmaAF.py
@@ -0,0 +1,71 @@
+# python3
+
+import os
+import subprocess
+import gzip
+
+os.chdir('/data/mcgaugheyd/genomes/1000G_phase2_GRCh37/gemini_annotation/')
+subprocess.call('rsync -av 165.112.66.100:/Volumes/Arges/2016-2018_John_Bryan_Archive/macbook_archive/Documents/git/rob_exac_gnomad/data/gnomad_genome_level_SumStatAnnotated.csv .', shell = True)
+#subprocess.call('wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_28/GRCh37_mapping/gencode.v28lift37.annotation.gtf.gz ../', shell = True)
+
+gtf = gzip.open('/data/mcgaugheyd/genomes/1000G_phase2_GRCh37/gencode.v28lift37.annotation.gtf.gz')
+
+# build gene <-> coord dictionary
+gtf_dict = {}
+for line in gtf:
+	line = line.decode('utf-8')
+	if line[0] == '#':
+		continue
+	if line.split()[2] == 'gene':
+		gene_name = [x for x in line.split(';') if 'gene_name' in x][0].split('"')[1].upper()
+		chrom = line.split()[0]
+		start = line.split()[3]
+		end = line.split()[4]
+		gtf_dict[gene_name] = chrom + '_' + start + '_' + end
+
+# read in sigmaAF csv (John Bryan)
+LoF_0001 = {}
+LoF_01 = {}
+missense_0001 = {}
+missense_01 = {}
+
+gnomad_sigmaAF = open('/data/mcgaugheyd/genomes/1000G_phase2_GRCh37/gemini_annotation/gnomad_genome_level_SumStatAnnotated.csv')
+for line in gnomad_sigmaAF:
+	#line = line.decode('utf-8')
+	line = line.split(',')
+	gene = line[2].replace('"', '').upper()
+	AF = line[1]
+	sumAF = line[4]
+	variation = line[3].replace('"', '')
+	if AF == '1e-04' and variation == 'LoF':
+		LoF_0001[gene] = sumAF
+	if AF == '0.01' and variation == 'LoF':
+		LoF_01[gene] = sumAF
+	if AF == '1e-04' and variation == 'missense_variant':
+		missense_0001[gene] = sumAF
+	if AF == '0.01' and variation == 'missense_variant':
+		missense_01[gene] = sumAF
+
+output = []
+for k, v in LoF_01.items():
+	try:
+		line = '\t'.join(gtf_dict[k].split('_')) + '\t' + \
+			k + '\t' + \
+			LoF_0001[k] + '\t' + \
+			LoF_01[k] +  '\t' + \
+			missense_0001[k] + '\t' + \
+			missense_01[k] 
+		output.append(line)
+	except:
+		print(k + ' is missing in gtf')
+
+out_file = open('/data/mcgaugheyd/genomes/1000G_phase2_GRCh37/gemini_annotation/sigmaAF.bed', 'w')
+for line in output:
+	out_file.write(line)
+	out_file.write('\n')
+
+subprocess.call('python3 ~/git/ChromosomeMappings/convert_notation.py -c ~/git/ChromosomeMappings/GRCh37_UCSC2ensembl.txt -f /data/mcgaugheyd/genomes/1000G_phase2_GRCh37/gemini_annotation/sigmaAF.bed \
+	| sort -k1,1 -k2,2n | bgzip -f > /data/mcgaugheyd/genomes/1000G_phase2_GRCh37/gemini_annotation/sigmaAF.ensembl.bed.gz', shell = True)
+subprocess.call('tabix -f -s 1 -b 2 -e 3 -0 /data/mcgaugheyd/genomes/1000G_phase2_GRCh37/gemini_annotation/sigmaAF.ensembl.bed.gz', shell = True)
+
+
diff --git a/src/config_variant_prioritization.yaml b/src/config_variant_prioritization.yaml
@@ -2,15 +2,19 @@ input_vcf: 'VCFs.GATK.vcf.gz'
 aaf_change: '0.1'
 family_name: ['COL036','COL076','COL091','COL114','COL130','COL158','COL159','DDL012','DDL013','DDL014','DDL015','DDL016','DDL017','DDL018','DDL019','DDL019','DDL020','DDL021','dor_yeshorim','dor_yeshorim','ambry','W89','W90','W97','W100','W107','W103','ZEIN_15-29-31','CCGO_FAM_800532','CCGO_FAM_800594','CCGO_FAM_800643','CCGO_FAM_800666','CCGO_FAM_800778','CCGO_FAM_800016and18','CCGO_FAM_800016and18','CCGO_FAM_800044','CCGO_FAM_800062','CCGO_FAM_800067and71','CCGO_FAM_800067and71','CCGO_FAM_800118','CCGO_FAM_800157','CCGO_FAM_800160','CCGO_FAM_800160','CCGO_FAM_800188','CCGO_FAM_800298','CCGO_FAM_800308','CCGO_FAM_800347','CCGO_FAM_800412','CCGO_FAM_800513','CCGO_FAM_800520','CCGO_FAM_800524','W23','DDL001','DDL002','DDL003','DDL004','DDL004','DDL004','DDL005','DDL005','DDL006','DDL007','DDL008','DDL009','DDL010','DDL011','DDL012','fam5','fam5','fam5','fam34','fam34']
 ped: '/home/mcgaugheyd/git/NGS_db/master.ped'
+pick: 'canonical'
 gemini_db_name: 'gemini_db/MVL_2017-12-14.gemini.db'
 samtools_version: 'samtools/1.6'
 ref_genome: '/data/OGVFB/resources/human_g1k_v37_decoy.fasta'
-VEP_version: 'VEP/89'
-vcfanno_version: 'vcfanno/0.1.1'
+VEP_version: 'VEP/92'
+vcfanno_version: 'vcfanno/0.2.9'
 vcfanno_lua: '/home/$USER/git/casey_to_gemini/src/vcfanno_custom.lua'
 vcfanno_conf: '/home/$USER/git/casey_to_gemini/src/vcfanno_exomes.conf'
-vcf2db_version: 'vcf2db/7dfc48a'
+vcf2db_version: 'vcf2db/2017.12.11'
 gemini_version: 'gemini/0.20.1'
-R_version: 'R/3.4.0_gcc-6.2.0'
-pandoc_version: 'pandoc/1.15.0.6'
+peddy_version: 'peddy/0.3.1'
+R_version: 'R/3.5'
+pandoc_version: 'pandoc/2.1.1'
+gemini_lenient: 'Yes'
+SeeGEM_script: '/home/$USER/git/variant_prioritization/src/GEMINI_db_to_SeeGEM.R'
 regions: '/home/mcgaugheyd/git/variant_prioritization/src/vcf_region_split_100_coords.txt'
diff --git a/src/vcfanno_v2.conf b/src/vcfanno_v2.conf
@@ -251,6 +251,11 @@ columns=[2]
 ops=["flag"]
 names=["cse-hiseq"]
 
+[[annotation]]
+file="/data/mcgaugheyd/genomes/1000G_phase2_GRCh37/gemini_annotation/sigmaAF.ensembl.bed.gz"
+columns=[5,6,7,8]
+ops=["max","max","max","max"]
+names=["SigmaAF_LoF_0001", "SigmaAF_LoF_01",  "SigmaAF_Missense_0001", "SigmaAF_Missense_01"]
 
 [[postannotation]]
 fields=["ac_exac_all", "an_exac_all"]