Skip to content

Commit

Permalink
Comphet dn (#18)
Browse files Browse the repository at this point in the history
* support compound het with 1 end is a de novo

* tests for compound hets

* no underscore
  • Loading branch information
brentp authored Mar 18, 2019
1 parent d817143 commit eb9b99a
Show file tree
Hide file tree
Showing 5 changed files with 109 additions and 11 deletions.
3 changes: 1 addition & 2 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
v0.0.7 (dev)
============
+ [expr] allow --region to be a bed file
+ new command for finding compound-hets
+ [compound-hets] new command for finding compound-hets including where 1 side is a de novo (see: https://github.com/brentp/slivar/wiki/rare-disease#compound-heterozygotes)
+ default to send output to stdout for streaming
+ fix bug in some -g gnotate files

2 changes: 1 addition & 1 deletion slivar.nimble
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ license = "MIT"

# Dependencies

requires "hts >= 0.2.7", "nimgen", "binaryheap", "zip", "https://github.com/brentp/duktape-nim#dev", "https://github.com/brentp/bpbio", "https://github.com/brentp/nim-minizip", "argparse"
requires "hts >= 0.2.9", "nimgen", "binaryheap", "zip", "https://github.com/brentp/duktape-nim#dev", "https://github.com/brentp/bpbio", "https://github.com/brentp/nim-minizip", "argparse"
srcDir = "src"
installExt = @["nim"]

Expand Down
48 changes: 40 additions & 8 deletions src/slivarpkg/comphet.nim
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,37 @@ import argparse
import tables
import sets

template inherited(kid: pedfile.Sample, a:openarray[int8]): bool =
# this avoids checks as they are already done in is_compound het
a[kid.i] == 1 and (a[kid.mom.i] == 1 or a[kid.dad.i] == 1)

template denovo(kid: pedfile.Sample, a:openarray[int8]): bool =
# this avoids checks as they are already done in is_compound het
a[kid.i] == 1 and (a[kid.mom.i] == 0 and a[kid.dad.i] == 0)

proc compound_denovo(kid: pedfile.Sample, a:openarray[int8], b:openarray[int8]): bool {.inline.} =
## this should only be callled from is_compount_het as it avoids re-checking stuff already in there.
## it checks if there is transmitted het and 1
if kid.denovo(a) and kid.inherited(b): return true
if kid.denovo(b) and kid.inherited(a): return true
return false

proc is_compound_het*(kid: pedfile.Sample, a:openarray[int8], b:openarray[int8]): bool {.inline.} =
# check if the kid (with mom and dad) has either a classic compound het
# where each parent transmits 1 het to the kid or where 1 het is transmitted
# and the other is de novo.
doAssert kid.dad != nil and kid.mom != nil
if a[kid.i] != 1 or b[kid.i] != 1: return false

if a[kid.dad.i] == -1 or a[kid.mom.i] == -1: return false
if b[kid.dad.i] == -1 or b[kid.mom.i] == -1: return false

# if only a single het between the parents, only possiblity is a de novo.
if a[kid.dad.i] + a[kid.mom.i] + b[kid.dad.i] + b[kid.mom.i] == 1:
return compound_denovo(kid, a, b)

if a[kid.dad.i] + a[kid.mom.i] != 1: return false

if b[kid.dad.i] == -1 or b[kid.mom.i] == -1: return false
if b[kid.dad.i] + b[kid.mom.i] != 1: return false
# now we have 1 het parent at each site and kid het at both sites.
# check that different parents have the different alleles.
Expand Down Expand Up @@ -87,6 +110,7 @@ proc main*(dropfirst:bool=false) =
option("-p", "--ped", default="", help="required ped file describing the trios in the VCF")
option("-f", "--field", default="BCSQ", help="INFO field containing the gene name")
option("-i", "--index", default="2", help="(1-based) index of the gene-name in the field after splitting on '|'")
option("-o", "--out-vcf", default="/dev/stdout", help="path to output VCF/BCF")

let opts = p.parse()
if opts.ped == "":
Expand All @@ -106,7 +130,7 @@ proc main*(dropfirst:bool=false) =
if kids.len == 0:
quit &"[slivar] no trios found for {opts.ped} with {opts.vcf}"

if not open(ovcf, "/dev/stdout", mode="w"):
if not open(ovcf, opts.out_vcf, mode="w"):
quit "couldn't open output vcf"

if ivcf.header.add_info("slivar_comphet", ".", "String", "compound hets called by slivar. format is chrom/pos/ref/alt/gene/sample") != Status.OK:
Expand Down Expand Up @@ -153,16 +177,24 @@ when isMainModule:
import unittest
suite "compound hets":
test "simple":
var kid = Sample(id:"kid")
kid.mom = Sample(id:"mom")
kid.dad = Sample(id:"dad")
kid.i = 0
kid.dad.i = 1
kid.mom.i = 2
var kid = Sample(id:"kid", i:0)
kid.dad = Sample(id:"dad", i: 1)
kid.mom = Sample(id:"mom", i: 2)

check is_compound_het(kid, [1'i8, 0, 1], [1'i8, 1, 0])

check not is_compound_het(kid, [1'i8, 0, 1], [1'i8, 0, 1])
check not is_compound_het(kid, [1'i8, 1, 1], [1'i8, 0, 1])

check not is_compound_het(kid, [0'i8, 0, 1], [1'i8, 1, 0])

test "with denovo":
var kid = Sample(id:"kid", i:0)
kid.dad = Sample(id:"dad", i: 1)
kid.mom = Sample(id:"mom", i: 2)

# first is de novo, 2nd is inherited
check is_compound_het(kid, [1'i8, 0, 0], [1'i8, 1, 0])
check not is_compound_het(kid, [1'i8, 1, 0], [1'i8, 1, 0])

check is_compound_het(kid, [1'i8, 0, 1], [1'i8, 0, 0])
61 changes: 61 additions & 0 deletions tests/comphet.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
##fileformat=VCFv4.1
##ALT=<ID=*:DEL,Description="Represents any possible spanning deletion allele at this location">
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
##FILTER=<ID=LowQual,Description="Low quality">
##FILTER=<ID=VQSRTrancheSNP99.00to99.90,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -3.6527 <= x < 0.2363">
##FILTER=<ID=VQSRTrancheSNP99.90to100.00+,Description="Truth sensitivity tranche level for SNP model at VQS Lod < -2841.3422">
##FILTER=<ID=VQSRTrancheSNP99.90to100.00,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -2841.3422 <= x < -3.6527">
##FILTER=<ID=gatkRecommendIndelFilter,Description="QD < 2.0 || FS > 200.0 || ReadPosRankSum < -20.0">
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another">
##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=RGQ,Number=1,Type=Integer,Description="Unconditional reference genotype confidence, encoded as a phred quality -10*log10 p(genotype call is wrong)">
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
##INFO=<ID=ClippingRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref number of hard clipped bases">
##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP Membership">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
##INFO=<ID=DS,Number=0,Type=Flag,Description="Were any of the samples downsampled?">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes">
##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
##INFO=<ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
##INFO=<ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
##INFO=<ID=BCSQ,Number=.,Type=String,Description="fake bcftools CSQ">
##INFO=<ID=expect,Number=.,Type=String,Description="should this be a compound het">
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
##INFO=<ID=NEGATIVE_TRAIN_SITE,Number=0,Type=Flag,Description="This variant was used to build the negative training set of bad variants">
##INFO=<ID=POSITIVE_TRAIN_SITE,Number=0,Type=Flag,Description="This variant was used to build the positive training set of good variants">
##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
##INFO=<ID=SOR,Number=1,Type=Float,Description="Symmetric Odds Ratio of 2x2 contingency table to detect strand bias">
##INFO=<ID=VQSLOD,Number=1,Type=Float,Description="Log odds ratio of being a true variant versus being false under the trained gaussian mixture model">
##INFO=<ID=culprit,Number=1,Type=String,Description="The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out">
##INFO=<ID=set,Number=1,Type=String,Description="Source VCF for the merged record in CombineVariants">
##contig=<ID=1,length=249250621,assembly=b37>
##reference=file:///data/diag/diagInternal/tmp/temp/GATK_bundle_2.8/b37/human_g1k_v37_decoy.fasta
##source=SelectVariants
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG002 HG003 HG004
1 65797 . T C 32.92 PASS MQ=57.00;BCSQ=missense|geneA;expect=yes GT:AD:DP:GQ 0/1:9,9:18:20 0/1:9,9:18:20 0/0:11,0:11:20
1 65799 . T C 32.92 PASS MQ=57.00;BCSQ=missense|geneA;expect=yes GT:AD:DP:GQ 0/1:9,9:18:20 0/0:9,9:18:20 0/1:11,0:11:20
1 65800 . T C 32.92 PASS MQ=57.00;BCSQ=missense|geneB;expect=no,2DNs GT:AD:DP:GQ 0/1:9,9:18:20 0/0:9,9:18:20 0/0:11,0:11:20
1 65801 . T C 32.92 PASS MQ=57.00;BCSQ=missense|geneB;expect=no,2DNs GT:AD:DP:GQ 0/1:9,9:18:20 0/0:9,9:18:20 0/0:11,0:11:20
1 65802 . T C 32.92 PASS MQ=57.00;BCSQ=missense|geneC;expect=yes,1DN GT:AD:DP:GQ 0/1:9,9:18:20 0/0:9,9:18:20 0/0:11,0:11:20
1 65803 . T C 32.92 PASS MQ=57.00;BCSQ=missense|geneC;expect=yes,1DN GT:AD:DP:GQ 0/1:9,9:18:20 0/1:9,9:18:20 0/0:11,0:11:20
1 65804 . T C 32.92 PASS MQ=57.00;BCSQ=missense|geneD;expect=no,same-parent-with-het GT:AD:DP:GQ 0/1:9,9:18:20 0/1:9,9:18:20 0/0:11,0:11:20
1 65805 . T C 32.92 PASS MQ=57.00;BCSQ=missense|geneD;expect=no,same-parent-with-het GT:AD:DP:GQ 0/1:9,9:18:20 0/1:9,9:18:20 0/0:11,0:11:20
1 65806 . T C 32.92 PASS MQ=57.00;BCSQ=missense|geneE;expect=yes,multiple-matches GT:AD:DP:GQ 0/1:9,9:18:20 0/1:9,9:18:20 0/0:11,0:11:20
1 65807 . T C 32.92 PASS MQ=57.00;BCSQ=missense|geneE;expect=yes,multiple-matches GT:AD:DP:GQ 0/1:9,9:18:20 0/0:9,9:18:20 0/1:11,0:11:20
1 65808 . T C 32.92 PASS MQ=57.00;BCSQ=missense|geneE;expect=yes,multiple-matches GT:AD:DP:GQ 0/1:9,9:18:20 0/0:9,9:18:20 0/1:11,0:11:20
1 65810 . T C 32.92 PASS MQ=57.00;BCSQ=missense|geneF;expect=yes,with-extra-non-matching GT:AD:DP:GQ 0/1:9,9:18:20 0/1:9,9:18:20 0/0:11,0:11:20
1 65811 . T C 32.92 PASS MQ=57.00;BCSQ=missense|geneF;expect=yes,with-extra-non-matching GT:AD:DP:GQ 0/1:9,9:18:20 0/0:9,9:18:20 0/1:11,0:11:20
1 65812 . T C 32.92 PASS MQ=57.00;BCSQ=missense|geneF;expect=no,extra-non-matching GT:AD:DP:GQ 0/1:9,9:18:20 0/1:9,9:18:20 0/1:11,0:11:20
6 changes: 6 additions & 0 deletions tests/functional-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,9 @@ rm -f xx.bcf
run check_slivar_gnotate_load $exe gnotate --js tests/test-functions.js --expr "call_rate(variant)" -o xx.bcf tests/ashk-trio.vcf.gz
assert_equal 9834 $(bcftools view -H xx.bcf | wc -l)
rm -f xx.bcf

run check_compound_hets $exe compound-hets -v tests/comphet.vcf --ped tests/ashk-trio.ped -o ixx.vcf
assert_exit_code 0
assert_equal $(grep -c expect=yes tests/comphet.vcf) $(grep -c expect=yes ixx.vcf)
assert_equal $(grep -c expect=no ixx.vcf) 0
rm -f ixx.vcf

0 comments on commit eb9b99a

Please sign in to comment.