-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvcf-consensus.py
137 lines (92 loc) · 6.1 KB
/
vcf-consensus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/env python
Usage = """
vcf-consensus.py - Becca Love, 2 June 2016
Take a VCF file and a reference genome, and construct a consensus sequence from a particular sample with sensitivity to read counts. For example, a SNP called as a heterozygote with reference/alternate read counts of 6/4 is more likely to be incorporated into the reference sequence than a SNP called as 10/3.
Importantly, this is a stochastic process and the same consensus sequence will not be constructed each time from the same sample and reference genome. It may also change the masking of certain loci because the VCF input is masking-blind.
This was designed for use with VCFs from pooled data, where read counts may be much higher.
Optionally, the SNPs may be filtered during this process by passing the argument -q with a minimum site quality, corresponding to field 6 of the VCF.
Currently, this script only works on biallelic SNPs.
Usage:
vcf-consensus.py -v vcf -o outprefix -s sample -r reference -q min_SNP_quality
Depends:
Biopython https://biopython.org/DIST/docs/tutorial/Tutorial.html
vcf/pysam https://pysam.readthedocs.io/en/latest/api.html
"""
##to-do: implement support for multiple samples in a population
import argparse
import copy
import random
from Bio import SeqIO
import vcf
parser = argparse.ArgumentParser(description="Generate a consensus sequence from one or more samples in a VCF file, taking into account minor allele frequencies.")
parser.add_argument('-v','--vcf', action="store", dest="vcfname", type=str, help='VCF file used to generate consensus sequence. Required.', required=True)
parser.add_argument('-o','--output', action="store", dest="outbase", type=str, help='Prefix for output files. If no prefix is provided, the sample name specified is used.')##note below, default is sample name
parser.add_argument('-s','--sample', action="store", dest="sample", type=str, help='1 sample name for which to generate a consensus. Required.', required=True)
parser.add_argument('-r','--reference', action="store", dest="reference", type=str, help='The reference sequence against which the VCF file was generated. Required.', required=True)
parser.add_argument('-q','--quality',action="store",dest="quality", default=0, type=float, help='Minimum quality for considering a locus. Default: 0.')
parser.add_argument('-x', '--verbose', action="store_true", help='Run the program with progress messages.')
args = parser.parse_args()
if not args.outbase:
args.outbase = args.sample
def analyze_locus(record,sample,verbose=False):
altFreq = 0
cumAlts = 0
cumReads = 0
##generate a random number to which to compare the alternate allele frequency
testBound = random.uniform(0.0,1.0)##should I use scipy.random.rand() here again?
if record.genotype(sample)["GT"] == "./.":
if verbose is True:
print "Warning: missing genotype in " + sample + " at " + record.CHROM + ":" + str(record.POS) + "; using reference allele"
return record.REF
elif record.genotype(sample)["AD"] and record.genotype(sample)["GT"] != "./.":
if len(record.genotype(sample)["AD"]) == 2:
##for a two-allele locus, the # of reads for the alternate allele will always be 2nd in the list generated by ["AD"].
cumAlts += float(record.genotype(sample)["AD"][1]) ##get # reads supporting alternate allele
cumReads += float(sum(record.genotype(sample)["AD"])) ##get total # reads
elif len(record.genotype(sample)["AD"]) > 2: ##only biallelic loci are supported for now
##to-do: accommodate multiallelic loci
if verbose is True:
print "Warning: skipping multi-allelic locus at " + record.CHROM + ":" + str(record.POS) + "; using reference allele"
return record.REF
altFreq = float(cumAlts) / float(cumReads)
if altFreq > testBound:
print altFreq, testBound
print "altFreq > testBound at " + record.CHROM + ":" + str(record.POS)
return str(record.ALT[0])
elif altFreq <= testBound:
return record.REF
vcf_reader = vcf.Reader(open(args.vcfname,'r'))
reference = SeqIO.to_dict(SeqIO.parse(args.reference, "fasta"))
try:
##make sure that the reference genome and the supplied VCF file have the same contigs
assert set(vcf_reader.contigs.keys()) == set(reference.keys())
except:
print "Error: contigs of reference and VCF do not match"
raise SystemExit
##copy the reference contigs to the contigs that will be edited, and make a dictionary to hold them
outContigs = {}
for contig in vcf_reader.contigs.keys():
outContigs[contig] = copy.copy(reference[contig])
outContigs[contig].seq = outContigs[contig].seq.tomutable()
##keep track of number of variants for troubleshooting and checking
numberSeen = 0
numberChanged = 0
for record in vcf_reader:
##make sure the reference alleles match in the reference genome and the VCF
assert record.REF == reference[record.CHROM].seq[record.POS-1].upper(), "Error: reference and VCF do not match" ##check this for off-by-one error
##ignore filtered sites, indels, and sites not passing the quality threshold
if record.QUAL >= args.quality and record.is_snp == True and not record.FILTER:
outContigs[record.CHROM].seq[record.POS-1] = analyze_locus(record,args.sample,args.verbose)
if outContigs[record.CHROM][record.POS-1] != reference[record.CHROM].seq[record.POS-1].upper():
numberChanged += 1
else:
if args.verbose:
print "Skipping poor-quality site at " + record.CHROM + ":" + str(record.POS) + "; using reference allele"
if args.verbose:
print "Finished processing " + record.CHROM + ":" + str(record.POS)
numberSeen += 1
outFile = args.outbase + ".consensus.fasta"
SeqIO.write(list(outContigs.values()), outFile, "fasta")
if args.verbose:
print "Saw " + str(numberSeen) + " variants"
print "Changed " + str(numberChanged) + " sites from reference"