-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathvcf2dadi.py
114 lines (79 loc) · 3.22 KB
/
vcf2dadi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/python
# Simple conversion tool from VCF to dadi's SNP input format
import vcf
import argparse
from collections import OrderedDict
parser = argparse.ArgumentParser(description="Conversion tool from VCF to SNP"
" input format from dadi")
parser.add_argument("-vcf", dest="vcf_file", help="Path to vcf file")
parser.add_argument("-p", dest="population_file", help="Populations file. There"
" should be one line per population, with the name of the"
" population separated from the samples by a colon (':') "
" and each sample separated by whitespace")
parser.add_argument("-o", dest="output_file", help="Name of the output file")
arg = parser.parse_args()
def parse_populations(pop_file):
"""
Parses a population file and returns an orderedDict object with the
population name as keys and the corresponding list of samples as values
"""
pops = OrderedDict()
fh = open(pop_file)
for line in fh:
fields = line.split(":")
pops[fields[0]] = [x for x in fields[1].split() if x]
return pops
def get_data_from_vcf(vcf_file, pops_dic):
"""
Uses the vcf module to parse and retrieve data from a VCF file. The data
is returned as a list of lists
"""
storage = [["Reference", "Alternative", "Allele1"] + list(pops_dic.keys()) + ["\tAllele2"] + list(pops_dic.keys()) + ["\tChr", "Pos"]]
# Get vcf handle
vcf_handle = vcf.Reader(open(vcf_file))
for record in vcf_handle:
# Ignore if more than two alleles
if len(record.alleles) > 2:
continue
# Get major and minor alleles for first two columns
ref_allele = record.alleles[0]
alt_allele = str(record.alleles[1])
# Get allele counts for populations
pop_ref_counts = dict((x, 0) for x in pops_dic)
pop_alt_counts = dict((x, 0) for x in pops_dic)
for pop, samples in pops_dic.items():
for ind in samples:
# Count ref alleles
try:
pop_ref_counts[pop] += \
record.genotype(ind).gt_bases.count(ref_allele)
except AttributeError:
pass
try:
pop_alt_counts[pop] += \
record.genotype(ind).gt_bases.count(alt_allele)
except AttributeError:
pass
# Create line record
ref_snippet = "-{}-".format(ref_allele)
alt_snippet = "-{}-".format(alt_allele)
line_record = [ref_snippet, alt_snippet, ref_allele] + list([str(x) for x in pop_ref_counts.values()]) + [alt_allele] + list([str(x) for x in pop_alt_counts.values()]) + [str(record.CHROM), str(record.POS)]
storage.append(line_record)
return storage
def write_snp_file(records, output_file):
"""
Writes the records into a SNP file format
"""
fh = open(output_file, "w")
for rec in records:
fh.write("\t".join(rec) + "\n")
fh.close()
def main():
# Get arguments
vcf_file = arg.vcf_file
output_file = arg.output_file
pop_file = arg.population_file
pop_dic = parse_populations(pop_file)
records = get_data_from_vcf(vcf_file, pop_dic)
write_snp_file(records, output_file)
main()