-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy patholdloci2phy.py
108 lines (83 loc) · 3.22 KB
/
oldloci2phy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/usr/bin/python3
# Copyright 2015 Francisco Pina Martins <[email protected]>
# This file is part of loci2phy.
# loci2phy is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# loci2phy is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with loci2phy. If not, see <http://www.gnu.org/licenses/>.
# Usage: python3 loci2phy.py file.vcf file.loci file.phy
def vcf_parser(vcf_filename):
"""Parses a VCF file and returns a sorted list with loci names and a list
with taxa names."""
vcf = open(vcf_filename, 'r')
loci = []
seqnames = {}
for line in vcf:
if line.startswith("##"):
pass
elif line.startswith("#CHROM"):
for names in line.split()[9:]:
seqnames[names] = ""
else:
loci.append(line.split()[0])
vcf.close()
loci = sorted(list(set(loci)))
return loci, seqnames
def loci_parser(loci_filename, loci, seqnames):
"""Gets a loci list, and a loci file and sequence names and filters the loci
file according to the loci list. Returns a dict {seqname: sequence}"""
loci_file = open(loci_filename, 'r')
if loci[0] == "0":
estouinteressado = 1
else:
estouinteressado = 0
c = 0
seqlen = 0
totlen = 0
vcfseqs = set(seqnames.keys())
#taxaset = set(vcfseqs)
taxaset = set()
seqlines= " "
for lines in loci_file:
if estouinteressado == 1 and lines.startswith("//") == False:
seqlines = lines.strip(">\n").split()
seqnames[seqlines[0]] += seqlines[1]
taxaset.add(seqlines[0])
elif lines.startswith("//") and estouinteressado == 1:
seqlen = len(seqlines[1])
totlen += seqlen
difset = vcfseqs.difference(taxaset)
for t in difset:
seqnames[t] += "N" * seqlen
taxaset = set()
c += 1
estouinteressado = 0
if str(c) in loci:
estouinteressado = 1
elif estouinteressado == 0 and lines.startswith("//"):
c += 1
if str(c) in loci:
estouinteressado = 1
loci_file.close()
return seqnames
def phy_writer(phy_filename, seqnames):
"""Writes the output ready to submit to RAxML or other phylogeny
program. Based on seqnames dict {seqname: sequence}"""
phy = open(phy_filename, 'w')
seqnum = len(seqnames)
bpnum = len(list(seqnames.values())[0])
phy.write(str(seqnum) + " " + str(bpnum) + "\n")
for k, v in seqnames.items():
phy.write(k + "\t" + v + "\n")
phy.close()
if __name__ == "__main__":
from sys import argv
loci, seqnames = vcf_parser(argv[1])
seqnames = loci_parser(argv[2], loci, seqnames)
phy_writer(argv[3], seqnames)