-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathsplitCons.py
102 lines (79 loc) · 3.33 KB
/
splitCons.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# Curated Hepatitis B Virus Alignments from GenBank Data
# Copyright (C) 2017 University of the Witwatersrand, Johannesburg, South Africa
# Author: Dr Trevor G. Bell, [email protected]
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
# This file is part of the pipeline described in the following publication:
# Bell, T. G., Yousif, M. and Kramvis, A. 2016
# Bioinformatic curation and alignment of genotyped hepatitis B virus (HBV)
# sequence data from the GenBank public database.
# SpringerPlus 5: 1896.
# https://springerplus.springeropen.com/articles/10.1186/s40064-016-3312-0
#!/usr/bin/python
import climb
import sys
import re
S = climb.Sequence()
S.load(sys.argv[1])
threshold = int(sys.argv[2])
length = int(sys.argv[3])
cons = ''
for i in S.baseDistribution('1-%i' % (S.seqLength()[0][1])):
highNum = 0
cc = 0
for j in i:
if (j != climb.GAP) and (i[j] > highNum):
highNum = i[j]
highBase = j
cons += highBase
print 'ID,Mismatches,Bases,Percentage'
mismatchList = []
overLength = 0
for i in S.seq:
pos = 0
bases = 0
mismatches = 0
for j in i['seq']:
if j != climb.GAP:
bases += 1
if pos <= len(cons)-1: # some sequences are longer than the consensus
if j != cons[pos]:
mismatches += 1
pos += 1
if mismatches/float(bases)*100 > threshold:
mismatchList.append(i['id'])
f = open('_%s.fasta' % (i['id'][:8]), 'w')
# f.write('>Cons\n') # to include consensus with query in the file (for muscle)
# f.write('%s\n' % cons)
f.write('>%s\n' % (i['id'][:8]))
f.write('%s\n' % (i['seq']))
f.close()
print '%s,%04i,%04i,%3.2f' % (i['id'][:8], mismatches, bases, mismatches/float(bases)*100)
f = open('%s_cons.fasta' % (sys.argv[1][0]), 'w') # first character is the genotype
f.write('>Cons\n')
f.write('%s\n' % cons)
f.close()
f = open('%s_log.txt' % (sys.argv[1][0]), 'a') # first character is the genotype
f.write('splitcons:\n')
f.write('Threshold: %i\n' % threshold)
f.write('\tMismatch List:\n')
for i in mismatchList:
f.write('\t%s\n' % i)
f.write('Number excluded: %i\n' % len(mismatchList))
f.write('of which overlengths: %i\n' % overLength)
f.write('Sequence count before: %i\n' % S.seqCount())
S.seqRemoveByID(mismatchList, escape=True)
f.write('Sequence count after: %i\n' % S.seqCount())
f.close()
S.save(S.fileName, overWrite=True)
S.unload()
# Ends