forked from chenmy33/IntronGetting
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRMexon.py
57 lines (45 loc) · 1.61 KB
/
RMexon.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import sys
def RmExon(ref_bed, multialignment):
'''this function is used to remove exon matches from exon-intron-exon alignment
Note: the input of this function are listed below
1) BED file of the reference species
2) multialignemnt output of SATe'''
# Parse BED file to acquire the pos info
input1 = open(ref_bed, 'r')
dist_lst = []
for line in input1:
parsed_line = line.strip().strip('>').split('\t')
dist_lst.append(int(parsed_line[2]) - int(parsed_line[1]) + 1)
input1.close()
# print(dist_lst)
# Read multialignment
seqid_lst = []
seq_lst = []
tmp_str = ''
with open(multialignment, 'r') as input2:
for index, line in enumerate(input2):
if ">" in line:
seqid_lst.append(line.strip('\n'))
seq_lst.append(tmp_str)
tmp_str = ''
else:
tmp_str += line
seq_lst.pop(0)
seq_lst.append(tmp_str)
input2.close()
# for x in seq_lst:
# print('seq')
# print(x)
# print(len(seq_lst))
# Using reference species to conduct the analysis
# Wrtie the after-cut multialignment
# output = open('tmp_data/test_out.aln', 'w')
# for i in range(len(seqid_lst)):
# join_seq = seqid_lst[i] + '\n' + seq_lst[i][dist_lst[0]:-dist_lst[1]] + '\n'
# output.write(join_seq)
# output.close()
for i in range(len(seqid_lst)):
join_seq = seqid_lst[i] + '\n' + seq_lst[i][dist_lst[0]:-dist_lst[1]] + '\n'
print(join_seq)
if __name__ == "__main__":
RmExon(sys.argv[1], sys.argv[2])