-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathThreadHomoloCAP3.py
executable file
·185 lines (169 loc) · 6.87 KB
/
ThreadHomoloCAP3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#!/usr/bin/python3
from sys import argv, stdout
from os.path import getsize, abspath
from os import makedirs, chdir, listdir, remove
from Bio import SeqIO
import subprocess
import threading
import time
if (argv[1] == "-h" or argv[1] == "--help"):
print("USAGE: PKG-name.py cluster-file.clstr CDS-file.cds num-threads percent-identity-cap3 MODE[end/int]")
exit(0)
file_homology = abspath(argv[1])
file_cds = abspath(argv[2])
THREADS = int(argv[3])
IDENT = int(argv[4])
MODE = argv[5]
prefix=file_homology.split("/")[-1].split(".clstr")[0]
print("Processing %s file" % file_homology)
cmd_cap = "/home/Tools/CAP3/cap3 %s -r 0 -p %s -o %s -h 30 -y %s -f %s -t 500 -s 251 -i 32 -j 42 -x o%s > %s.o%s.log"
lib_prot2cdsid = {}
file = open(file_homology, "r")
count = 0
set_cds_id = set()
set_tmp = set()
for line in file:
if ">Cluster" in line:
if len( set_tmp ) > 1:
lib_prot2cdsid[ "%s_Cluster%s" % (prefix, count) ] = set_tmp
lib_prot2cdsid[ "%s_Cluster%s" % (prefix, count) ] = list( lib_prot2cdsid[ "%s_Cluster%s" % (prefix, count) ] )
set_cds_id.update( set_tmp )
set_tmp = set()
count += 1
if "... " in line:
set_tmp.add( line.split(">")[1].split("... ")[0] )
file.close()
print("Complete processing %s file" % file_homology)
print("Amount clusters: %s" % len(lib_prot2cdsid) )
print("Processing %s file" % file_cds)
lib_id2seq = {}
seq_singlets = []
seq_contigs = []
for seq in SeqIO.parse(file_cds, "fasta"):
if seq.id in set_cds_id:
lib_id2seq[ seq.id ] = seq
else:
seq_singlets.append( seq )
print("Complete processing %s file" % file_cds)
print("Amount sequences: %s" % len(lib_id2seq) )
print("Amount singlets: %s" % len(seq_singlets) )
makedirs("workdir_thread")
chdir("workdir_thread")
set_assembled_id = set()
completed = 0
set_processing_prot = set()
LOCK = threading.RLock()
def ASSEMBLE(lib_prot2cdsid, lib_id2seq):
global seq_singlets, seq_contigs, set_assembled_id, set_processing_prot, completed
for prot in lib_prot2cdsid:
LOCK.acquire()
if prot in set_processing_prot:
LOCK.release()
continue
set_processing_prot.add( prot )
print("\r COMPLETED/ALL: %s/%s; Assembled: %s" % (completed, len(lib_prot2cdsid), len(seq_contigs) ), end='')
stdout.flush()
LOCK.release()
seq_homology_cds = []
list_lens = []
file_seq = "%s.cds" % prot
for cds_id in lib_prot2cdsid[ prot ]:
seq_homology_cds.append( lib_id2seq[ cds_id ] )
list_lens.append( len(lib_id2seq[ cds_id ]) )
SeqIO.write( seq_homology_cds, file_seq, "fasta")
list_lens.sort()
overlap = int(sum( list_lens )/ len( list_lens ))
if MODE == "int":
max_gap = int(overlap*0.2)
if max_gap > 500: max_gap = 500
max_clip = list_lens[0]
if max_clip > 120: max_clip = 120
min_overlap = int(list_lens[0]*0.5)
if min_overlap < 100: min_overlap = 100
else:
max_gap = 10
max_clip = 20
overlap = 500
min_overlap = 100
while overlap >= min_overlap:
if MODE == "int":
max_gap = int(overlap*0.2)
if max_gap > 500: max_gap = 500
max_clip = list_lens[0]
if max_clip > 120: max_clip = 120
PIPE=subprocess.PIPE
#cmd_cap = "/home/Tools/CAP3/cap3 %s -r 0 -p %s -o %s -h 30 -y %s -f %s -t 500 -s 300 -i 32 -j 42 -x o%s > %s.o%s.log"
process = subprocess.Popen(cmd_cap % (file_seq, IDENT, overlap, max_clip, max_gap, overlap, file_seq, overlap), shell=True, stdout=PIPE)
process.wait()
if getsize("%s.o%s.contigs" % (file_seq, overlap)) != 0:
count_i = 0
seq_homology_cds = []
for seq in SeqIO.parse("%s.o%s.contigs" % (file_seq, overlap), "fasta"):
seq.id = "Homolog_%s_i%s_length_%s" % (prot, count_i, len(seq) )
seq.name = ""
seq.description = ""
seq_homology_cds.append( seq )
count_i += 1
if getsize("%s.o%s.singlets" % (file_seq, overlap)) == 0:
LOCK.acquire()
seq_contigs.extend( seq_homology_cds )
set_assembled_id.update( set(lib_prot2cdsid[ prot ]) )
completed += 1
LOCK.release()
for filename in listdir():
if file_seq in filename:
remove(filename)
break
else:
for seq in SeqIO.parse("%s.o%s.singlets" % (file_seq, overlap), "fasta"):
if "Homolog" in seq.id:
seq.id = "Homolog_%s_i%s_length_%s" % (prot, count_i, len(seq) )
seq.name = ""
seq.description = ""
seq_homology_cds.append( seq )
count_i += 1
else:
seq_homology_cds.append( seq )
SeqIO.write( seq_homology_cds, file_seq, "fasta")
for filename in listdir():
if "%s." % file_seq in filename:
remove(filename)
if MODE == "int":
overlap -= 200
else:
overlap -= 100
else:
LOCK.acquire()
for seq in seq_homology_cds:
if "Homolog" in seq.id:
seq_contigs.append( seq )
else:
if seq.id in set_assembled_id:
continue
seq_singlets.append( seq )
lib_prot2cdsid[ prot ].remove( seq.id )
set_assembled_id.update( set(lib_prot2cdsid[ prot ]) )
completed += 1
LOCK.release()
for filename in listdir():
if file_seq in filename:
remove(filename)
for _ in range(THREADS):
thread_ = threading.Thread(target=ASSEMBLE, args=(lib_prot2cdsid, lib_id2seq))
thread_.start()
while threading.active_count() >1:
time.sleep(1)
print("\nAssembling homology CDS finished")
seq_singlets_finish = []
set_singlets_writed = set()
for seq in seq_singlets:
if seq.id in set_singlets_writed:
continue
if seq.id not in set_assembled_id:
seq_singlets_finish.append( seq )
set_singlets_writed.add( seq.id )
print("Count contigs = %s; count singlets = %s" % ( len(seq_contigs), len(seq_singlets_finish) ))
print("Writing singlets and contigs into files")
SeqIO.write(seq_singlets_finish, "%s.ThreadHomoloCAP3.singlets" % file_cds, "fasta")
SeqIO.write(seq_contigs, "%s.ThreadHomoloCAP3.contigs" % file_cds, "fasta")
print("Finish")