13
13
# See the License for the specific language governing permissions and
14
14
# limitations under the License.
15
15
16
+ from ast import Num
16
17
import os .path
17
18
import glob
18
19
import shutil
19
20
20
21
from math import inf
22
+ from multiprocessing import Pool
21
23
from argparse import ArgumentParser
22
24
23
25
from Bio import SeqIO , SeqRecord
@@ -83,21 +85,34 @@ def process_command(small_rna, adapter, front, anywhere, cutoff, quiet):
83
85
84
86
do_log (quiet , '==> Completed command Process' )
85
87
86
- def sort_command (genome , small_rna , cds , min_length , max_length , quiet ):
88
+ def sort_command (genome , small_rna , cds , min_length , max_length , num_mismatches , disable_align , threads , quiet ):
87
89
'''
88
90
Code to run when the user chooses the sort command
89
91
'''
90
92
91
- if not validate_file (genome , 'fasta' ):
93
+ if not disable_align and not validate_file (genome , 'fasta' ):
92
94
print (f'Error: expected a genome in FASTA format, got { genome } ' )
95
+
96
+ # Add note if genome positional argument has been missed
97
+ if genome is None :
98
+ print ('It looks like you missed the Genome FASTA argument, try adding one to the end of your command' )
99
+
93
100
return False
94
101
95
- if not validate_file (small_rna , 'fastq' ):
96
- print (f'Error: expected a small RNA FASTQ with at least one sequence, got { small_rna } ' )
102
+ if validate_file (small_rna , 'fastq' ):
103
+ small_rna_filetype = 'fastq'
104
+ elif validate_file (small_rna , 'fasta' ):
105
+ small_rna_filetype = 'fasta'
106
+ else :
107
+ print (f'Error: expected a small RNA FASTQ or FASTA with at least one sequence, got { small_rna } ' )
97
108
return False
98
109
99
110
do_log (quiet , '==> Starting command Sort' )
100
- new_fastq = align_to_genome (genome , small_rna , cds , quiet = quiet )
111
+ if not disable_align :
112
+ new_fastq = align_to_genome (genome , small_rna , cds , threads = threads , small_rna_filetype = small_rna_filetype , mismatches = num_mismatches , quiet = quiet )
113
+ else :
114
+ new_fastq = small_rna
115
+
101
116
table_file = bin_rna_size (new_fastq , min_length , max_length , quiet = quiet )
102
117
103
118
graph_length (table_file )
@@ -106,7 +121,7 @@ def sort_command(genome, small_rna, cds, min_length, max_length, quiet):
106
121
107
122
def extractnc_command (genome , gff , quiet ):
108
123
'''
109
- Code to run when the user chooses to extract the noncoding mRNA reigon
124
+ Code to run when the user chooses to extract the noncoding mRNA region
110
125
'''
111
126
112
127
if not validate_file (genome , 'fasta' ):
@@ -127,7 +142,23 @@ def extractnc_command(genome, gff, quiet):
127
142
128
143
do_log (quiet , '==> Completed command extractNC' )
129
144
130
- def unitas_command (small_rna_path , species_name , ref_seqs , cds , unspliced_transcriptome , quiet ):
145
+ # Parallelism bits for unitas
146
+ # initialize worker processes
147
+ def init_worker (a , b , c , d ):
148
+ # declare scope of a new global variable
149
+ global species_name , ref_seqs , quiet , UNITAS_OUTPUT
150
+ # store argument in the global variable for this process
151
+ species_name = a
152
+ ref_seqs = b
153
+ quiet = c
154
+ UNITAS_OUTPUT = d
155
+
156
+
157
+ # easiest way to implement this quickly
158
+ def unitas_threads (small_rna ):
159
+ run_unitas_annotation (small_rna , species_name , ref_seqs , quiet = quiet , unitas_output = UNITAS_OUTPUT )
160
+
161
+ def unitas_command (small_rna_path , species_name , ref_seqs , cds , unspliced_transcriptome , threads , quiet ):
131
162
'''
132
163
Code to run when the user chooses the unitas command
133
164
'''
@@ -159,14 +190,16 @@ def unitas_command(small_rna_path, species_name, ref_seqs, cds, unspliced_transc
159
190
160
191
mkdir_if_not_exists (UNITAS_OUTPUT )
161
192
162
- for small_rna in glob .glob (os .path .join (small_rna_path , '*.fastq' )):
163
- run_unitas_annotation (small_rna , species_name , ref_seqs , quiet = quiet , unitas_output = UNITAS_OUTPUT )
193
+ small_rna_list = glob .glob (os .path .join (small_rna_path , '*.fastq' ))
194
+
195
+ with Pool (threads , initializer = init_worker , initargs = (species_name , ref_seqs , quiet , UNITAS_OUTPUT ,)) as p :
196
+ p .map (unitas_threads , small_rna_list )
164
197
165
198
table_path = merge_summary ()
166
199
graph_unitas_classification_type (table_path )
167
200
do_log (quiet , '==> Completed command Unitas' )
168
201
169
- def targetid_command (small_rna , targets , min_seq_length , mismatches_allowed , quiet ):
202
+ def targetid_command (small_rna , targets , min_seq_length , mismatches_allowed , threads , quiet ):
170
203
'''
171
204
Code to run when the user chooses the targetid command
172
205
'''
@@ -185,7 +218,7 @@ def targetid_command(small_rna, targets, min_seq_length, mismatches_allowed, qui
185
218
print ('Error: You need to supply at least one target file with -t' )
186
219
187
220
revcomp_file = revcomp_input_file (small_rna , quiet = quiet )
188
- sam_files = find_targets (revcomp_file , targets , min_seq_length = min_seq_length , mismatches_allowed = mismatches_allowed , quiet = quiet )
221
+ sam_files = find_targets (revcomp_file , targets , threads = threads , min_seq_length = min_seq_length , mismatches_allowed = mismatches_allowed , quiet = quiet )
189
222
build_summary_files (sam_files , quiet = quiet )
190
223
191
224
do_log (quiet , '==> Ending TargetID command' )
@@ -205,15 +238,17 @@ def main():
205
238
parser_process .add_argument ('small_rna' , help = 'Path to FASTQ containing the small RNA' )
206
239
207
240
parser_sort = subparsers .add_parser ('sort' , help = 'Find RNAs that align to a genome and sort them by length' )
208
- parser_sort .add_argument ('-d' , '--cds' , help = 'Optional CDS region, also align this to the CDS reigon as well as the genome' )
241
+ parser_sort .add_argument ('-d' , '--cds' , help = 'Optional CDS region, also align this to the CDS region as well as the genome' )
209
242
parser_sort .add_argument ('-l' , '--min-length' , help = 'Minimum length to bin' , type = int , default = - inf )
210
243
parser_sort .add_argument ('-x' , '--max-length' , help = 'Maximum length to bin' , type = int , default = inf )
244
+ parser_sort .add_argument ('-m' , '--ref-mismatches' , type = int , default = None , help = 'Number of mismatches to use in bowtie2, None for default behaviour' )
245
+ parser_sort .add_argument ('--disable-alignment' , action = 'store_true' , help = 'Skip the alignment to the reference genome step' )
211
246
parser_sort .add_argument ('small_rna' , help = 'Path to FASTQ containing the small RNA' )
212
- parser_sort .add_argument ('genome' , help = 'Genome to align against' )
247
+ parser_sort .add_argument ('genome' , nargs = '?' , default = None , help = 'Genome to align against' )
213
248
214
- parser_extractnc = subparsers .add_parser ('extractnc' , help = 'Extarct the noncoding reigon from a fasta with a GFF file' )
249
+ parser_extractnc = subparsers .add_parser ('extractnc' , help = 'Extarct the noncoding region from a fasta with a GFF file' )
215
250
parser_extractnc .add_argument ('genome' , help = 'FASTA containing the genome to extract from' )
216
- parser_extractnc .add_argument ('gff_file' , help = 'GFF file containing annotations of CDS and mRNA reigons ' )
251
+ parser_extractnc .add_argument ('gff_file' , help = 'GFF file containing annotations of CDS and mRNA regions ' )
217
252
218
253
parser_unitas = subparsers .add_parser ('unitas' , help = 'Run unitas on split files and merge results' )
219
254
parser_unitas .add_argument ('-d' , '--cds' , help = 'Optional CDS region, passed to unitas' )
@@ -230,7 +265,7 @@ def main():
230
265
231
266
parser_all = subparsers .add_parser ('all' , help = 'Run process, sort and unitas one after the other' )
232
267
parser_all .add_argument ('-a' , '--adapter' , help = 'Sequence of the adapter to remove from the 3\' end' )
233
- parser_all .add_argument ('-d' , '--cds' , help = 'Optional CDS region, also align this to the CDS reigon as well as the genome' )
268
+ parser_all .add_argument ('-d' , '--cds' , help = 'Optional CDS region, also align this to the CDS region as well as the genome' )
234
269
parser_all .add_argument ('-g' , '--front' , help = 'Sequence of the adapter to remove from the 5\' end' )
235
270
parser_all .add_argument ('-b' , '--anywhere' , help = 'Sequence of the adapters to remove from both ends' )
236
271
parser_all .add_argument ('-c' , '--cutoff' , help = 'Quality cutoff to trin RNA sequences at' , default = 20 , type = int )
@@ -239,8 +274,10 @@ def main():
239
274
parser_all .add_argument ('-r' , '--refseq' , help = 'References for use with unitas' , nargs = '*' , default = None )
240
275
parser_all .add_argument ('-s' , '--species' , help = 'Species to set in unitas arguments' , default = 'x' )
241
276
parser_all .add_argument ('-u' , '--unspliced-transcriptome' , help = 'Optional, unspliced transcriptome, passed to unitas' )
277
+ parser_all .add_argument ('-m' , '--ref-mismatches' , type = int , default = None , help = 'Number of mismatches to use in bowtie2 when aligning to the genome, None for default behaviour' )
278
+ parser_all .add_argument ('--disable-alignment' , action = 'store_true' , help = 'Skip the alignment to the reference genome step' )
242
279
parser_all .add_argument ('small_rna' , help = 'Path to FASTQ containing the small RNA' )
243
- parser_all .add_argument ('genome' , help = 'Genome to align against' )
280
+ parser_all .add_argument ('genome' , nargs = '?' , default = None , help = 'Genome to align against' )
244
281
245
282
args = parser .parse_args ()
246
283
@@ -262,6 +299,7 @@ def get_command_args(name):
262
299
return None
263
300
264
301
mkdir_if_not_exists (get_config_key ('general' , 'output_directory' ))
302
+ num_threads = get_config_key ('general' , 'threads' )
265
303
266
304
if args .command == 'process' :
267
305
process_command (
@@ -280,6 +318,9 @@ def get_command_args(name):
280
318
get_command_args ('cds' ),
281
319
get_command_args ('min_length' ),
282
320
get_command_args ('max_length' ),
321
+ get_command_args ('ref_mismatches' ),
322
+ get_command_args ('disable_alignment' ),
323
+ num_threads ,
283
324
get_command_args ('quiet' )
284
325
)
285
326
@@ -297,6 +338,7 @@ def get_command_args(name):
297
338
get_command_args ('refseq' ),
298
339
get_command_args ('cds' ),
299
340
get_command_args ('unspliced_transcriptome' ),
341
+ num_threads ,
300
342
get_command_args ('quiet' )
301
343
)
302
344
@@ -306,6 +348,7 @@ def get_command_args(name):
306
348
get_command_args ('target_files' ),
307
349
get_command_args ('min_seq_length' ),
308
350
get_command_args ('num_mismatches' ),
351
+ num_threads ,
309
352
get_command_args ('quiet' )
310
353
)
311
354
@@ -328,16 +371,22 @@ def get_command_args(name):
328
371
get_command_args ('cds' ),
329
372
get_command_args ('min_length' ),
330
373
get_command_args ('max_length' ),
374
+ get_command_args ('ref_mismatches' ),
375
+ get_command_args ('disable_alignment' ),
376
+ num_threads ,
331
377
get_command_args ('quiet' )
332
378
)
333
379
334
380
if out_code is not None :
335
381
return
336
-
382
+
337
383
unitas_command (
338
384
os .path .join (get_config_key ('general' , 'output_directory' ), 'binned_rna' ),
339
385
get_command_args ('species' ),
340
386
get_command_args ('refseq' ),
387
+ get_command_args ('cds' ),
388
+ get_command_args ('unspliced_transcriptome' ),
389
+ num_threads ,
341
390
get_command_args ('quiet' )
342
391
)
343
392
0 commit comments