4
4
from sequence_processing_pipeline .NuQCJob import NuQCJob
5
5
from sequence_processing_pipeline .FastQCJob import FastQCJob
6
6
from sequence_processing_pipeline .GenPrepFileJob import GenPrepFileJob
7
+ from sequence_processing_pipeline .MultiQCJob import MultiQCJob
7
8
import pandas as pd
8
9
from json import dumps
9
10
from collections import defaultdict
11
+ import re
10
12
11
13
12
14
ASSAY_NAME_NONE = "Assay"
@@ -163,37 +165,6 @@ def post_process_raw_fastq_output(self):
163
165
projects = [x ['project_name' ] for x in projects ]
164
166
165
167
for project_name in projects :
166
- # copy the files from ConvertJob output to faked NuQCJob output
167
- # folder: $WKDIR/$RUN_ID/NuQCJob/$PROJ_NAME/amplicon
168
- output_folder = join (self .pipeline .output_path ,
169
- 'NuQCJob' ,
170
- project_name ,
171
- # for legacy purposes, output folders are
172
- # either 'trimmed_sequences', 'amplicon', or
173
- # 'filtered_sequences'. Hence, this folder
174
- # is not defined using AMPLICON_TYPE as that
175
- # value may or may not equal the needed value.
176
- 'amplicon' )
177
-
178
- makedirs (output_folder )
179
-
180
- # get list of all raw output files to be copied.
181
- job_output = [join (self .raw_fastq_files_path , x ) for x in
182
- listdir (self .raw_fastq_files_path )]
183
-
184
- job_output = [x for x in job_output if isfile (x )]
185
- job_output = [x for x in job_output if x .endswith ('fastq.gz' )]
186
-
187
- # NB: In this case, ensure the ONLY files that get copied are
188
- # Undetermined files, and this is what we expect for 16S runs.
189
- job_output = [x for x in job_output if
190
- basename (x ).startswith ('Undetermined' )]
191
-
192
- # copy the file
193
- for fastq_file in job_output :
194
- new_path = join (output_folder , basename (fastq_file ))
195
- copyfile (fastq_file , new_path )
196
-
197
168
# FastQC expects the ConvertJob output to also be organized by
198
169
# project. Since this would entail running the same ConvertJob
199
170
# multiple times on the same input with just a name-change in
@@ -212,28 +183,66 @@ def post_process_raw_fastq_output(self):
212
183
new_path = join (output_folder , basename (raw_fastq_file ))
213
184
copyfile (raw_fastq_file , new_path )
214
185
186
+ # copy the files from ConvertJob output to faked NuQCJob output
187
+ # folder: $WKDIR/$RUN_ID/NuQCJob/$PROJ_NAME/amplicon
188
+ output_folder = join (self .pipeline .output_path ,
189
+ 'NuQCJob' ,
190
+ project_name ,
191
+ # for legacy purposes, output folders are
192
+ # either 'trimmed_sequences', 'amplicon', or
193
+ # 'filtered_sequences'. Hence, this folder
194
+ # is not defined using AMPLICON_TYPE as that
195
+ # value may or may not equal the needed value.
196
+ 'amplicon' )
197
+ makedirs (output_folder )
198
+
199
+ # copy the file
200
+ for fastq_file in job_output :
201
+ new_path = join (output_folder , basename (fastq_file ))
202
+ copyfile (fastq_file , new_path )
203
+
215
204
def generate_reports (self ):
216
205
config = self .pipeline .get_software_configuration ('fastqc' )
217
- job = FastQCJob (self .pipeline .run_dir ,
218
- self .pipeline .output_path ,
219
- self .raw_fastq_files_path ,
220
- join (self .pipeline .output_path , 'NuQCJob' ),
221
- config ['nprocs' ],
222
- config ['nthreads' ],
223
- config ['fastqc_executable_path' ],
224
- config ['modules_to_load' ],
225
- self .master_qiita_job_id ,
226
- config ['queue' ],
227
- config ['nodes' ],
228
- config ['wallclock_time_in_minutes' ],
229
- config ['job_total_memory_limit' ],
230
- config ['job_pool_size' ],
231
- config ['multiqc_config_file_path' ],
232
- config ['job_max_array_length' ],
233
- True )
206
+ fcjob = FastQCJob (self .pipeline .run_dir ,
207
+ self .pipeline .output_path ,
208
+ self .raw_fastq_files_path ,
209
+ join (self .pipeline .output_path , 'NuQCJob' ),
210
+ config ['nprocs' ],
211
+ config ['nthreads' ],
212
+ config ['fastqc_executable_path' ],
213
+ config ['modules_to_load' ],
214
+ self .master_qiita_job_id ,
215
+ config ['queue' ],
216
+ config ['nodes' ],
217
+ config ['wallclock_time_in_minutes' ],
218
+ config ['job_total_memory_limit' ],
219
+ config ['job_pool_size' ],
220
+ config ['job_max_array_length' ],
221
+ True )
222
+ mqcjob = MultiQCJob (self .pipeline .run_dir ,
223
+ self .pipeline .output_path ,
224
+ self .raw_fastq_files_path ,
225
+ join (self .pipeline .output_path , 'NuQCJob' ),
226
+ config ['nprocs' ],
227
+ config ['nthreads' ],
228
+ config ['multiqc_executable_path' ],
229
+ config ['modules_to_load' ],
230
+ self .master_qiita_job_id ,
231
+ config ['queue' ],
232
+ config ['nodes' ],
233
+ config ['wallclock_time_in_minutes' ],
234
+ config ['job_total_memory_limit' ],
235
+ config ['job_pool_size' ],
236
+ join (self .pipeline .output_path , 'FastQCJob' ),
237
+ config ['job_max_array_length' ],
238
+ config ['multiqc_config_file_path' ],
239
+ True )
234
240
235
241
if 'FastQCJob' not in self .skip_steps :
236
- job .run (callback = self .job_callback )
242
+ fcjob .run (callback = self .job_callback )
243
+
244
+ if 'MultiQCJob' not in self .skip_steps :
245
+ mqcjob .run (callback = self .job_callback )
237
246
238
247
def generate_prep_file (self ):
239
248
config = self .pipeline .get_software_configuration ('seqpro' )
@@ -386,30 +395,49 @@ def quality_control(self):
386
395
387
396
def generate_reports (self ):
388
397
config = self .pipeline .get_software_configuration ('fastqc' )
389
- job = FastQCJob (self .pipeline .run_dir ,
390
- self .pipeline .output_path ,
391
- self .raw_fastq_files_path ,
392
- join (self .pipeline .output_path , 'NuQCJob' ),
393
- config ['nprocs' ],
394
- config ['nthreads' ],
395
- config ['fastqc_executable_path' ],
396
- config ['modules_to_load' ],
397
- self .master_qiita_job_id ,
398
- config ['queue' ],
399
- config ['nodes' ],
400
- config ['wallclock_time_in_minutes' ],
401
- config ['job_total_memory_limit' ],
402
- config ['job_pool_size' ],
403
- config ['multiqc_config_file_path' ],
404
- config ['job_max_array_length' ],
405
- False )
398
+ fqjob = FastQCJob (self .pipeline .run_dir ,
399
+ self .pipeline .output_path ,
400
+ self .raw_fastq_files_path ,
401
+ join (self .pipeline .output_path , 'NuQCJob' ),
402
+ config ['nprocs' ],
403
+ config ['nthreads' ],
404
+ config ['fastqc_executable_path' ],
405
+ config ['modules_to_load' ],
406
+ self .master_qiita_job_id ,
407
+ config ['queue' ],
408
+ config ['nodes' ],
409
+ config ['wallclock_time_in_minutes' ],
410
+ config ['job_total_memory_limit' ],
411
+ config ['job_pool_size' ],
412
+ config ['job_max_array_length' ],
413
+ False )
414
+ mqcjob = MultiQCJob (self .pipeline .run_dir ,
415
+ self .pipeline .output_path ,
416
+ self .raw_fastq_files_path ,
417
+ join (self .pipeline .output_path , 'NuQCJob' ),
418
+ config ['nprocs' ],
419
+ config ['nthreads' ],
420
+ config ['multiqc_executable_path' ],
421
+ config ['modules_to_load' ],
422
+ self .master_qiita_job_id ,
423
+ config ['queue' ],
424
+ config ['nodes' ],
425
+ config ['wallclock_time_in_minutes' ],
426
+ config ['job_total_memory_limit' ],
427
+ config ['job_pool_size' ],
428
+ join (self .pipeline .output_path , 'FastQCJob' ),
429
+ config ['job_max_array_length' ],
430
+ config ['multiqc_config_file_path' ],
431
+ False )
406
432
407
433
if 'FastQCJob' not in self .skip_steps :
408
- job .run (callback = self .job_callback )
434
+ fqjob .run (callback = self .job_callback )
435
+ if 'MultiQCJob' not in self .skip_steps :
436
+ mqcjob .run (callback = self .job_callback )
409
437
410
- failed_samples = job .audit (self .pipeline .get_sample_ids ())
438
+ failed_samples = fqjob .audit (self .pipeline .get_sample_ids ())
411
439
if hasattr (self , 'fsr' ):
412
- self .fsr .write (failed_samples , job .__class__ .__name__ )
440
+ self .fsr .write (failed_samples , fqjob .__class__ .__name__ )
413
441
return failed_samples
414
442
415
443
def generate_prep_file (self ):
@@ -534,12 +562,20 @@ def execute_pipeline(self):
534
562
prep_paths = []
535
563
self .prep_file_paths = {}
536
564
565
+ rematch = re .compile (
566
+ r"(?P<runid>[a-zA-Z0-9_-]+)\.(?P<qname>[a-zA-Z0-9_]+)"
567
+ r"(?P<qid>[0-9]{5,6})\..\.tsv" )
568
+
537
569
for root , dirs , files in walk (tmp ):
538
570
for _file in files :
539
571
# breakup the prep-info-file into segments
540
572
# (run-id, project_qid, other) and cleave
541
573
# the qiita-id from the project_name.
542
- qid = _file .split ('.' )[1 ].split ('_' )[- 1 ]
574
+ rer = rematch .match (_file )
575
+ if rer is None :
576
+ continue
577
+
578
+ _ , _ , qid = rer .groups ()
543
579
544
580
if qid not in self .prep_file_paths :
545
581
self .prep_file_paths [qid ] = []
0 commit comments