|
8 | 8 | import pandas as pd
|
9 | 9 | from json import dumps
|
10 | 10 | from collections import defaultdict
|
11 |
| -import re |
12 | 11 |
|
13 | 12 |
|
14 | 13 | ASSAY_NAME_NONE = "Assay"
|
@@ -144,13 +143,133 @@ def _generate_artifact_name(self, prep_file_path):
|
144 | 143 | # this is a normal pre-prep or sample-sheet.
|
145 | 144 | return (a_name, False)
|
146 | 145 |
|
| 146 | + def execute_pipeline(self): |
| 147 | + ''' |
| 148 | + Executes steps of pipeline in proper sequence. |
| 149 | + :return: None |
| 150 | + ''' |
| 151 | + if not self.is_restart: |
| 152 | + self.pre_check() |
| 153 | + |
| 154 | + # this is performed even in the event of a restart. |
| 155 | + self.generate_special_map() |
| 156 | + |
| 157 | + # even if a job is being skipped, it's being skipped because it was |
| 158 | + # determined that it already completed successfully. Hence, |
| 159 | + # increment the status because we are still iterating through them. |
| 160 | + |
| 161 | + self.update_status("Converting data", 1, 9) |
| 162 | + if "ConvertJob" not in self.skip_steps: |
| 163 | + # converting raw data to fastq depends heavily on the instrument |
| 164 | + # used to generate the run_directory. Hence this method is |
| 165 | + # supplied by the instrument mixin. |
| 166 | + self.convert_raw_to_fastq() |
| 167 | + self.integrate_results() |
| 168 | + self.generate_sequence_counts() |
| 169 | + |
| 170 | + self.update_status("QC-ing reads", 2, 9) |
| 171 | + if "NuQCJob" not in self.skip_steps: |
| 172 | + self.qc_reads() |
| 173 | + |
| 174 | + self.update_status("Generating reports", 3, 9) |
| 175 | + if "FastQCJob" not in self.skip_steps: |
| 176 | + # reports are currently implemented by the assay mixin. This is |
| 177 | + # only because metagenomic runs currently require a failed-samples |
| 178 | + # report to be generated. This is not done for amplicon runs since |
| 179 | + # demultiplexing occurs downstream of SPP. |
| 180 | + self.generate_reports() |
| 181 | + |
| 182 | + self.update_status("Generating preps", 4, 9) |
| 183 | + if "GenPrepFileJob" not in self.skip_steps: |
| 184 | + self.generate_prep_file() |
| 185 | + |
| 186 | + # moved final component of genprepfilejob outside of object. |
| 187 | + # obtain the paths to the prep-files generated by GenPrepFileJob |
| 188 | + # w/out having to recover full state. |
| 189 | + tmp = join(self.pipeline.output_path, 'GenPrepFileJob', 'PrepFiles') |
| 190 | + |
| 191 | + self.has_replicates = False |
| 192 | + |
| 193 | + prep_paths = [] |
| 194 | + self.prep_file_paths = {} |
| 195 | + |
| 196 | + for root, dirs, files in walk(tmp): |
| 197 | + for _file in files: |
| 198 | + # we are looing for .tsv files and we are only interested |
| 199 | + # in the string after the last _, which is the study_id |
| 200 | + if not _file.endswith('.tsv'): |
| 201 | + continue |
| 202 | + # continue if no underscore |
| 203 | + chunks = _file.rsplit('_', 1) |
| 204 | + if len(chunks) <= 1: |
| 205 | + continue |
| 206 | + # continue if no int after . |
| 207 | + qid = chunks[-1].split('.')[0] |
| 208 | + if not qid.isnumeric(): |
| 209 | + continue |
| 210 | + if qid not in self.prep_file_paths: |
| 211 | + self.prep_file_paths[qid] = [] |
| 212 | + |
| 213 | + _path = abspath(join(root, _file)) |
| 214 | + prep_paths.append(_path) |
| 215 | + self.prep_file_paths[qid].append(_path) |
| 216 | + |
| 217 | + for _dir in dirs: |
| 218 | + if _dir == '1': |
| 219 | + # if PrepFiles contains the '1' directory, then it's a |
| 220 | + # given that this sample-sheet contains replicates. |
| 221 | + self.has_replicates = True |
| 222 | + |
| 223 | + # currently imported from Assay although it is a base method. it |
| 224 | + # could be imported into Workflows potentially, since it is a post- |
| 225 | + # processing step. All pairings of assay and instrument type need to |
| 226 | + # generate prep-info files in the same format. |
| 227 | + self.overwrite_prep_files(prep_paths) |
| 228 | + |
| 229 | + # for now, simply re-run any line below as if it was a new job, even |
| 230 | + # for a restart. functionality is idempotent, except for the |
| 231 | + # registration of new preps in Qiita. These will simply be removed |
| 232 | + # manually. |
| 233 | + |
| 234 | + # post-processing steps are by default associated with the Workflow |
| 235 | + # class, since they deal with fastq files and Qiita, and don't depend |
| 236 | + # on assay or instrument type. |
| 237 | + self.update_status("Generating sample information", 5, 9) |
| 238 | + self.sifs = self.generate_sifs() |
| 239 | + |
| 240 | + # post-processing step. |
| 241 | + self.update_status("Registering blanks in Qiita", 6, 9) |
| 242 | + if self.update: |
| 243 | + self.update_blanks_in_qiita() |
| 244 | + |
| 245 | + self.update_status("Loading preps into Qiita", 7, 9) |
| 246 | + if self.update: |
| 247 | + self.update_prep_templates() |
| 248 | + |
| 249 | + # before we load preps into Qiita we need to copy the fastq |
| 250 | + # files n times for n preps and correct the file-paths each |
| 251 | + # prep is pointing to. |
| 252 | + self.load_preps_into_qiita() |
| 253 | + |
| 254 | + # before we pack the results, we need to generate the human-readable |
| 255 | + # report of samples lost in each step. The tracking is being done |
| 256 | + # within fsr (FailedSamplesRecord), in conjuction with Job.audit. |
| 257 | + self.fsr.generate_report() |
| 258 | + |
| 259 | + self.update_status("Generating packaging commands", 8, 9) |
| 260 | + self.generate_commands() |
| 261 | + |
| 262 | + self.update_status("Packaging results", 9, 9) |
| 263 | + if self.update: |
| 264 | + self.execute_commands() |
| 265 | + |
147 | 266 |
|
148 | 267 | class Amplicon(Assay):
|
149 | 268 | AMPLICON_TYPE = 'Amplicon'
|
150 | 269 | AMPLICON_SUB_TYPES = {'16S', '18S', 'ITS'}
|
151 | 270 | assay_type = ASSAY_NAME_AMPLICON
|
152 | 271 |
|
153 |
| - def post_process_raw_fastq_output(self): |
| 272 | + def qc_reads(self): |
154 | 273 | """
|
155 | 274 | Post-process ConvertJob output into correct form for FastQCJob.
|
156 | 275 | """
|
@@ -353,7 +472,7 @@ class MetaOmic(Assay):
|
353 | 472 | # MetaOmic does not have an assay_type of its own. It is defined by its
|
354 | 473 | # children.
|
355 | 474 |
|
356 |
| - def quality_control(self): |
| 475 | + def qc_reads(self): |
357 | 476 | # because this is a mixin, assume containing object will contain
|
358 | 477 | # a pipeline object.
|
359 | 478 | config = self.pipeline.get_software_configuration('nu-qc')
|
@@ -521,111 +640,6 @@ def load_preps_into_qiita(self):
|
521 | 640 |
|
522 | 641 | return df
|
523 | 642 |
|
524 |
| - def execute_pipeline(self): |
525 |
| - ''' |
526 |
| - Executes steps of pipeline in proper sequence. |
527 |
| - :return: None |
528 |
| - ''' |
529 |
| - self.pre_check() |
530 |
| - |
531 |
| - self.generate_special_map() |
532 |
| - |
533 |
| - self.update_status("Converting data", 1, 9) |
534 |
| - |
535 |
| - self.convert_raw_to_fastq() |
536 |
| - |
537 |
| - self.integrate_results() |
538 |
| - |
539 |
| - self.generate_sequence_counts() |
540 |
| - |
541 |
| - self.update_status("Performing quality control", 2, 9) |
542 |
| - self.quality_control() |
543 |
| - |
544 |
| - self.update_status("Generating reports", 3, 9) |
545 |
| - self.generate_reports() |
546 |
| - |
547 |
| - self.update_status("Generating preps", 4, 9) |
548 |
| - self.generate_prep_file() |
549 |
| - |
550 |
| - # moved final component of genprepfilejob outside of object. |
551 |
| - # obtain the paths to the prep-files generated by GenPrepFileJob |
552 |
| - # w/out having to recover full state. |
553 |
| - tmp = join(self.pipeline.output_path, 'GenPrepFileJob', 'PrepFiles') |
554 |
| - |
555 |
| - self.has_replicates = False |
556 |
| - |
557 |
| - prep_paths = [] |
558 |
| - self.prep_file_paths = {} |
559 |
| - |
560 |
| - rematch = re.compile( |
561 |
| - r"(?P<runid>[a-zA-Z0-9_-]+)\.(?P<qname>[a-zA-Z0-9_]+)" |
562 |
| - r"(?P<qid>[0-9]{5,6})\..\.tsv") |
563 |
| - |
564 |
| - for root, dirs, files in walk(tmp): |
565 |
| - for _file in files: |
566 |
| - # breakup the prep-info-file into segments |
567 |
| - # (run-id, project_qid, other) and cleave |
568 |
| - # the qiita-id from the project_name. |
569 |
| - rer = rematch.match(_file) |
570 |
| - if rer is None: |
571 |
| - continue |
572 |
| - |
573 |
| - _, _, qid = rer.groups() |
574 |
| - |
575 |
| - if qid not in self.prep_file_paths: |
576 |
| - self.prep_file_paths[qid] = [] |
577 |
| - |
578 |
| - _path = abspath(join(root, _file)) |
579 |
| - if _path.endswith('.tsv'): |
580 |
| - prep_paths.append(_path) |
581 |
| - self.prep_file_paths[qid].append(_path) |
582 |
| - |
583 |
| - for _dir in dirs: |
584 |
| - if _dir == '1': |
585 |
| - # if PrepFiles contains the '1' directory, then it's a |
586 |
| - # given that this sample-sheet contains replicates. |
587 |
| - self.has_replicates = True |
588 |
| - |
589 |
| - # currently imported from Assay although it is a base method. it |
590 |
| - # could be imported into Workflows potentially, since it is a post- |
591 |
| - # processing step. All pairings of assay and instrument type need to |
592 |
| - # generate prep-info files in the same format. |
593 |
| - self.overwrite_prep_files(prep_paths) |
594 |
| - |
595 |
| - # for now, simply re-run any line below as if it was a new job, even |
596 |
| - # for a restart. functionality is idempotent, except for the |
597 |
| - # registration of new preps in Qiita. These will simply be removed |
598 |
| - # manually. |
599 |
| - |
600 |
| - # post-processing steps are by default associated with the Workflow |
601 |
| - # class, since they deal with fastq files and Qiita, and don't depend |
602 |
| - # on assay or instrument type. |
603 |
| - self.update_status("Generating sample information", 5, 9) |
604 |
| - self.sifs = self.generate_sifs() |
605 |
| - |
606 |
| - # post-processing step. |
607 |
| - self.update_status("Registering blanks in Qiita", 6, 9) |
608 |
| - if self.update: |
609 |
| - self.update_blanks_in_qiita() |
610 |
| - |
611 |
| - self.update_status("Loading preps into Qiita", 7, 9) |
612 |
| - if self.update: |
613 |
| - self.update_prep_templates() |
614 |
| - |
615 |
| - # before we load preps into Qiita we need to copy the fastq |
616 |
| - # files n times for n preps and correct the file-paths each |
617 |
| - # prep is pointing to. |
618 |
| - self.load_preps_into_qiita() |
619 |
| - |
620 |
| - self.fsr.generate_report() |
621 |
| - |
622 |
| - self.update_status("Generating packaging commands", 8, 9) |
623 |
| - self.generate_commands() |
624 |
| - |
625 |
| - self.update_status("Packaging results", 9, 9) |
626 |
| - if self.update: |
627 |
| - self.execute_commands() |
628 |
| - |
629 | 643 |
|
630 | 644 | class Metagenomic(MetaOmic):
|
631 | 645 | METAGENOMIC_TYPE = 'Metagenomic'
|
|
0 commit comments