qiita-spots · antgonza · May 6, 2025 · Apr 22, 2025 · Apr 22, 2025 · Apr 22, 2025
diff --git a/src/qp_klp/Assays.py b/src/qp_klp/Assays.py
@@ -8,7 +8,6 @@
 import pandas as pd
 from json import dumps
 from collections import defaultdict
-import re
 
 
 ASSAY_NAME_NONE = "Assay"
@@ -144,13 +143,133 @@ def _generate_artifact_name(self, prep_file_path):
             # this is a normal pre-prep or sample-sheet.
             return (a_name, False)
 
+    def execute_pipeline(self):
+        '''
+        Executes steps of pipeline in proper sequence.
+        :return: None
+        '''
+        if not self.is_restart:
+            self.pre_check()
+
+        # this is performed even in the event of a restart.
+        self.generate_special_map()
+
+        # even if a job is being skipped, it's being skipped because it was
+        # determined that it already completed successfully. Hence,
+        # increment the status because we are still iterating through them.
+
+        self.update_status("Converting data", 1, 9)
+        if "ConvertJob" not in self.skip_steps:
+            # converting raw data to fastq depends heavily on the instrument
+            # used to generate the run_directory. Hence this method is
+            # supplied by the instrument mixin.
+            self.convert_raw_to_fastq()
+            self.integrate_results()
+            self.generate_sequence_counts()
+
+        self.update_status("QC-ing reads", 2, 9)
+        if "NuQCJob" not in self.skip_steps:
+            self.qc_reads()
+
+        self.update_status("Generating reports", 3, 9)
+        if "FastQCJob" not in self.skip_steps:
+            # reports are currently implemented by the assay mixin. This is
+            # only because metagenomic runs currently require a failed-samples
+            # report to be generated. This is not done for amplicon runs since
+            # demultiplexing occurs downstream of SPP.
+            self.generate_reports()
+
+        self.update_status("Generating preps", 4, 9)
+        if "GenPrepFileJob" not in self.skip_steps:
+            self.generate_prep_file()
+
+        # moved final component of genprepfilejob outside of object.
+        # obtain the paths to the prep-files generated by GenPrepFileJob
+        # w/out having to recover full state.
+        tmp = join(self.pipeline.output_path, 'GenPrepFileJob', 'PrepFiles')
+
+        self.has_replicates = False
+
+        prep_paths = []
+        self.prep_file_paths = {}
+
+        for root, dirs, files in walk(tmp):
+            for _file in files:
+                # we are looing for .tsv files and we are only interested
+                # in the string after the last _, which is the study_id
+                if not _file.endswith('.tsv'):
+                    continue
+                # continue if no underscore
+                chunks = _file.rsplit('_', 1)
+                if len(chunks) <= 1:
+                    continue
+                # continue if no int after .
+                qid = chunks[-1].split('.')[0]
+                if not qid.isnumeric():
+                    continue
+                if qid not in self.prep_file_paths:
+                    self.prep_file_paths[qid] = []
+
+                _path = abspath(join(root, _file))
+                prep_paths.append(_path)
+                self.prep_file_paths[qid].append(_path)
+
+            for _dir in dirs:
+                if _dir == '1':
+                    # if PrepFiles contains the '1' directory, then it's a
+                    # given that this sample-sheet contains replicates.
+                    self.has_replicates = True
+
+        # currently imported from Assay although it is a base method. it
+        # could be imported into Workflows potentially, since it is a post-
+        # processing step. All pairings of assay and instrument type need to
+        # generate prep-info files in the same format.
+        self.overwrite_prep_files(prep_paths)
+
+        # for now, simply re-run any line below as if it was a new job, even
+        # for a restart. functionality is idempotent, except for the
+        # registration of new preps in Qiita. These will simply be removed
+        # manually.
+
+        # post-processing steps are by default associated with the Workflow
+        # class, since they deal with fastq files and Qiita, and don't depend
+        # on assay or instrument type.
+        self.update_status("Generating sample information", 5, 9)
+        self.sifs = self.generate_sifs()
+
+        # post-processing step.
+        self.update_status("Registering blanks in Qiita", 6, 9)
+        if self.update:
+            self.update_blanks_in_qiita()
+
+        self.update_status("Loading preps into Qiita", 7, 9)
+        if self.update:
+            self.update_prep_templates()
+
+        # before we load preps into Qiita we need to copy the fastq
+        # files n times for n preps and correct the file-paths each
+        # prep is pointing to.
+        self.load_preps_into_qiita()
+
+        # before we pack the results, we need to generate the human-readable
+        # report of samples lost in each step. The tracking is being done
+        # within fsr (FailedSamplesRecord), in conjuction with Job.audit.
+        self.fsr.generate_report()
+
+        self.update_status("Generating packaging commands", 8, 9)
+        self.generate_commands()
+
+        self.update_status("Packaging results", 9, 9)
+        if self.update:
+            self.execute_commands()
+
 
 class Amplicon(Assay):
     AMPLICON_TYPE = 'Amplicon'
     AMPLICON_SUB_TYPES = {'16S', '18S', 'ITS'}
     assay_type = ASSAY_NAME_AMPLICON
 
-    def post_process_raw_fastq_output(self):
+    def qc_reads(self):
         """
         Post-process ConvertJob output into correct form for FastQCJob.
         """
@@ -353,7 +472,7 @@ class MetaOmic(Assay):
     # MetaOmic does not have an assay_type of its own. It is defined by its
     # children.
 
-    def quality_control(self):
+    def qc_reads(self):
         # because this is a mixin, assume containing object will contain
         # a pipeline object.
         config = self.pipeline.get_software_configuration('nu-qc')
@@ -521,111 +640,6 @@ def load_preps_into_qiita(self):
 
         return df
 
-    def execute_pipeline(self):
-        '''
-        Executes steps of pipeline in proper sequence.
-        :return: None
-        '''
-        self.pre_check()
-
-        self.generate_special_map()
-
-        self.update_status("Converting data", 1, 9)
-
-        self.convert_raw_to_fastq()
-
-        self.integrate_results()
-
-        self.generate_sequence_counts()
-
-        self.update_status("Performing quality control", 2, 9)
-        self.quality_control()
-
-        self.update_status("Generating reports", 3, 9)
-        self.generate_reports()
-
-        self.update_status("Generating preps", 4, 9)
-        self.generate_prep_file()
-
-        # moved final component of genprepfilejob outside of object.
-        # obtain the paths to the prep-files generated by GenPrepFileJob
-        # w/out having to recover full state.
-        tmp = join(self.pipeline.output_path, 'GenPrepFileJob', 'PrepFiles')
-
-        self.has_replicates = False
-
-        prep_paths = []
-        self.prep_file_paths = {}
-
-        rematch = re.compile(
-            r"(?P<runid>[a-zA-Z0-9_-]+)\.(?P<qname>[a-zA-Z0-9_]+)"
-            r"(?P<qid>[0-9]{5,6})\..\.tsv")
-
-        for root, dirs, files in walk(tmp):
-            for _file in files:
-                # breakup the prep-info-file into segments
-                # (run-id, project_qid, other) and cleave
-                # the qiita-id from the project_name.
-                rer = rematch.match(_file)
-                if rer is None:
-                    continue
-
-                _, _, qid = rer.groups()
-
-                if qid not in self.prep_file_paths:
-                    self.prep_file_paths[qid] = []
-
-                _path = abspath(join(root, _file))
-                if _path.endswith('.tsv'):
-                    prep_paths.append(_path)
-                    self.prep_file_paths[qid].append(_path)
-
-            for _dir in dirs:
-                if _dir == '1':
-                    # if PrepFiles contains the '1' directory, then it's a
-                    # given that this sample-sheet contains replicates.
-                    self.has_replicates = True
-
-        # currently imported from Assay although it is a base method. it
-        # could be imported into Workflows potentially, since it is a post-
-        # processing step. All pairings of assay and instrument type need to
-        # generate prep-info files in the same format.
-        self.overwrite_prep_files(prep_paths)
-
-        # for now, simply re-run any line below as if it was a new job, even
-        # for a restart. functionality is idempotent, except for the
-        # registration of new preps in Qiita. These will simply be removed
-        # manually.
-
-        # post-processing steps are by default associated with the Workflow
-        # class, since they deal with fastq files and Qiita, and don't depend
-        # on assay or instrument type.
-        self.update_status("Generating sample information", 5, 9)
-        self.sifs = self.generate_sifs()
-
-        # post-processing step.
-        self.update_status("Registering blanks in Qiita", 6, 9)
-        if self.update:
-            self.update_blanks_in_qiita()
-
-        self.update_status("Loading preps into Qiita", 7, 9)
-        if self.update:
-            self.update_prep_templates()
-
-        # before we load preps into Qiita we need to copy the fastq
-        # files n times for n preps and correct the file-paths each
-        # prep is pointing to.
-        self.load_preps_into_qiita()
-
-        self.fsr.generate_report()
-
-        self.update_status("Generating packaging commands", 8, 9)
-        self.generate_commands()
-
-        self.update_status("Packaging results", 9, 9)
-        if self.update:
-            self.execute_commands()
-
 
 class Metagenomic(MetaOmic):
     METAGENOMIC_TYPE = 'Metagenomic'

diff --git a/src/qp_klp/StandardAmpliconWorkflow.py b/src/qp_klp/StandardAmpliconWorkflow.py
@@ -1,11 +1,8 @@
 from .Protocol import Illumina
-from os.path import join, abspath
-from os import walk
 from sequence_processing_pipeline.Pipeline import Pipeline
 from .Assays import Amplicon
 from .Assays import ASSAY_NAME_AMPLICON
 from .Workflows import Workflow
-import re
 
 
 class StandardAmpliconWorkflow(Workflow, Amplicon, Illumina):
@@ -52,122 +49,3 @@ def __init__(self, **kwargs):
                                  "type bool")
 
             self.update = kwargs['update_qiita']
-
-    def execute_pipeline(self):
-        '''
-        Executes steps of pipeline in proper sequence.
-        :return: None
-        '''
-        if not self.is_restart:
-            self.pre_check()
-
-        # this is performed even in the event of a restart.
-        self.generate_special_map()
-
-        # even if a job is being skipped, it's being skipped because it was
-        # determined that it already completed successfully. Hence,
-        # increment the status because we are still iterating through them.
-
-        self.update_status("Converting data", 1, 9)
-        if "ConvertJob" not in self.skip_steps:
-            # converting raw data to fastq depends heavily on the instrument
-            # used to generate the run_directory. Hence this method is
-            # supplied by the instrument mixin.
-            self.convert_raw_to_fastq()
-
-        self.update_status("Post-processing raw fasq output", 2, 9)
-        if "NuQCJob" not in self.skip_steps:
-            # there is no failed samples reporting for amplicon runs.
-            self.post_process_raw_fastq_output()
-
-        self.update_status("Generating reports", 3, 9)
-        if "FastQCJob" not in self.skip_steps:
-            # reports are currently implemented by the assay mixin. This is
-            # only because metagenomic runs currently require a failed-samples
-            # report to be generated. This is not done for amplicon runs since
-            # demultiplexing occurs downstream of SPP.
-            self.generate_reports()
-
-        self.update_status("Generating preps", 4, 9)
-        if "GenPrepFileJob" not in self.skip_steps:
-            # preps are currently associated with array mixin, but only
-            # because there are currently some slight differences in how
-            # FastQCJob gets instantiated(). This could get moved into a
-            # shared method, but probably still in Assay.
-            self.generate_prep_file()
-
-        # moved final component of genprepfilejob outside of object.
-        # obtain the paths to the prep-files generated by GenPrepFileJob
-        # w/out having to recover full state.
-        tmp = join(self.pipeline.output_path, 'GenPrepFileJob', 'PrepFiles')
-
-        self.has_replicates = False
-
-        prep_paths = []
-        self.prep_file_paths = {}
-        rematch = re.compile(
-            r"(?P<runid>[a-zA-Z0-9_-]+)\.(?P<qname>[a-zA-Z0-9_]+)"
-            r"(?P<qid>[0-9]{5,6})\..\.tsv")
-
-        for root, dirs, files in walk(tmp):
-            for _file in files:
-                # breakup the prep-info-file into segments
-                # (run-id, project_qid, other) and cleave
-                # the qiita-id from the project_name.
-                rer = rematch.match(_file)
-                if rer is None:
-                    continue
-
-                _, _, qid = rer.groups()
-
-                if qid not in self.prep_file_paths:
-                    self.prep_file_paths[qid] = []
-
-                _path = abspath(join(root, _file))
-                if _path.endswith('.tsv'):
-                    prep_paths.append(_path)
-                    self.prep_file_paths[qid].append(_path)
-
-            for _dir in dirs:
-                if _dir == '1':
-                    # if PrepFiles contains the '1' directory, then it's a
-                    # given that this sample-sheet contains replicates.
-                    self.has_replicates = True
-
-        # currently imported from Assay although it is a base method. it
-        # could be imported into Workflows potentially, since it is a post-
-        # processing step. All pairings of assay and instrument type need to
-        # generate prep-info files in the same format.
-        self.overwrite_prep_files(prep_paths)
-
-        # for now, simply re-run any line below as if it was a new job, even
-        # for a restart. functionality is idempotent, except for the
-        # registration of new preps in Qiita. These will simply be removed
-        # manually.
-
-        # post-processing steps are by default associated with the Workflow
-        # class, since they deal with fastq files and Qiita, and don't depend
-        # on assay or instrument type.
-        self.update_status("Generating sample information", 5, 9)
-        self.sifs = self.generate_sifs()
-
-        # post-processing step.
-        self.update_status("Registering blanks in Qiita", 6, 9)
-        if self.update:
-            self.update_blanks_in_qiita()
-
-        self.update_status("Loading preps into Qiita", 7, 9)
-        if self.update:
-            self.update_prep_templates()
-
-        # before we load preps into Qiita we need to copy the fastq
-        # files n times for n preps and correct the file-paths each
-        # prep is pointing to.
-        self.load_preps_into_qiita()
-
-        self.update_status("Generating packaging commands", 8, 9)
-        self.generate_commands()
-
-        self.update_status("Packaging results", 9, 9)
-        if self.update:
-            self.execute_commands()