add subsample_reads (#138)

antgonza · web-flow · commit 2b7832e6e0b4 · 2025-07-13T07:29:02.000-06:00
* add subsample_reads * support the 2 different seq counts * index_col * self.assay_type == 'Amplicon' * pack warnings * so -> _ * self.warnings -> self.assay_warnings * "w" * addressing @AmandaBirmingham comments * self.warnings -> self.assay_warnings * flake 8
diff --git a/src/qp_klp/Assays.py b/src/qp_klp/Assays.py
@@ -35,6 +35,7 @@ class Assay():
     w/'_'.
     """
     assay_type = ASSAY_NAME_NONE
+    assay_warnings = []
 
     @classmethod
     def _replace_tube_ids_w_sample_names(cls, prep_file_path, tube_id_map):
@@ -166,6 +167,7 @@ def execute_pipeline(self):
             self.convert_raw_to_fastq()
             self.integrate_results()
             self.generate_sequence_counts()
+            self.subsample_reads()
 
         self.update_status("QC-ing reads", 2, 9)
         if "NuQCJob" not in self.skip_steps:
@@ -260,6 +262,13 @@ def execute_pipeline(self):
         self.update_status("Generating packaging commands", 8, 9)
         self.generate_commands()
 
+        # store the warnings, if they exist so they are packed with the
+        # final results
+        if self.assay_warnings:
+            wfp = f'{self.pipeline.output_path}/final_results/WARNINGS.txt'
+            with open(wfp, 'w') as f:
+                f.write('\n'.join(self.assay_warnings))
+
         self.update_status("Packaging results", 9, 9)
         if self.update:
             self.execute_commands()
diff --git a/src/qp_klp/Protocol.py b/src/qp_klp/Protocol.py
@@ -4,11 +4,14 @@
 from sequence_processing_pipeline.TRIntegrateJob import TRIntegrateJob
 from sequence_processing_pipeline.PipelineError import PipelineError
 from sequence_processing_pipeline.util import determine_orientation
-from os.path import join, split
+from os.path import join, split, basename, dirname
 from re import match
 from os import makedirs, rename, walk
 from metapool import load_sample_sheet
 from metapool.sample_sheet import PROTOCOL_NAME_ILLUMINA, PROTOCOL_NAME_TELLSEQ
+import pandas as pd
+from glob import glob
+from qiita_client.util import system_call
 
 
 PROTOCOL_NAME_NONE = "None"
@@ -22,6 +25,56 @@ class Protocol():
      initialization.
     """
     protocol_type = PROTOCOL_NAME_NONE
+    # this value was selected by looking at all the successful NuQC/SPP jobs,
+    # the max sequeces were: 712,497,596
+    MAX_READS = 720000000
+
+    def subsample_reads(self):
+        if self.assay_type == 'Amplicon':
+            return
+
+        df = pd.read_csv(self.reports_path)
+        if 'raw_reads_r1r2' in df.columns:
+            # this is a TellSeq run: SeqCounts.csv
+            read_col = 'raw_reads_r1r2'
+            index_col = 'Sample_ID'
+        elif '# Reads' in df.columns:
+            # this is a Illumina: Demultiplex_Stats.csv
+            read_col = '# Reads'
+            index_col = 'SampleID'
+        else:
+            raise ValueError(
+                'Not sure how to check for seq counts to subsample, '
+                'please let an admin know.')
+        # df will keep any rows/samples with more than the self.MAX_READS
+        df = df[df[read_col] > self.MAX_READS]
+        if df.shape[0]:
+            for _, row in df.iterrows():
+                sn = row[index_col]
+                # look for any sample (fwd/rev pairs) that have the sample_name
+                # as prefix of their filename
+                files = glob(f'{self.raw_fastq_files_path}/*/{sn}*.fastq.gz')
+                # for each file let's get their folder (dn) and filename (bn),
+                # then create a fullpath with with dn and bn where we are
+                # changing the filename from fastq.gz to full.gz; then
+                # subsample this full.gz to a new file with the correct
+                # fastq.gz via seqtk
+                for f in files:
+                    dn = dirname(f)
+                    bn = basename(f)
+                    nbn = join(dn, bn.replace('fastq.gz', 'full.gz'))
+                    cmd = f'mv {f} {nbn}'
+                    _, se, rv = system_call(cmd)
+                    if rv != 0 or se:
+                        raise ValueError(f'Error during mv: {cmd}. {se}')
+                    cmd = (f'seqtk sample -s 42 {nbn} {self.MAX_READS} '
+                           f'| gzip > {f}')
+                    _, se, rv = system_call(cmd)
+                    if rv != 0 or se:
+                        raise ValueError(f'Error during seqtk: {cmd}. {se}')
+                    self.assay_warnings.append(
+                        f'{sn} ({bn}) had {row[read_col]} sequences, '
+                        f'subsampling to {self.MAX_READS}')
 
 
 class Illumina(Protocol):