Cleared up some bugs, and POD5 files now have more fulsome names.

EdinburghGenomics · Feb 7, 2024 · 48b401d · 48b401d
1 parent 0971407
commit 48b401d
Show file tree

Hide file tree

Showing 8 changed files with 122 additions and 13 deletions.
diff --git a/Snakefile.rundata b/Snakefile.rundata
@@ -1,7 +1,7 @@
 # vim: ft=python
 
 from hesiod import ( load_final_summary, find_sequencing_summary, find_summary,
-                     dump_yaml, load_yaml )
+                     get_common_prefix, dump_yaml, load_yaml )
 
 # Rules to filter, compress and combine the original files.
 # These rules are designed to be included in Snakefile.main and will not run standalone.
@@ -66,12 +66,15 @@ def i_merge_md5sum_pod5_batch(wildcards):
 
 rule merge_md5sum_pod5_batch:
     output:
-        pod5 = "{cell}/pod5_{barcode}_{pfs}/batch{bsize}_{batch}.pod5",
-        md5  = "md5sums/{cell}/pod5_{barcode}_{pfs}/batch{bsize}_{batch}.pod5.md5"
+        pod5 = "{cell}/pod5_{barcode}_{pfs}/{cp}_batch{bsize}_{batch}.pod5",
+        md5  = "md5sums/{cell}/pod5_{barcode}_{pfs}/{cp}_batch{bsize}_{batch}.pod5.md5"
     input:
         i_merge_md5sum_pod5_batch
     params:
-        pod5_base = "pod5_{barcode}_{pfs}/batch{bsize}_{batch}.pod5"
+        pod5_base = "pod5_{barcode}_{pfs}/{cp}_batch{bsize}_{batch}.pod5"
+    wildcard_constraints:
+        bsize = r"\d+",
+        batch = r"\d+",
     shell:
        r"""pod5 merge -o {output.pod5} {input}
            ( cd {wildcards.cell} && md5sum -- {params.pod5_base} ) > {output.md5}.tmp
@@ -83,22 +86,24 @@ localrules: merge_pod5_md5sums
 # The copy_pod5 rule in Snakefile.main drives execution of the rule below which runs per barcode
 # (each time for pass+fail) and drives the rule above which actually merges/sums the batches of
 # files.
-
 def i_merge_pod5_md5sums(wildcards):
     """Available POD5 files within a single directory are divided into
        batches of, say, 80
     """
     pod5_dir = f"{wildcards.cell}/pod5_{wildcards.bc}_{wildcards.pf}"
 
     # How many files in total for this barcode? And thus how many merged files?
-    total_files = len(SC[wildcards.cell][wildcards.bc][f'pod5_{wildcards.pf}'])
-    output_pod5_needed = ((total_files - 1) // POD5_BATCH_SIZE) + 1
+    all_files = SC[wildcards.cell][wildcards.bc][f'pod5_{wildcards.pf}']
+    output_pod5_needed = ((len(all_files) - 1) // POD5_BATCH_SIZE) + 1
+
+    # If the files have a common prefix then keep it
+    cp = get_common_prefix(all_files, extn=r"_\d+\.pod5") or 'combined'
 
     # List of md5 files we thus need to generate.
     res = []
     for b in range(output_pod5_needed):
         # FIXME - probably {b:08d} is wrong now?!
-        res.append(f"md5sums/{pod5_dir}/batch{POD5_BATCH_SIZE}_{b:08d}.pod5.md5")
+        res.append(f"md5sums/{pod5_dir}/{cp}_batch{POD5_BATCH_SIZE}_{b:08d}.pod5.md5")
     return res
 
 rule merge_pod5_md5sums:

diff --git a/doc/pod5_remake.txt b/doc/pod5_remake.txt
@@ -19,3 +19,20 @@ I need to fix the runs I made, so:
 20240125_EGS2_27971RLpool01_Run2
 20240126_EGS2_27971RLpool01_Run3
 20240125_EGS2_29490KG
+
+---
+
+OK this is fine, and I confirmed that running Dorado on the batched-up POD5 files produces the same result.
+However, I should really modify my code so that the original prefix of the POD5 files gets included in the
+batched file names. So for example files like:
+
+PAS24884_fail_barcode11_64b522fc_6e8790f5_1.pod5
+
+should be batched into:
+
+PAS24884_fail_barcode11_64b522fc_6e8790f5_batch100_00000000.pod5
+
+Not simply:
+batch100_00000000.pod5
+
+I'll sort this out later, once auto-delivery is back.
diff --git a/doc/sample_names.txt b/doc/sample_names.txt
@@ -37,3 +37,19 @@ like to have this come from Ragic (or wherever).
 If you put the file in after the fact you will have to re-run Hesiod, by removing the
 `report.done` file from the `pipeline` directory. This will be quick because it only needs
 to redo the report. Processing of the actual data is unaffected.
+
+---
+
+With run 20240125_EGS2_29490KG I did this but the sample names were not picked up.
+Did I just make a typo, or is there some deeper issue? What's the quickest way to debug
+the sample names lookup?
+
+$ env ENVIRON_SH=~pipeline/hesiod/current/environ.sh ./sample_names_check.sh 20240125_EGS2_29490KG
+
+So this helper script is helpful. In this case the error is:
+
+error: Unable to parse line 2
+
+So what did I do wrong in /lustre-gseg/promethion/sample_names/29490_sample_names.tsv?
+
+Oh, I misspelt "barcode" as "barocde". Well, I'm just going to make the script tolerate that.
diff --git a/hesiod/__init__.py b/hesiod/__init__.py
@@ -246,4 +246,27 @@ def slurp_file(filename):
     with open(filename) as fh:
         return [ l.rstrip("\n") for l in fh ]
 
+def get_common_prefix(list_of_filenames, extn=r"\..+", base_only=True):
+    """Used by the Snakefiles. Chop extn (which is a regex) off every file
+       in the list and see if that leaves a common prefix. If so, return it.
+       If not, return None.
+    """
+    extn_regex = re.compile(extn + "$")
+    if not list_of_filenames:
+        return None
+
+    if base_only:
+        list_of_filenames = (os.path.basename(f) for f in list_of_filenames)
+    else:
+        list_of_filenames = iter(list_of_filenames)
+
+    common_prefix = re.sub(extn_regex, '', next(list_of_filenames))
+
+    for f in list_of_filenames:
+        p = re.sub(extn_regex, '', f)
+        if p != common_prefix:
+            return None
+
+    return common_prefix
+
 hesiod_version = _determine_version()
diff --git a/scan_cells.py b/scan_cells.py
@@ -15,7 +15,7 @@
 from pprint import pprint, pformat
 
 from hesiod import ( glob, groupby, parse_cell_name, load_final_summary,
-                     fast5_out, find_summary, dump_yaml )
+                     fast5_out, find_summary, dump_yaml, get_common_prefix )
 
 DEFAULT_FILETYPES_TO_SCAN = ["fastq", "fastq.gz", "fast5", "pod5", "bam"]
 
@@ -263,11 +263,14 @@ def find_representative_pod5(cell, infiles, batch_size, try_glob=False):
     pod5_pass = [ bc for bc, bcparts in infiles.items() for plist in bcparts['pod5_pass']  ]
     pod5_fail = [ bc for bc, bcparts in infiles.items() for flist in bcparts['pod5_fail']  ]
 
+    pod5_files = []
     if pod5_pass:
         bc = pod5_pass[0]
+        pod5_files = infiles[bc]['pod5_pass']
         pf = "pass"
     elif pod5_fail:
         bc = pod5_fail[0] # We have no passing reads, but we have fails and can still report
+        pod5_files = infiles[bc]['pod5_fail']
         pf = "fail"
     elif try_glob:
         # In this case, just look for any output file
@@ -277,8 +280,11 @@ def find_representative_pod5(cell, infiles, batch_size, try_glob=False):
         # We really have nothing
         return None
 
+    # The Snakefile.rundata will maintain the common prefix, so get that
+    cp = get_common_prefix(pod5_files, extn=r"_\d+\.pod5") or 'combined'
+
     batch = 0  # This will always be padded to 8 digits
-    return f"{cell}/pod5_{bc}_{pf}/batch{batch_size}_{batch:08d}.pod"
+    return f"{cell}/pod5_{bc}_{pf}/{cp}_batch{batch_size}_{batch:08d}.pod5"
 
 def parse_args(*args):
     description = """Scan the input files for all cells, to provide a work plan for Snakemake"""

diff --git a/test/examples/runs/20231107_MIN2_26171SS/sc_data.yaml b/test/examples/runs/20231107_MIN2_26171SS/sc_data.yaml
@@ -23,7 +23,7 @@ printable_counts: |-
 representative_fast5:
   26171SS0001L02/20231108_1411_MN32284_AOZ898_35873099: null
 representative_pod5:
-  26171SS0001L02/20231108_1411_MN32284_AOZ898_35873099: 26171SS0001L02/20231108_1411_MN32284_AOZ898_35873099/pod5_._pass/batch100_00000000.pod
+  26171SS0001L02/20231108_1411_MN32284_AOZ898_35873099: 26171SS0001L02/20231108_1411_MN32284_AOZ898_35873099/pod5_._pass/AOZ898_pass_35873099_e8dfad65_batch100_00000000.pod5
 scanned_cells:
   26171SS0001L02/20231108_1411_MN32284_AOZ898_35873099:
     .:

diff --git a/test/test_hesiod_init_funcs.py b/test/test_hesiod_init_funcs.py
@@ -16,7 +16,7 @@
 
 from hesiod import ( parse_cell_name, load_final_summary, abspath, groupby, glob,
                      find_sequencing_summary, find_summary, load_yaml, dump_yaml,
-                     empty_sc_data, od_key_replace )
+                     empty_sc_data, od_key_replace, get_common_prefix )
 
 class T(unittest.TestCase):
 
@@ -247,5 +247,31 @@ def test_empty_sc_data(self):
         self.assertEqual( type(empty_sc_data()), dict )
         self.assertEqual( len(empty_sc_data()), 7 )
 
+    def test_common_prefix(self):
+        """Used to help batching up the pod5 files
+        """
+        self.assertEqual( get_common_prefix([]), None )
+
+        self.assertEqual( get_common_prefix([ "foo.txt",
+                                              "foo.dat",
+                                              "foo.bam" ]), "foo" )
+
+        self.assertEqual( get_common_prefix([ "biz/foo.txt",
+                                              "boz/foo.dat",
+                                              "boo/foo.bam",
+                                              "foo.txt.md5" ]), "foo" )
+
+        self.assertEqual( get_common_prefix([ "foo.txt",
+                                              "foo.dat",
+                                              "foo.bam",
+                                              "md5/foo.md5" ],
+                                            extn = r"\.[^.]+"), "foo" )
+
+        self.assertEqual( get_common_prefix([ "foo.txt",
+                                              "foo.dat",
+                                              "foo.bam",
+                                              "md5/foo.txt.md5" ],
+                                            extn = r"\.[^.]+"), None )
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/test_scan_cells.py b/test/test_scan_cells.py
@@ -99,6 +99,22 @@ def test_scan_main_pod5(self):
         # And just to be sure:
         self.assertEqual(sc, expected)
 
+    def test_find_representative_pod5(self):
+        """This is rather redundant given the test above, but technically we can
+           change the batch size, and ther may be no pasing reads.
+        """
+        cell_name = "26171SS0001L02/20231108_1411_MN32284_AOZ898_35873099"
+        sc = scan_cells( f"{DATA_DIR}/runs/20231107_MIN2_26171SS",
+                         cellsready=[cell_name] )['scanned_cells']
+
+        sc[cell_name]['.']['pod5_pass'] = []
+
+        self.assertEqual( find_representative_pod5( cell_name,
+                                                    sc[cell_name],
+                                                    batch_size = 123,
+                                                    try_glob = False),
+                          cell_name + "/pod5_._fail/AOZ898_fail_35873099_e8dfad65_batch123_00000000.pod5" )
+
     def test_scan_error(self):
         """ A missing fast5 file should raise an exception
         """
@@ -142,7 +158,7 @@ def test_find_representative_fast5_bc(self):
     def test_find_representative_fast5_nobc(self):
         """ And for the barcodeless version
         """
-        cell_name = 'testlib/20190710_1723_2-A5-D5_PAD38578_c6ded78b'
+        cell_name = "testlib/20190710_1723_2-A5-D5_PAD38578_c6ded78b"
         sc = scan_cells( f"{DATA_DIR}/runs/20190710_LOCALTEST_00newrun",
                          cellsready=[cell_name] )['scanned_cells']