Fix pod5 naming screw-up

EdinburghGenomics · Feb 6, 2024 · 85a45db · 85a45db
1 parent 65f3232
commit 85a45db
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 5 deletions.
diff --git a/Snakefile.rundata b/Snakefile.rundata
@@ -66,12 +66,12 @@ def i_merge_md5sum_pod5_batch(wildcards):
 
 rule merge_md5sum_pod5_batch:
     output:
-        pod5 = "{cell}/pod5_{barcode}_{pfs}/batch{bsize}_{batch}.pod",
-        md5  = "md5sums/{cell}/pod5_{barcode}_{pfs}/batch{bsize}_{batch}.pod.md5"
+        pod5 = "{cell}/pod5_{barcode}_{pfs}/batch{bsize}_{batch}.pod5",
+        md5  = "md5sums/{cell}/pod5_{barcode}_{pfs}/batch{bsize}_{batch}.pod5.md5"
     input:
         i_merge_md5sum_pod5_batch
     params:
-        pod5_base = "pod5_{barcode}_{pfs}/batch{bsize}_{batch}.pod"
+        pod5_base = "pod5_{barcode}_{pfs}/batch{bsize}_{batch}.pod5"
     shell:
        r"""pod5 merge -o {output.pod5} {input}
            ( cd {wildcards.cell} && md5sum -- {params.pod5_base} ) > {output.md5}.tmp
@@ -98,7 +98,7 @@ def i_merge_pod5_md5sums(wildcards):
     res = []
     for b in range(output_pod5_needed):
         # FIXME - probably {b:08d} is wrong now?!
-        res.append(f"md5sums/{pod5_dir}/batch{POD5_BATCH_SIZE}_{b:08d}.pod.md5")
+        res.append(f"md5sums/{pod5_dir}/batch{POD5_BATCH_SIZE}_{b:08d}.pod5.md5")
     return res
 
 rule merge_pod5_md5sums:

diff --git a/doc/pod5_remake.txt b/doc/pod5_remake.txt
@@ -0,0 +1,21 @@
+I'm so dumb. I made the Hesiod pipeline create batched pod5 files with the .pod extension.
+Of course, Dorado ignores them.
+
+I need to fix the runs I made, so:
+
+1) Test on ~/test_promethion/fastqdata/20231107_MIN2_26171SS
+
+* Check the md5sums
+* Remove the existing pod5_._fail and pod5_._pass directories
+* And the ones from md5sums too
+* Re-run the pipeline
+* Any more prodding needed?? No
+* Check the md5sums again - ok
+* Check that Dorado will read the files now - seems so
+
+2) And then release new Hesiod and repeat for the actual runs:
+
+20240124_EGS2_27971RLpool01
+20240125_EGS2_27971RLpool01_Run2
+20240126_EGS2_27971RLpool01_Run3
+20240125_EGS2_29490KG
diff --git a/get_pod5_metadata.py b/get_pod5_metadata.py
@@ -94,10 +94,15 @@ def read_pod5(p5_filename):
         # read, so just get the first one, and dict-ify it.
         read0 = vars(next(p5_handle.reads()).run_info)
 
-        # Run ID (should be in the filename anyway!)
+        # Run ID used to be in the filename, but not now with the batched pod5 files
         res['RunID'] = read0['acquisition_id']
         res['Software'] = read0['software']
 
+        # Redundant but still useful to extract
+        res['FlowcellId'] = read0['flow_cell_id']
+        res['FlowcellType'] = read0['flow_cell_product_code']
+        res['Sample'] = read0['sample_id']
+
         # Stuff from 'context_tags'
         context_tags = dict(read0['context_tags'])
         res['ExperimentType'] = context_tags.get('experiment_type', 'unknown')

diff --git a/test/test_get_pod5_metadata.py b/test/test_get_pod5_metadata.py
@@ -50,7 +50,10 @@ def test_converted_pod5(self):
                          RunID             = 'b7f7032d28779ac6666af1b4fd724bf2ec41ec25',
                          SamplingFrequency = '4.0 kHz',
                          ExperimentType    = 'genomic_dna',
+                         FlowcellId        = 'PAK00002',
+                         FlowcellType      = 'FLO-PRO002',
                          SequencingKit     = 'sqk-lsk109',
+                         Sample            = '14211AT0082',
                          BasecallConfig    = 'dna_r9.4.1_450bps_hac_prom.cfg' )
 
         self.assertEqual(dict(md), expected)