Skip to content

Commit

Permalink
Cleared up some bugs, and POD5 files now have more fulsome names.
Browse files Browse the repository at this point in the history
  • Loading branch information
tbooth committed Feb 7, 2024
1 parent 0971407 commit 48b401d
Show file tree
Hide file tree
Showing 8 changed files with 122 additions and 13 deletions.
21 changes: 13 additions & 8 deletions Snakefile.rundata
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# vim: ft=python

from hesiod import ( load_final_summary, find_sequencing_summary, find_summary,
dump_yaml, load_yaml )
get_common_prefix, dump_yaml, load_yaml )

# Rules to filter, compress and combine the original files.
# These rules are designed to be included in Snakefile.main and will not run standalone.
Expand Down Expand Up @@ -66,12 +66,15 @@ def i_merge_md5sum_pod5_batch(wildcards):

rule merge_md5sum_pod5_batch:
output:
pod5 = "{cell}/pod5_{barcode}_{pfs}/batch{bsize}_{batch}.pod5",
md5 = "md5sums/{cell}/pod5_{barcode}_{pfs}/batch{bsize}_{batch}.pod5.md5"
pod5 = "{cell}/pod5_{barcode}_{pfs}/{cp}_batch{bsize}_{batch}.pod5",
md5 = "md5sums/{cell}/pod5_{barcode}_{pfs}/{cp}_batch{bsize}_{batch}.pod5.md5"
input:
i_merge_md5sum_pod5_batch
params:
pod5_base = "pod5_{barcode}_{pfs}/batch{bsize}_{batch}.pod5"
pod5_base = "pod5_{barcode}_{pfs}/{cp}_batch{bsize}_{batch}.pod5"
wildcard_constraints:
bsize = r"\d+",
batch = r"\d+",
shell:
r"""pod5 merge -o {output.pod5} {input}
( cd {wildcards.cell} && md5sum -- {params.pod5_base} ) > {output.md5}.tmp
Expand All @@ -83,22 +86,24 @@ localrules: merge_pod5_md5sums
# The copy_pod5 rule in Snakefile.main drives execution of the rule below which runs per barcode
# (each time for pass+fail) and drives the rule above which actually merges/sums the batches of
# files.

def i_merge_pod5_md5sums(wildcards):
"""Available POD5 files within a single directory are divided into
batches of, say, 80
"""
pod5_dir = f"{wildcards.cell}/pod5_{wildcards.bc}_{wildcards.pf}"

# How many files in total for this barcode? And thus how many merged files?
total_files = len(SC[wildcards.cell][wildcards.bc][f'pod5_{wildcards.pf}'])
output_pod5_needed = ((total_files - 1) // POD5_BATCH_SIZE) + 1
all_files = SC[wildcards.cell][wildcards.bc][f'pod5_{wildcards.pf}']
output_pod5_needed = ((len(all_files) - 1) // POD5_BATCH_SIZE) + 1

# If the files have a common prefix then keep it
cp = get_common_prefix(all_files, extn=r"_\d+\.pod5") or 'combined'

# List of md5 files we thus need to generate.
res = []
for b in range(output_pod5_needed):
# FIXME - probably {b:08d} is wrong now?!
res.append(f"md5sums/{pod5_dir}/batch{POD5_BATCH_SIZE}_{b:08d}.pod5.md5")
res.append(f"md5sums/{pod5_dir}/{cp}_batch{POD5_BATCH_SIZE}_{b:08d}.pod5.md5")
return res

rule merge_pod5_md5sums:
Expand Down
17 changes: 17 additions & 0 deletions doc/pod5_remake.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,20 @@ I need to fix the runs I made, so:
20240125_EGS2_27971RLpool01_Run2
20240126_EGS2_27971RLpool01_Run3
20240125_EGS2_29490KG

---

OK this is fine, and I confirmed that running Dorado on the batched-up POD5 files produces the same result.
However, I should really modify my code so that the original prefix of the POD5 files gets included in the
batched file names. So for example files like:

PAS24884_fail_barcode11_64b522fc_6e8790f5_1.pod5

should be batched into:

PAS24884_fail_barcode11_64b522fc_6e8790f5_batch100_00000000.pod5

Not simply:
batch100_00000000.pod5

I'll sort this out later, once auto-delivery is back.
16 changes: 16 additions & 0 deletions doc/sample_names.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,19 @@ like to have this come from Ragic (or wherever).
If you put the file in after the fact you will have to re-run Hesiod, by removing the
`report.done` file from the `pipeline` directory. This will be quick because it only needs
to redo the report. Processing of the actual data is unaffected.

---

With run 20240125_EGS2_29490KG I did this but the sample names were not picked up.
Did I just make a typo, or is there some deeper issue? What's the quickest way to debug
the sample names lookup?

$ env ENVIRON_SH=~pipeline/hesiod/current/environ.sh ./sample_names_check.sh 20240125_EGS2_29490KG

So this helper script is helpful. In this case the error is:

error: Unable to parse line 2

So what did I do wrong in /lustre-gseg/promethion/sample_names/29490_sample_names.tsv?

Oh, I misspelt "barcode" as "barocde". Well, I'm just going to make the script tolerate that.
23 changes: 23 additions & 0 deletions hesiod/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,4 +246,27 @@ def slurp_file(filename):
with open(filename) as fh:
return [ l.rstrip("\n") for l in fh ]

def get_common_prefix(list_of_filenames, extn=r"\..+", base_only=True):
"""Used by the Snakefiles. Chop extn (which is a regex) off every file
in the list and see if that leaves a common prefix. If so, return it.
If not, return None.
"""
extn_regex = re.compile(extn + "$")
if not list_of_filenames:
return None

if base_only:
list_of_filenames = (os.path.basename(f) for f in list_of_filenames)
else:
list_of_filenames = iter(list_of_filenames)

common_prefix = re.sub(extn_regex, '', next(list_of_filenames))

for f in list_of_filenames:
p = re.sub(extn_regex, '', f)
if p != common_prefix:
return None

return common_prefix

hesiod_version = _determine_version()
10 changes: 8 additions & 2 deletions scan_cells.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from pprint import pprint, pformat

from hesiod import ( glob, groupby, parse_cell_name, load_final_summary,
fast5_out, find_summary, dump_yaml )
fast5_out, find_summary, dump_yaml, get_common_prefix )

DEFAULT_FILETYPES_TO_SCAN = ["fastq", "fastq.gz", "fast5", "pod5", "bam"]

Expand Down Expand Up @@ -263,11 +263,14 @@ def find_representative_pod5(cell, infiles, batch_size, try_glob=False):
pod5_pass = [ bc for bc, bcparts in infiles.items() for plist in bcparts['pod5_pass'] ]
pod5_fail = [ bc for bc, bcparts in infiles.items() for flist in bcparts['pod5_fail'] ]

pod5_files = []
if pod5_pass:
bc = pod5_pass[0]
pod5_files = infiles[bc]['pod5_pass']
pf = "pass"
elif pod5_fail:
bc = pod5_fail[0] # We have no passing reads, but we have fails and can still report
pod5_files = infiles[bc]['pod5_fail']
pf = "fail"
elif try_glob:
# In this case, just look for any output file
Expand All @@ -277,8 +280,11 @@ def find_representative_pod5(cell, infiles, batch_size, try_glob=False):
# We really have nothing
return None

# The Snakefile.rundata will maintain the common prefix, so get that
cp = get_common_prefix(pod5_files, extn=r"_\d+\.pod5") or 'combined'

batch = 0 # This will always be padded to 8 digits
return f"{cell}/pod5_{bc}_{pf}/batch{batch_size}_{batch:08d}.pod"
return f"{cell}/pod5_{bc}_{pf}/{cp}_batch{batch_size}_{batch:08d}.pod5"

def parse_args(*args):
description = """Scan the input files for all cells, to provide a work plan for Snakemake"""
Expand Down
2 changes: 1 addition & 1 deletion test/examples/runs/20231107_MIN2_26171SS/sc_data.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ printable_counts: |-
representative_fast5:
26171SS0001L02/20231108_1411_MN32284_AOZ898_35873099: null
representative_pod5:
26171SS0001L02/20231108_1411_MN32284_AOZ898_35873099: 26171SS0001L02/20231108_1411_MN32284_AOZ898_35873099/pod5_._pass/batch100_00000000.pod
26171SS0001L02/20231108_1411_MN32284_AOZ898_35873099: 26171SS0001L02/20231108_1411_MN32284_AOZ898_35873099/pod5_._pass/AOZ898_pass_35873099_e8dfad65_batch100_00000000.pod5
scanned_cells:
26171SS0001L02/20231108_1411_MN32284_AOZ898_35873099:
.:
Expand Down
28 changes: 27 additions & 1 deletion test/test_hesiod_init_funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from hesiod import ( parse_cell_name, load_final_summary, abspath, groupby, glob,
find_sequencing_summary, find_summary, load_yaml, dump_yaml,
empty_sc_data, od_key_replace )
empty_sc_data, od_key_replace, get_common_prefix )

class T(unittest.TestCase):

Expand Down Expand Up @@ -247,5 +247,31 @@ def test_empty_sc_data(self):
self.assertEqual( type(empty_sc_data()), dict )
self.assertEqual( len(empty_sc_data()), 7 )

def test_common_prefix(self):
"""Used to help batching up the pod5 files
"""
self.assertEqual( get_common_prefix([]), None )

self.assertEqual( get_common_prefix([ "foo.txt",
"foo.dat",
"foo.bam" ]), "foo" )

self.assertEqual( get_common_prefix([ "biz/foo.txt",
"boz/foo.dat",
"boo/foo.bam",
"foo.txt.md5" ]), "foo" )

self.assertEqual( get_common_prefix([ "foo.txt",
"foo.dat",
"foo.bam",
"md5/foo.md5" ],
extn = r"\.[^.]+"), "foo" )

self.assertEqual( get_common_prefix([ "foo.txt",
"foo.dat",
"foo.bam",
"md5/foo.txt.md5" ],
extn = r"\.[^.]+"), None )

if __name__ == '__main__':
unittest.main()
18 changes: 17 additions & 1 deletion test/test_scan_cells.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,22 @@ def test_scan_main_pod5(self):
# And just to be sure:
self.assertEqual(sc, expected)

def test_find_representative_pod5(self):
"""This is rather redundant given the test above, but technically we can
change the batch size, and ther may be no pasing reads.
"""
cell_name = "26171SS0001L02/20231108_1411_MN32284_AOZ898_35873099"
sc = scan_cells( f"{DATA_DIR}/runs/20231107_MIN2_26171SS",
cellsready=[cell_name] )['scanned_cells']

sc[cell_name]['.']['pod5_pass'] = []

self.assertEqual( find_representative_pod5( cell_name,
sc[cell_name],
batch_size = 123,
try_glob = False),
cell_name + "/pod5_._fail/AOZ898_fail_35873099_e8dfad65_batch123_00000000.pod5" )

def test_scan_error(self):
""" A missing fast5 file should raise an exception
"""
Expand Down Expand Up @@ -142,7 +158,7 @@ def test_find_representative_fast5_bc(self):
def test_find_representative_fast5_nobc(self):
""" And for the barcodeless version
"""
cell_name = 'testlib/20190710_1723_2-A5-D5_PAD38578_c6ded78b'
cell_name = "testlib/20190710_1723_2-A5-D5_PAD38578_c6ded78b"
sc = scan_cells( f"{DATA_DIR}/runs/20190710_LOCALTEST_00newrun",
cellsready=[cell_name] )['scanned_cells']

Expand Down

0 comments on commit 48b401d

Please sign in to comment.