Skip to content

Commit

Permalink
Various updates. There are still test failures but it's the weekend
Browse files Browse the repository at this point in the history
so I'll come back to it.
  • Loading branch information
tbooth committed Feb 23, 2024
1 parent 6388b3e commit 6f5dbe0
Show file tree
Hide file tree
Showing 10 changed files with 335 additions and 23 deletions.
45 changes: 33 additions & 12 deletions Snakefile.main
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ def get_cell_info( experiment, cell, cell_content, counts, fin_summary,
blobs = None,
nanoplot = None,
pod5_meta = None,
fastq_meta = None,
duplex = None,
minknow_report = None ):
"""Compiles the content of cell_info.yaml from various bits and pieces.
Expand Down Expand Up @@ -213,6 +214,10 @@ def get_cell_info( experiment, cell, cell_content, counts, fin_summary,
if pod5_meta:
ci.update(pod5_meta)

# And the fastq_metadata. For some reason we can only find the basecall model here.
if fastq_meta:
ci['_fastq_metadata'] = fastq_meta

# Add in the barcode-based filter. Here we can choose how to deal with internal
# and external names, but remember that external names in particular must be
# robustly quoted.
Expand Down Expand Up @@ -413,6 +418,7 @@ def one_cell_inputs(wc):
duplex_list = f"duplex_scan/{cell}/pair_ids_filtered.txt",

pod5_meta = f"{cell}/cell_pod5_metadata.yaml",
fastq_meta = f"{cell}/cell_fastq_metadata.yaml",
fin_summary = f"{cell}/cell_final_summary.yaml",

sample_names = f"{cell}/sample_names.yaml",
Expand Down Expand Up @@ -467,6 +473,7 @@ rule one_cell:
fin_summary = fin_summary,
blobs = input.blobs,
pod5_meta = load_yaml(input.pod5_meta),
fastq_meta = load_yaml(input.fastq_meta),
nanoplot = input.nanoplot_r[0],
minknow_report = minknow_report,
sample_names = load_yaml(input.sample_names) )
Expand All @@ -480,27 +487,41 @@ rule sample_names:
shell:
"sample_names_fetch.py {wildcards.cell}"

def find_representative_pod5(wildcards):
"""Looks in SC_DATA for a representative POD5 for this cell. If there is none,
raises an exception.
def find_representative_file(ftype):
"""Returns a function that finds files of type ftype for a cell
"""
rep_pod5 = SC_DATA['representative_pod5'][wildcards.cell]
if not rep_pod5:
raise RuntimeError("No representative POD5 in SC_DATA")
def _find_representative(wildcards):
"""Looks in SC_DATA for a representative POD5/FASTQ for this cell.
If there is none, raises an exception.
"""
rep_file = SC_DATA[f'representative_{ftype}'][wildcards.cell]
if not rep_file:
raise RuntimeError("No representative {ftype} for {wildcards.cell} in SC_DATA")

return rep_file

return _find_representative

return rep_pod5
localrules: pod5_metadata, fastq_metadata

# This should get the same metadata from the batched or un-batched POD5 meaning it can be run
# even after the originals are deleted.
# The weird input definition tries to handle this. Whichever we use we only need a single file,
# so return the first.
# This is now always run on the output (batched) pod5.
# The scan_cells.py script predicts what the name of a suitable file will be.
# We only need one, as every read in every file has the same metadata.
rule pod5_metadata:
output: "{cell}/cell_pod5_metadata.yaml"
input:
pod5 = find_representative_pod5
pod5 = find_representative_file("pod5")
shell:
"get_pod5_metadata.py -v {input.pod5} > {output}"

# Similar shtick for fastq_metadata.
rule fastq_metadata:
output: "{cell}/cell_fastq_metadata.yaml"
input:
fastq = find_representative_file("fastq")
shell:
"get_pod5_metadata.py {input.fastq} > {output}"

# Only if the run directory is in place, load the rules that filter, compress and combine the
# original files.
if EXPDIR:
Expand Down
46 changes: 46 additions & 0 deletions doc/dorado_metadata.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
After releasing Hesiod 3.x, which is the Dorado+POD5 update, I got Rob to do a test run:

https://egcloud.bio.ed.ac.uk/hesiod/20240222_EGS2_Is_PromethION_Working/

The run itself is a bit funky (used barcodes but the samples look to be un-barcoded) but
Hesiod processed it fine (not I had to manually set it as an internal run to trigger
getting a report).

A few issues:

We're making a load of empty fastq.gz files. In other pipelines we avoid this. I'm not sure
if this is by accident or design here? Anyway, it works so I'll leave it for now.

The Metadata reports "Guppy Version" and "Guppy Config". Of course we're now using Dorado.

get_pod5_metadata.py now gives me:

Software: MinKNOW 23.11.7 (Bream 7.8.2, Core 5.8.6, Dorado 7.2.13+fba8e8925)

Which seems rather more informative so probably I should use that. Note that even for slightly
older runs (eg. 20240124_EGS2_27971RLpool01) I see:

Software: MinKNOW 23.04.5 (Bream 7.5.9, Core 5.5.3, Guppy 6.5.7+ca6d6af)

So I should deffo be using this in the Hesiod reports now, not Guppy Version.
But note that this gets used for the deliveries, so I need to modify
disseminate_results.py and the template too!!

Also "Guppy Config" does not reveal the model version, which in this case is:

[email protected]
^^^^^

So I need to get this from the POD5 (or from the FASTQ header even??).

OK.

Hmm. The "Guppy Config" item in the report is coming from:

res['BasecallConfig'] = context_tags.get('basecall_config_filename', 'unknown')

But this doesn't capture the model number. It doesn't seem to be in the POD5 metadata
at all. Or anywhere else! I think I'm going to need to get this from the actual FASTQ
file header. What a PITA!

OK, done. Let's incorporate this.
1 change: 0 additions & 1 deletion get_fast5_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import gzip
from io import BytesIO
import shutil
import math
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from collections import OrderedDict
from contextlib import suppress
Expand Down
110 changes: 110 additions & 0 deletions get_fastq_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#!/usr/bin/env python3

"""Given a (directory of) .fastq(.gz) file(s), extract some metadata from the first line:
1) runid
2) Start time of the run
3) flow_cell_id
4) barcode
5) basecall_model (apparently we can't get this from elsewhere)
Inputs:
A directory where .fastq(.gz) files may be found, or a file
"""

import os, sys, re
import logging
import gzip
import shutil
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from collections import OrderedDict
from contextlib import suppress

# For parsing of ISO/RFC format dates (note that newer Python has datetime.datetime.fromisoformat
# but we're using dateutil.parser.isoparse from python-dateutil 2.8)
# Actually, the time in the header lines here is per-read, and not useful to us anyway.
# The POD5 file knows the cell start time.
#from dateutil.parser import isoparse

from hesiod import dump_yaml, glob

def main(args):

logging.basicConfig( level = logging.DEBUG if args.verbose else logging.INFO,
format = "{levelname}:{message}",
style = '{')

if os.path.isdir(args.fastq):
logging.debug(f"Scanning .fastq[.gz] files in {args.fastq!r}")
md = md_from_fastq_dir(args.fastq)
else:
logging.debug(f"Reading from single file {args.fastq!r}")
md = md_from_fastq_file(args.fastq)

print(dump_yaml(md), end='')

def md_from_fastq_dir(fq_dir):
"""Read from the directory of fastq files and return a dict of metadata
from the first header of the first file.
"""
fq_files = glob(os.path.join(fq_dir, '*.fastq.gz'))
if not fq_files:
# Try unzipped...
logging.debug("No .fastq.gz files, maybe .fastq?")
fq_files = glob(os.path.join(fq_dir, "*.fastq"))

logging.debug(f"Found {len(fq_files)} files")
if not fq_files:
raise RuntimeError("No fastq[.gz] files found.")

# Use the first one
return md_from_fastq_file(fq_files[0])

def md_from_fastq_file(fq_file):
"""Read from a specified fastq file and return a dict of metadata
"""
_open = gzip.open if fq_file.endswith('.gz') else open
with _open(fq_file, 'rt') as fh:
first_line = next(fh)

if not first_line.startswith("@"):
raise RuntimeError(f"Not a FASTQ header line:\n{first_line}")

return md_from_header_line(first_line.rstrip("\n"))

def md_from_header_line(hline):
"""Extract the goodies from the header line.
"""
hdict = dict([p.split("=", 1) for p in hline.split() if "=" in p])

res = OrderedDict()

for k, v in dict( runid = None,
flowcell = "flow_cell_id",
experiment = "protocol_group_id",
sample = "sample_id",
barcode = None,
basecall_model = "basecall_model_version_id" ).items():
if hdict.get(v or k):
res[k] = hdict.get(v or k)

# Add this if missing
for x in ['basecall_model']:
res.setdefault(x, 'unknown')

return res

def parse_args(*args):
description = """Extract various bits of metadata from the first read in a FASTQ file."""

parser = ArgumentParser( description = description,
formatter_class = ArgumentDefaultsHelpFormatter)

parser.add_argument("fastq", default='.', nargs='?',
help="A file, or a directory to scan for .fastq[.gz] files")
parser.add_argument("-v", "--verbose", action="store_true",
help="Print progress to stderr")

return parser.parse_args(*args)

if __name__=="__main__":
main(parse_args())

10 changes: 7 additions & 3 deletions get_pod5_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def read_pod5(p5_filename):
"""Gets the metadata from the first read record in a single pod5 file
"""
res = OrderedDict()
for x in ['POD5Version', 'StartTime', 'GuppyVersion']:
for x in ['POD5Version', 'StartTime', 'Software']:
res[x] = 'unknown'

p5_handle = pod5.Reader(Path(p5_filename))
Expand All @@ -94,7 +94,7 @@ def read_pod5(p5_filename):
# read, so just get the first one, and dict-ify it.
read0 = vars(next(p5_handle.reads()).run_info)

# Run ID used to be in the filename, but not now with the batched pod5 files
# Run ID used to be in the filename, but to be sure get it here
res['RunID'] = read0['acquisition_id']
res['Software'] = read0['software']

Expand All @@ -118,8 +118,12 @@ def read_pod5(p5_filename):

# Stuff from 'tracking_id'
tracking_id = dict(read0['tracking_id'])

# This is useful, being the experiment start time not the read start time
res['StartTime'] = tracking_id['exp_start_time']
res['GuppyVersion'] = tracking_id['guppy_version']
# Note that 'guppy_version' will contain the Dorado version if Dorado is used
# We'll rely on read0['software'] from now on.
#res['GuppyVersion'] = tracking_id['guppy_version']

finally:
p5_handle.close()
Expand Down
11 changes: 9 additions & 2 deletions make_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,22 @@ def get_cell_metadata(ci):
('Files in pass',),
('Files in fail',),
('SequencingKit', 'Sequencing Kit'),
('GuppyVersion', 'Guppy Version'),
('BasecallConfig', 'Guppy Config'),
('Software', 'Software'),
('BasecallConfig', 'Basecaller Config'),
(None, 'Basecall Model'),
('SamplingFrequency', 'Sampling Freq'),
('ExperimentType', 'Input Type'),
('Slot',),
('CellID', 'Cell ID'),
('RunID', 'Run UUID') ]:
if len(x) == 1:
# Get that key
res[x[0]] = ci.get(x[0], 'unknown')
elif x[0]:
# Get that key but rename it
res[x[1]] = ci.get(x[0], 'unknown')
else:
# Add a placeholder but don't set it yet
res[x[1]] = 'unknown'

if '_final_summary' in ci:
Expand All @@ -46,6 +50,9 @@ def get_cell_metadata(ci):
res['End Time'] = strftime(fs['processing_stopped'])
res['Run Time'] = fs['run_time']

if '_fastq_metadata' in ci:
res['Basecall Model'] = ci['_fastq_metadata']['basecall_model']

return res

def format_counts_per_cells(cells, heading="Read summary"):
Expand Down
Loading

0 comments on commit 6f5dbe0

Please sign in to comment.