Various updates. There are still test failures but it's the weekend

so I'll come back to it.
EdinburghGenomics · Feb 23, 2024 · 6f5dbe0 · 6f5dbe0
1 parent 6388b3e
commit 6f5dbe0
Show file tree

Hide file tree

Showing 10 changed files with 335 additions and 23 deletions.
diff --git a/Snakefile.main b/Snakefile.main
@@ -138,6 +138,7 @@ def get_cell_info( experiment, cell, cell_content, counts, fin_summary,
                    blobs = None,
                    nanoplot = None,
                    pod5_meta = None,
+                   fastq_meta = None,
                    duplex = None,
                    minknow_report = None ):
     """Compiles the content of cell_info.yaml from various bits and pieces.
@@ -213,6 +214,10 @@ def get_cell_info( experiment, cell, cell_content, counts, fin_summary,
     if pod5_meta:
         ci.update(pod5_meta)
 
+    # And the fastq_metadata. For some reason we can only find the basecall model here.
+    if fastq_meta:
+        ci['_fastq_metadata'] = fastq_meta
+
     # Add in the barcode-based filter. Here we can choose how to deal with internal
     # and external names, but remember that external names in particular must be
     # robustly quoted.
@@ -413,6 +418,7 @@ def one_cell_inputs(wc):
         duplex_list  = f"duplex_scan/{cell}/pair_ids_filtered.txt",
 
         pod5_meta    = f"{cell}/cell_pod5_metadata.yaml",
+        fastq_meta   = f"{cell}/cell_fastq_metadata.yaml",
         fin_summary  = f"{cell}/cell_final_summary.yaml",
 
         sample_names = f"{cell}/sample_names.yaml",
@@ -467,6 +473,7 @@ rule one_cell:
                             fin_summary = fin_summary,
                             blobs = input.blobs,
                             pod5_meta = load_yaml(input.pod5_meta),
+                            fastq_meta = load_yaml(input.fastq_meta),
                             nanoplot = input.nanoplot_r[0],
                             minknow_report = minknow_report,
                             sample_names = load_yaml(input.sample_names) )
@@ -480,27 +487,41 @@ rule sample_names:
     shell:
         "sample_names_fetch.py {wildcards.cell}"
 
-def find_representative_pod5(wildcards):
-    """Looks in SC_DATA for a representative POD5 for this cell. If there is none,
-       raises an exception.
+def find_representative_file(ftype):
+    """Returns a function that finds files of type ftype for a cell
     """
-    rep_pod5 = SC_DATA['representative_pod5'][wildcards.cell]
-    if not rep_pod5:
-        raise RuntimeError("No representative POD5 in SC_DATA")
+    def _find_representative(wildcards):
+        """Looks in SC_DATA for a representative POD5/FASTQ for this cell.
+           If there is none, raises an exception.
+        """
+        rep_file = SC_DATA[f'representative_{ftype}'][wildcards.cell]
+        if not rep_file:
+            raise RuntimeError("No representative {ftype} for {wildcards.cell} in SC_DATA")
+
+        return rep_file
+
+    return _find_representative
 
-    return rep_pod5
+localrules: pod5_metadata, fastq_metadata
 
-# This should get the same metadata from the batched or un-batched POD5 meaning it can be run
-# even after the originals are deleted.
-# The weird input definition tries to handle this. Whichever we use we only need a single file,
-# so return the first.
+# This is now always run on the output (batched) pod5.
+# The scan_cells.py script predicts what the name of a suitable file will be.
+# We only need one, as every read in every file has the same metadata.
 rule pod5_metadata:
     output: "{cell}/cell_pod5_metadata.yaml"
     input:
-        pod5 = find_representative_pod5
+        pod5 = find_representative_file("pod5")
     shell:
         "get_pod5_metadata.py -v {input.pod5} > {output}"
 
+# Similar shtick for fastq_metadata.
+rule fastq_metadata:
+    output: "{cell}/cell_fastq_metadata.yaml"
+    input:
+        fastq = find_representative_file("fastq")
+    shell:
+        "get_pod5_metadata.py {input.fastq} > {output}"
+
 # Only if the run directory is in place, load the rules that filter, compress and combine the
 # original files.
 if EXPDIR:

diff --git a/doc/dorado_metadata.txt b/doc/dorado_metadata.txt
@@ -0,0 +1,46 @@
+After releasing Hesiod 3.x, which is the Dorado+POD5 update, I got Rob to do a test run:
+
+https://egcloud.bio.ed.ac.uk/hesiod/20240222_EGS2_Is_PromethION_Working/
+
+The run itself is a bit funky (used barcodes but the samples look to be un-barcoded) but
+Hesiod processed it fine (not I had to manually set it as an internal run to trigger
+getting a report).
+
+A few issues:
+
+We're making a load of empty fastq.gz files. In other pipelines we avoid this. I'm not sure
+if this is by accident or design here? Anyway, it works so I'll leave it for now.
+
+The Metadata reports "Guppy Version" and "Guppy Config". Of course we're now using Dorado.
+
+get_pod5_metadata.py now gives me:
+
+Software: MinKNOW 23.11.7 (Bream 7.8.2, Core 5.8.6, Dorado 7.2.13+fba8e8925)
+
+Which seems rather more informative so probably I should use that. Note that even for slightly
+older runs (eg. 20240124_EGS2_27971RLpool01) I see:
+
+Software: MinKNOW 23.04.5 (Bream 7.5.9, Core 5.5.3, Guppy 6.5.7+ca6d6af)
+
+So I should deffo be using this in the Hesiod reports now, not Guppy Version.
+But note that this gets used for the deliveries, so I need to modify
+disseminate_results.py and the template too!!
+
+Also "Guppy Config" does not reveal the model version, which in this case is:
+
+[email protected]
+                             ^^^^^
+
+So I need to get this from the POD5 (or from the FASTQ header even??).
+
+OK.
+
+Hmm. The "Guppy Config" item in the report is coming from:
+
+res['BasecallConfig'] = context_tags.get('basecall_config_filename', 'unknown')
+
+But this doesn't capture the model number. It doesn't seem to be in the POD5 metadata
+at all. Or anywhere else! I think I'm going to need to get this from the actual FASTQ
+file header. What a PITA!
+
+OK, done. Let's incorporate this.
diff --git a/get_fast5_metadata.py b/get_fast5_metadata.py
@@ -13,7 +13,6 @@
 import gzip
 from io import BytesIO
 import shutil
-import math
 from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
 from collections import OrderedDict
 from contextlib import suppress

diff --git a/get_fastq_metadata.py b/get_fastq_metadata.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+
+"""Given a (directory of) .fastq(.gz) file(s), extract some metadata from the first line:
+    1) runid
+    2) Start time of the run
+    3) flow_cell_id
+    4) barcode
+    5) basecall_model (apparently we can't get this from elsewhere)
+   Inputs:
+    A directory where .fastq(.gz) files may be found, or a file
+"""
+
+import os, sys, re
+import logging
+import gzip
+import shutil
+from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
+from collections import OrderedDict
+from contextlib import suppress
+
+# For parsing of ISO/RFC format dates (note that newer Python has datetime.datetime.fromisoformat
+# but we're using dateutil.parser.isoparse from python-dateutil 2.8)
+# Actually, the time in the header lines here is per-read, and not useful to us anyway.
+# The POD5 file knows the cell start time.
+#from dateutil.parser import isoparse
+
+from hesiod import dump_yaml, glob
+
+def main(args):
+
+    logging.basicConfig( level = logging.DEBUG if args.verbose else logging.INFO,
+                         format = "{levelname}:{message}",
+                         style = '{')
+
+    if os.path.isdir(args.fastq):
+        logging.debug(f"Scanning .fastq[.gz] files in {args.fastq!r}")
+        md = md_from_fastq_dir(args.fastq)
+    else:
+        logging.debug(f"Reading from single file {args.fastq!r}")
+        md = md_from_fastq_file(args.fastq)
+
+    print(dump_yaml(md), end='')
+
+def md_from_fastq_dir(fq_dir):
+    """Read from the directory of fastq files and return a dict of metadata
+       from the first header of the first file.
+    """
+    fq_files = glob(os.path.join(fq_dir, '*.fastq.gz'))
+    if not fq_files:
+        # Try unzipped...
+        logging.debug("No .fastq.gz files, maybe .fastq?")
+        fq_files = glob(os.path.join(fq_dir, "*.fastq"))
+
+    logging.debug(f"Found {len(fq_files)} files")
+    if not fq_files:
+        raise RuntimeError("No fastq[.gz] files found.")
+
+    # Use the first one
+    return md_from_fastq_file(fq_files[0])
+
+def md_from_fastq_file(fq_file):
+    """Read from a specified fastq file and return a dict of metadata
+    """
+    _open = gzip.open if fq_file.endswith('.gz') else open
+    with _open(fq_file, 'rt') as fh:
+        first_line = next(fh)
+
+        if not first_line.startswith("@"):
+            raise RuntimeError(f"Not a FASTQ header line:\n{first_line}")
+
+    return md_from_header_line(first_line.rstrip("\n"))
+
+def md_from_header_line(hline):
+    """Extract the goodies from the header line.
+    """
+    hdict = dict([p.split("=", 1) for p in hline.split() if "=" in p])
+
+    res = OrderedDict()
+
+    for k, v in dict( runid = None,
+                      flowcell = "flow_cell_id",
+                      experiment = "protocol_group_id",
+                      sample = "sample_id",
+                      barcode = None,
+                      basecall_model = "basecall_model_version_id" ).items():
+        if hdict.get(v or k):
+            res[k] = hdict.get(v or k)
+
+    # Add this if missing
+    for x in ['basecall_model']:
+        res.setdefault(x, 'unknown')
+
+    return res
+
+def parse_args(*args):
+    description = """Extract various bits of metadata from the first read in a FASTQ file."""
+
+    parser = ArgumentParser( description = description,
+                             formatter_class = ArgumentDefaultsHelpFormatter)
+
+    parser.add_argument("fastq", default='.', nargs='?',
+                        help="A file, or a directory to scan for .fastq[.gz] files")
+    parser.add_argument("-v", "--verbose", action="store_true",
+                        help="Print progress to stderr")
+
+    return parser.parse_args(*args)
+
+if __name__=="__main__":
+    main(parse_args())
+
diff --git a/get_pod5_metadata.py b/get_pod5_metadata.py
@@ -81,7 +81,7 @@ def read_pod5(p5_filename):
     """Gets the metadata from the first read record in a single pod5 file
     """
     res = OrderedDict()
-    for x in ['POD5Version', 'StartTime', 'GuppyVersion']:
+    for x in ['POD5Version', 'StartTime', 'Software']:
         res[x] = 'unknown'
 
     p5_handle = pod5.Reader(Path(p5_filename))
@@ -94,7 +94,7 @@ def read_pod5(p5_filename):
         # read, so just get the first one, and dict-ify it.
         read0 = vars(next(p5_handle.reads()).run_info)
 
-        # Run ID used to be in the filename, but not now with the batched pod5 files
+        # Run ID used to be in the filename, but to be sure get it here
         res['RunID'] = read0['acquisition_id']
         res['Software'] = read0['software']
 
@@ -118,8 +118,12 @@ def read_pod5(p5_filename):
 
         # Stuff from 'tracking_id'
         tracking_id = dict(read0['tracking_id'])
+
+        # This is useful, being the experiment start time not the read start time
         res['StartTime']    = tracking_id['exp_start_time']
-        res['GuppyVersion'] = tracking_id['guppy_version']
+        # Note that 'guppy_version' will contain the Dorado version if Dorado is used
+        # We'll rely on read0['software'] from now on.
+        #res['GuppyVersion'] = tracking_id['guppy_version']
 
     finally:
         p5_handle.close()

diff --git a/make_report.py b/make_report.py
@@ -26,18 +26,22 @@ def get_cell_metadata(ci):
                ('Files in pass',),
                ('Files in fail',),
                ('SequencingKit',     'Sequencing Kit'),
-               ('GuppyVersion',      'Guppy Version'),
-               ('BasecallConfig',    'Guppy Config'),
+               ('Software',          'Software'),
+               ('BasecallConfig',    'Basecaller Config'),
+               (None,                'Basecall Model'),
                ('SamplingFrequency', 'Sampling Freq'),
                ('ExperimentType',    'Input Type'),
                ('Slot',),
                ('CellID',            'Cell ID'),
                ('RunID',             'Run UUID') ]:
         if len(x) == 1:
+            # Get that key
             res[x[0]] = ci.get(x[0], 'unknown')
         elif x[0]:
+            # Get that key but rename it
             res[x[1]] = ci.get(x[0], 'unknown')
         else:
+            # Add a placeholder but don't set it yet
             res[x[1]] = 'unknown'
 
     if '_final_summary' in ci:
@@ -46,6 +50,9 @@ def get_cell_metadata(ci):
         res['End Time'] = strftime(fs['processing_stopped'])
         res['Run Time'] = fs['run_time']
 
+    if '_fastq_metadata' in ci:
+        res['Basecall Model'] = ci['_fastq_metadata']['basecall_model']
+
     return res
 
 def format_counts_per_cells(cells, heading="Read summary"):