Working on adding the rRNA scan. We're getting there, but the

cluster is now down.
EdinburghGenomics · Nov 19, 2024 · c039f85 · c039f85
1 parent 61161be
commit c039f85
Show file tree

Hide file tree

Showing 7 changed files with 236 additions and 17 deletions.
diff --git a/Snakefile.blob b/Snakefile.blob
@@ -35,7 +35,7 @@ rule list_blob_plots:
                                  taxlevel   = BLOB_LEVELS,
                                  extn       = "cov0 read_cov.cov0".split(),
                                  thumb      = ['.__thumb', ''] ),
-        subs = "blob/{cell}.{part}.{bc_and_mas}+sub" + str(BLOB_SUBSAMPLE) + ".fasta",
+        subs = "subsampled_fasta/{cell}.{part}.{bc_and_mas}+sub" + str(BLOB_SUBSAMPLE) + ".fasta",
     run:
         # Un-silence sys.stderr in sub-jobs:
         logger.quiet.discard('all')
@@ -67,17 +67,6 @@ rule get_blob_species:
     shell:
         "blobplot_stats_to_species.py {input.txt} > {output}"
 
-# Convert to FASTA and subsample and munge the headers
-rule bam_to_subsampled_fasta:
-    output: "blob/{cell}.{part}.{barcode}{_mas}+sub{n}.fasta"
-    input:  "{cell}/{barcode}/{cell}.{part}.{barcode}{_mas}.bam"
-    threads: 8
-    resources:
-        mem_mb = 44000,
-        n_cpus = 8,
-    shell:
-        "{TOOLBOX} samtools fasta -@ {threads} {input} | {TOOLBOX} seqtk sample - {wildcards.n} | sed 's,/,_,g' > {output}"
-
 
 # Makes a .complexity file for our FASTA file
 # {foo} is blob/{cell}.subreads or blob/{cell}.scraps
@@ -137,7 +126,7 @@ checkpoint split_fasta_in_chunks:
     output:
         list = "blob/{foo}.fasta_parts_list",
         parts = temp(directory("blob/{foo}.fasta_parts")),
-    input: "blob/{foo}.fasta"
+    input: "subsampled_fasta/{foo}.fasta"
     params:
         chunksize = BLOB_SUBSAMPLE // BLOB_CHUNKS
     shell:

diff --git a/Snakefile.process_cells b/Snakefile.process_cells
@@ -168,6 +168,7 @@ def i_one_barcode_info(wc):
                 cou = [],
                 xml = [],
                 blobs = [],
+                rrna = [],
                 taxon = [] )
 
     # We need md5sums and contig stats for hifi_reads and fail_reads
@@ -192,6 +193,9 @@ def i_one_barcode_info(wc):
         res['blobs'].append(f"blob/{cell}.hifi_reads.{bc_and_mas}.plots.yaml")
         res['taxon'].append(f"blob/{cell}.hifi_reads.{bc_and_mas}.species.txt")
 
+    # Likewise for the rRNA scan
+    res['rrna'].append(f"rRNA_scan/{cell}.hifi_reads.{bc_and_mas}.results.yaml")
+
     # And the actual Kinnex file too
     res['kinnex'] = f"{cell}.hifi_reads.{barcode}.kinnex_scan.yaml"
 
@@ -236,7 +240,7 @@ rule one_barcode_info:
              --taxon {input.taxon} \
              --kinnex {input.kinnex} \
              --binning <( binned_or_not.py <{input.bamhead} ) \
-             --plots  {input.blobs} \
+             --plots {input.rrna} {input.blobs} \
              > {output}
         """
 
@@ -374,6 +378,17 @@ rule bam_to_fastq:
            {TOOLBOX} samtools fastq -@ $sam_threads {input} | pigz -c -n -p $pigz_threads > {output}
         """
 
+# Convert to FASTA and subsample and munge the headers
+rule bam_to_subsampled_fasta:
+    output: "subsampled_fasta/{cell}.{part}.{barcode}{_mas}+sub{n}.fasta"
+    input:  "{cell}/{barcode}/{cell}.{part}.{barcode}{_mas}.bam"
+    threads: 4
+    resources:
+        mem_mb = 22000,
+        n_cpus = 4,
+    shell:
+        "{TOOLBOX} samtools fasta -@ {threads} {input} | {TOOLBOX} seqtk sample - {wildcards.n} | sed 's,/,_,g' > {output}"
+
 # Make a .count file for the FASTQ file
 rule count_fastq:
     output: "{foo}.fastq.count"
@@ -386,5 +401,6 @@ rule md5sum_file:
     input: "{foo}"
     shell: '( cd "$(dirname {input:q})" && md5sum -- "$(basename {input:q})" ) > {output:q}'
 
-## BLOB plotter rules ##
+## BLOB plotter and rRNA scanner rules ##
 include: "Snakefile.blob"
+include: "Snakefile.rrnascan"
diff --git a/Snakefile.rrnascan b/Snakefile.rrnascan
@@ -0,0 +1,56 @@
+## rRNA detection with SILVA rules ##
+# vim: ft=python
+
+from smrtino import load_yaml, dump_yaml
+
+# Default is that there will be 10000 sequences subsampled.
+# Best to keep this the same as blob_subsample so that the subsampling
+# only needs to happen once.
+# To override the things below set EXTRA_SNAKE_CONFIG
+#  eg. EXTRA_SNAKE_CONFIG="blob_subsample=1234"
+
+RRNA_SUBSAMPLE = int(config.get('rrna_subsample', config.get('blob_subsample', 10000)))
+
+# Compile the info from the flagstst summary into a YAML format suitable to be
+# added into the reports.
+rule count_alignments:
+    output: "rRNA_scan/{cell}.{part}.{bc_and_mas}.results.yaml"
+    input:  "rRNA_scan/{cell}.{part}.{bc_and_mas}.silva_aligned.bam.stat"
+    run:
+        # Un-silence sys.stderr in sub-jobs:
+        logger.quiet.discard('all')
+
+        # Parse the stat file. We only care about these lines:
+        # 10000 + 0 primary
+        # 9856 + 0 primary mapped (98.56% : N/A) # or...
+        # 0 + 0 primary mapped (N/A : N/A)       # if there are 0 reads total
+        total_reads = 0
+        mapped_txt = "0.00%"
+        mapped_perc = 0.0
+        with open(str(input)) as fh:
+            for aline in fh:
+                aline = aline.strip()
+                mo = re.fullmatch(r"(\d+) \+ 0 primary", aline)
+                if mo:
+                    total_reads = mo.group(1)
+                mo = re.fullmatch(r"\d+ \+ 0 primary mapped \((N/A|([0-9.]+)%) : N/A\)", aline)
+                if mo:
+                    mapped_txt = mo.group(1)
+                    if mo.group(2):
+                        mapped_perc = float(mo.group(2))
+
+        res = dict(title = f"Percentage of rRNA found in subsample ({total_reads} sequences)",
+                   label = mapped_txt,
+                   fraction = [mapped_perc, 100])
+
+        dump_yaml([res], filename=str(output))
+
+# Align the reads to SILVA and then get the samtools flagstat summary.
+rule align_to_silva:
+    output: "rRNA_scan/{cell}.{part}.{bc_and_mas}.silva_aligned.bam.stat"
+    input:  "subsampled_fasta/{cell}.{part}.{bc_and_mas}+sub" + str(RRNA_SUBSAMPLE) + ".fasta"
+    shell:
+       r"""{TOOLBOX} SNAKEJOB_THREADS={threads} align_to_silva.sh {input} | \
+            samtools flagstat - > {output}
+        """
+
diff --git a/doc/kinnex_notes_from_pacbio_scripts.txt b/doc/kinnex_notes_from_pacbio_scripts.txt
@@ -0,0 +1,135 @@
+This is going to be a big old topic.
+
+With Kinnex reads, we currently have this:
+
+1) HiFi generation on Revio
+2) Possible primary demultiplexing on Revio or SMRTLink
+3) Desegmentation in SMRTLink
+4) Secondary demultiplexing on SMRTLink
+
+The good news is that in each case, SMRTLink knows the name of the samples,
+so it should just be a matter of scanning for the SMRTLink outputs as well
+as just the files transferred from the instrument. Most of the code should
+continue to work, as the XML files describing the files are very similar.
+
+As a start, I've made a small modification so that get_revio_data.py takes the
+data_dir setting from the pbcell.yml file. This means we can auto-link data
+from whichever location.
+
+That's the easy bit, now we have some considerations:
+
+1) How do we determine what is the correct thing to deliver? My best answer is
+we look for runs as now, but then for each run we look for relevant analyses
+in SMRTLink. The latest valid analysis is what we deliver.
+
+We could go to SMRTLink directly, of course.
+
+2) The get_revio_yml.py script relies on SMRTino having interpreted the XML, but
+we're not running SMRTino on the SMRTLink stuff so we'll have to parse the XML directly.
+Of course, I can re-use the SMRTino code but then I end up with two copies, one
+which is applied by SMRTino and one which is used for grabbing processed results
+from SMRTLink.
+
+I'm sure I can make this neater. I think right now there is still untidiness with
+ws_name_to_lib in get_revio_yml.py trying to work out the library name from the info
+gleaned from SMRTino. It's not really clear how much of the job each part is meant to
+do. Currently SMRTino resolves the project but this script resolves the library which
+is clearly whacky.
+
+Hmmm.
+
+3) Heleen pointed out the Kinnex reads probably always want to be delivered in fastq.gz
+format not BAM. Is this right? If so, confusingly SMRTLink leaves the unassigned in BAM
+format so do we deliver this or not deliver it or convert it to FASTQ?
+
+Well, the first BAM file contains all the desgmented reads, and then it is this desegmented
+BAM that is run through Lima. The same Lima that is used for non-Kinnex CCS reads. Which
+is why the results go into cromwell-executions/pb_demux_ccs right. I may need to differentiate
+between Kinnex demultiplexing and non-Kinnex demultiplexing. I assume there will be no Kinnex
+desegmenting without subsequent Lima demultiplexing?
+
+Hmmm again.
+
+The project of interest is 32620_Clark_Emily. Heleen found the outputs from SMRTLink under
+/mnt/lustre/gseg/smrtlink/software/userdata/jobs_root/0000/0000001/0000001454/outputs/fastx_files
+
+The file that tells me what barcode matches what sample is:
+/mnt/lustre/gseg/smrtlink/software/userdata/jobs_root/0000/0000001/0000001454/outputs/m84140_240806_121846_s3.hifi_reads.consensusreadset.xml
+Note that this uses "biosample", not "wellsample", which makes sense.
+
+---
+
+How am I currently getting the name of the sample?
+
+SMRTino has compile_bc_info.py which reads the XML. This gets 'bs_name' and 'ws_name', and copies
+them to the YAML. It has a special case for the unassigned reads, but it does not try and determine
+which is the EdGe sample name (in the case of unbarcoded samples it could be bs_name or ws_name).
+It does however look to extract the project number (in smrtino.ParseXML.get_readset_info).
+
+We also have make_summary.py which calls smrtino.ParseXML.get_metadata_summary and this only gets
+'ws_name' because at this point we're not worrying about 'bs_name'. At this point we set 'ws_project'
+based on 'ws_name' which is fine but later we'll get 'bs_project' (per sample) and decide which is
+the true project number and put it into 'project' in the YAML.
+
+So the logic in SMRTino gets me a definitive project number, which is used to work out which files
+belong to which project. But it does not yield a definitive sample name and it's up to get_revio_yml.py
+(in scan_for_smrt_cells()) to work this out. But here it uses bs_name and ws_name which are directly
+copied from the XML so that's OK.
+
+So, it's complex but actually not as messy as I feared. I'll not plan to re-write this. I should
+be able to add a scan_for_desegmentation() function to get_revio_yml.py which finds the info, reads the
+XML directly, and writes pbcell files which are compatible with what I already have. But they will be
+FASTQ only. I think we need:
+
+readset_type: Revio (Kinnex)
+parts: [segmented_reads]
+
+Now we don't get the total bases, guessed taxon, and other _cstats, and counting these is expensive.
+So what to do? I think we keep the barcode info in the pbcell file as it is, and keep adding
+this to the summary tables. If we need more summary info then we'll need a proper Kinnex QC and this
+will have to be a pipeline which will take time to run.
+
+But if I keep these barcodes then I'll need to mark them to be skipped by get_revio_data.py, so let's
+add 'skip_data: True' to each existing barcode. This will do for now.
+
+So the next question is, given a cell_id, how do I locate any desgmentation post-processing in
+SMRTLink. Do I have to call the API or is there a way I can easily probe the file system? Well
+the files will be under /mnt/lustre/gseg/smrtlink/data/cromwell-executions/pb_demux_ccs/ and
+we can save time by only looking at directories newer than the cell. Then we can see the cell
+right in the filename. Yeah OK this is easy. We should search back from the newest directory so
+if the demultiplexing is repeated we'll pick up the latest. Cool.
+
+Let's write.
+
+---
+
+So on 14th Aug we had further discussion, and agreed that yes there are two rounds of barcoding
+with Kinnex and so these should be reflected in the filenames, so for regular PacBio we have:
+
+m84140_240719_161514_s4.hifi_reads.all.bam
+
+But for Kinnex we should have:
+
+m84140_240719_161514_s4.hifi_reads.all.segmented.bc101.bam
+
+And it's quite possible that both the barcodes will be "all" or that both will be actual barcodes.
+But we'll keep the file names in this format.
+
+Also Heleen did more digging and discovered that we DO already get BAM files for the desegmented
+and demultiplexed reads if we look properly. Also not all Kinnex reads are demultiplexed so it's no
+good scanning just for "pb_demux_ccs" jobs. I also need to look for "pb_segment_reads" jobs, but
+this is trickier since I can't just scan for the file name. Also as an aside I tried exporting the
+reads as FASTQ within SMRTLink and it fails grrr:
+
+https://edgen-smrtlink.epcc.ed.ac.uk:8243/sl/analysis/results/1465?show=status
+
+So I think maybe I'll focus on the "pb_demux_ccs" part for now and come back to the
+segmented-but-not-barcoded later. That part is harder to automate but easier to do manually.
+
+Frances has put a demo delivery for this at:
+
+/lustre-gseg/transfer/example_kinnex_deliveries/multiplexed_samples/ which is based upon
+32620_Clark_Emily so we should use that for testing. We should also try:
+
+m84140_240802_160841_s2 has 18 biosamples plus controls, project 32790SR
+m84140_240802_140933_s1 has 34 biosamples plus controls, project 31760NS
diff --git a/doc/screen_for_rRNA.txt b/doc/screen_for_rRNA.txt
@@ -8,3 +8,5 @@ Kinnex. So, to be useful we'd have to auto-run skera. Is this even possible?
 And now we think about Kinnex. See kinnex.txt
 
 Come back here once that is solved.
+
+It is. And we have an rRNA scan. See Snakefile.rrnascan.
diff --git a/make_report.py b/make_report.py
@@ -361,15 +361,24 @@ def format_per_barcode(bc, aggr, title, md_items=None):
             rep("", f"\n### {escape_md(plot_group['title'])}\n")
 
             # plot_group['files'] will be a list of lists, so plot
-            # each list a s a row.
-            for plot_row in plot_group['files']:
+            # each list as a row.
+            for plot_row in plot_group.get('files',[]):
                 rep("<div class='flex'>")
                 rep(" ".join(
                         f"[plot](img/{p}){{.thumbnail}}"
                         for p in plot_row
                     ))
                 rep("</div>")
 
+            # if there is a label and fraction we do some funky plotting
+            if 'fraction' in plot_group:
+                rep(("<div style='"
+                     " background: linear-gradient(to right, salmon 0%,salmon {}%,#00000000 {}%,#00000000 100%);"
+                     " width: 60%; height: 20pt; font-weight: bold; border-style: solid;"
+                     " border-style: solid; border-width: 2px; border-color: bluegray"
+                     ">{l}</div>").format(
+                        f=plot_group['fraction'], l=escape_md(plot_group.get('label','')) ) )
+
     # Return val is redundant since the lines will be added to the report.
     return rep
 

diff --git a/toolbox/align_to_silva.sh b/toolbox/align_to_silva.sh
@@ -0,0 +1,12 @@
+#!/bin/sh
+
+# This shell script needs to take a single argument (a FASTQ file) and
+# align the reads within to the SILVA database, writing SAM to STDOUT
+
+# It should respect the SNAKEJOB_THREADS setting.
+/lustre-gseg/software/minimap2/minimap2-2.26_x64-linux/minimap2 \
+    -a -t ${SNAKEJOB_THREADS:-3} -x asm10 \
+    /lustre-gseg/references/silva/silva138/SILVA_138.1_LSU_SSU_NR99.fasta \
+    "$1"
+
+# Note - I came up with the above database and threads combo through guesswork!
Original file line number	Diff line number	Diff line change
Expand Up		@@ -8,3 +8,5 @@ Kinnex. So, to be useful we'd have to auto-run skera. Is this even possible?
		And now we think about Kinnex. See kinnex.txt

		Come back here once that is solved.

		It is. And we have an rRNA scan. See Snakefile.rrnascan.