Skip to content

Commit

Permalink
Working on adding the rRNA scan. We're getting there, but the
Browse files Browse the repository at this point in the history
cluster is now down.
  • Loading branch information
tbooth committed Nov 19, 2024
1 parent 61161be commit c039f85
Show file tree
Hide file tree
Showing 7 changed files with 236 additions and 17 deletions.
15 changes: 2 additions & 13 deletions Snakefile.blob
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ rule list_blob_plots:
taxlevel = BLOB_LEVELS,
extn = "cov0 read_cov.cov0".split(),
thumb = ['.__thumb', ''] ),
subs = "blob/{cell}.{part}.{bc_and_mas}+sub" + str(BLOB_SUBSAMPLE) + ".fasta",
subs = "subsampled_fasta/{cell}.{part}.{bc_and_mas}+sub" + str(BLOB_SUBSAMPLE) + ".fasta",
run:
# Un-silence sys.stderr in sub-jobs:
logger.quiet.discard('all')
Expand Down Expand Up @@ -67,17 +67,6 @@ rule get_blob_species:
shell:
"blobplot_stats_to_species.py {input.txt} > {output}"

# Convert to FASTA and subsample and munge the headers
rule bam_to_subsampled_fasta:
output: "blob/{cell}.{part}.{barcode}{_mas}+sub{n}.fasta"
input: "{cell}/{barcode}/{cell}.{part}.{barcode}{_mas}.bam"
threads: 8
resources:
mem_mb = 44000,
n_cpus = 8,
shell:
"{TOOLBOX} samtools fasta -@ {threads} {input} | {TOOLBOX} seqtk sample - {wildcards.n} | sed 's,/,_,g' > {output}"


# Makes a .complexity file for our FASTA file
# {foo} is blob/{cell}.subreads or blob/{cell}.scraps
Expand Down Expand Up @@ -137,7 +126,7 @@ checkpoint split_fasta_in_chunks:
output:
list = "blob/{foo}.fasta_parts_list",
parts = temp(directory("blob/{foo}.fasta_parts")),
input: "blob/{foo}.fasta"
input: "subsampled_fasta/{foo}.fasta"
params:
chunksize = BLOB_SUBSAMPLE // BLOB_CHUNKS
shell:
Expand Down
20 changes: 18 additions & 2 deletions Snakefile.process_cells
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ def i_one_barcode_info(wc):
cou = [],
xml = [],
blobs = [],
rrna = [],
taxon = [] )

# We need md5sums and contig stats for hifi_reads and fail_reads
Expand All @@ -192,6 +193,9 @@ def i_one_barcode_info(wc):
res['blobs'].append(f"blob/{cell}.hifi_reads.{bc_and_mas}.plots.yaml")
res['taxon'].append(f"blob/{cell}.hifi_reads.{bc_and_mas}.species.txt")

# Likewise for the rRNA scan
res['rrna'].append(f"rRNA_scan/{cell}.hifi_reads.{bc_and_mas}.results.yaml")

# And the actual Kinnex file too
res['kinnex'] = f"{cell}.hifi_reads.{barcode}.kinnex_scan.yaml"

Expand Down Expand Up @@ -236,7 +240,7 @@ rule one_barcode_info:
--taxon {input.taxon} \
--kinnex {input.kinnex} \
--binning <( binned_or_not.py <{input.bamhead} ) \
--plots {input.blobs} \
--plots {input.rrna} {input.blobs} \
> {output}
"""

Expand Down Expand Up @@ -374,6 +378,17 @@ rule bam_to_fastq:
{TOOLBOX} samtools fastq -@ $sam_threads {input} | pigz -c -n -p $pigz_threads > {output}
"""

# Convert to FASTA and subsample and munge the headers
rule bam_to_subsampled_fasta:
output: "subsampled_fasta/{cell}.{part}.{barcode}{_mas}+sub{n}.fasta"
input: "{cell}/{barcode}/{cell}.{part}.{barcode}{_mas}.bam"
threads: 4
resources:
mem_mb = 22000,
n_cpus = 4,
shell:
"{TOOLBOX} samtools fasta -@ {threads} {input} | {TOOLBOX} seqtk sample - {wildcards.n} | sed 's,/,_,g' > {output}"

# Make a .count file for the FASTQ file
rule count_fastq:
output: "{foo}.fastq.count"
Expand All @@ -386,5 +401,6 @@ rule md5sum_file:
input: "{foo}"
shell: '( cd "$(dirname {input:q})" && md5sum -- "$(basename {input:q})" ) > {output:q}'

## BLOB plotter rules ##
## BLOB plotter and rRNA scanner rules ##
include: "Snakefile.blob"
include: "Snakefile.rrnascan"
56 changes: 56 additions & 0 deletions Snakefile.rrnascan
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
## rRNA detection with SILVA rules ##
# vim: ft=python

from smrtino import load_yaml, dump_yaml

# Default is that there will be 10000 sequences subsampled.
# Best to keep this the same as blob_subsample so that the subsampling
# only needs to happen once.
# To override the things below set EXTRA_SNAKE_CONFIG
# eg. EXTRA_SNAKE_CONFIG="blob_subsample=1234"

RRNA_SUBSAMPLE = int(config.get('rrna_subsample', config.get('blob_subsample', 10000)))

# Compile the info from the flagstst summary into a YAML format suitable to be
# added into the reports.
rule count_alignments:
output: "rRNA_scan/{cell}.{part}.{bc_and_mas}.results.yaml"
input: "rRNA_scan/{cell}.{part}.{bc_and_mas}.silva_aligned.bam.stat"
run:
# Un-silence sys.stderr in sub-jobs:
logger.quiet.discard('all')

# Parse the stat file. We only care about these lines:
# 10000 + 0 primary
# 9856 + 0 primary mapped (98.56% : N/A) # or...
# 0 + 0 primary mapped (N/A : N/A) # if there are 0 reads total
total_reads = 0
mapped_txt = "0.00%"
mapped_perc = 0.0
with open(str(input)) as fh:
for aline in fh:
aline = aline.strip()
mo = re.fullmatch(r"(\d+) \+ 0 primary", aline)
if mo:
total_reads = mo.group(1)
mo = re.fullmatch(r"\d+ \+ 0 primary mapped \((N/A|([0-9.]+)%) : N/A\)", aline)
if mo:
mapped_txt = mo.group(1)
if mo.group(2):
mapped_perc = float(mo.group(2))

res = dict(title = f"Percentage of rRNA found in subsample ({total_reads} sequences)",
label = mapped_txt,
fraction = [mapped_perc, 100])

dump_yaml([res], filename=str(output))

# Align the reads to SILVA and then get the samtools flagstat summary.
rule align_to_silva:
output: "rRNA_scan/{cell}.{part}.{bc_and_mas}.silva_aligned.bam.stat"
input: "subsampled_fasta/{cell}.{part}.{bc_and_mas}+sub" + str(RRNA_SUBSAMPLE) + ".fasta"
shell:
r"""{TOOLBOX} SNAKEJOB_THREADS={threads} align_to_silva.sh {input} | \
samtools flagstat - > {output}
"""

135 changes: 135 additions & 0 deletions doc/kinnex_notes_from_pacbio_scripts.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
This is going to be a big old topic.

With Kinnex reads, we currently have this:

1) HiFi generation on Revio
2) Possible primary demultiplexing on Revio or SMRTLink
3) Desegmentation in SMRTLink
4) Secondary demultiplexing on SMRTLink

The good news is that in each case, SMRTLink knows the name of the samples,
so it should just be a matter of scanning for the SMRTLink outputs as well
as just the files transferred from the instrument. Most of the code should
continue to work, as the XML files describing the files are very similar.

As a start, I've made a small modification so that get_revio_data.py takes the
data_dir setting from the pbcell.yml file. This means we can auto-link data
from whichever location.

That's the easy bit, now we have some considerations:

1) How do we determine what is the correct thing to deliver? My best answer is
we look for runs as now, but then for each run we look for relevant analyses
in SMRTLink. The latest valid analysis is what we deliver.

We could go to SMRTLink directly, of course.

2) The get_revio_yml.py script relies on SMRTino having interpreted the XML, but
we're not running SMRTino on the SMRTLink stuff so we'll have to parse the XML directly.
Of course, I can re-use the SMRTino code but then I end up with two copies, one
which is applied by SMRTino and one which is used for grabbing processed results
from SMRTLink.

I'm sure I can make this neater. I think right now there is still untidiness with
ws_name_to_lib in get_revio_yml.py trying to work out the library name from the info
gleaned from SMRTino. It's not really clear how much of the job each part is meant to
do. Currently SMRTino resolves the project but this script resolves the library which
is clearly whacky.

Hmmm.

3) Heleen pointed out the Kinnex reads probably always want to be delivered in fastq.gz
format not BAM. Is this right? If so, confusingly SMRTLink leaves the unassigned in BAM
format so do we deliver this or not deliver it or convert it to FASTQ?

Well, the first BAM file contains all the desgmented reads, and then it is this desegmented
BAM that is run through Lima. The same Lima that is used for non-Kinnex CCS reads. Which
is why the results go into cromwell-executions/pb_demux_ccs right. I may need to differentiate
between Kinnex demultiplexing and non-Kinnex demultiplexing. I assume there will be no Kinnex
desegmenting without subsequent Lima demultiplexing?

Hmmm again.

The project of interest is 32620_Clark_Emily. Heleen found the outputs from SMRTLink under
/mnt/lustre/gseg/smrtlink/software/userdata/jobs_root/0000/0000001/0000001454/outputs/fastx_files

The file that tells me what barcode matches what sample is:
/mnt/lustre/gseg/smrtlink/software/userdata/jobs_root/0000/0000001/0000001454/outputs/m84140_240806_121846_s3.hifi_reads.consensusreadset.xml
Note that this uses "biosample", not "wellsample", which makes sense.

---

How am I currently getting the name of the sample?

SMRTino has compile_bc_info.py which reads the XML. This gets 'bs_name' and 'ws_name', and copies
them to the YAML. It has a special case for the unassigned reads, but it does not try and determine
which is the EdGe sample name (in the case of unbarcoded samples it could be bs_name or ws_name).
It does however look to extract the project number (in smrtino.ParseXML.get_readset_info).

We also have make_summary.py which calls smrtino.ParseXML.get_metadata_summary and this only gets
'ws_name' because at this point we're not worrying about 'bs_name'. At this point we set 'ws_project'
based on 'ws_name' which is fine but later we'll get 'bs_project' (per sample) and decide which is
the true project number and put it into 'project' in the YAML.

So the logic in SMRTino gets me a definitive project number, which is used to work out which files
belong to which project. But it does not yield a definitive sample name and it's up to get_revio_yml.py
(in scan_for_smrt_cells()) to work this out. But here it uses bs_name and ws_name which are directly
copied from the XML so that's OK.

So, it's complex but actually not as messy as I feared. I'll not plan to re-write this. I should
be able to add a scan_for_desegmentation() function to get_revio_yml.py which finds the info, reads the
XML directly, and writes pbcell files which are compatible with what I already have. But they will be
FASTQ only. I think we need:

readset_type: Revio (Kinnex)
parts: [segmented_reads]

Now we don't get the total bases, guessed taxon, and other _cstats, and counting these is expensive.
So what to do? I think we keep the barcode info in the pbcell file as it is, and keep adding
this to the summary tables. If we need more summary info then we'll need a proper Kinnex QC and this
will have to be a pipeline which will take time to run.

But if I keep these barcodes then I'll need to mark them to be skipped by get_revio_data.py, so let's
add 'skip_data: True' to each existing barcode. This will do for now.

So the next question is, given a cell_id, how do I locate any desgmentation post-processing in
SMRTLink. Do I have to call the API or is there a way I can easily probe the file system? Well
the files will be under /mnt/lustre/gseg/smrtlink/data/cromwell-executions/pb_demux_ccs/ and
we can save time by only looking at directories newer than the cell. Then we can see the cell
right in the filename. Yeah OK this is easy. We should search back from the newest directory so
if the demultiplexing is repeated we'll pick up the latest. Cool.

Let's write.

---

So on 14th Aug we had further discussion, and agreed that yes there are two rounds of barcoding
with Kinnex and so these should be reflected in the filenames, so for regular PacBio we have:

m84140_240719_161514_s4.hifi_reads.all.bam

But for Kinnex we should have:

m84140_240719_161514_s4.hifi_reads.all.segmented.bc101.bam

And it's quite possible that both the barcodes will be "all" or that both will be actual barcodes.
But we'll keep the file names in this format.

Also Heleen did more digging and discovered that we DO already get BAM files for the desegmented
and demultiplexed reads if we look properly. Also not all Kinnex reads are demultiplexed so it's no
good scanning just for "pb_demux_ccs" jobs. I also need to look for "pb_segment_reads" jobs, but
this is trickier since I can't just scan for the file name. Also as an aside I tried exporting the
reads as FASTQ within SMRTLink and it fails grrr:

https://edgen-smrtlink.epcc.ed.ac.uk:8243/sl/analysis/results/1465?show=status

So I think maybe I'll focus on the "pb_demux_ccs" part for now and come back to the
segmented-but-not-barcoded later. That part is harder to automate but easier to do manually.

Frances has put a demo delivery for this at:

/lustre-gseg/transfer/example_kinnex_deliveries/multiplexed_samples/ which is based upon
32620_Clark_Emily so we should use that for testing. We should also try:

m84140_240802_160841_s2 has 18 biosamples plus controls, project 32790SR
m84140_240802_140933_s1 has 34 biosamples plus controls, project 31760NS
2 changes: 2 additions & 0 deletions doc/screen_for_rRNA.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@ Kinnex. So, to be useful we'd have to auto-run skera. Is this even possible?
And now we think about Kinnex. See kinnex.txt

Come back here once that is solved.

It is. And we have an rRNA scan. See Snakefile.rrnascan.
13 changes: 11 additions & 2 deletions make_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,15 +361,24 @@ def format_per_barcode(bc, aggr, title, md_items=None):
rep("", f"\n### {escape_md(plot_group['title'])}\n")

# plot_group['files'] will be a list of lists, so plot
# each list a s a row.
for plot_row in plot_group['files']:
# each list as a row.
for plot_row in plot_group.get('files',[]):
rep("<div class='flex'>")
rep(" ".join(
f"[plot](img/{p}){{.thumbnail}}"
for p in plot_row
))
rep("</div>")

# if there is a label and fraction we do some funky plotting
if 'fraction' in plot_group:
rep(("<div style='"
" background: linear-gradient(to right, salmon 0%,salmon {}%,#00000000 {}%,#00000000 100%);"
" width: 60%; height: 20pt; font-weight: bold; border-style: solid;"
" border-style: solid; border-width: 2px; border-color: bluegray"
">{l}</div>").format(
f=plot_group['fraction'], l=escape_md(plot_group.get('label','')) ) )

# Return val is redundant since the lines will be added to the report.
return rep

Expand Down
12 changes: 12 additions & 0 deletions toolbox/align_to_silva.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/sh

# This shell script needs to take a single argument (a FASTQ file) and
# align the reads within to the SILVA database, writing SAM to STDOUT

# It should respect the SNAKEJOB_THREADS setting.
/lustre-gseg/software/minimap2/minimap2-2.26_x64-linux/minimap2 \
-a -t ${SNAKEJOB_THREADS:-3} -x asm10 \
/lustre-gseg/references/silva/silva138/SILVA_138.1_LSU_SSU_NR99.fasta \
"$1"

# Note - I came up with the above database and threads combo through guesswork!

0 comments on commit c039f85

Please sign in to comment.