Skip to content

Commit

Permalink
Merge pull request #1 from vadimzalunin/master
Browse files Browse the repository at this point in the history
pipeline for 3k contigs taxonomy using mash
  • Loading branch information
ArghSee authored Nov 26, 2019
2 parents 57daefd + 8e2bc32 commit 451ecf4
Show file tree
Hide file tree
Showing 6 changed files with 46 additions and 0 deletions.
16 changes: 16 additions & 0 deletions src/taxonomy/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
FROM debian:stretch

RUN apt-get update
RUN apt-get install -qy wget

RUN wget -q https://repo.anaconda.com/archive/Anaconda3-2019.10-Linux-x86_64.sh && bash Anaconda3-2019.10-Linux-x86_64.sh -b -p /root/anaconda
ENV PATH="/root/anaconda/bin:${PATH}"
RUN eval $(conda shell.bash hook) && conda init

RUN conda install -y -c bioconda -c conda-forge snakemake

RUN conda install google-cloud-storage

COPY bloom_filter bloom_check /usr/bin/

RUN wget -q https://github.com/marbl/Mash/releases/download/v2.2/mash-Linux64-v2.2.tar && tar xf mash-Linux64-v2.2.tar && mv mash-Linux64-v2.2/mash /usr/bin/
23 changes: 23 additions & 0 deletions src/taxonomy/Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from snakemake.remote.GS import RemoteProvider as GSRemoteProvider
GS = GSRemoteProvider()

configfile: 'config.yaml'

SAMPLES, *_ = GS.glob_wildcards(config['bucket'] + '/'+ config['dir']+'/{acc}.realign.local.fa')
msh=GS.remote(config["ref_sketch"])

rule sketch:
input: fasta="3k/{acc}.realign.local.fa"
output: msh="3k/{acc}.realign.local.fa.msh"
run: shell("mash sketch -i {input.fasta}")

rule dist:
input: sketch="3k/{acc}.realign.local.fa.msh", msh=msh
output: dist="3k/{acc}.realign.local.fa.dist"
threads: 6
shell: '''
mash dist -v 0.05 -p 6 {input.msh} {input.sketch} | sed -rn 's/\\t([^0][0-9]+)\\/([0-9]+)$/\\t\\1\\t\\2/p' | sort -nrk5 > {output.dist}
'''

rule check_all:
input: expand("3k/{acc}.realign.local.fa.dist", acc=SAMPLES)
Binary file added src/taxonomy/bloom_check
Binary file not shown.
Binary file added src/taxonomy/bloom_filter
Binary file not shown.
3 changes: 3 additions & 0 deletions src/taxonomy/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
bucket: viral2_test_bucket
dir: 3k
ref_sketch: viral2_test_bucket/ref_viruses_rep_genomes_v5.fasta.msh
4 changes: 4 additions & 0 deletions src/taxonomy/mash_tax.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash -eu

bucket=$1
snakemake --rerun-incomplete -p -j 750 --kubernetes --container-image us.gcr.io/strides-sra-hackathon-data/test_pipeline:v0.4 --default-remote-provider GS --default-remote-prefix $bucket --latency-wait 60 --keep-going --restart-times 3 --nolock check_all

0 comments on commit 451ecf4

Please sign in to comment.