Skip to content

Commit

Permalink
Auto scaling in the pipeline without having to create a leader and rs…
Browse files Browse the repository at this point in the history
…ync files (#139)

* add Toil extras to container for autoscaling support

add Toil extras to container for autoscaling support

* added commands to launch cluster and copy files

* add leader incorporation into pipeline; leader is node on which pipeline is running

add autoscaling option
add autoscaling staging path option
inherit FROM Toil in Dockerfile
install rnaseq pipeline script in virtual env

* support for refactored RNA-Seq

support for refactored RNA-Seq

* add hera option and accomodate files on s3 etc.

add hera option and accomodate files on s3 etc.

* add max_sample_size to config

add max_sample_size to config

* comment out questionable image version manipulation code

comment out questionable image version manipulation code

* fixes to CWL and the wrapper script to support Dockstore and autoscaling

fixes to CWL and the wrapper script to support Dockstore and autoscaling

* fixes to support autoscaling; unique id for each sample

fixes to support autoscaling; unique id for each sample

* add support for auto scaling inputs

add support for auto scaling inputs

* fix misplaced doc string

fix misplaced doc string

* remove comments and clean up

remove comments and clean up

* remove blank line

remove blank line

* remove another blank line

remove another blank line

* insert blank line

insert blank line

* fix comments, replace command string with list

fix comments, replace command string with list

* mount root folder on host for Toil AWS provisioner

Mount the root folder to an anonymous directory on the host file system
This is done to make it writable in case Dockstore (which calls cwl-runner)
is used to launch the container, because cwl-runner makes the container
file system read only. We need to do this because the Toil AWS provisioner
will try to create a key pair in /root/.ssh and create a file called
.sshSuccess in /root
  • Loading branch information
Walter Shands authored and jvivian committed Mar 28, 2018
1 parent b97dbed commit a32acc3
Show file tree
Hide file tree
Showing 3 changed files with 373 additions and 105 deletions.
27 changes: 22 additions & 5 deletions docker/Dockerfile.template
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM ubuntu:14.04
FROM quay.io/ucsc_cgl/toil:3.14.0

# File Author / Maintainer
MAINTAINER John Vivian <[email protected]>
Expand All @@ -16,16 +16,33 @@ RUN apt-get update && apt-get install -y \
RUN curl https://get.docker.com/builds/Linux/x86_64/docker-DOCKERVER.tgz \
| tar -xvzf - --transform='s,[^/]*/,,g' -C /usr/local/bin/ \
&& chmod u+x /usr/local/bin/docker
# Install Toil
RUN pip install toil==3.3.5

# Set up a virtual environment with the system site package option so Toil
# can zip up this virtual environment and place it on the worker nodes
# Any Toil script that is pip installed must be installed in the virtual
# environment; this is how the pipeline is placed on the worker nodes
RUN pip install virtualenv
RUN virtualenv --system-site-packages /opt/rnaseq-pipeline/toil_venv

# Install toil-rnaseq
COPY toil-rnaseq.tar.gz .
RUN pip install toil-rnaseq.tar.gz && rm toil-rnaseq.tar.gz

RUN bash -c 'source /opt/rnaseq-pipeline/toil_venv/bin/activate && pip install toil-rnaseq.tar.gz && rm toil-rnaseq.tar.gz'

COPY wrapper.py /opt/rnaseq-pipeline/
COPY README.md /opt/rnaseq-pipeline/

# Mesos communicates on port 5050 so make sure this port is open
EXPOSE 5050

# Mount the root folder to an anonymous directory on the host file system
# This is done to make it writable in case Dockstore (which calls cwl-runner)
# is used to launch the container, because cwl-runner makes the container
# file system read only. We need to do this because the Toil AWS provisioner
# will try to create a key pair in /root/.ssh and create a file called
# .sshSuccess in /root.
# Be sure to run the container with the -rm option so that the volume
# is removed when the container exits
VOLUME /root

ENTRYPOINT ["python", "/opt/rnaseq-pipeline/wrapper.py"]
CMD ["--help"]
157 changes: 141 additions & 16 deletions docker/rnaseq-cgl-pipeline.cwl
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,127 @@ hints:
description: "The process requires at least 16G of RAM and we recommend 500GB or storage."
inputs:
auto-scale:
type: boolean?
default: false
doc: "If this flag is true, the pipeline will use auto-scaling and will be the leader."
inputBinding:
prefix: --auto-scale
cluster-name:
type: string?
doc: "Name of the Toil cluster. Usually the security group name"
inputBinding:
prefix: --cluster-name
output-location:
type: string?
doc: "Directory in cloud where output files will be put; e.g. s3://toil-rnaseq-cloud-output-bucket."
inputBinding:
prefix: --output-location
provisioner:
type: string?
doc: "Cloud provisioner to use. E.g aws"
inputBinding:
prefix: --provisioner
job-store:
type: string?
doc: "Directory in cloud where working files will be put; e.g. aws:us-west-2:autoscaling-toil-rnaseq-jobstore"
inputBinding:
prefix: --job-store
max-nodes:
type: int?
doc: "Maximum worker nodes to launch in auto scaling. E.g. 2"
inputBinding:
prefix: --max-nodes
node-type:
type: string?
doc: "Cloud worker VM type; e.g. c3.8xlarge"
inputBinding:
prefix: --node-type
credentials-id:
type: string?
doc: "Credentials id"
inputBinding:
prefix: --credentials-id
credentials-secret-key:
type: string?
doc: "Credentials secret key"
inputBinding:
prefix: --credentials-secret-key
#This input format is needed for Dockstore when autoscaling so Dockstore does not download the input files
#Instead they will be downloaded by the Toil pipeline from a commmon location
sample-tar-paths:
doc: "Absolute path to sample tarball"
type:
- "null" #means that sample-tar-paths is optional, but if present must be an array of strings
- type: array
items: string
inputBinding:
prefix: --sample-tar
#This input format is needed for Dockstore when autoscaling so Dockstore does not download the input files
#Instead they will be downloaded by the Toil pipeline from a commmon location
sample-single-paths:
doc: "Absolute path(s) to unpaired FASTQ files. FASTQ files are comma delimited. Ex: sample1,sample2,sample3,sample4"
type:
- "null" #means that sample-single-paths is optional, but if present must be an array of strings
- type: array
items: string
inputBinding:
prefix: --sample-single
#This input format is needed for Dockstore when autoscaling so Dockstore does not download the input files
#Instead they will be downloaded by the Toil pipeline from a commmon location
sample-paired-paths:
doc: "Absolute path(s) to paired FASTQ files. FASTQ pairs are comma delimited and each pair is in the order R1,R2,R1,R2.... Ex: sample1,sample2,sample3,sample4"
type:
- "null" #means that sample-paired-paths is optional, but if present must be an array of strings
- type: array
items: string
inputBinding:
prefix: --sample-paired
output-basenames:
doc: "Array of Unique names to use for naming the output files"
type:
- type: array
items: string
inputBinding:
prefix: --output-basenames
star-path:
type: string?
doc: "Absolute path to STAR index tarball."
inputBinding:
prefix: --star
rsem-path:
type: string?
doc: "Absolute path to rsem reference tarball."
inputBinding:
prefix: --rsem
kallisto-path:
type: string?
doc: "Absolute path to kallisto index (.idx) file."
inputBinding:
prefix: --kallisto
hera-path:
type: string?
doc: "Absolute path to Hera index (.idx) file."
inputBinding:
prefix: --hera
sample-tar:
doc: "Absolute path to sample tarball"
type: File[]?
Expand All @@ -74,23 +195,29 @@ inputs:
itemSeparator: ","
star:
type: File
type: File?
doc: "Absolute path to STAR index tarball."
inputBinding:
prefix: --star
rsem:
type: File
type: File?
doc: "Absolute path to rsem reference tarball."
inputBinding:
prefix: --rsem
kallisto:
type: File
type: File?
doc: "Absolute path to kallisto index (.idx) file."
inputBinding:
prefix: --kallisto
hera:
type: File?
doc: "Absolute path to Hera index (.idx) file."
inputBinding:
prefix: --hera
disable-cutadapt:
type: boolean?
default: false
Expand Down Expand Up @@ -138,17 +265,23 @@ inputs:
inputBinding:
prefix: --resume
max-sample-size:
type: string?
doc: "Maximum size of sample file using Toil resource requirements syntax, e.g '20G'. Standard suffixes like K, Ki, M, Mi, G or Gi are supported."
inputBinding:
prefix: --max-sample-size
cores:
type: int?
doc: "Will set a cap on number of cores to use, default is all available cores."
inputBinding:
prefix: --cores
output-basename:
type: string?
doc: "Basename to use for naming the output files"
inputBinding:
prefix: --output-basename
# credentials-file:
# type: File?
# doc: "<path/file_name> with access credentials. E.g /root/.aws/credentials"
# inputBinding:
# prefix: --credentials-file
outputs:
output_files:
Expand All @@ -162,10 +295,6 @@ outputs:
wiggle_files:
type:
type: array
#is this needed if there are no wiggle file
#outputs, i.e. save-wiggle is false?
#similar to sample-tar above?
# items: ["null", File]
items: File
outputBinding:
glob: '*.wiggle.bg'
Expand All @@ -174,10 +303,6 @@ outputs:
bam_files:
type:
type: array
#is this needed if there are no BAM file
#outputs, i.e. save-bam is false?
#similar to sample-tar above?
# items: ["null", File]
items: File
outputBinding:
glob: '*.bam'
Expand Down
Loading

0 comments on commit a32acc3

Please sign in to comment.