Skip to content

Commit

Permalink
Merge pull request #4 from uclahs-cds/nwiltsie-bootstrap
Browse files Browse the repository at this point in the history
Bootstrap repository with minimal working pipeline
  • Loading branch information
nwiltsie authored Jul 24, 2024
2 parents b582ed9 + 4a9e218 commit be6926f
Show file tree
Hide file tree
Showing 30 changed files with 1,395 additions and 270 deletions.
2 changes: 1 addition & 1 deletion .github/CODEOWNERS
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# Default owner(s)
* @tyamaguchi-ucla @yashpatel6 @zhuchcn @uclahs-cds/software-wg
* @uclahs-cds/nextflow-wg
33 changes: 33 additions & 0 deletions .github/workflows/docker-build-release.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
---
name: Update image in GHCR

run-name: >
${{
github.event_name == 'delete' && format(
'Delete `{0}{1}`',
github.event.ref_type == 'branch' && 'branch-' || '',
github.event.ref
)
|| github.ref == 'refs/heads/main' && 'Update `dev`'
|| format(
'Update `{0}{1}`',
!startsWith(github.ref, 'refs/tags') && 'branch-' || '',
github.ref_name
)
}} docker tag
on:
push:
branches-ignore: ['gh-pages']
tags: ['v*']
delete:

jobs:
push-or-delete-image:
runs-on: ubuntu-latest
name: Update GitHub Container Registry
permissions:
contents: read
packages: write
steps:
- uses: uclahs-cds/[email protected]
2 changes: 1 addition & 1 deletion .github/workflows/static-analysis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ on:
- main

jobs:
CICD-base:
static-analysis:
runs-on: ubuntu-latest

steps:
Expand Down
32 changes: 32 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
ARG R_VERSION=4.3.1

FROM rocker/r-ver:${R_VERSION} AS build

COPY docker/install-stablelift.R /tmp
RUN Rscript /tmp/install-stablelift.R

FROM rocker/r-ver:${R_VERSION}

# Overwrite the site library with just the desired packages. By default rocker
# only bundles docopt and littler in that directory.
COPY --from=build /tmp/userlib /usr/local/lib/R/site-library

# Install python (required for argparse). The version is not important, but
# let's pin it for stability.
ARG PYTHON_VERSION=3.10.6-1~22.04

RUN apt-get update \
&& apt-get install -y --no-install-recommends \
python3=${PYTHON_VERSION} \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

# Add a new user/group called bldocker
RUN groupadd -g 500001 bldocker && \
useradd -l -r -u 500001 -g bldocker bldocker

# Change the default user to bldocker from root
USER bldocker

LABEL maintainer="Nicholas Wiltsie <[email protected]" \
org.opencontainers.image.source=https://github.com/uclahs-cds/pipeline-StableLift
120 changes: 79 additions & 41 deletions README.md

Large diffs are not rendered by default.

Empty file added config/F16.config
Empty file.
Empty file added config/F2.config
Empty file.
Empty file added config/F32.config
Empty file.
23 changes: 0 additions & 23 deletions config/F72.config
Original file line number Diff line number Diff line change
@@ -1,23 +0,0 @@
// Static process resource allocation here
// Specific for each node type - F72 here
process {
withName: process_name {
cpus = <allocated cpus>
memory = <allocated memory>
// Other process-specific allocations here
}
withName: process_name_2 {
cpus = <allocated cpus>
memory = <allocated memory>
// Other process-specific allocations here
}
withName: process_name_3 {
cpus = <allocated cpus>
memory = <allocated memory>
// Other process-specific allocations here
}
withName: example_process {
cpus = 2
memory = 5.GB
}
}
49 changes: 49 additions & 0 deletions config/custom_schema_types.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import nextflow.Nextflow

/**
* This custom schema namespace implements a custom type for Funcotator data sources.
*/
custom_schema_types {

/**
* Check that input refers to a properly configured Funcotator data source
* directory
*/
check_funcotator_data_source = { Map options, String name, Map properties ->
if (!(options[name] in Map)) {
throw new Exception("${name} should be a Map, not ${options[name].getClass()}.")
}

options[name].each { entry ->
def entry_as_map = [:]
entry_as_map[entry.key] = entry.value
schema.validate_parameter(entry_as_map, entry.key, properties.elements[entry.key])
}

/*
Confirm that the destination reference sequence ID is a valid subfolder
in at least _one_ of the data sources. A reference-specific data source
directory requires a .config file at a path like:
dataSourcesFolder/<sourcename>/hg19/<name>.config
dataSourcesFolder/<sourcename>/hg38/<name>.config
There can be mulitple <sourcename> folders, but there should be only
one config per reference-specific subfolder.
*/
config_glob = [
options[name].data_source,
"*",
options[name].dest_reference_id,
"*.config"
].join("/")

if (!Nextflow.file(config_glob)) {
throw new Exception("${name} is improperly configured - no files found matching '${config_glob}'")
}
}

types = [
'FuncotatorDataSource': custom_schema_types.check_funcotator_data_source
]
}
38 changes: 30 additions & 8 deletions config/default.config
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import nextflow.util.SysHelper
import nextflow.Nextflow

// Default inputs/parameters of the pipeline
params {
Expand All @@ -7,29 +8,50 @@ params {
min_cpus = 1
min_memory = 1.MB

save_intermediate_files = false

dataset_id = ''
blcds_registered_dataset = false

ucla_cds = true
docker_container_registry = "ghcr.io/uclahs-cds"

// Docker images
// REPLACE WITH TOOLS USED
tool_a_version = 'x.x.x' // Docker version for tool a
docker_image_tool_a = "${-> params.docker_container_registry}/tool_a:${params.tool_a_version}"
bcftools_version = '1.20_score-1.20-20240505'
bedtools_version = '2.31.0'
gatk_version = '4.2.4.1'
pipeval_version = '5.0.0-rc.3'
samtools_version = '1.20'
stablelift_version = 'branch-nwiltsie-bootstrap' // FIXME

docker_image_bcftools = "${-> params.docker_container_registry}/bcftools-score:${params.bcftools_version}"
docker_image_bedtools = "${-> params.docker_container_registry}/bedtools:${params.bedtools_version}"
docker_image_gatk = "broadinstitute/gatk:${params.gatk_version}"
docker_image_pipeval = "${-> params.docker_container_registry}/pipeval:${params.pipeval_version}"
docker_image_samtools = "${-> params.docker_container_registry}/samtools:${params.samtools_version}"
docker_image_stablelift = "${-> params.docker_container_registry}/stablelift:${params.stablelift_version}"

// These are the index files associated with the source and destination
// reference sequences
src_fasta_fai = "${ -> params.src_fasta_ref}.fai"
dest_fasta_fai = "${ -> params.dest_fasta_ref}.fai"

src_fasta_dict = "${ -> Nextflow.file(params.src_fasta_ref).resolveSibling(Nextflow.file(params.src_fasta_ref).getBaseName() + '.dict') }"
dest_fasta_dict = "${ -> Nextflow.file(params.dest_fasta_ref).resolveSibling(Nextflow.file(params.dest_fasta_ref).getBaseName() + '.dict') }"
}

// Process specific scope
process {
// Process results are stored to local cache.
// If pipeline is launched with the 'resume' option, existing cache results will be used when available
// rather than re-executing processes
// If pipeline is launched with the 'resume' option, existing cache results
// will be used when available rather than re-executing processes
cache = true

// Forward process 'stdout' to shell terminal and, consequently, the log file
echo = true
executor = 'local'

// Other directives or options that should apply for every process

// total amount of resources avaible to the pipeline
// Total amount of resources available to the pipeline
cpus = params.max_cpus
memory = params.max_memory
}
Expand Down
51 changes: 13 additions & 38 deletions config/methods.config
Original file line number Diff line number Diff line change
Expand Up @@ -2,45 +2,15 @@
includeConfig "${projectDir}/external/pipeline-Nextflow-config/config/methods/common_methods.config"
includeConfig "${projectDir}/external/pipeline-Nextflow-config/config/schema/schema.config"
includeConfig "${projectDir}/external/pipeline-Nextflow-config/config/bam/bam_parser.config"
includeConfig "${projectDir}/external/pipeline-Nextflow-config/config/store_object_as_json/store_object_as_json.config"

methods {
get_ids_from_bams = {
params.samples_to_process = [] as Set
params.input.BAM.each { k, v ->
v.each { bam_path ->
def bam_header = bam_parser.parse_bam_header(bam_path)
def sm_tags = bam_header['read_group'].collect{ it['SM'] }.unique()
if (sm_tags.size() != 1) {
throw new Exception("${bam_path} contains multiple samples! Please run pipeline with single sample BAMs.")
}
if (alreadyExists == params.samples_to_process.any { it.orig_id == sm_tags[0] }) {
throw new Exception("Sample ${sm_tags[0]} was found in multiple BAMs. Please provide only one BAM per sample")
}
new_sm_tag = methods.sanitize_string(sm_tags[0])
params.samples_to_process.add(['orig_id': sm_tags[0], 'id': new_sm_tag, 'path': bam_path, 'sample_type': k])
}
}
}
// Set the output and log output dirs here.
// Set the output and log output dirs here.
set_output_dir = {

tz = TimeZone.getTimeZone('UTC')
def date = new Date().format("yyyyMMdd'T'HHmmss'Z'", tz)

params.dataset_registry_prefix = '/hot/data'
def date = new Date().format("yyyyMMdd'T'HHmmss'Z'", TimeZone.getTimeZone('UTC'))

if (params.blcds_registered_dataset == true) {
if ("${params.dataset_id.length()}" != 11) {
throw new Exception("Dataset id must be eleven characters long")
}
def disease = "${params.dataset_id.substring(0,4)}"
// Need to fill in analyte, technology, raw_od_aligned, genome, pipeline-name
params.output_log_directory = "${params.dataset_registry_prefix}/$disease/${params.dataset_id}/${patient}/${sample}/analyte/technology,raw_or_aligned/genome/log/pipeline-name/$date"
params.disease = "${disease}"
} else {
params.output_dir_base = "${params.output_dir}/${manifest.name}-${manifest.version}/${params.sample_name.replace(' ', '_')}"
params.log_output_dir = "${params.output_dir_base}/log-${manifest.name}-${manifest.version}-${date}"
}
params.output_dir_base = "${params.output_dir}/${manifest.name}-${manifest.version}/${params.sample_id.replace(' ', '_')}"
params.log_output_dir = "${params.output_dir_base}/log-${manifest.name}-${manifest.version}-${date}"
}

set_pipeline_logs = {
Expand Down Expand Up @@ -68,15 +38,20 @@ methods {
}

setup = {
// add this file and uncomment if needed
// schema.load_custom_types("${projectDir}/config/custom_schema_types.config")
schema.load_custom_types("${projectDir}/config/custom_schema_types.config")
schema.validate()

methods.set_output_dir()
methods.set_resources_allocation()
methods.modify_base_allocations()
methods.set_pipeline_logs()
methods.set_env()
methods.get_ids_from_bams()
methods.setup_docker_cpus()
methods.setup_process_afterscript()

json_extractor.store_object_as_json(
params,
new File("${params.log_output_dir}/nextflow-log/params.json")
)
}
}
90 changes: 90 additions & 0 deletions config/schema.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
---
sample_id:
type: 'String'
required: true
help: 'sample id supplied from input yaml'
save_intermediate_files:
type: 'Bool'
required: true
default: false
help: 'Enable to store intermediate files'
output_dir:
type: 'Path'
mode: 'w'
required: true
help: 'absolute path to directory to store output'
src_fasta_ref:
type: 'Path'
mode: 'r'
required: true
help: 'Source reference sequence (FASTA)'
src_fasta_fai:
type: 'Path'
mode: 'r'
required: true
help: 'Source reference sequence index file'
src_fasta_dict:
type: 'Path'
mode: 'r'
required: true
help: 'Source reference sequence dictionary'
dest_fasta_ref:
type: 'Path'
mode: 'r'
required: true
help: 'Destination reference sequence (FASTA)'
dest_fasta_fai:
type: 'Path'
mode: 'r'
required: true
help: 'Destination reference sequence index file'
dest_fasta_dict:
type: 'Path'
mode: 'r'
required: true
help: 'Destination reference sequence dictionary'
chain_file:
type: 'Path'
mode: 'r'
required: true
help: 'FIXME ???'
repeat_bed:
type: 'Path'
mode: 'r'
required: true
help: 'FIXME ???'
funcotator_data:
type: 'FuncotatorDataSource'
required: true
help: 'Funcotator data source and reference sample IDs'
elements:
data_source:
type: 'Path'
mode: 'r'
required: true
help: 'Root data source folder for Funcotator'
src_reference_id:
type: 'String'
mode: 'r'
required: true
help: 'Reference build ID for the source sequence'
dest_reference_id:
type: 'String'
mode: 'r'
required: true
help: 'Reference build ID for the destination sequence'
rf_model:
type: 'Path'
mode: 'r'
required: true
help: 'FIXME ???'
input:
type: 'Namespace'
required: true
help: 'Input sample'
elements:
vcf:
type: 'Path'
mode: 'r'
required: true
help: 'Input dataset supplied by input yaml'
Loading

0 comments on commit be6926f

Please sign in to comment.