Merge pull request #4 from uclahs-cds/nwiltsie-bootstrap

Bootstrap repository with minimal working pipeline
uclahs-cds · Jul 24, 2024 · be6926f · be6926f
2 parents b582ed9 + 4a9e218
commit be6926f
Show file tree

Hide file tree

Showing 30 changed files with 1,395 additions and 270 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -1,2 +1,2 @@
 # Default owner(s)
-*	@tyamaguchi-ucla @yashpatel6 @zhuchcn @uclahs-cds/software-wg
+*	@uclahs-cds/nextflow-wg
diff --git a/.github/workflows/docker-build-release.yaml b/.github/workflows/docker-build-release.yaml
@@ -0,0 +1,33 @@
+---
+name: Update image in GHCR
+
+run-name: >
+  ${{
+    github.event_name == 'delete' && format(
+      'Delete `{0}{1}`',
+      github.event.ref_type == 'branch' && 'branch-' || '',
+      github.event.ref
+    )
+    || github.ref == 'refs/heads/main' && 'Update `dev`'
+      || format(
+        'Update `{0}{1}`',
+        !startsWith(github.ref, 'refs/tags') && 'branch-' || '',
+        github.ref_name
+      )
+  }} docker tag
+
+on:
+  push:
+    branches-ignore: ['gh-pages']
+    tags: ['v*']
+  delete:
+
+jobs:
+  push-or-delete-image:
+    runs-on: ubuntu-latest
+    name: Update GitHub Container Registry
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - uses: uclahs-cds/[email protected]
diff --git a/.github/workflows/static-analysis.yml b/.github/workflows/static-analysis.yml
@@ -10,7 +10,7 @@ on:
       - main
 
 jobs:
-  CICD-base:
+  static-analysis:
     runs-on: ubuntu-latest
 
     steps:

diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,32 @@
+ARG R_VERSION=4.3.1
+
+FROM rocker/r-ver:${R_VERSION} AS build
+
+COPY docker/install-stablelift.R /tmp
+RUN Rscript /tmp/install-stablelift.R
+
+FROM rocker/r-ver:${R_VERSION}
+
+# Overwrite the site library with just the desired packages. By default rocker
+# only bundles docopt and littler in that directory.
+COPY --from=build /tmp/userlib /usr/local/lib/R/site-library
+
+# Install python (required for argparse). The version is not important, but
+# let's pin it for stability.
+ARG PYTHON_VERSION=3.10.6-1~22.04
+
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+        python3=${PYTHON_VERSION} \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+# Add a new user/group called bldocker
+RUN groupadd -g 500001 bldocker && \
+    useradd -l -r -u 500001 -g bldocker bldocker
+
+# Change the default user to bldocker from root
+USER bldocker
+
+LABEL maintainer="Nicholas Wiltsie <[email protected]" \
+      org.opencontainers.image.source=https://github.com/uclahs-cds/pipeline-StableLift
diff --git a/README.md b/README.md
diff --git a/config/F16.config b/config/F16.config
diff --git a/config/F2.config b/config/F2.config
diff --git a/config/F32.config b/config/F32.config
diff --git a/config/F72.config b/config/F72.config
@@ -1,23 +0,0 @@
-// Static process resource allocation here
-// Specific for each node type - F72 here
-process {
-    withName: process_name {
-        cpus = <allocated cpus>
-        memory = <allocated memory>
-        // Other process-specific allocations here
-    }
-    withName: process_name_2 {
-        cpus = <allocated cpus>
-        memory = <allocated memory>
-        // Other process-specific allocations here
-    }
-    withName: process_name_3 {
-        cpus = <allocated cpus>
-        memory = <allocated memory>
-        // Other process-specific allocations here
-    }
-    withName: example_process {
-        cpus = 2
-        memory = 5.GB
-    }
-}

diff --git a/config/custom_schema_types.config b/config/custom_schema_types.config
@@ -0,0 +1,49 @@
+import nextflow.Nextflow
+
+/**
+* This custom schema namespace implements a custom type for Funcotator data sources.
+*/
+custom_schema_types {
+
+    /**
+    * Check that input refers to a properly configured Funcotator data source
+    * directory
+    */
+    check_funcotator_data_source = { Map options, String name, Map properties ->
+        if (!(options[name] in Map)) {
+            throw new Exception("${name} should be a Map, not ${options[name].getClass()}.")
+        }
+
+        options[name].each { entry ->
+            def entry_as_map = [:]
+            entry_as_map[entry.key] = entry.value
+            schema.validate_parameter(entry_as_map, entry.key, properties.elements[entry.key])
+        }
+
+        /*
+        Confirm that the destination reference sequence ID is a valid subfolder
+        in at least _one_ of the data sources. A reference-specific data source
+        directory requires a .config file at a path like:
+
+        dataSourcesFolder/<sourcename>/hg19/<name>.config
+        dataSourcesFolder/<sourcename>/hg38/<name>.config
+
+        There can be mulitple <sourcename> folders, but there should be only
+        one config per reference-specific subfolder.
+        */
+        config_glob = [
+            options[name].data_source,
+            "*",
+            options[name].dest_reference_id,
+            "*.config"
+        ].join("/")
+
+        if (!Nextflow.file(config_glob)) {
+            throw new Exception("${name} is improperly configured - no files found matching '${config_glob}'")
+        }
+    }
+
+    types = [
+        'FuncotatorDataSource': custom_schema_types.check_funcotator_data_source
+    ]
+}
diff --git a/config/default.config b/config/default.config
@@ -1,4 +1,5 @@
 import nextflow.util.SysHelper
+import nextflow.Nextflow
 
 // Default inputs/parameters of the pipeline
 params {
@@ -7,29 +8,50 @@ params {
     min_cpus = 1
     min_memory = 1.MB
 
+    save_intermediate_files = false
+
+    dataset_id = ''
+    blcds_registered_dataset = false
+
     ucla_cds = true
     docker_container_registry = "ghcr.io/uclahs-cds"
 
     // Docker images
-    // REPLACE WITH TOOLS USED
-    tool_a_version = 'x.x.x' // Docker version for tool a
-    docker_image_tool_a = "${-> params.docker_container_registry}/tool_a:${params.tool_a_version}"
+    bcftools_version   = '1.20_score-1.20-20240505'
+    bedtools_version   = '2.31.0'
+    gatk_version       = '4.2.4.1'
+    pipeval_version    = '5.0.0-rc.3'
+    samtools_version   = '1.20'
+    stablelift_version = 'branch-nwiltsie-bootstrap'    // FIXME
+
+    docker_image_bcftools = "${-> params.docker_container_registry}/bcftools-score:${params.bcftools_version}"
+    docker_image_bedtools = "${-> params.docker_container_registry}/bedtools:${params.bedtools_version}"
+    docker_image_gatk = "broadinstitute/gatk:${params.gatk_version}"
+    docker_image_pipeval = "${-> params.docker_container_registry}/pipeval:${params.pipeval_version}"
+    docker_image_samtools = "${-> params.docker_container_registry}/samtools:${params.samtools_version}"
+    docker_image_stablelift = "${-> params.docker_container_registry}/stablelift:${params.stablelift_version}"
+
+    // These are the index files associated with the source and destination
+    // reference sequences
+    src_fasta_fai = "${ -> params.src_fasta_ref}.fai"
+    dest_fasta_fai = "${ -> params.dest_fasta_ref}.fai"
+
+    src_fasta_dict = "${ -> Nextflow.file(params.src_fasta_ref).resolveSibling(Nextflow.file(params.src_fasta_ref).getBaseName() + '.dict') }"
+    dest_fasta_dict = "${ -> Nextflow.file(params.dest_fasta_ref).resolveSibling(Nextflow.file(params.dest_fasta_ref).getBaseName() + '.dict') }"
 }
 
 // Process specific scope
 process {
     // Process results are stored to local cache.
-    // If pipeline is launched with the 'resume' option, existing cache results will be used when available
-    // rather than re-executing processes
+    // If pipeline is launched with the 'resume' option, existing cache results
+    // will be used when available rather than re-executing processes
     cache = true
 
     // Forward process 'stdout' to shell terminal and, consequently, the log file
     echo = true
     executor = 'local'
 
-    // Other directives or options that should apply for every process
-
-    // total amount of resources avaible to the pipeline
+    // Total amount of resources available to the pipeline
     cpus = params.max_cpus
     memory = params.max_memory
 }

diff --git a/config/methods.config b/config/methods.config
@@ -2,45 +2,15 @@
 includeConfig "${projectDir}/external/pipeline-Nextflow-config/config/methods/common_methods.config"
 includeConfig "${projectDir}/external/pipeline-Nextflow-config/config/schema/schema.config"
 includeConfig "${projectDir}/external/pipeline-Nextflow-config/config/bam/bam_parser.config"
+includeConfig "${projectDir}/external/pipeline-Nextflow-config/config/store_object_as_json/store_object_as_json.config"
 
 methods {
-    get_ids_from_bams = {
-        params.samples_to_process = [] as Set
-        params.input.BAM.each { k, v ->
-            v.each { bam_path ->
-                def bam_header = bam_parser.parse_bam_header(bam_path)
-                def sm_tags = bam_header['read_group'].collect{ it['SM'] }.unique()
-                if (sm_tags.size() != 1) {
-                    throw new Exception("${bam_path} contains multiple samples! Please run pipeline with single sample BAMs.")
-                }
-                if (alreadyExists == params.samples_to_process.any { it.orig_id == sm_tags[0] }) {
-                    throw new Exception("Sample ${sm_tags[0]} was found in multiple BAMs. Please provide only one BAM per sample")
-                }
-                new_sm_tag = methods.sanitize_string(sm_tags[0])
-                params.samples_to_process.add(['orig_id': sm_tags[0], 'id': new_sm_tag, 'path': bam_path, 'sample_type': k])
-            }
-        }
-    }
-    // Set the output and log output dirs here. 
+    // Set the output and log output dirs here.
     set_output_dir = {
-
-        tz = TimeZone.getTimeZone('UTC')
-        def date = new Date().format("yyyyMMdd'T'HHmmss'Z'", tz)
-
-        params.dataset_registry_prefix = '/hot/data'
+        def date = new Date().format("yyyyMMdd'T'HHmmss'Z'", TimeZone.getTimeZone('UTC'))
 
-        if (params.blcds_registered_dataset == true) {
-            if ("${params.dataset_id.length()}" != 11) {
-                throw new Exception("Dataset id must be eleven characters long")
-            }
-            def disease = "${params.dataset_id.substring(0,4)}"
-            // Need to fill in analyte, technology, raw_od_aligned, genome, pipeline-name
-            params.output_log_directory = "${params.dataset_registry_prefix}/$disease/${params.dataset_id}/${patient}/${sample}/analyte/technology,raw_or_aligned/genome/log/pipeline-name/$date"
-            params.disease = "${disease}"
-        } else {
-            params.output_dir_base = "${params.output_dir}/${manifest.name}-${manifest.version}/${params.sample_name.replace(' ', '_')}"
-            params.log_output_dir = "${params.output_dir_base}/log-${manifest.name}-${manifest.version}-${date}"
-        }
+        params.output_dir_base = "${params.output_dir}/${manifest.name}-${manifest.version}/${params.sample_id.replace(' ', '_')}"
+        params.log_output_dir = "${params.output_dir_base}/log-${manifest.name}-${manifest.version}-${date}"
     }
 
     set_pipeline_logs = {
@@ -68,15 +38,20 @@ methods {
     }
 
     setup = {
-// add this file and uncomment if needed
-//        schema.load_custom_types("${projectDir}/config/custom_schema_types.config")
+        schema.load_custom_types("${projectDir}/config/custom_schema_types.config")
+        schema.validate()
+
         methods.set_output_dir()
         methods.set_resources_allocation()
         methods.modify_base_allocations()
         methods.set_pipeline_logs()
         methods.set_env()
-        methods.get_ids_from_bams()
         methods.setup_docker_cpus()
         methods.setup_process_afterscript()
+
+        json_extractor.store_object_as_json(
+            params,
+            new File("${params.log_output_dir}/nextflow-log/params.json")
+        )
     }
 }
diff --git a/config/schema.yaml b/config/schema.yaml
@@ -0,0 +1,90 @@
+---
+sample_id:
+  type: 'String'
+  required: true
+  help: 'sample id supplied from input yaml'
+save_intermediate_files:
+  type: 'Bool'
+  required: true
+  default: false
+  help: 'Enable to store intermediate files'
+output_dir:
+  type: 'Path'
+  mode: 'w'
+  required: true
+  help: 'absolute path to directory to store output'
+src_fasta_ref:
+  type: 'Path'
+  mode: 'r'
+  required: true
+  help: 'Source reference sequence (FASTA)'
+src_fasta_fai:
+  type: 'Path'
+  mode: 'r'
+  required: true
+  help: 'Source reference sequence index file'
+src_fasta_dict:
+  type: 'Path'
+  mode: 'r'
+  required: true
+  help: 'Source reference sequence dictionary'
+dest_fasta_ref:
+  type: 'Path'
+  mode: 'r'
+  required: true
+  help: 'Destination reference sequence (FASTA)'
+dest_fasta_fai:
+  type: 'Path'
+  mode: 'r'
+  required: true
+  help: 'Destination reference sequence index file'
+dest_fasta_dict:
+  type: 'Path'
+  mode: 'r'
+  required: true
+  help: 'Destination reference sequence dictionary'
+chain_file:
+  type: 'Path'
+  mode: 'r'
+  required: true
+  help: 'FIXME ???'
+repeat_bed:
+  type: 'Path'
+  mode: 'r'
+  required: true
+  help: 'FIXME ???'
+funcotator_data:
+  type: 'FuncotatorDataSource'
+  required: true
+  help: 'Funcotator data source and reference sample IDs'
+  elements:
+    data_source:
+      type: 'Path'
+      mode: 'r'
+      required: true
+      help: 'Root data source folder for Funcotator'
+    src_reference_id:
+      type: 'String'
+      mode: 'r'
+      required: true
+      help: 'Reference build ID for the source sequence'
+    dest_reference_id:
+      type: 'String'
+      mode: 'r'
+      required: true
+      help: 'Reference build ID for the destination sequence'
+rf_model:
+  type: 'Path'
+  mode: 'r'
+  required: true
+  help: 'FIXME ???'
+input:
+  type: 'Namespace'
+  required: true
+  help: 'Input sample'
+  elements:
+    vcf:
+      type: 'Path'
+      mode: 'r'
+      required: true
+      help: 'Input dataset supplied by input yaml'
-Original file line number
+Diff line change
@@ Expand Up / @@ -10,7 +10,7 @@ on: @@
           - main
     jobs:
-      CICD-base:
+      static-analysis:
         runs-on: ubuntu-latest
         steps:
@@ Expand Down @@