Updating output directives and adding software information

marchoeppner · marchoeppner · commit 4f31bd83de66 · 2024-03-11T09:53:27.000+01:00
diff --git a/.groovylintrc.json b/.groovylintrc.json
@@ -18,7 +18,7 @@
         "design.ImplementationAsType": "off",
         "unnecessary.UnnecessaryPublicModifier": "off",
         "unnecessary.DuplicateStringLiteral": "off",
-        "formatting.LineLength": "off",
-	"convention.ImplicitClosureParameter": "off"
+	"basic.DeadCode": "off",
+	"formatting.LineLength": "off"
     }
 }
diff --git a/conf/modules.config b/conf/modules.config
@@ -0,0 +1,45 @@
+process {
+
+    publishDir = [
+        path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" },
+        mode: params.publish_dir_mode,
+        enabled: true,
+        saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+    ]
+
+    withName: BWAMEM2_MEM {
+        publishDir = [
+            path: { "${params.outdir}/bwamem2" },
+            mode: params.publish_dir_mode,
+            enabled: false
+        ]
+    }
+    withName: CUSTOM_DUMPSOFTWAREVERSIONS {
+        publishDir = [
+            path: { "${params.outdir}/custom" },
+            mode: params.publish_dir_mode,
+            enabled: false
+        ]
+    }
+    withName: FASTP {
+        publishDir = [
+            path: { "${params.outdir}/FASTP" },
+            mode: params.publish_dir_mode,
+            enabled: false
+        ]
+    }
+    withName: PTRIMMER {
+        publishDir = [
+            path: { "${params.outdir}/ptrimmer" },
+            mode: params.publish_dir_mode,
+            enabled: false
+        ]
+    }
+    withName: BIOBLOOMTOOLS_CATEGORIZER {
+        publishDir = [
+            path: { "${params.outdir}/biobloom" },
+            mode: params.publish_dir_mode,
+            enabled: false
+        ]  
+    }
+}
diff --git a/conf/resources.config b/conf/resources.config
@@ -4,9 +4,9 @@ params {
        
        genomes {
           tomato {
-            fasta    = "${params.reference_base}/tomato/Solanum_lycopersicum.SL3.0.dna.toplevel.fa"
-            fai      = "${params.reference_base}/tomato/Solanum_lycopersicum.SL3.0.dna.toplevel.fa.fai"
-            dict     = "${params.reference_base}/tomato/sSolanum_lycopersicum.SL3.0.dna.toplevel.dict"
+            fasta    = "${params.reference_base}/gmo-check/tomato/Solanum_lycopersicum.SL3.0.dna.toplevel.fa"
+            fai      = "${params.reference_base}/gmo-check/tomato/Solanum_lycopersicum.SL3.0.dna.toplevel.fa.fai"
+            dict     = "${params.reference_base}/gmo-check/tomato/sSolanum_lycopersicum.SL3.0.dna.toplevel.dict"
             amplicon_txt = "${baseDir}/assets/genomes/tomato/amplicon.txt"
             bed      = "${baseDir}/assets/genomes/tomato/primers.bed"
             target_bed = "${baseDir}/assets/genomes/tomato/targets.bed"
diff --git a/docs/installation.md b/docs/installation.md
@@ -16,40 +16,14 @@ This pipeline requires locally stored genomes in fasta format. To build these, d
 nextflow run marchoeppner/gmo-check -profile standard,singularity --build_references --run_name build_refs --outdir /path/to/references
 ```
 
-If you do not have singularity on your system, you can also specify docker, podman or conda for software provisioning - see the [usage information](usage.md).
+where `/path/to/references` could be something like `/data/pipelines/references` or whatever is most appropriate on your system. 
 
-The path specified with `--outdir` can then be given to the pipeline during normal execution as `--reference_base`.
+If you do not have singularity on your system, you can also specify docker, podman or conda for software provisioning - see the [usage information](usage.md).
 
+The path specified with `--outdir` can then be given to the pipeline during normal execution as `--reference_base`. Please note that the build process will create a pipeline-specific subfolder (`gmo-check`) that must not be given as part of the `--outdir` argument. Gmo-check is part of a collection of pipelines that use a shared reference directory and it will choose the appropriate subfolder by itself. 
 
 ## Site-specific config file
 
-This pipeline requires a site-specific configuration file to be able to talk to your local cluster or compute infrastructure. Nextflow supports a wide
-range of such infrastructures, including Slurm, LSF and SGE - but also Kubernetes and AWS. For more information, see [here](https://www.nextflow.io/docs/latest/executor.html).
-
-Please see conf/lsh.config for an example of how to configure this pipeline for a Slurm queue.
-
-All software is provided through either Conda environments or Docker containers. Consider a Docker-compatible container engine if at all possible (Docker, Singularity, Podman). Conda environments are built on the fly during pipeline execution and only for a given pipeline run, which tends to slow things down quite a bit. Details on how to specify singularity as your container engine are provided in the config file for our lsh system (lsh.config).
-
-With this information in place, you will next have to create an new site-specific profile for your local environment in `nextflow.config` using the following format:
-
-```
-
-profiles {
-	
-	your_profile {
-		includeConfig 'conf/base.config'
-		includeConfig 'conf/your_cluster.config'
-		includeConfig 'conf/resources.config'
-	}
-}
-
-```
-
-This would add a new profile, called `your_profile` which uses (and expects) conda to provide all software. 
-
-`base.config` Basic settings about resource usage for the individual pipeline stages. 
-
-`resources.config` Gives information about the files that are to be used during analysis for the individual human genome assemblies. 
-
-`your_cluster.config` Specifies which sort of resource manager to use and where to find e.g. local resources cluster file system (see below).
+If you run on anything other than a local system, this pipeline requires a site-specific configuration file to be able to talk to your cluster or compute infrastructure. Nextflow supports a wide range of such infrastructures, including Slurm, LSF and SGE - but also Kubernetes and AWS. For more information, see [here](https://www.nextflow.io/docs/latest/executor.html).
 
+Site-specific config-files for our pipeline ecosystem are stored centrally on [github](https://github.com/marchoeppner/configs). Please talk to us if you want to add your system
diff --git a/docs/pipeline.md b/docs/pipeline.md
@@ -1 +1,3 @@
 # Pipeline structure
+
+![](images/pipeline_dag.png)
diff --git a/docs/software.md b/docs/software.md
@@ -7,4 +7,22 @@ Version 0.24, doi: 10.1093/bioinformatics/bty560, [PubMed](https://pubmed.ncbi.n
 Version 1.19, doi: 10.1093/bioinformatics/btw354, [PubMed](https://pubmed.ncbi.nlm.nih.gov/27312411/) [github](https://github.com/MultiQC/MultiQC)
 
 **Samtools**
-Version 1.19, doi: 10.1093/bioinformatics/btp352, [PubMed](https://pubmed.ncbi.nlm.nih.gov/19505943/) [github](https://github.com/samtools/samtools)
+Version 1.19, doi: 10.1093/bioinformatics/btp352, [PubMed](https://pubmed.ncbi.nlm.nih.gov/19505943/) [github](https://github.com/samtools/samtools)
+
+**Vsearch**
+Version 2.27.0, doi: 10.7717/peerj.2584, [PubMed](https://pubmed.ncbi.nlm.nih.gov/27781170/) [github](https://github.com/torognes/vsearch)
+
+**Ptrimmer**
+Version 1.3.3, doi: 10.1186/s12859-019-2854-x, [PubMed](https://pubmed.ncbi.nlm.nih.gov/31077131/) [github](https://github.com/DMU-lilab/pTrimmer)
+
+**Bwa-mem2**
+Version 2.2.1, doi: 10.1109/IPDPS.2019.00041, [IEEE Explore](https://ieeexplore.ieee.org/document/8820962) [github](https://github.com/bwa-mem2/bwa-mem2)
+
+**Freebayes**
+Version 1.3.6, [ArXiv](http://arxiv.org/abs/1207.3907) [github](https://github.com/freebayes/freebayes)
+
+**Blast**
+Version 2.15, doi: 10.1016/S0022-2836(05)80360-2, [PubMed](https://pubmed.ncbi.nlm.nih.gov/2231712/) [NCBI] https://blast.ncbi.nlm.nih.gov/doc/blast-help/downloadblastdata.html
+
+**Bedtools**
+Version 2.31.1, doi: 10.1093/bioinformatics/btq033, [PubMed](https://pubmed.ncbi.nlm.nih.gov/20110278/) [github](https://github.com/arq5x/bedtools2)
diff --git a/images/pipeline_dag.png b/images/pipeline_dag.png
diff --git a/lib/WorkflowPipeline.groovy b/lib/WorkflowPipeline.groovy
@@ -16,6 +16,9 @@ class WorkflowPipeline {
             log.info 'Cannot run the alignment workflow without genome references (--reference_base). Please check the documentation!'
             System.exit(1)
         }
+        if ( !params.input && !params.build_references) {
+            log.info "This pipeline requires a sample sheet as input (--input)"
+            System.exit(1)
+        }
     }
-
 }
diff --git a/main.nf b/main.nf
@@ -2,16 +2,15 @@
 
 nextflow.enable.dsl = 2
 
-// TODO: Update this block with a description and the name of the pipeline
 /**
 ===============================
-Pipeline
+GMO-check Pipeline
 ===============================
 
-This Pipeline performs ....
+This Pipeline performs detection of genetic events in food and seed material(s) (GMO analysis).
 
 ### Homepage / git
-git@github.com:marchoeppner/pipeline.git
+git@github.com:marchoeppner/gmo-check.git
 
 **/
 
@@ -24,7 +23,6 @@ run_name = (params.run_name == false) ? "${workflow.sessionId}" : "${params.run_
 
 WorkflowMain.initialise(workflow, params, log)
 
-// TODO: Rename this and the file under lib/ to something matching this pipeline (e.g. WorkflowAmplicons)
 WorkflowPipeline.initialise(params, log)
 
 include { GMO }                 from './workflows/gmo'
@@ -48,9 +46,9 @@ workflow.onComplete {
     log.info "Duration: $workflow.duration"
     log.info hline
 
-    summary["BlastDB"]                  = params.blastdb
-    summary["Freebayes_min_alt_frac"]   = params.freebayes_min_alternate_frac
-    summary["Freebayes_min_alt_count"]  = params.freebayes_min_alternate_count
+    summary['BlastDB']                  = params.blastdb
+    summary['Freebayes_min_alt_frac']   = params.freebayes_min_alternate_frac
+    summary['Freebayes_min_alt_count']  = params.freebayes_min_alternate_count
 
     emailFields = [:]
     emailFields['version'] = workflow.manifest.version
diff --git a/modules/bedtools/coverage/main.nf b/modules/bedtools/coverage/main.nf
@@ -1,5 +1,4 @@
 process BEDTOOLS_COVERAGE {
-    publishDir "${params.outdir}/${meta.sample_id}/BEDTOOLS", mode: 'copy'
 
     label 'short_parallel'
 
@@ -11,16 +10,16 @@ process BEDTOOLS_COVERAGE {
         'quay.io/biocontainers/bedtools:2.31.1--hf5e1c6e_0' }"
 
     input:
-    tuple val(meta),path(bam),path(bai)
+    tuple val(meta), path(bam), path(bai)
     path(bed)
 
     output:
     tuple val(meta), path(coverage), emit: report
     path('versions.yml'), emit: versions
 
     script:
-    coverage = meta.sample_id + ".bedcov.txt"
-    
+    coverage = meta.sample_id + '.bedcov.txt'
+
     """
     coverageBed -a $bed -b $bam > $coverage
 
diff --git a/modules/biobloomtools/categorizer/main.nf b/modules/biobloomtools/categorizer/main.nf
@@ -1,5 +1,4 @@
 process BIOBLOOMTOOLS_CATEGORIZER {
-    publishDir "${params.outdir}/Processing/Bloomfilter", mode: 'copy'
 
     label 'short_parallel'
 
diff --git a/modules/blast/blastn/main.nf b/modules/blast/blastn/main.nf
@@ -1,5 +1,4 @@
 process BLAST_BLASTN {
-    publishDir "${params.outdir}/Processing/BlastN", mode: 'copy'
 
     label 'short_parallel'
 
diff --git a/modules/blast/makeblastdb/main.nf b/modules/blast/makeblastdb/main.nf
@@ -1,8 +1,6 @@
 process BLAST_MAKEBLASTDB {
     tag "$fasta"
 
-    publishDir "${params.outdir}/Processing/BlastDB", mode: 'copy'
-
     label 'short_parallel'
 
     conda 'bioconda::blast=2.15'
diff --git a/modules/bwamem2/index/main.nf b/modules/bwamem2/index/main.nf
@@ -6,7 +6,7 @@ process BWAMEM2_INDEX {
     conda 'bioconda::samtools=1.19.2 bioconda::bwa-mem2=2.2.1'
     container 'quay.io/biocontainers/mulled-v2-e5d375990341c5aef3c9aff74f96f66f65375ef6:2cdf6bf1e92acbeb9b2834b1c58754167173a410-0'
 
-    publishDir "${params.outdir}/${meta.id}", mode: 'copy'
+    publishDir "${params.outdir}/gmo-check/${meta.id}", mode: 'copy'
 
     input:
     tuple val(meta), path(fasta)
diff --git a/modules/freebayes/main.nf b/modules/freebayes/main.nf
@@ -1,8 +1,6 @@
 process FREEBAYES {
     tag "${meta.sample_id}"
 
-    publishDir "${params.outdir}/Processing/Freebayes", mode: 'copy'
-
     label 'medium_serial'
 
     conda 'bioconda::freebayes=1.3.6'
diff --git a/modules/gunzip/main.nf b/modules/gunzip/main.nf
@@ -3,7 +3,7 @@ process GUNZIP {
 
     label 'medium_serial'
 
-    publishDir "${params.outdir}/${meta.id}", mode: 'copy'
+    publishDir "${params.outdir}/gmo-check/${meta.id}", mode: 'copy'
 
     conda 'sed=4.7'
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
diff --git a/modules/helper/rules_to_bed.nf b/modules/helper/rules_to_bed.nf
@@ -1,16 +1,14 @@
 process RULES_TO_BED {
-
     input:
     path(json)
 
     output:
     path(bed), emit: bed
 
     script:
-    bed = "rules.txt"
+    bed = 'rules.txt'
 
     """
     rules_to_bed.rb --json $json > $bed
     """
-
-}
+}
diff --git a/modules/helper/vcf_to_report.nf b/modules/helper/vcf_to_report.nf
@@ -4,7 +4,7 @@ process VCF_TO_REPORT {
     publishDir "${params.outdir}/Reports/JSON", mode: 'copy'
 
     input:
-    tuple val(meta),path(vcf),path(coverage)
+    tuple val(meta), path(vcf), path(coverage)
     path(rules)
 
     output:
diff --git a/modules/samtools/ampliconclip/main.nf b/modules/samtools/ampliconclip/main.nf
@@ -6,8 +6,6 @@ process SAMTOOLS_AMPLICONCLIP {
 
     tag "${meta.sample_id}"
 
-    publishDir "${params.outdir}/${meta.sample_id}/BWA2", mode: 'copy'
-
     input:
     tuple val(meta), path(bam), path(bai)
     path(bed)
diff --git a/modules/samtools/dict/main.nf b/modules/samtools/dict/main.nf
@@ -6,7 +6,7 @@ process SAMTOOLS_DICT {
         'https://depot.galaxyproject.org/singularity/samtools:1.19.2--h50ea8bc_0' :
         'quay.io/biocontainers/samtools:1.19.2--h50ea8bc_0' }"
 
-    publishDir "${params.outdir}/${meta.id}", mode: 'copy'
+    publishDir "${params.outdir}/gmo-check/${meta.id}", mode: 'copy'
 
     input:
     tuple val(meta), path(fasta)
diff --git a/modules/samtools/faidx/main.nf b/modules/samtools/faidx/main.nf
@@ -8,7 +8,7 @@ process SAMTOOLS_FAIDX {
         'https://depot.galaxyproject.org/singularity/samtools:1.19.2--h50ea8bc_0' :
         'quay.io/biocontainers/samtools:1.19.2--h50ea8bc_0' }"
 
-    publishDir "${params.outdir}/${meta.id}", mode: 'copy'
+    publishDir "${params.outdir}/gmo-check/${meta.id}", mode: 'copy'
 
     input:
     tuple val(meta), path(fasta)
diff --git a/modules/samtools/index/main.nf b/modules/samtools/index/main.nf
@@ -1,5 +1,4 @@
 process SAMTOOLS_INDEX {
-    publishDir "${params.outdir}/${meta.sample_id}/BWA2", mode: 'copy'
 
     conda 'bioconda::samtools=1.19.2'
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
diff --git a/modules/samtools/markdup/main.nf b/modules/samtools/markdup/main.nf
@@ -8,8 +8,6 @@ process SAMTOOLS_MARKDUP {
 
     tag "${meta.sample_id}"
 
-    publishDir "${params.outdir}/${meta.sample_id}/", mode: 'copy'
-
     input:
     tuple val(meta), path(merged_bam), path(merged_bam_index)
 
diff --git a/modules/vsearch/fastqfilter/main.nf b/modules/vsearch/fastqfilter/main.nf
@@ -1,8 +1,6 @@
 process VSEARCH_FASTQFILTER {
     tag "${meta.sample_id}"
 
-    publishDir "${params.outdir}/${meta.sample_id}/VSEARCH", mode: 'copy'
-
     label 'short_serial'
 
     conda 'bioconda::vsearch=2.27.0'
diff --git a/modules/vsearch/fastqmerge/main.nf b/modules/vsearch/fastqmerge/main.nf
@@ -1,8 +1,6 @@
 process VSEARCH_FASTQMERGE {
     tag "${meta.sample_id}"
 
-    publishDir "${params.outdir}/${meta.sample_id}/VSEARCH", mode: 'copy'
-
     label 'short_serial'
 
     conda 'bioconda::vsearch=2.27.0'
diff --git a/modules/vsearch/fastxuniques/main.nf b/modules/vsearch/fastxuniques/main.nf
@@ -1,8 +1,6 @@
 process VSEARCH_FASTXUNIQUES {
     tag "${meta.sample_id}"
 
-    publishDir "${params.outdir}/${meta.sample_id}/VSEARCH", mode: 'copy'
-
     label 'short_serial'
 
     conda 'bioconda::vsearch=2.27.0'
diff --git a/nextflow.config b/nextflow.config
diff --git a/subworkflows/bwamem2/main.nf b/subworkflows/bwamem2/main.nf
diff --git a/workflows/build_references.nf b/workflows/build_references.nf

Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@`
`18`	`18`	`"design.ImplementationAsType": "off",`
`19`	`19`	`"unnecessary.UnnecessaryPublicModifier": "off",`
`20`	`20`	`"unnecessary.DuplicateStringLiteral": "off",`
`21`		`- "formatting.LineLength": "off",`
`22`		`- "convention.ImplicitClosureParameter": "off"`
	`21`	`+ "basic.DeadCode": "off",`
	`22`	`+ "formatting.LineLength": "off"`
`23`	`23`	`}`
`24`	`24`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1 +1,3 @@`
`1`	`1`	`# Pipeline structure`
	`2`	`+`
	`3`	`+![](images/pipeline_dag.png)`
Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,9 @@ class WorkflowPipeline {`
`16`	`16`	`log.info 'Cannot run the alignment workflow without genome references (--reference_base). Please check the documentation!'`
`17`	`17`	`System.exit(1)`
`18`	`18`	`}`
	`19`	`+ if ( !params.input && !params.build_references) {`
	`20`	`+ log.info "This pipeline requires a sample sheet as input (--input)"`
	`21`	`+ System.exit(1)`
	`22`	`+ }`
`19`	`23`	`}`
`20`		`-`
`21`	`24`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,4 @@`
`1`	`1`	`process BIOBLOOMTOOLS_CATEGORIZER {`
`2`		`- publishDir "${params.outdir}/Processing/Bloomfilter", mode: 'copy'`
`3`	`2`
`4`	`3`	`label 'short_parallel'`
`5`	`4`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,4 @@`
`1`	`1`	`process BLAST_BLASTN {`
`2`		`- publishDir "${params.outdir}/Processing/BlastN", mode: 'copy'`
`3`	`2`
`4`	`3`	`label 'short_parallel'`
`5`	`4`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,4 @@`
`1`	`1`	`process SAMTOOLS_INDEX {`
`2`		`- publishDir "${params.outdir}/${meta.sample_id}/BWA2", mode: 'copy'`
`3`	`2`
`4`	`3`	`conda 'bioconda::samtools=1.19.2'`
`5`	`4`	`container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?`