Adding intial documentation

marchoeppner · marchoeppner · commit 673fa2b5cc70 · 2024-02-21T09:41:32.000+01:00
diff --git a/docs/installation.md b/docs/installation.md
@@ -1,5 +1,17 @@
 # Installation
 
+## Installation the references
+
+This pipeline requires locally stored genomes in fasta format. To build these, do:
+
+```
+nextflow run marchoeppner/gmo-check -profile standard,singularity --run_name build_refs --outdir /path/to/references
+```
+
+If you do not singularity on your system, you can also specify docker, podman or conda for software provisioning - see the [usage information](usage.md).
+
+The path specified with `--outdir` can then be given to the pipeline during normal execution as `--reference_base`.
+
 ## Site-specific config file
 
 This pipeline requires a site-specific configuration file to be able to talk to your local cluster or compute infrastructure. Nextflow supports a wide
diff --git a/docs/usage.md b/docs/usage.md
@@ -1,4 +1,69 @@
 # Usage information
 
-## Basic execution
+# Running the pipeline
 
+A basic execution of the pipeline looks as follows:
+
+a) Without a site-specific config file
+
+```
+nextflow run marchoeppner/gmo-check -profile standard,singularity --input samples.csv --genome tomato --reference_base /path/to/references --run_name pipeline-test
+```
+where `--path_to_references` corresponds to the location in which you have [installed](installation.md) the pipeline references. 
+
+In this example, the pipeline will assume it runs on a single computer with the singularity container engine available. Other options to provision software are:
+
+`-profile standard,docker` 
+
+`-profile standard,podman` 
+
+`-profile standard,conda` 
+
+b) with a site-specific config file
+
+```
+nextflow run marchoeppner/gmo-check -profile lsh --input samples.csv --genome tomato --run_name pipeline-text
+```
+
+# Options
+
+## `--input samplesheet.csv` [default = null]
+
+This pipeline expects a CSV-formatted sample sheet to properly pull various meta data through the processes. The required format looks as follows:
+
+```
+sample_id,library_id,readgroup_id,R1,R2
+S100,S100,AACYTCLM5.1.S100,/home/marc/projects/gaba/data/S100_R1.fastq.gz,/home/marc/projects/gaba/data/S100_R2.fastq.gz
+```
+
+If you are unsure about the read group ID, just make sure that it should be unique for the combination of library, flowcell and lane. Typically it would be constructed from these components - and the easiest way to get it is from the FastQ file iteself (header of read 1, for example).
+
+## `--genome tomato` [default = tomato]
+
+The name of the pre-configured genome to analyze against. This parameter controls not only the mapping reference (if you use a mapping-based analysis), but also which internally pre-configured configuration files are used. Currently, only one genome can be analyzed per pipeline run. 
+
+Available options:
+
+- tomato
+
+## `--run_name Fubar` [default = null]
+
+A mandatory name for this run, to be included with the result files. 
+
+## `--email me@google.com` [ default = null]
+
+An email address to which the MultiQC report is send after pipeline completion. This requires for the executing system to have `sendmail` configured. 
+
+## `--tools vsearch` [default = vsearch]
+
+This pipeline supports two completely independent tool chains:
+
+- `vsearch` using a simple "metagenomics-like" amplicon processing workflow to produce dereplicated sequences from the short reads to then search for pre-defined patterns against a BLAST database (built-in)
+
+- `bwa2` uses a classical variant calling approach, with parameters similar to what one would find in cancer analysis to detect low-frequency SNPs in mixed samples. 
+
+You can specify either one, or both: `--tools 'vsearch,bwa2'` 
+
+## `--reference_base` [default = null ]
+
+The location of where the pipeline references are installed on your system. This will typically be pre-set in your site-specific config file and is only needed when you run without one. 
diff --git a/modules/bwamem2/index/main.nf b/modules/bwamem2/index/main.nf
@@ -1,16 +1,19 @@
 process BWAMEM2_INDEX {
 
-    tag "${meta.genome}"
+    tag "${meta.id}"
 
     label 'medium_serial'
 
-    publishDir "${params.outdir}/${meta.genome}", mode 'copy'
+    conda 'bioconda::samtools=1.19.2 bioconda::bwa-mem2=2.2.1'
+    container 'quay.io/biocontainers/mulled-v2-e5d375990341c5aef3c9aff74f96f66f65375ef6:2cdf6bf1e92acbeb9b2834b1c58754167173a410-0'
+
+    publishDir "${params.outdir}/${meta.id}", mode: 'copy'
 
     input:
     tuple val(meta),path(fasta)
 
     output:
-    //path('*'), emit: bwa_index
+    path('*'), emit: bwa_index
     path("versions.yml"), emit: versions
 
     script:
diff --git a/modules/samtools/dict/main.nf b/modules/samtools/dict/main.nf
@@ -1,16 +1,17 @@
 process SAMTOOLS_DICT {
+
+    tag "${meta.id}"
+    
     conda 'bioconda::samtools=1.19.2'
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/samtools:1.19.2--h50ea8bc_0' :
         'quay.io/biocontainers/samtools:1.19.2--h50ea8bc_0' }"
 
-    publishDir "${params.outdir}/${meta.genome}", mode 'copy'
-
-    tag "${fasta}"
+    publishDir "${params.outdir}/${meta.id}", mode: 'copy'
 
     input:
-    tuple val(meta), path(fasta)
-
+    tuple val(meta),path(fasta)
+    
     output:
     tuple val(meta), path(dict), emit: dict
     path("versions.yml"), emit: versions
diff --git a/modules/samtools/faidx/main.nf b/modules/samtools/faidx/main.nf
@@ -4,13 +4,13 @@ process SAMTOOLS_FAIDX {
 
     label 'short_serial'
 
-    publishDir "${params.outdir}/${meta.id}", mode 'copy'
-
     conda 'bioconda::samtools=1.19.2'
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/samtools:1.19.2--h50ea8bc_0' :
         'quay.io/biocontainers/samtools:1.19.2--h50ea8bc_0' }"
 
+    publishDir "${params.outdir}/${meta.id}", mode: 'copy'
+
     input:
     tuple val(meta),path(fasta)
 
diff --git a/workflows/build_references.nf b/workflows/build_references.nf
@@ -1,10 +1,13 @@
 include { GUNZIP }              from "./../modules/gunzip"
 include { SAMTOOLS_FAIDX }      from "./../modules/samtools/faidx"
+include { SAMTOOLS_DICT }       from "./../modules/samtools/dict"
+include { BWAMEM2_INDEX }       from "./../modules/bwamem2/index"
 
 genomes = params.references.genomes.keySet()
 
 genome_list = []
 
+// Get all the configured genomes
 genomes.each { genome ->
     def meta = [:]
     meta.id = genome.toString()
@@ -14,22 +17,38 @@ genomes.each { genome ->
 
 ch_genomes = Channel.fromList(genome_list)
 
+// Workflow starts here
 workflow BUILD_REFERENCES {
 
     main:
 
+    // Check if any of the fasta files are gzipped
     ch_genomes.branch {
         compressed: it[1].toString().contains(".gz")
         uncompressed: !it[1].toString().contains(".gz")
     }.set { ch_genomes_branched }
 
+    // unzip all the compressed fasta files
     GUNZIP(
         ch_genomes_branched.compressed
     )
     
+    // merge all fasta files back into one channel
     ch_fasta = ch_genomes_branched.uncompressed.mix(GUNZIP.out.gunzip)
 
+    // Index the fasta file(s)
     SAMTOOLS_FAIDX(
         ch_fasta
     )
+
+    // Create a sequence dictionary for the fasta file(s)
+    SAMTOOLS_DICT(
+        ch_fasta
+    )
+
+    // Create the BWA2 index for the fasta file(s)
+    BWAMEM2_INDEX(
+        ch_fasta
+    )
+    
 }