marchoeppner
diff --git a/‎conf/lsh.config
-29 b/‎conf/lsh.config
-29
diff --git a/‎conf/modules.config
+16-1 b/‎conf/modules.config
+16-1
diff --git a/‎conf/resources.config
+35-26 b/‎conf/resources.config
+35-26
diff --git a/‎conf/test.config
+1-15 b/‎conf/test.config
+1-15
diff --git a/‎docs/developer.md
+58-1 b/‎docs/developer.md
+58-1
diff --git a/‎docs/usage.md
+43-16 b/‎docs/usage.md
+43-16
diff --git a/‎lib/WorkflowMain.groovy
+2-1 b/‎lib/WorkflowMain.groovy
+2-1
diff --git a/‎modules/bedtools/coverage/environment.yml
+7 b/‎modules/bedtools/coverage/environment.yml
+7
diff --git a/‎modules/bedtools/coverage/main.nf
+1-1 b/‎modules/bedtools/coverage/main.nf
+1-1
diff --git a/‎modules/biobloomtools/categorizer/environment.yml
+7 b/‎modules/biobloomtools/categorizer/environment.yml
+7
diff --git a/‎modules/biobloomtools/categorizer/main.nf
+1-1 b/‎modules/biobloomtools/categorizer/main.nf
+1-1
diff --git a/‎modules/blast/blastn/environment.yml
+7 b/‎modules/blast/blastn/environment.yml
+7
diff --git a/‎modules/blast/blastn/main.nf
+1-1 b/‎modules/blast/blastn/main.nf
+1-1
diff --git a/‎modules/blast/makeblastdb/environment.yml
+7 b/‎modules/blast/makeblastdb/environment.yml
+7
@@ -49,5 +49,20 @@ process {
             enabled: false
         ]
     }
-    
+    withName: BLAST_MAKEBLASTDB {
+        publishDir = [
+            path: { "${params.outdir}/blastdb" },
+            mode: params.publish_dir_mode,
+            enabled: false
+        ]
+    }
+
+    withName: 'GUNZIP|BWAMEM2_INDEX|SAMTOOLS_FAIDX|SAMTOOLS_DICT' {
+        publishDir = [
+            path: { "${params.outdir}/gmo-check/1.0/${meta.id}" },
+            mode: params.publish_dir_mode,
+            enabled: true,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
 }
@@ -1,30 +1,39 @@
 params {
 
-    references {
-       
-       genomes {
-          tomato {
-            fasta    = "${params.reference_base}/gmo-check/tomato/Solanum_lycopersicum.SL3.0.dna.toplevel.fa"
-            fai      = "${params.reference_base}/gmo-check/tomato/Solanum_lycopersicum.SL3.0.dna.toplevel.fa.fai"
-            dict     = "${params.reference_base}/gmo-check/tomato/sSolanum_lycopersicum.SL3.0.dna.toplevel.dict"
-            amplicon_txt = "${baseDir}/assets/genomes/tomato/amplicon.txt"
-            bed      = "${baseDir}/assets/genomes/tomato/primers.bed"
-            target_bed = "${baseDir}/assets/genomes/tomato/targets.bed"
-            rules    = "${baseDir}/assets/genomes/tomato/rules.json"
-            url      = "https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-58/fasta/solanum_lycopersicum/dna/Solanum_lycopersicum.SL3.0.dna.toplevel.fa.gz"
-          }
-          soybean {
-            fasta    = "${params.reference_base}/soybean/Glycine_max.Glycine_max_v2.1.dna.toplevel.fa"
-            fai      = "${params.reference_base}/soybean/Glycine_max.Glycine_max_v2.1.dna.toplevel.fa.fai"
-            dict     =  "${params.reference_base}/soybean/Glycine_max.Glycine_max_v2.1.dna.toplevel.fa.dict"
-            amplicon_txt = null
-            bed      = null
-            target_bed = null
-            rules    = null
-            url = "https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-58/fasta/glycine_max/dna/Glycine_max.Glycine_max_v2.1.dna.toplevel.fa.gz"
-          }
-        
-       }
+  references {
+      
+    genomes {
+      tomato {
+        fasta    = "${params.reference_base}/gmo-check/1.0/tomato/Solanum_lycopersicum.SL3.0.dna.toplevel.fa"
+        fai      = "${params.reference_base}/gmo-check/1.0/tomato/Solanum_lycopersicum.SL3.0.dna.toplevel.fa.fai"
+        dict     = "${params.reference_base}/gmo-check/1.0/tomato/sSolanum_lycopersicum.SL3.0.dna.toplevel.dict"
+        amplicon_txt = "${baseDir}/assets/genomes/tomato/amplicon.txt"
+        bed      = "${baseDir}/assets/genomes/tomato/primers.bed"
+        target_bed = "${baseDir}/assets/genomes/tomato/targets.bed"
+        rules    = "${baseDir}/assets/genomes/tomato/rules.json"
+        url      = "https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-58/fasta/solanum_lycopersicum/dna/Solanum_lycopersicum.SL3.0.dna.toplevel.fa.gz"
+      }
+      soybean {
+        fasta    = "${params.reference_base}/gmo-check/1.0/soybean/Glycine_max.Glycine_max_v2.1.dna.toplevel.fa"
+        fai      = "${params.reference_base}/gmo-check/1.0/soybean/Glycine_max.Glycine_max_v2.1.dna.toplevel.fa.fai"
+        dict     =  "${params.reference_base}/gmo-check/1.0/soybean/Glycine_max.Glycine_max_v2.1.dna.toplevel.fa.dict"
+        amplicon_txt = null
+        bed      = null
+        target_bed = null
+        rules    = null
+        url = "https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-58/fasta/glycine_max/dna/Glycine_max.Glycine_max_v2.1.dna.toplevel.fa.gz"
+      }
+      test {
+				fasta    = "${params.reference_base}/gmo-check/1.0/test/test.fa"
+				fai      = "${params.reference_base}/gmo-check/1.0/test/test.fa.fai"
+				dict     = "${params.reference_base}/gmo-check/1.0/test/test.dict"
+				amplicon_txt = "${baseDir}/assets/genomes/test/amplicon.txt"
+				bed      = "${baseDir}/assets/genomes/test/primers.bed"
+				target_bed = "${baseDir}/assets/genomes/test/targets.bed"
+				rules    = "${baseDir}/assets/genomes/test/rules.json"
+				url      = "https://raw.githubusercontent.com/marchoeppner/nf-testdata/main/gmo-check/test.fa.gz"
+			}        
     }
-  
+  }
+
 }
@@ -5,20 +5,6 @@ params {
 	run_name = "pipeline_test"
 	max_cpus = 6
 	genome = "test"
+	input = "https://raw.githubusercontent.com/marchoeppner/nf-testdata/main/gmo-check/samples.csv"
 
-	references {
-
-		genomes {
-			test {
-        	    fasta    = "${params.reference_base}/test/test.fa"
-            	fai      = "${params.reference_base}/test/test.fa.fai"
-            	dict     = "${params.reference_base}/test/test.dict"
-            	amplicon_txt = "${baseDir}/assets/genomes/test/amplicon.txt"
-            	bed      = "${baseDir}/assets/genomes/test/primers.bed"
-            	target_bed = "${baseDir}/assets/genomes/test/targets.bed"
-            	rules    = "${baseDir}/assets/genomes/test/rules.json"
-            	url      = "${baseDir}/assets/genomes/test/genome.fa"
-          	}
-		}
-	}
 }
@@ -144,4 +144,61 @@ You'll note that some obvious errors/warnings are omitted. This behavior is cont
 
 ## Sending report emails
 
-This template is set up to send the final QC report via Email (--email [email protected]). This requires for sendmail to be configured on the executing node/computer. 
+This template is set up to send the final QC report via Email (--email [email protected]). This requires for sendmail to be configured on the executing node/computer. 
+
+## Adding new genomes and targets
+
+This pipeline uses a JSON-formatted config file to keep track of the supported analyses. The most basic form looks as follows:
+
+```JSON
+{
+    "rules": {
+        "vsearch-blast": {
+            "payload": [
+                {
+                    "format": "JSON",
+                    "name": "GABA Mutation in SIGAD3",
+                    "target": "SiGAD3|NM_001246898.2",
+                    "matcher": "AAAG-TGGA",
+                    "positive_report": "Diese Probe enthält eine GABA Mutation in SIGAD3. Nachweis erbraucht über: Amplicon Analyse.",
+                    "negative_report": "Für diese Probe konnte keine GABA Mutation in SIGAD3 nachgewiesen werden."
+                }
+            ]
+            
+        },
+        "bwa-freebayes": {
+            "payload": [
+                {
+                    "format": "VCF",
+                    "target": "1:14834-14836",
+                    "name": "GABA Mutation in SIGAD3",
+                    "matcher": "1\t14834\t.\tGTG\tGTTG",
+                    "positive_report": "Diese Probe enthält eine GABA Mutation in SIGAD3. Nachweis erbracht über: Varianten Analyse.",
+                    "negative_report": "Für diese Probe konnte keine GABA Mutation in SIGAD3 nachgewiesen werden."
+                } 
+            ]
+        }
+    }
+}
+```
+
+This file is reference-genome specific and lives in `assets/genome/NAME_OF_SPECIES/rules.json` [example](../assets/genomes/tomato/rules.json)
+
+The rule set knows two types of rules:
+
+- `vsearch-blast` - for analyses that use assembled and clustered amplicons to find patterns in a BLAST database
+
+- `bwa-freebayes` - for analyses that use read alignment and variant calling against a reference genome. 
+
+To add new targets to an already established reference genome:
+
+- Add new elements to the appropriate payload block in the rules.json manifest, following the example structure above
+- If you want to enable the vsearch-blast tool chain, make sure that the built-in [Blast Database](../assets/blastdb.fasta.gz) contains the required target motif(s) (usually a gene of interest). 
+- Add the necessary primer information to the Ptrimmer config (amplicon.txt)
+- Add the primer sequences to the cutadapt fasta file (primers.fa)
+
+To add new reference genomes and matching target rules:
+
+- Add the necessary information about the new reference genome into the [resources.config](../conf/resources.config) file, including a download link. 
+- Create a new species folder under /assets/genome
+- Add the relevant files as described for above for adding individual assets
@@ -2,14 +2,25 @@
 
 This is not a full release. Please note that some things may not work as intended yet. 
 
-# Running the pipeline
+[Running the pipeline](#running-the-pipeline)
+
+[Options](#options)
+
+[Resources](#resources)
+
+
+## Running the pipeline
 
 A basic execution of the pipeline looks as follows:
 
 a) Without a site-specific config file
 
-```
-nextflow run marchoeppner/gmo-check -profile standard,singularity --input samples.csv --genome tomato --reference_base /path/to/references --run_name pipeline-test
+```bash
+nextflow run marchoeppner/gmo-check -profile standard,singularity \\
+--input samples.csv \\
+--genome tomato \\
+--reference_base /path/to/references \\
+--run_name pipeline-test
 ```
 where `path_to_references` corresponds to the location in which you have [installed](installation.md) the pipeline references. 
 
@@ -27,15 +38,15 @@ b) with a site-specific config file
 nextflow run marchoeppner/gmo-check -profile lsh --input samples.csv --genome tomato --run_name pipeline-text
 ```
 
-In this example, both `--reference_base` and the choice of software provisioning are already set in your local configuration and don't have to provided as command line argument. 
+In this example, both `--reference_base` and the choice of software provisioning are already set in your site-specific configuration and don't have to provided as command line argument. 
 
-# Options
+## Options
 
-## `--input samples.csv` [default = null]
+### `--input samples.csv` [default = null]
 
 This pipeline expects a CSV-formatted sample sheet to properly pull various meta data through the processes. The required format looks as follows:
 
-```
+```CSV
 sample_id,library_id,readgroup_id,single_end,R1,R2
 S100,S100,AACYTCLM5.1.S100,false,/home/marc/projects/gaba/data/S100_R1.fastq.gz,/home/marc/projects/gaba/data/S100_R2.fastq.gz
 ```
@@ -52,23 +63,23 @@ The `single_end` column is prospectively included to enable support for non-pair
 
 `R1` and `R2` designate the full path(s) to the read data. This can either be a local path on your (shared) file system or data in the cloud which you access via e.g., S3, google buckets or FTP. 
 
-## `--genome tomato` [default = tomato]
+### `--genome tomato` [default = tomato]
 
 The name of the pre-configured genome to analyze against. This parameter controls not only the mapping reference (if you use a mapping-based analysis), but also which internally pre-configured configuration files are used. Currently, only one genome can be analyzed per pipeline run. 
 
 Available options:
 
 - tomato
 
-## `--run_name Fubar` [default = null]
+### `--run_name Fubar` [default = null]
 
 A mandatory name for this run, to be included with the result files. 
 
-## `--email [email protected]` [ default = null]
+### `--email [email protected]` [ default = null]
 
 An email address to which the MultiQC report is send after pipeline completion. This requires for the executing system to have [sendmail](https://rimuhosting.com/support/settingupemail.jsp?mta=sendmail) configured. 
 
-## `--tools vsearch` [default = vsearch]
+### `--tools vsearch` [default = vsearch]
 
 This pipeline supports two completely independent tool chains:
 
@@ -80,16 +91,32 @@ You can specify either one, or both: `--tools 'vsearch,bwa2'`
 
 Which tool chain is the best choice? Well, technically both options give near-identical results. So in this case `vsearch` would be the better option since it runs significantly faster. However, this pipeline is designed to (theoretically) handle many more types of genetic variants, not all of which are necessarily detectable without a proper variant calling. This is why the `bwa2` option exists - future proofing. 
 
-## `--reference_base` [default = null ]
+### `--reference_base` [default = null ]
 
 The location of where the pipeline references are installed on your system. This will typically be pre-set in your site-specific config file and is only needed when you run without one. 
 
-## `--outdir results` [default = results]
+### `--outdir results` [default = results]
 
 The location where the results are stored. Usually this will be `results`in the location from where you run the nextflow process. However, this option also accepts any other path in your file system(s). 
 
-## `--freebayes_min_alternate_count 50` [ default = 50]
+### `--freebayes_min_alternate_count 50` [ default = 50]
 The minimum number of reads to support a given SNP. Since we are working with amplicon data, this value can be fairly high. 
 
-## `--freebayes_min_alternate_frac 0.01` [ default = 0.01]
-The minimum percentage of reads supporting a SNP at a given site for the SNP to be considered. The default of 1% is chosen to be able to detect low levels of contribution but may need some tweaking depending on your exact sequencing setup and coverage. 
+### `--freebayes_min_alternate_frac 0.01` [ default = 0.01]
+The minimum percentage of reads supporting a SNP at a given site for the SNP to be considered. The default of 1% is chosen to be able to detect low levels of contribution but may need some tweaking depending on your exact sequencing setup and coverage. 
+
+## Resources
+
+The following options can be set to control resource usage outside of a site-specific [config](https://github.com/marchoeppner/nf-configs) file.
+
+### `--max_cpus` [ default = 16]
+
+The maximum number of cpus a single job can request. This is typically the maximum number of cores available on a compute node or your local (development) machine. 
+
+### `--max_memory` [ default = 128.GB ]
+
+The maximum amount of memory a single job can request. This is typically the maximum amount of RAM available on a compute node or your local (development) machine, minus a few percent to prevent the machine from running out of memory while running basic background tasks.
+
+### `--max_time`[ default = 240.h ]
+
+The maximum allowed run/wall time a single job can request. This is mostly relevant for environments where run time is restricted, such as in a computing cluster with active resource manager or possibly some cloud environments.  
@@ -19,7 +19,6 @@ class WorkflowMain {
         }
     }
 
-    // TODO: Change name of the pipeline below
     public static String header(workflow) {
         def headr = ''
         def infoLine = "${workflow.manifest.description} | version ${workflow.manifest.version}"
@@ -42,7 +41,9 @@ class WorkflowMain {
             Required parameters:
             --input                        The primary pipeline input (typically a CSV file)
             --email                        Email address to send reports to (enclosed in '')
+            --tools                        A comma-separated list of tool chains to run (vsearch, bwa2)
             Optional parameters:
+            --genome                       Use this reference genome when requesting 'bwa2' tool chain
             --run_name                     A descriptive name for this pipeline run
             Output:
             --outdir                       Local directory to which all output is written (default: results)
 
@@ -0,0 +1,7 @@
+name: beedtools_coverage
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::bedtools=2.31.1
@@ -4,7 +4,7 @@ process BEDTOOLS_COVERAGE {
 
     tag "${meta.sample_id}"
 
-    conda 'bioconda::bedtools=2.31.1'
+    conda "${moduleDir}/environment.yml"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/bedtools:2.31.1--hf5e1c6e_0' :
         'quay.io/biocontainers/bedtools:2.31.1--hf5e1c6e_0' }"
 
@@ -0,0 +1,7 @@
+name: biobloomtools_categorizer
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::biobloomtools=2.3.5
@@ -4,7 +4,7 @@ process BIOBLOOMTOOLS_CATEGORIZER {
 
     tag "${meta.sample_id}|${meta.library_id}|${meta.readgroup_id}"
 
-    conda 'bioconda::biobloomtools=2.3.5'
+    conda "${moduleDir}/environment.yml"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/biobloomtools:2.3.5--h4056dc3_2' :
         'quay.io/biocontainers/biobloomtools:2.3.5--h4056dc3_2' }"
 
@@ -0,0 +1,7 @@
+name: blast_blastn
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::blast=2.15
@@ -2,7 +2,7 @@ process BLAST_BLASTN {
 
     label 'short_parallel'
 
-    conda 'bioconda::blast=2.15'
+    conda "${moduleDir}/environment.yml"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/blast:2.15.0--pl5321h6f7f691_1' :
         'quay.io/biocontainers/blast:2.15.0--pl5321h6f7f691_1' }"
 
@@ -0,0 +1,7 @@
+name: blast_makeblastdb
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::blast=2.15