Skip to content

Commit c18a300

Browse files
committed
Refactoring of modules
1 parent 6c2b504 commit c18a300

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+399
-162
lines changed

conf/lsh.config

-29
This file was deleted.

conf/modules.config

+16-1
Original file line numberDiff line numberDiff line change
@@ -49,5 +49,20 @@ process {
4949
enabled: false
5050
]
5151
}
52-
52+
withName: BLAST_MAKEBLASTDB {
53+
publishDir = [
54+
path: { "${params.outdir}/blastdb" },
55+
mode: params.publish_dir_mode,
56+
enabled: false
57+
]
58+
}
59+
60+
withName: 'GUNZIP|BWAMEM2_INDEX|SAMTOOLS_FAIDX|SAMTOOLS_DICT' {
61+
publishDir = [
62+
path: { "${params.outdir}/gmo-check/1.0/${meta.id}" },
63+
mode: params.publish_dir_mode,
64+
enabled: true,
65+
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
66+
]
67+
}
5368
}

conf/resources.config

+35-26
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,39 @@
11
params {
22

3-
references {
4-
5-
genomes {
6-
tomato {
7-
fasta = "${params.reference_base}/gmo-check/tomato/Solanum_lycopersicum.SL3.0.dna.toplevel.fa"
8-
fai = "${params.reference_base}/gmo-check/tomato/Solanum_lycopersicum.SL3.0.dna.toplevel.fa.fai"
9-
dict = "${params.reference_base}/gmo-check/tomato/sSolanum_lycopersicum.SL3.0.dna.toplevel.dict"
10-
amplicon_txt = "${baseDir}/assets/genomes/tomato/amplicon.txt"
11-
bed = "${baseDir}/assets/genomes/tomato/primers.bed"
12-
target_bed = "${baseDir}/assets/genomes/tomato/targets.bed"
13-
rules = "${baseDir}/assets/genomes/tomato/rules.json"
14-
url = "https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-58/fasta/solanum_lycopersicum/dna/Solanum_lycopersicum.SL3.0.dna.toplevel.fa.gz"
15-
}
16-
soybean {
17-
fasta = "${params.reference_base}/soybean/Glycine_max.Glycine_max_v2.1.dna.toplevel.fa"
18-
fai = "${params.reference_base}/soybean/Glycine_max.Glycine_max_v2.1.dna.toplevel.fa.fai"
19-
dict = "${params.reference_base}/soybean/Glycine_max.Glycine_max_v2.1.dna.toplevel.fa.dict"
20-
amplicon_txt = null
21-
bed = null
22-
target_bed = null
23-
rules = null
24-
url = "https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-58/fasta/glycine_max/dna/Glycine_max.Glycine_max_v2.1.dna.toplevel.fa.gz"
25-
}
26-
27-
}
3+
references {
4+
5+
genomes {
6+
tomato {
7+
fasta = "${params.reference_base}/gmo-check/1.0/tomato/Solanum_lycopersicum.SL3.0.dna.toplevel.fa"
8+
fai = "${params.reference_base}/gmo-check/1.0/tomato/Solanum_lycopersicum.SL3.0.dna.toplevel.fa.fai"
9+
dict = "${params.reference_base}/gmo-check/1.0/tomato/sSolanum_lycopersicum.SL3.0.dna.toplevel.dict"
10+
amplicon_txt = "${baseDir}/assets/genomes/tomato/amplicon.txt"
11+
bed = "${baseDir}/assets/genomes/tomato/primers.bed"
12+
target_bed = "${baseDir}/assets/genomes/tomato/targets.bed"
13+
rules = "${baseDir}/assets/genomes/tomato/rules.json"
14+
url = "https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-58/fasta/solanum_lycopersicum/dna/Solanum_lycopersicum.SL3.0.dna.toplevel.fa.gz"
15+
}
16+
soybean {
17+
fasta = "${params.reference_base}/gmo-check/1.0/soybean/Glycine_max.Glycine_max_v2.1.dna.toplevel.fa"
18+
fai = "${params.reference_base}/gmo-check/1.0/soybean/Glycine_max.Glycine_max_v2.1.dna.toplevel.fa.fai"
19+
dict = "${params.reference_base}/gmo-check/1.0/soybean/Glycine_max.Glycine_max_v2.1.dna.toplevel.fa.dict"
20+
amplicon_txt = null
21+
bed = null
22+
target_bed = null
23+
rules = null
24+
url = "https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-58/fasta/glycine_max/dna/Glycine_max.Glycine_max_v2.1.dna.toplevel.fa.gz"
25+
}
26+
test {
27+
fasta = "${params.reference_base}/gmo-check/1.0/test/test.fa"
28+
fai = "${params.reference_base}/gmo-check/1.0/test/test.fa.fai"
29+
dict = "${params.reference_base}/gmo-check/1.0/test/test.dict"
30+
amplicon_txt = "${baseDir}/assets/genomes/test/amplicon.txt"
31+
bed = "${baseDir}/assets/genomes/test/primers.bed"
32+
target_bed = "${baseDir}/assets/genomes/test/targets.bed"
33+
rules = "${baseDir}/assets/genomes/test/rules.json"
34+
url = "https://raw.githubusercontent.com/marchoeppner/nf-testdata/main/gmo-check/test.fa.gz"
35+
}
2836
}
29-
37+
}
38+
3039
}

conf/test.config

+1-15
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,6 @@ params {
55
run_name = "pipeline_test"
66
max_cpus = 6
77
genome = "test"
8+
input = "https://raw.githubusercontent.com/marchoeppner/nf-testdata/main/gmo-check/samples.csv"
89

9-
references {
10-
11-
genomes {
12-
test {
13-
fasta = "${params.reference_base}/test/test.fa"
14-
fai = "${params.reference_base}/test/test.fa.fai"
15-
dict = "${params.reference_base}/test/test.dict"
16-
amplicon_txt = "${baseDir}/assets/genomes/test/amplicon.txt"
17-
bed = "${baseDir}/assets/genomes/test/primers.bed"
18-
target_bed = "${baseDir}/assets/genomes/test/targets.bed"
19-
rules = "${baseDir}/assets/genomes/test/rules.json"
20-
url = "${baseDir}/assets/genomes/test/genome.fa"
21-
}
22-
}
23-
}
2410
}

docs/developer.md

+58-1
Original file line numberDiff line numberDiff line change
@@ -144,4 +144,61 @@ You'll note that some obvious errors/warnings are omitted. This behavior is cont
144144

145145
## Sending report emails
146146

147-
This template is set up to send the final QC report via Email (--email [email protected]). This requires for sendmail to be configured on the executing node/computer.
147+
This template is set up to send the final QC report via Email (--email [email protected]). This requires for sendmail to be configured on the executing node/computer.
148+
149+
## Adding new genomes and targets
150+
151+
This pipeline uses a JSON-formatted config file to keep track of the supported analyses. The most basic form looks as follows:
152+
153+
```JSON
154+
{
155+
"rules": {
156+
"vsearch-blast": {
157+
"payload": [
158+
{
159+
"format": "JSON",
160+
"name": "GABA Mutation in SIGAD3",
161+
"target": "SiGAD3|NM_001246898.2",
162+
"matcher": "AAAG-TGGA",
163+
"positive_report": "Diese Probe enthält eine GABA Mutation in SIGAD3. Nachweis erbraucht über: Amplicon Analyse.",
164+
"negative_report": "Für diese Probe konnte keine GABA Mutation in SIGAD3 nachgewiesen werden."
165+
}
166+
]
167+
168+
},
169+
"bwa-freebayes": {
170+
"payload": [
171+
{
172+
"format": "VCF",
173+
"target": "1:14834-14836",
174+
"name": "GABA Mutation in SIGAD3",
175+
"matcher": "1\t14834\t.\tGTG\tGTTG",
176+
"positive_report": "Diese Probe enthält eine GABA Mutation in SIGAD3. Nachweis erbracht über: Varianten Analyse.",
177+
"negative_report": "Für diese Probe konnte keine GABA Mutation in SIGAD3 nachgewiesen werden."
178+
}
179+
]
180+
}
181+
}
182+
}
183+
```
184+
185+
This file is reference-genome specific and lives in `assets/genome/NAME_OF_SPECIES/rules.json` [example](../assets/genomes/tomato/rules.json)
186+
187+
The rule set knows two types of rules:
188+
189+
- `vsearch-blast` - for analyses that use assembled and clustered amplicons to find patterns in a BLAST database
190+
191+
- `bwa-freebayes` - for analyses that use read alignment and variant calling against a reference genome.
192+
193+
To add new targets to an already established reference genome:
194+
195+
- Add new elements to the appropriate payload block in the rules.json manifest, following the example structure above
196+
- If you want to enable the vsearch-blast tool chain, make sure that the built-in [Blast Database](../assets/blastdb.fasta.gz) contains the required target motif(s) (usually a gene of interest).
197+
- Add the necessary primer information to the Ptrimmer config (amplicon.txt)
198+
- Add the primer sequences to the cutadapt fasta file (primers.fa)
199+
200+
To add new reference genomes and matching target rules:
201+
202+
- Add the necessary information about the new reference genome into the [resources.config](../conf/resources.config) file, including a download link.
203+
- Create a new species folder under /assets/genome
204+
- Add the relevant files as described for above for adding individual assets

docs/usage.md

+43-16
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,25 @@
22

33
This is not a full release. Please note that some things may not work as intended yet.
44

5-
# Running the pipeline
5+
[Running the pipeline](#running-the-pipeline)
6+
7+
[Options](#options)
8+
9+
[Resources](#resources)
10+
11+
12+
## Running the pipeline
613

714
A basic execution of the pipeline looks as follows:
815

916
a) Without a site-specific config file
1017

11-
```
12-
nextflow run marchoeppner/gmo-check -profile standard,singularity --input samples.csv --genome tomato --reference_base /path/to/references --run_name pipeline-test
18+
```bash
19+
nextflow run marchoeppner/gmo-check -profile standard,singularity \\
20+
--input samples.csv \\
21+
--genome tomato \\
22+
--reference_base /path/to/references \\
23+
--run_name pipeline-test
1324
```
1425
where `path_to_references` corresponds to the location in which you have [installed](installation.md) the pipeline references.
1526

@@ -27,15 +38,15 @@ b) with a site-specific config file
2738
nextflow run marchoeppner/gmo-check -profile lsh --input samples.csv --genome tomato --run_name pipeline-text
2839
```
2940

30-
In this example, both `--reference_base` and the choice of software provisioning are already set in your local configuration and don't have to provided as command line argument.
41+
In this example, both `--reference_base` and the choice of software provisioning are already set in your site-specific configuration and don't have to provided as command line argument.
3142

32-
# Options
43+
## Options
3344

34-
## `--input samples.csv` [default = null]
45+
### `--input samples.csv` [default = null]
3546

3647
This pipeline expects a CSV-formatted sample sheet to properly pull various meta data through the processes. The required format looks as follows:
3748

38-
```
49+
```CSV
3950
sample_id,library_id,readgroup_id,single_end,R1,R2
4051
S100,S100,AACYTCLM5.1.S100,false,/home/marc/projects/gaba/data/S100_R1.fastq.gz,/home/marc/projects/gaba/data/S100_R2.fastq.gz
4152
```
@@ -52,23 +63,23 @@ The `single_end` column is prospectively included to enable support for non-pair
5263

5364
`R1` and `R2` designate the full path(s) to the read data. This can either be a local path on your (shared) file system or data in the cloud which you access via e.g., S3, google buckets or FTP.
5465

55-
## `--genome tomato` [default = tomato]
66+
### `--genome tomato` [default = tomato]
5667

5768
The name of the pre-configured genome to analyze against. This parameter controls not only the mapping reference (if you use a mapping-based analysis), but also which internally pre-configured configuration files are used. Currently, only one genome can be analyzed per pipeline run.
5869

5970
Available options:
6071

6172
- tomato
6273

63-
## `--run_name Fubar` [default = null]
74+
### `--run_name Fubar` [default = null]
6475

6576
A mandatory name for this run, to be included with the result files.
6677

67-
## `--email [email protected]` [ default = null]
78+
### `--email [email protected]` [ default = null]
6879

6980
An email address to which the MultiQC report is send after pipeline completion. This requires for the executing system to have [sendmail](https://rimuhosting.com/support/settingupemail.jsp?mta=sendmail) configured.
7081

71-
## `--tools vsearch` [default = vsearch]
82+
### `--tools vsearch` [default = vsearch]
7283

7384
This pipeline supports two completely independent tool chains:
7485

@@ -80,16 +91,32 @@ You can specify either one, or both: `--tools 'vsearch,bwa2'`
8091

8192
Which tool chain is the best choice? Well, technically both options give near-identical results. So in this case `vsearch` would be the better option since it runs significantly faster. However, this pipeline is designed to (theoretically) handle many more types of genetic variants, not all of which are necessarily detectable without a proper variant calling. This is why the `bwa2` option exists - future proofing.
8293

83-
## `--reference_base` [default = null ]
94+
### `--reference_base` [default = null ]
8495

8596
The location of where the pipeline references are installed on your system. This will typically be pre-set in your site-specific config file and is only needed when you run without one.
8697

87-
## `--outdir results` [default = results]
98+
### `--outdir results` [default = results]
8899

89100
The location where the results are stored. Usually this will be `results`in the location from where you run the nextflow process. However, this option also accepts any other path in your file system(s).
90101

91-
## `--freebayes_min_alternate_count 50` [ default = 50]
102+
### `--freebayes_min_alternate_count 50` [ default = 50]
92103
The minimum number of reads to support a given SNP. Since we are working with amplicon data, this value can be fairly high.
93104

94-
## `--freebayes_min_alternate_frac 0.01` [ default = 0.01]
95-
The minimum percentage of reads supporting a SNP at a given site for the SNP to be considered. The default of 1% is chosen to be able to detect low levels of contribution but may need some tweaking depending on your exact sequencing setup and coverage.
105+
### `--freebayes_min_alternate_frac 0.01` [ default = 0.01]
106+
The minimum percentage of reads supporting a SNP at a given site for the SNP to be considered. The default of 1% is chosen to be able to detect low levels of contribution but may need some tweaking depending on your exact sequencing setup and coverage.
107+
108+
## Resources
109+
110+
The following options can be set to control resource usage outside of a site-specific [config](https://github.com/marchoeppner/nf-configs) file.
111+
112+
### `--max_cpus` [ default = 16]
113+
114+
The maximum number of cpus a single job can request. This is typically the maximum number of cores available on a compute node or your local (development) machine.
115+
116+
### `--max_memory` [ default = 128.GB ]
117+
118+
The maximum amount of memory a single job can request. This is typically the maximum amount of RAM available on a compute node or your local (development) machine, minus a few percent to prevent the machine from running out of memory while running basic background tasks.
119+
120+
### `--max_time`[ default = 240.h ]
121+
122+
The maximum allowed run/wall time a single job can request. This is mostly relevant for environments where run time is restricted, such as in a computing cluster with active resource manager or possibly some cloud environments.

lib/WorkflowMain.groovy

+2-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ class WorkflowMain {
1919
}
2020
}
2121

22-
// TODO: Change name of the pipeline below
2322
public static String header(workflow) {
2423
def headr = ''
2524
def infoLine = "${workflow.manifest.description} | version ${workflow.manifest.version}"
@@ -42,7 +41,9 @@ class WorkflowMain {
4241
Required parameters:
4342
--input The primary pipeline input (typically a CSV file)
4443
--email Email address to send reports to (enclosed in '')
44+
--tools A comma-separated list of tool chains to run (vsearch, bwa2)
4545
Optional parameters:
46+
--genome Use this reference genome when requesting 'bwa2' tool chain
4647
--run_name A descriptive name for this pipeline run
4748
Output:
4849
--outdir Local directory to which all output is written (default: results)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
name: beedtools_coverage
2+
channels:
3+
- conda-forge
4+
- bioconda
5+
- defaults
6+
dependencies:
7+
- bioconda::bedtools=2.31.1

modules/bedtools/coverage/main.nf

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ process BEDTOOLS_COVERAGE {
44

55
tag "${meta.sample_id}"
66

7-
conda 'bioconda::bedtools=2.31.1'
7+
conda "${moduleDir}/environment.yml"
88
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
99
'https://depot.galaxyproject.org/singularity/bedtools:2.31.1--hf5e1c6e_0' :
1010
'quay.io/biocontainers/bedtools:2.31.1--hf5e1c6e_0' }"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
name: biobloomtools_categorizer
2+
channels:
3+
- conda-forge
4+
- bioconda
5+
- defaults
6+
dependencies:
7+
- bioconda::biobloomtools=2.3.5

modules/biobloomtools/categorizer/main.nf

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ process BIOBLOOMTOOLS_CATEGORIZER {
44

55
tag "${meta.sample_id}|${meta.library_id}|${meta.readgroup_id}"
66

7-
conda 'bioconda::biobloomtools=2.3.5'
7+
conda "${moduleDir}/environment.yml"
88
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
99
'https://depot.galaxyproject.org/singularity/biobloomtools:2.3.5--h4056dc3_2' :
1010
'quay.io/biocontainers/biobloomtools:2.3.5--h4056dc3_2' }"

modules/blast/blastn/environment.yml

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
name: blast_blastn
2+
channels:
3+
- conda-forge
4+
- bioconda
5+
- defaults
6+
dependencies:
7+
- bioconda::blast=2.15

modules/blast/blastn/main.nf

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ process BLAST_BLASTN {
22

33
label 'short_parallel'
44

5-
conda 'bioconda::blast=2.15'
5+
conda "${moduleDir}/environment.yml"
66
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
77
'https://depot.galaxyproject.org/singularity/blast:2.15.0--pl5321h6f7f691_1' :
88
'quay.io/biocontainers/blast:2.15.0--pl5321h6f7f691_1' }"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
name: blast_makeblastdb
2+
channels:
3+
- conda-forge
4+
- bioconda
5+
- defaults
6+
dependencies:
7+
- bioconda::blast=2.15

0 commit comments

Comments
 (0)