Upgrading meta construct to also allow single-end data types to be passed through the pipeline

marchoeppner · marchoeppner · commit 6c2b50442475 · 2024-03-11T13:35:29.000+01:00
diff --git a/conf/modules.config b/conf/modules.config
@@ -42,4 +42,12 @@ process {
             enabled: false
         ]  
     }
+    withName: 'VSEARCH_FASTQFILTER|VSEARCH_FASTQMERGE' {
+        publishDir = [
+            path: { "${params.outdir}/biobloom" },
+            mode: params.publish_dir_mode,
+            enabled: false
+        ]
+    }
+    
 }
diff --git a/docs/installation.md b/docs/installation.md
@@ -26,4 +26,4 @@ The path specified with `--outdir` can then be given to the pipeline during norm
 
 If you run on anything other than a local system, this pipeline requires a site-specific configuration file to be able to talk to your cluster or compute infrastructure. Nextflow supports a wide range of such infrastructures, including Slurm, LSF and SGE - but also Kubernetes and AWS. For more information, see [here](https://www.nextflow.io/docs/latest/executor.html).
 
-Site-specific config-files for our pipeline ecosystem are stored centrally on [github](https://github.com/marchoeppner/configs). Please talk to us if you want to add your system
+Site-specific config-files for our pipeline ecosystem are stored centrally on [github](https://github.com/marchoeppner/nf-configs). Please talk to us if you want to add your system
diff --git a/docs/usage.md b/docs/usage.md
@@ -31,16 +31,26 @@ In this example, both `--reference_base` and the choice of software provisioning
 
 # Options
 
-## `--input samplesheet.csv` [default = null]
+## `--input samples.csv` [default = null]
 
 This pipeline expects a CSV-formatted sample sheet to properly pull various meta data through the processes. The required format looks as follows:
 
 ```
-sample_id,library_id,readgroup_id,R1,R2
-S100,S100,AACYTCLM5.1.S100,/home/marc/projects/gaba/data/S100_R1.fastq.gz,/home/marc/projects/gaba/data/S100_R2.fastq.gz
+sample_id,library_id,readgroup_id,single_end,R1,R2
+S100,S100,AACYTCLM5.1.S100,false,/home/marc/projects/gaba/data/S100_R1.fastq.gz,/home/marc/projects/gaba/data/S100_R2.fastq.gz
 ```
+The columns `sample_id` and `library_id` should be self-explanatory. 
 
-If you are unsure about the readgroup ID, just make sure that it is unique for the combination of library, flowcell and lane. Typically it would be constructed from these components - and the easiest way to get it is from the FastQ file itself (header of read 1, for example).
+If you are uncertain about `readgroup_id`, just make sure that it is unique for the combination of library, flowcell and lane. Typically it would be constructed from these components - and the easiest way to get it is from the FastQ file itself (header of read 1, for example).
+
+```
+@VL00316:70:AACYTCLM5:1:1101:18686:1038 1:N:0:AAGCGGTGAA+AACCTAGACG
+```
+For a hypothetical library called "LIB100", this  can be turned into the readgroup id: `AACYTCLM5.1.LIB100` - where `AACYTCLM5` is the ID of the flowcell, `1` is the lane on that flow cell and `LIB100` is the identifier of the library. 
+
+The `single_end` column is prospectively included to enable support for non-paired end sequencing technologies, such as Ion Torrent or Pacbio/ONT (TBD). For the moment, you can just put "false" here. 
+
+`R1` and `R2` designate the full path(s) to the read data. This can either be a local path on your (shared) file system or data in the cloud which you access via e.g., S3, google buckets or FTP. 
 
 ## `--genome tomato` [default = tomato]
 
diff --git a/modules/biobloomtools/categorizer/main.nf b/modules/biobloomtools/categorizer/main.nf
@@ -10,10 +10,10 @@ process BIOBLOOMTOOLS_CATEGORIZER {
         'quay.io/biocontainers/biobloomtools:2.3.5--h4056dc3_2' }"
 
     input:
-    tuple val(meta), path(r1), path(r2)
+    tuple val(meta), path(reads)
 
     output:
-    tuple val(meta), path(r1_trim), path(r2_trim), emit: reads
+    tuple val(meta), path('*noMatch*.fq.gz'), emit: reads
     path('versions.yml'), emit: versions
     path("*summary.tsv"), emit: results
 
@@ -23,7 +23,7 @@ process BIOBLOOMTOOLS_CATEGORIZER {
     r2_trim = filtered + '_noMatch_2.fq.gz'
 
     """
-    biobloomcategorizer -p $filtered -t ${task.cpus} -n --fq --gz_out -i -e -f "${params.bloomfilter}" $r1 $r2
+    biobloomcategorizer -p $filtered -t ${task.cpus} -n --fq --gz_out -i -e -f "${params.bloomfilter}" $reads
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
diff --git a/modules/cat_fastq/main.nf b/modules/cat_fastq/main.nf
@@ -0,0 +1,36 @@
+process CAT_FASTQ {
+    tag "$meta.sample_id"
+    label 'process_single'
+
+    conda 'conda-forge::sed=4.7'
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/ubuntu:20.04' :
+        'ubuntu:20.04' }"
+
+    input:
+    tuple val(meta), path(reads, stageAs: 'input*/*')
+
+    output:
+    tuple val(meta), path('*.merged.fastq.gz'), emit: reads
+    path 'versions.yml'                       , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def prefix = meta.sample_id
+    def readList = reads instanceof List ? reads.collect { r -> r.toString() } : [reads.toString()]
+
+    def read1 = []
+    def read2 = []
+    readList.eachWithIndex { v, ix -> (ix & 1 ? read2 : read1) << v }
+    """
+    zcat ${read1.join(' ')} > ${prefix}_1.merged.fastq.gz
+    zcat ${read2.join(' ')} > ${prefix}_2.merged.fastq.gz
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//')
+    END_VERSIONS
+    """
+}
diff --git a/modules/fastp/main.nf b/modules/fastp/main.nf
@@ -1,44 +1,66 @@
 process FASTP {
-    publishDir "${params.outdir}/Processing/FastP", mode: 'copy'
+    tag "${meta.sample_id}"
 
     label 'short_parallel'
 
-    tag "${meta.sample_id}|${meta.library_id}|${meta.readgroup_id}"
-
     conda 'bioconda::fastp=0.23.4'
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/fastp:0.23.4--hadf994f_2' :
         'quay.io/biocontainers/fastp:0.23.4--hadf994f_2' }"
 
     input:
-    tuple val(meta), path(r1), path(r2)
+    tuple val(meta), path(reads)
 
     output:
-    tuple val(meta), path(r1_trim), path(r2_trim), emit: reads
+    tuple val(meta), path('*trimmed.fastq.gz'), emit: reads
     path("*.json"), emit: json
     path('versions.yml'), emit: versions
 
     script:
+
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: reads[0].getBaseName()
+
+    r1 = reads[0]
+
     suffix = '_trimmed.fastq.gz'
-    r1_trim = file(r1).getBaseName() + suffix
-    r2_trim = file(r2).getBaseName() + suffix
-    json = file(r1).getBaseName() + '.fastp.json'
-    html = file(r2).getBaseName() + '.fastp.html'
-
-    """
-    fastp -c --in1 $r1 --in2 $r2 \
-    --out1 $r1_trim \
-    --out2 $r2_trim \
-    --detect_adapter_for_pe \
-    -w ${task.cpus} \
-    -j $json \
-    -h $html \
-    --length_required 35
-
-    cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g")
-    END_VERSIONS
-
-    """
+
+    json = prefix + '.fastp.json'
+    html = prefix + '.fastp.html'
+
+    if (meta.single_end) {
+        r1_trim = r1.getBaseName() + suffix
+        """
+        fastp --in1 ${r1} \
+        --out1 $r1_trim \
+        -w ${task.cpus} \
+        -j $json \
+        -h $html $args
+
+        cat <<-END_VERSIONS > versions.yml
+        "${task.process}":
+            fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g")
+        END_VERSIONS
+        """
+    } else {
+        r2 = reads[1]
+        r1_trim = r1.getBaseName() + suffix
+        r2_trim = r2.getBaseName() + suffix
+        """
+        fastp --in1 ${r1} --in2 ${r2} \
+        --out1 $r1_trim \
+        --out2 $r2_trim \
+        --detect_adapter_for_pe \
+        -w ${task.cpus} \
+        -j $json \
+        -h $html \
+        $args
+
+        cat <<-END_VERSIONS > versions.yml
+        "${task.process}":
+            fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g")
+        END_VERSIONS
+
+        """
+    }
 }
diff --git a/modules/input_check.nf b/modules/input_check.nf
@@ -8,30 +8,35 @@ workflow INPUT_CHECK {
 
     main:
     samplesheet
-        .splitCsv(header:true, sep: ',')
-        .map { fastq_channel(it) }
+        .splitCsv(header:true, sep:',')
+        .map { row -> fastq_channel(row) }
         .set { reads }
 
     emit:
     reads // channel: [ val(meta), [ reads ] ]
 }
 
+// Function to get list of [ meta, [ fastq_1, fastq_2 ] ]
 def fastq_channel(LinkedHashMap row) {
-    // create meta map
-    def meta = [:]
-    meta.sample_id         = row.sample_id
-    meta.readgroup_id      = row.readgroup_id
-    meta.library_id        = row.library_id
 
-    // add path(s) of the fastq file(s) to the meta map
-    def fastqMeta = []
+    meta = [:]
+    meta.sample_id      = row.sample_id
+    meta.single_end     = row.single_end
+    meta.library_id     = row.library_id
+    meta.readgroup_id   = row.readgroup_id
+
+    array = []
     if (!file(row.R1).exists()) {
         exit 1, "ERROR: Please check input samplesheet -> Read 1 FastQ file does not exist!\n${row.R1}"
     }
-    if (!file(row.R2).exists()) {
-        exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.R2}"
+    if (meta.single_end) {
+        array = [ meta, [ file(row.R1)] ]
+    } else {
+        if (!file(row.R2).exists()) {
+            exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.R2}"
+        }
+        array = [ meta, [ file(row.R1), file(row.R2)] ]
     }
-    fastqMeta = [ meta, file(row.R1), file(row.R2) ]
-
-    return fastqMeta
+    
+    return array
 }
diff --git a/modules/ptrimmer/main.nf b/modules/ptrimmer/main.nf
@@ -1,33 +1,53 @@
 process PTRIMMER {
-    publishDir "${params.outdir}/${meta.sample_id}/VSEARCH/PTRIMMER", mode: 'copy'
-
     label 'short_serial'
 
-    tag "${meta.sample_id}|${meta.library_id}|${meta.readgroup_id}"
+    tag "${meta.sample_id}"
 
     conda 'bioconda::ptrimmer=1.3.3.'
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/ptrimmer:1.3.3--h50ea8bc_5' :
         'quay.io/biocontainers/ptrimmer:1.3.3--h50ea8bc_5' }"
 
     input:
-    tuple val(meta), path(r1), path(r2)
+    tuple val(meta), path(reads)
     path(amplicon_txt)
 
     output:
-    tuple val(meta), path(r1_trimmed), path(r2_trimmed), emit: reads
+    tuple val(meta), path('*ptrimmed.fastq.gz'), emit: reads
     path('versions.yml'), emit: versions
 
     script:
-    r1_trimmed = r1.getBaseName() + '_ptrimmed.fastq'
-    r2_trimmed = r2.getBaseName() + '_ptrimmed.fastq'
-
-    """
-    ptrimmer -t pair -a $amplicon_txt -f $r1 -d $r1_trimmed -r $r2 -e $r2_trimmed
-    cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        Ptrimmer: \$(ptrimmer --help 2>&1 | grep Version | sed -e "s/Version: //g")
-    END_VERSIONS
-
-    """
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: meta.sample_id
+
+    r1 = reads[0]
+    r1_trimmed = prefix + '_1.ptrimmed.fastq'
+    r1_trimmed_gz = r1_trimmed + '.gz'
+
+    if (meta.single_end) {
+        """
+        ptrimmer $args -t single -a $amplicon_txt -f $r1 -d $r1_trimmed
+        gzip $r1_trimmed
+
+        cat <<-END_VERSIONS > versions.yml
+        "${task.process}":
+            Ptrimmer: \$(ptrimmer --help 2>&1 | grep Version | sed -e "s/Version: //g")
+        END_VERSIONS
+        """
+    } else {
+        r2 = reads[1]
+        r2_trimmed = prefix + '_2.ptrimmed.fastq'
+        r2_trimmed_gz = r2_trimmed + '.gz'
+
+        """
+        ptrimmer $args -t pair -a $amplicon_txt -f $r1 -d $r1_trimmed -r $r2 -e $r2_trimmed
+        gzip $r1_trimmed
+        gzip $r2_trimmed
+
+        cat <<-END_VERSIONS > versions.yml
+        "${task.process}":
+            Ptrimmer: \$(ptrimmer --help 2>&1 | grep Version | sed -e "s/Version: //g")
+        END_VERSIONS
+        """
+    }
 }
diff --git a/modules/vsearch/fastqfilter/main.nf b/modules/vsearch/fastqfilter/main.nf
@@ -19,11 +19,15 @@ process VSEARCH_FASTQFILTER {
     filtered = fq.getBaseName() + '.filtered.fasta'
 
     """
-    vsearch -fastq_filter $fq -fastq_maxee 0.5 -relabel Filtered -fastaout $filtered
+    vsearch -fastq_filter $fq \
+    -fastq_maxee 0.5 \
+    --threads ${task.cpus} \
+    -relabel Filtered \
+    -fastaout $filtered
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
-        vsearch: \$(vsearch --version 2>&1 | head -n1  | sed -e "s/vsearch //g" -e "s/,.*//")
+        vsearch: \$(vsearch --version 2>&1 | head -n 1 | sed 's/vsearch //g' | sed 's/,.*//g' | sed 's/^v//' | sed 's/_.*//')
     END_VERSIONS
     """
 }
diff --git a/modules/vsearch/fastqmerge/main.nf b/modules/vsearch/fastqmerge/main.nf
@@ -20,12 +20,13 @@ process VSEARCH_FASTQMERGE {
 
     """
     vsearch --fastq_merge $fwd --reverse $rev \
+    --threads ${task.cpus} \
     --fastqout $merged \
     --fastq_eeout
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
-        vsearch: \$(vsearch --version 2>&1 | head -n1  | sed -e "s/vsearch //g" -e "s/,.*//")
+        vsearch: \$(vsearch --version 2>&1 | head -n 1 | sed 's/vsearch //g' | sed 's/,.*//g' | sed 's/^v//' | sed 's/_.*//')
     END_VERSIONS
     """
 }
diff --git a/modules/vsearch/fastxuniques/main.nf b/modules/vsearch/fastxuniques/main.nf
@@ -19,11 +19,14 @@ process VSEARCH_FASTXUNIQUES {
     derep = fa.getBaseName() + '.unique.fasta'
 
     """
-    vsearch -fastx_uniques $fa -sizeout -relabel ${meta.sample_id}_Unique -fastaout $derep
+    vsearch -fastx_uniques $fa \
+    -sizeout -relabel ${meta.sample_id}_Unique \
+    -fastaout $derep \
+    --threads ${task.cpus} \
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
-        vsearch: \$(vsearch --version 2>&1 | head -n1 | sed -e "s/vsearch //g" -e "s/,.*//")
+        vsearch: \$(vsearch --version 2>&1 | head -n 1 | sed 's/vsearch //g' | sed 's/,.*//g' | sed 's/^v//' | sed 's/_.*//')
     END_VERSIONS
     """
 }
diff --git a/subworkflows/bwamem2/main.nf b/subworkflows/bwamem2/main.nf
@@ -33,7 +33,6 @@ workflow BWAMEM2_WORKFLOW {
         reads,
         fasta
     )
-
     ch_versions = ch_versions.mix(BWAMEM2_MEM.out.versions)
 
     // Group BAM files by sample, in case of multi-lane setup
@@ -95,4 +94,4 @@ workflow BWAMEM2_WORKFLOW {
     vcf         = FREEBAYES.out.vcf
     reports     = ch_reports
     bam         = SAMTOOLS_AMPLICONCLIP.out.bam
-    }
+}
diff --git a/subworkflows/vsearch/main.nf b/subworkflows/vsearch/main.nf
diff --git a/workflows/gmo.nf b/workflows/gmo.nf

Original file line number	Diff line number	Diff line change
`@@ -42,4 +42,12 @@ process {`
`42`	`42`	`enabled: false`
`43`	`43`	`]`
`44`	`44`	`}`
	`45`	`+ withName: 'VSEARCH_FASTQFILTER\|VSEARCH_FASTQMERGE' {`
	`46`	`+ publishDir = [`
	`47`	`+ path: { "${params.outdir}/biobloom" },`
	`48`	`+ mode: params.publish_dir_mode,`
	`49`	`+ enabled: false`
	`50`	`+ ]`
	`51`	`+ }`
	`52`	`+`
`45`	`53`	`}`
Original file line number	Diff line number	Diff line change
@@ -26,4 +26,4 @@ The path specified with `--outdir` can then be given to the pipeline during norm
`26`	`26`
`27`	`27`	`If you run on anything other than a local system, this pipeline requires a site-specific configuration file to be able to talk to your cluster or compute infrastructure. Nextflow supports a wide range of such infrastructures, including Slurm, LSF and SGE - but also Kubernetes and AWS. For more information, see [here](https://www.nextflow.io/docs/latest/executor.html).`
`28`	`28`
`29`		`-Site-specific config-files for our pipeline ecosystem are stored centrally on [github](https://github.com/marchoeppner/configs). Please talk to us if you want to add your system`
	`29`	`+Site-specific config-files for our pipeline ecosystem are stored centrally on [github](https://github.com/marchoeppner/nf-configs). Please talk to us if you want to add your system`