diff --git a/.github/.dockstore.yml b/.github/.dockstore.yml index 030138a0..191fabd2 100644 --- a/.github/.dockstore.yml +++ b/.github/.dockstore.yml @@ -3,3 +3,4 @@ version: 1.2 workflows: - subclass: nfl primaryDescriptorPath: /nextflow.config + publish: True diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index eda0dd40..9a119e85 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -69,7 +69,7 @@ If you wish to contribute a new step, please use the following coding standards: 2. Write the process block (see below). 3. Define the output channel if needed (see below). 4. Add any new flags/options to `nextflow.config` with a default (see below). -5. Add any new flags/options to `nextflow_schema.json` with help text (with `nf-core schema build .`) +5. Add any new flags/options to `nextflow_schema.json` with help text (with `nf-core schema build .`). 6. Add any new flags/options to the help message (for integer/text parameters, print to help the corresponding `nextflow.config` parameter). 7. Add sanity checks for all relevant parameters. 8. Add any new software to the `scrape_software_versions.py` script in `bin/` and the version command to the `scrape_software_versions` process in `main.nf`. @@ -87,7 +87,7 @@ Once there, use `nf-core schema build .` to add to `nextflow_schema.json`. ### Default processes resource requirements -Sensible defaults for process resource requirements (CPUs / memory / time) for a process should be defined in `conf/base.config`. These should generally be specified generic with `withLabel:` selectors so they can be shared across multiple processes/steps of the pipeline. A nf-core standard set of labels that should be followed where possible can be seen in the [nf-core pipeline template](https://github.com/nf-core/tools/blob/master/nf_core/pipeline-template/%7B%7Bcookiecutter.name_noslash%7D%7D/conf/base.config), which has the default process as a single core-process, and then different levels of multi-core configurations for increasingly large memory requirements defined with standardised labels. +Sensible defaults for process resource requirements (CPUs / memory / time) for a process should be defined in `conf/base.config`. These should generally be specified generic with `withLabel:` selectors so they can be shared across multiple processes/steps of the pipeline. A nf-core standard set of labels that should be followed where possible can be seen in the [nf-core pipeline template](https://github.com/nf-core/tools/blob/master/nf_core/pipeline-template/conf/base.config), which has the default process as a single core-process, and then different levels of multi-core configurations for increasingly large memory requirements defined with standardised labels. The process resources can be passed on to the tool dynamically within the process with the `${task.cpu}` and `${task.memory}` variables in the `script:` block. diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index e15489cb..7499b0e0 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -55,7 +55,7 @@ Have you provided the following extra information/files: ## Container engine -- Engine: +- Engine: - version: - Image tag: diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index fdb17e5e..b151f6e7 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -1,6 +1,6 @@ --- name: Feature request -about: Suggest an idea for the nf-core website +about: Suggest an idea for the nf-core/ampliseq pipeline labels: enhancement --- diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 28ecbbc2..8226e8df 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -10,14 +10,15 @@ Remember that PRs should be made against the dev branch, unless you're preparing Learn more about contributing: [CONTRIBUTING.md](https://github.com/nf-core/ampliseq/tree/master/.github/CONTRIBUTING.md) --> + ## PR checklist - [ ] This comment contains a description of changes (with reason). - [ ] If you've fixed a bug or added code that should be tested, add tests! - - [ ] If you've added a new tool - add to the software_versions process and a regex to `scrape_software_versions.py` - - [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/nf-core/ampliseq/tree/master/.github/CONTRIBUTING.md) - - [ ] If necessary, also make a PR on the nf-core/ampliseq _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. + - [ ] If you've added a new tool - add to the software_versions process and a regex to `scrape_software_versions.py` + - [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](nf-core/ampliseq/tree/master/.github/CONTRIBUTING.md) + - [ ] If necessary, also make a PR on the nf-core/ampliseq _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. - [ ] Make sure your code lints (`nf-core lint .`). - [ ] Ensure the test suite passes (`nextflow run . -profile test,docker`). - [ ] Usage Documentation in `docs/usage.md` is updated. diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml index 2ead208a..e8a8ab02 100644 --- a/.github/workflows/awsfulltest.yml +++ b/.github/workflows/awsfulltest.yml @@ -9,6 +9,16 @@ on: types: [completed] workflow_dispatch: + +env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + TOWER_ACCESS_TOKEN: ${{ secrets.AWS_TOWER_TOKEN }} + AWS_JOB_DEFINITION: ${{ secrets.AWS_JOB_DEFINITION }} + AWS_JOB_QUEUE: ${{ secrets.AWS_JOB_QUEUE }} + AWS_S3_BUCKET: ${{ secrets.AWS_S3_BUCKET }} + + jobs: run-awstest: name: Run AWS full tests @@ -23,21 +33,14 @@ jobs: - name: Install awscli run: conda install -c conda-forge awscli - name: Start AWS batch job - # TODO nf-core: You can customise AWS full pipeline tests as required + # nf-core: You can customise AWS full pipeline tests as required # Add full size test data (but still relatively small datasets for few samples) # on the `test_full.config` test runs with only one set of parameters # Then specify `-profile test_full` instead of `-profile test` on the AWS batch command - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - TOWER_ACCESS_TOKEN: ${{ secrets.AWS_TOWER_TOKEN }} - AWS_JOB_DEFINITION: ${{ secrets.AWS_JOB_DEFINITION }} - AWS_JOB_QUEUE: ${{ secrets.AWS_JOB_QUEUE }} - AWS_S3_BUCKET: ${{ secrets.AWS_S3_BUCKET }} run: | aws batch submit-job \ --region eu-west-1 \ --job-name nf-core-ampliseq \ --job-queue $AWS_JOB_QUEUE \ --job-definition $AWS_JOB_DEFINITION \ - --container-overrides '{"command": ["nf-core/ampliseq", "-r '"${GITHUB_SHA}"' -profile test --outdir s3://'"${AWS_S3_BUCKET}"'/ampliseq/results-'"${GITHUB_SHA}"' -w s3://'"${AWS_S3_BUCKET}"'/ampliseq/work-'"${GITHUB_SHA}"' -with-tower"], "environment": [{"name": "TOWER_ACCESS_TOKEN", "value": "'"$TOWER_ACCESS_TOKEN"'"}]}' + --container-overrides '{"command": ["nf-core/ampliseq", "-r '"${GITHUB_SHA}"' -profile test_full --outdir s3://'"${AWS_S3_BUCKET}"'/ampliseq/results-'"${GITHUB_SHA}"' -w s3://'"${AWS_S3_BUCKET}"'/ampliseq/work-'"${GITHUB_SHA}"' -with-tower"], "environment": [{"name": "TOWER_ACCESS_TOKEN", "value": "'"$TOWER_ACCESS_TOKEN"'"}]}' diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml index 87f03f0d..6834183b 100644 --- a/.github/workflows/awstest.yml +++ b/.github/workflows/awstest.yml @@ -6,6 +6,16 @@ name: nf-core AWS test on: workflow_dispatch: + +env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + TOWER_ACCESS_TOKEN: ${{ secrets.AWS_TOWER_TOKEN }} + AWS_JOB_DEFINITION: ${{ secrets.AWS_JOB_DEFINITION }} + AWS_JOB_QUEUE: ${{ secrets.AWS_JOB_QUEUE }} + AWS_S3_BUCKET: ${{ secrets.AWS_S3_BUCKET }} + + jobs: run-awstest: name: Run AWS tests @@ -20,16 +30,9 @@ jobs: - name: Install awscli run: conda install -c conda-forge awscli - name: Start AWS batch job - # TODO nf-core: You can customise CI pipeline run tests as required + # nf-core: You can customise CI pipeline run tests as required # For example: adding multiple test runs with different parameters # Remember that you can parallelise this by using strategy.matrix - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - TOWER_ACCESS_TOKEN: ${{ secrets.AWS_TOWER_TOKEN }} - AWS_JOB_DEFINITION: ${{ secrets.AWS_JOB_DEFINITION }} - AWS_JOB_QUEUE: ${{ secrets.AWS_JOB_QUEUE }} - AWS_S3_BUCKET: ${{ secrets.AWS_S3_BUCKET }} run: | aws batch submit-job \ --region eu-west-1 \ diff --git a/.github/workflows/branch.yml b/.github/workflows/branch.yml index 4c5957a1..0cd01373 100644 --- a/.github/workflows/branch.yml +++ b/.github/workflows/branch.yml @@ -13,7 +13,7 @@ jobs: - name: Check PRs if: github.repository == 'nf-core/ampliseq' run: | - { [[ ${{github.event.pull_request.head.repo.full_name}} == nf-core/ampliseq ]] && [[ $GITHUB_HEAD_REF = "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]] + { [[ ${{github.event.pull_request.head.repo.full_name }} == nf-core/ampliseq ]] && [[ $GITHUB_HEAD_REF = "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]] # If the above check failed, post a comment on the PR explaining the failure @@ -23,13 +23,22 @@ jobs: uses: mshick/add-pr-comment@v1 with: message: | + ## This PR is against the `master` branch :x: + + * Do not close this PR + * Click _Edit_ and change the `base` to `dev` + * This CI test will remain failed until you push a new commit + + --- + Hi @${{ github.event.pull_request.user.login }}, - It looks like this pull-request is has been made against the ${{github.event.pull_request.head.repo.full_name}} `master` branch. + It looks like this pull-request is has been made against the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `master` branch. The `master` branch on nf-core repositories should always contain code from the latest release. - Because of this, PRs to `master` are only allowed if they come from the ${{github.event.pull_request.head.repo.full_name}} `dev` branch. + Because of this, PRs to `master` are only allowed if they come from the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `dev` branch. You do not need to close this PR, you can change the target branch to `dev` by clicking the _"Edit"_ button at the top of this page. + Note that even after this, the test will continue to show as failing until you push a new commit. Thanks again for your contribution! repo-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 861e3d42..db2f00db 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -8,6 +8,9 @@ on: release: types: [published] +# Uncomment if we need an edge release of Nextflow again +# env: NXF_EDGE: 1 + jobs: test: name: Run workflow tests @@ -20,28 +23,11 @@ jobs: strategy: matrix: # Nextflow versions: check pipeline minimum and current latest - nxf_ver: ['20.04.0', ''] + nxf_ver: ['21.04.0', ''] steps: - name: Check out pipeline code uses: actions/checkout@v2 - - name: Check if Dockerfile or Conda environment changed - uses: technote-space/get-diff-action@v4 - with: - FILES: | - Dockerfile - environment.yml - - - name: Build new docker image - if: env.MATCHED_FILES - run: docker build --no-cache . -t nfcore/ampliseq:1.2.0 - - - name: Pull docker image - if: ${{ !env.MATCHED_FILES }} - run: | - docker pull nfcore/ampliseq:dev - docker tag nfcore/ampliseq:dev nfcore/ampliseq:1.2.0 - - name: Install Nextflow env: CAPSULE_LOG: none @@ -52,15 +38,29 @@ jobs: - name: Run pipeline with test data run: | nextflow run ${GITHUB_WORKSPACE} -profile test,docker - - name: Run pipeline with multi test data - run: | - nextflow run ${GITHUB_WORKSPACE} -profile test_multi,docker - - name: Run pipeline with manifest input - run: | - nextflow run ${GITHUB_WORKSPACE} -profile test_manifest,docker - - name: Run pipeline with ITS PacBio reads and Unite database - run: | - nextflow run ${GITHUB_WORKSPACE} -profile test_pacbio_its,docker - - name: Run pipeline with double cutadapt steps on test data with double primers + + profiles: + name: Run workflow profile + # Only run on push if this is the nf-core dev branch (merged PRs) + if: ${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/ampliseq') }} + runs-on: ubuntu-latest + env: + NXF_VER: '21.04.0' + NXF_ANSI_LOG: false + strategy: + matrix: + # Run remaining test profiles with minimum nextflow version + profile: [test_multi, test_pacbio_its, test_doubleprimers, test_iontorrent] + steps: + - name: Check out pipeline code + uses: actions/checkout@v2 + + - name: Install Nextflow + env: + CAPSULE_LOG: none + run: | + wget -qO- get.nextflow.io | bash + sudo mv nextflow /usr/local/bin/ + - name: Run pipeline with ${{ matrix.profile }} test profile run: | - nextflow run ${GITHUB_WORKSPACE} -profile test_doubleprimers,docker + nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.profile }},docker diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 48ac0e19..fcde400c 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -19,6 +19,34 @@ jobs: run: npm install -g markdownlint-cli - name: Run Markdownlint run: markdownlint ${GITHUB_WORKSPACE} -c ${GITHUB_WORKSPACE}/.github/markdownlint.yml + + # If the above check failed, post a comment on the PR explaining the failure + - name: Post PR comment + if: failure() + uses: mshick/add-pr-comment@v1 + with: + message: | + ## Markdown linting is failing + + To keep the code consistent with lots of contributors, we run automated code consistency checks. + To fix this CI test, please run: + + * Install `markdownlint-cli` + * On Mac: `brew install markdownlint-cli` + * Everything else: [Install `npm`](https://www.npmjs.com/get-npm) then [install `markdownlint-cli`](https://www.npmjs.com/package/markdownlint-cli) (`npm install -g markdownlint-cli`) + * Fix the markdown errors + * Automatically: `markdownlint . --config .github/markdownlint.yml --fix` + * Manually resolve anything left from `markdownlint . --config .github/markdownlint.yml` + + Once you push these changes the test should pass, and you can hide this comment :+1: + + We highly recommend setting up markdownlint in your code editor so that this formatting is done automatically on save. Ask about it on Slack for help! + + Thanks again for your contribution! + repo-token: ${{ secrets.GITHUB_TOKEN }} + allow-repeats: false + + YAML: runs-on: ubuntu-latest steps: @@ -29,7 +57,34 @@ jobs: - name: Install yaml-lint run: npm install -g yaml-lint - name: Run yaml-lint - run: yamllint $(find ${GITHUB_WORKSPACE} -type f -name "*.yml") + run: yamllint $(find ${GITHUB_WORKSPACE} -type f -name "*.yml" -o -name "*.yaml") + + # If the above check failed, post a comment on the PR explaining the failure + - name: Post PR comment + if: failure() + uses: mshick/add-pr-comment@v1 + with: + message: | + ## YAML linting is failing + + To keep the code consistent with lots of contributors, we run automated code consistency checks. + To fix this CI test, please run: + + * Install `yaml-lint` + * [Install `npm`](https://www.npmjs.com/get-npm) then [install `yaml-lint`](https://www.npmjs.com/package/yaml-lint) (`npm install -g yaml-lint`) + * Fix the markdown errors + * Run the test locally: `yamllint $(find . -type f -name "*.yml" -o -name "*.yaml")` + * Fix any reported errors in your YAML files + + Once you push these changes the test should pass, and you can hide this comment :+1: + + We highly recommend setting up yaml-lint in your code editor so that this formatting is done automatically on save. Ask about it on Slack for help! + + Thanks again for your contribution! + repo-token: ${{ secrets.GITHUB_TOKEN }} + allow-repeats: false + + nf-core: runs-on: ubuntu-latest steps: @@ -43,6 +98,7 @@ jobs: run: | wget -qO- get.nextflow.io | bash sudo mv nextflow /usr/local/bin/ + - uses: actions/setup-python@v1 with: python-version: '3.6' @@ -68,7 +124,7 @@ jobs: if: ${{ always() }} uses: actions/upload-artifact@v2 with: - name: linting-log-file + name: linting-logs path: | lint_log.txt lint_results.md diff --git a/.github/workflows/push_dockerhub_dev.yml b/.github/workflows/push_dockerhub_dev.yml deleted file mode 100644 index 6f213cb7..00000000 --- a/.github/workflows/push_dockerhub_dev.yml +++ /dev/null @@ -1,28 +0,0 @@ -name: nf-core Docker push (dev) -# This builds the docker image and pushes it to DockerHub -# Runs on nf-core repo releases and push event to 'dev' branch (PR merges) -on: - push: - branches: - - dev - -jobs: - push_dockerhub: - name: Push new Docker image to Docker Hub (dev) - runs-on: ubuntu-latest - # Only run for the nf-core repo, for releases and merged PRs - if: ${{ github.repository == 'nf-core/ampliseq' }} - env: - DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} - DOCKERHUB_PASS: ${{ secrets.DOCKERHUB_PASS }} - steps: - - name: Check out pipeline code - uses: actions/checkout@v2 - - - name: Build new docker image - run: docker build --no-cache . -t nfcore/ampliseq:dev - - - name: Push Docker image to DockerHub (dev) - run: | - echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin - docker push nfcore/ampliseq:dev diff --git a/.github/workflows/push_dockerhub_release.yml b/.github/workflows/push_dockerhub_release.yml deleted file mode 100644 index 2100b740..00000000 --- a/.github/workflows/push_dockerhub_release.yml +++ /dev/null @@ -1,29 +0,0 @@ -name: nf-core Docker push (release) -# This builds the docker image and pushes it to DockerHub -# Runs on nf-core repo releases and push event to 'dev' branch (PR merges) -on: - release: - types: [published] - -jobs: - push_dockerhub: - name: Push new Docker image to Docker Hub (release) - runs-on: ubuntu-latest - # Only run for the nf-core repo, for releases and merged PRs - if: ${{ github.repository == 'nf-core/ampliseq' }} - env: - DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} - DOCKERHUB_PASS: ${{ secrets.DOCKERHUB_PASS }} - steps: - - name: Check out pipeline code - uses: actions/checkout@v2 - - - name: Build new docker image - run: docker build --no-cache . -t nfcore/ampliseq:latest - - - name: Push Docker image to DockerHub (release) - run: | - echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin - docker push nfcore/ampliseq:latest - docker tag nfcore/ampliseq:latest nfcore/ampliseq:${{ github.event.release.tag_name }} - docker push nfcore/ampliseq:${{ github.event.release.tag_name }} diff --git a/.gitignore b/.gitignore index aa4bb5b3..c90b6055 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,9 @@ tests/ testing/ testing* *.pyc +.*.sw? +.Rproj.user +.Rhistory +.screenrc +ampliseq.Rproj +results_test/* diff --git a/CHANGELOG.md b/CHANGELOG.md index 1d0ad3e6..63a888f3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,57 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## nf-core/ampliseq version 1.2.0 "Teal Bronze Lion" - 2021 +## nf-core/ampliseq version 2.0.0 "Blue Copper Kangaroo" - 2021-06-29 + +Re-wrote whole pipeline in nextflow [DSL2](https://www.nextflow.io/docs/latest/dsl2.html) instead of DSL1 + +### `Added` + +* [#229](https://github.com/nf-core/ampliseq/pull/229) - `--single_end` for single-ended Illumina data +* [#229](https://github.com/nf-core/ampliseq/pull/229), [#245](https://github.com/nf-core/ampliseq/pull/245), [#267](https://github.com/nf-core/ampliseq/pull/267) - Taxonomic classification with DADA2 +* [#229](https://github.com/nf-core/ampliseq/pull/229) - `--dada_ref_taxonomy` for taxonomic classification with DADA2's assignTaxonomy and addSpecies functions +* [#278](https://github.com/nf-core/ampliseq/pull/278) - `--qiime_ref_taxonomy` for taxonomic classification with QIIME2 +* [#239](https://github.com/nf-core/ampliseq/pull/239) - Support of RDP database for DADA2 classification +* [#237](https://github.com/nf-core/ampliseq/pull/237) - Support of UNITE database for DADA2 classification +* [#229](https://github.com/nf-core/ampliseq/pull/229) - `--input` may point (1) at a fasta file ending with `.fasta`/`.fna`/`.fa` that will be taxonomically classified, (2) at a samples sheet ending with `.tsv` that allows analysis of multiple sequencing runs by reading the optional column `run`, or (3) at a folder input +* [#229](https://github.com/nf-core/ampliseq/pull/229) - `--sample_inference`, `--concatenate_reads`, `--illumina_pe_its`; please check the documentation for their function +* [#275](https://github.com/nf-core/ampliseq/pull/275) - Read count summary +* [#274](https://github.com/nf-core/ampliseq/pull/274) - `--skip_qiime` to prevent any steps that are executed with QIIME2 +* [#272](https://github.com/nf-core/ampliseq/pull/272) - `--cut_its` to cut ASV sequence to ITS region before performing taxonomic classification with DADA2 +* [#280](https://github.com/nf-core/ampliseq/pull/280) - Added support for IonTorrent data +* [#283](https://github.com/nf-core/ampliseq/pull/283) - `--cut_dada_ref_taxonomy` allows extracting expected amplicons from DADA2 reference taxonomy database + +### `Changed` + +* [#254](https://github.com/nf-core/ampliseq/pull/254) - Updated CamelCase parameters to be lower_case_snake_case: + * `multipleSequencingRuns` to `multiple_sequencing_runs` + * `minLen` to `min_len` + * `maxLen` to `max_len` + * `maxEE` to `max_ee` +* [#277](https://github.com/nf-core/ampliseq/pull/277) - Requires nextflow version `>= 21.04.0` + +### `Fixed` + +* [#273](https://github.com/nf-core/ampliseq/pull/273) - Template update for nf-core/tools version 1.14 + +### `Dependencies` + +* [#272](https://github.com/nf-core/ampliseq/pull/272) - New dependency ITSx v1.1.3 +* [#229](https://github.com/nf-core/ampliseq/pull/229) - Updated from cutadapt v2.8 to v3.2 +* [#229](https://github.com/nf-core/ampliseq/pull/229) - Updated DADA2 from v1.10 to v1.18.0, now not using QIIME2 for ASV generation any more +* [#229](https://github.com/nf-core/ampliseq/pull/229) - Updated QIIME2 to v2021.2 + +### `Removed` + +* [#229](https://github.com/nf-core/ampliseq/pull/229) - `--manifest` is superseeded by `--input` that can now also handle a sample sheet file input (required extension: `.tsv`) +* [#229](https://github.com/nf-core/ampliseq/pull/229) - `--Q2imported` and `untilQ2import` are removed because pausing at that point is not neccessary +* [#229](https://github.com/nf-core/ampliseq/pull/229) - `--split` is no longer supported, therefore all sample IDs have to be unique +* [#229](https://github.com/nf-core/ampliseq/pull/229) - `--classifier_removeHash` and `--qiime_timezone` became unnecessary +* [#229](https://github.com/nf-core/ampliseq/pull/229) - `--onlyDenoising` is deprecated in favour of `--skip_taxonomy` (which does the exact same thing) +* `--taxon_reference` became unnecessary +* [#229](https://github.com/nf-core/ampliseq/pull/229) - `--reference_database` and `--dereplication` are not supported any more. `--qiime_ref_taxonomy` allows now choosing a taxonomic reference + +## nf-core/ampliseq version 1.2.0 "Teal Bronze Lion" - 2021-02-04 ### `Added` @@ -28,7 +78,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Deprecated` -## nf-core/ampliseq version 1.1.3 - 2020 +## nf-core/ampliseq version 1.1.3 - 2020-11-02 ### `Added` @@ -52,12 +102,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * `--reads` is replaced by `--input` due to nf-core/tools v1.10.2 -## nf-core/ampliseq version 1.1.2 - 2019 +## nf-core/ampliseq version 1.1.2 - 2019-12-19 * No further changes, except a bugfix for the [timezone](https://github.com/nf-core/ampliseq/issues/114) issue found by @marchoeppner * Specification of `--qiime_timezone` might be required to run the analysis appropriately -## nf-core/ampliseq version 1.1.1 - 2019 +## nf-core/ampliseq version 1.1.1 - 2019-12-09 ### Pipeline Updates @@ -68,7 +118,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * [#78](https://github.com/nf-core/ampliseq/issues/78) - All sequenced classifed to the same species -## nf-core/ampliseq version 1.1.0 "Silver Lime Bee" - 2019 +## nf-core/ampliseq version 1.1.0 "Silver Lime Bee" - 2019-07-15 ### Pipeline updates diff --git a/CITATIONS.md b/CITATIONS.md new file mode 100644 index 00000000..5ceed36f --- /dev/null +++ b/CITATIONS.md @@ -0,0 +1,87 @@ +# nf-core/ampliseq: Citations + +## [nf-core/ampliseq](https://pubmed.ncbi.nlm.nih.gov/33193131/) + +> Straub D, Blackwell N, Langarica-Fuentes A, Peltzer A, Nahnsen S, Kleindienst S. Interpretations of Environmental Microbial Community Studies Are Biased by the Selected 16S rRNA (Gene) Amplicon Sequencing Pipeline. Front Microbiol. 2020 Oct 23;11:550420. doi: 10.3389/fmicb.2020.550420. PMID: 33193131; PMCID: PMC7645116. + +## [nf-core](https://pubmed.ncbi.nlm.nih.gov/32055031/) + +> Ewels PA, Peltzer A, Fillinger S, Patel H, Alneberg J, Wilm A, Garcia MU, Di Tommaso P, Nahnsen S. The nf-core framework for community-curated bioinformatics pipelines. Nat Biotechnol. 2020 Mar;38(3):276-278. doi: 10.1038/s41587-020-0439-x. PubMed PMID: 32055031. + +## [Nextflow](https://pubmed.ncbi.nlm.nih.gov/28398311/) + +> Di Tommaso P, Chatzou M, Floden EW, Barja PP, Palumbo E, Notredame C. Nextflow enables reproducible computational workflows. Nat Biotechnol. 2017 Apr 11;35(4):316-319. doi: 10.1038/nbt.3820. PubMed PMID: 28398311. + +## Pipeline tools + +### Core tools + +* [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) + +* [Cutadapt](https://journal.embnet.org/index.php/embnetjournal/article/view/200/479) + > Marcel, M. Cutadapt removes adapter sequences from high-throughput sequencing reads. EMBnet. journal 17.1 (2011): pp-10. doi: 10.14806/ej.17.1.200. + +* [DADA2](https://pubmed.ncbi.nlm.nih.gov/27214047/) + > Callahan BJ, McMurdie PJ, Rosen MJ, Han AW, Johnson AJ, Holmes SP. DADA2: High-resolution sample inference from Illumina amplicon data. Nat Methods. 2016 Jul;13(7):581-3. doi: 10.1038/nmeth.3869. Epub 2016 May 23. PMID: 27214047; PMCID: PMC4927377. + +### Taxonomic classification and database (only one database) + +* Classification by [QIIME2 classifier](https://pubmed.ncbi.nlm.nih.gov/29773078/) + > Bokulich NA, Kaehler BD, Rideout JR, Dillon M, Bolyen E, Knight R, Huttley GA, Gregory Caporaso J. Optimizing taxonomic classification of marker-gene amplicon sequences with QIIME 2's q2-feature-classifier plugin. Microbiome. 2018 May 17;6(1):90. doi: 10.1186/s40168-018-0470-z. PMID: 29773078; PMCID: PMC5956843. + +* default: [SILVA](https://pubmed.ncbi.nlm.nih.gov/23193283/) + > Quast C, Pruesse E, Yilmaz P, Gerken J, Schweer T, Yarza P, Peplies J, Glöckner FO. The SILVA ribosomal RNA gene database project: improved data processing and web-based tools. Nucleic Acids Res. 2013 Jan;41(Database issue):D590-6. doi: 10.1093/nar/gks1219. Epub 2012 Nov 28. PMID: 23193283; PMCID: PMC3531112. + +* [PR2 - Protist Reference Ribosomal Database](https://pubmed.ncbi.nlm.nih.gov/23193267/) + > Guillou L, Bachar D, Audic S, Bass D, Berney C, Bittner L, Boutte C, Burgaud G, de Vargas C, Decelle J, Del Campo J, Dolan JR, Dunthorn M, Edvardsen B, Holzmann M, Kooistra WH, Lara E, Le Bescot N, Logares R, Mahé F, Massana R, Montresor M, Morard R, Not F, Pawlowski J, Probert I, Sauvadet AL, Siano R, Stoeck T, Vaulot D, Zimmermann P, Christen R. The Protist Ribosomal Reference database (PR2): a catalog of unicellular eukaryote small sub-unit rRNA sequences with curated taxonomy. Nucleic Acids Res. 2013 Jan;41(Database issue):D597-604. doi: 10.1093/nar/gks1160. Epub 2012 Nov 27. PMID: 23193267; PMCID: PMC3531120. + +* [GTDB - Genome Taxonomy Database](https://pubmed.ncbi.nlm.nih.gov/30148503/) + > Parks DH, Chuvochina M, Waite DW, Rinke C, Skarshewski A, Chaumeil PA, Hugenholtz P. A standardized bacterial taxonomy based on genome phylogeny substantially revises the tree of life. Nat Biotechnol. 2018 Nov;36(10):996-1004. doi: 10.1038/nbt.4229. Epub 2018 Aug 27. PMID: 30148503. + +* [RDP - Ribosomal Database Project](https://pubmed.ncbi.nlm.nih.gov/24288368/) + > Cole JR, Wang Q, Fish JA, Chai B, McGarrell DM, Sun Y, Brown CT, Porras-Alfaro A, Kuske CR, Tiedje JM. Ribosomal Database Project: data and tools for high throughput rRNA analysis. Nucleic Acids Res. 2014 Jan;42(Database issue):D633-42. doi: 10.1093/nar/gkt1244. Epub 2013 Nov 27. PMID: 24288368; PMCID: PMC3965039. + +* [UNITE - eukaryotic nuclear ribosomal ITS region](https://pubmed.ncbi.nlm.nih.gov/15869663/) + > Kõljalg U, Larsson KH, Abarenkov K, Nilsson RH, Alexander IJ, Eberhardt U, Erland S, Høiland K, Kjøller R, Larsson E, Pennanen T, Sen R, Taylor AF, Tedersoo L, Vrålstad T, Ursing BM. UNITE: a database providing web-based methods for the molecular identification of ectomycorrhizal fungi. New Phytol. 2005 Jun;166(3):1063-8. doi: 10.1111/j.1469-8137.2005.01376.x. PMID: 15869663. + +### Downstream analysis + +* [QIIME2](https://pubmed.ncbi.nlm.nih.gov/31341288/) + > Bolyen E, Rideout JR, Dillon MR, Bokulich NA, Abnet CC, Al-Ghalith GA, Alexander H, Alm EJ, Arumugam M, Asnicar F, Bai Y, Bisanz JE, Bittinger K, Brejnrod A, Brislawn CJ, Brown CT, Callahan BJ, Caraballo-Rodríguez AM, Chase J, Cope EK, Da Silva R, Diener C, Dorrestein PC, Douglas GM, Durall DM, Duvallet C, Edwardson CF, Ernst M, Estaki M, Fouquier J, Gauglitz JM, Gibbons SM, Gibson DL, Gonzalez A, Gorlick K, Guo J, Hillmann B, Holmes S, Holste H, Huttenhower C, Huttley GA, Janssen S, Jarmusch AK, Jiang L, Kaehler BD, Kang KB, Keefe CR, Keim P, Kelley ST, Knights D, Koester I, Kosciolek T, Kreps J, Langille MGI, Lee J, Ley R, Liu YX, Loftfield E, Lozupone C, Maher M, Marotz C, Martin BD, McDonald D, McIver LJ, Melnik AV, Metcalf JL, Morgan SC, Morton JT, Naimey AT, Navas-Molina JA, Nothias LF, Orchanian SB, Pearson T, Peoples SL, Petras D, Preuss ML, Pruesse E, Rasmussen LB, Rivers A, Robeson MS 2nd, Rosenthal P, Segata N, Shaffer M, Shiffer A, Sinha R, Song SJ, Spear JR, Swafford AD, Thompson LR, Torres PJ, Trinh P, Tripathi A, Turnbaugh PJ, Ul-Hasan S, van der Hooft JJJ, Vargas F, Vázquez-Baeza Y, Vogtmann E, von Hippel M, Walters W, Wan Y, Wang M, Warren J, Weber KC, Williamson CHD, Willis AD, Xu ZZ, Zaneveld JR, Zhang Y, Zhu Q, Knight R, Caporaso JG. Reproducible, interactive, scalable and extensible microbiome data science using QIIME 2. Nat Biotechnol. 2019 Aug;37(8):852-857. doi: 10.1038/s41587-019-0209-9. Erratum in: Nat Biotechnol. 2019 Sep;37(9):1091. PMID: 31341288; PMCID: PMC7015180. + +* [MAFFT](https://pubmed.ncbi.nlm.nih.gov/23329690/) + > Katoh K, Standley DM. MAFFT multiple sequence alignment software version 7: improvements in performance and usability. Mol Biol Evol. 2013 Apr;30(4):772-80. doi: 10.1093/molbev/mst010. Epub 2013 Jan 16. PMID: 23329690; PMCID: PMC3603318. + +* [ANCOM](https://pubmed.ncbi.nlm.nih.gov/26028277/) + > Mandal S, Van Treuren W, White RA, Eggesbø M, Knight R, Peddada SD. Analysis of composition of microbiomes: a novel method for studying microbial composition. Microb Ecol Health Dis. 2015 May 29;26:27663. doi: 10.3402/mehd.v26.27663. PMID: 26028277; PMCID: PMC4450248. + +### Non-default tools + +* [ITSx](https://besjournals.onlinelibrary.wiley.com/doi/10.1111/2041-210X.12073) + > Bengtsson-Palme, J., Ryberg, M., Hartmann, M., Branco, S., Wang, Z., Godhe, A., De Wit, P., Sánchez-García, M., Ebersberger, I., de Sousa, F., Amend, A., Jumpponen, A., Unterseher, M., Kristiansson, E., Abarenkov, K., Bertrand, Y.J.K., Sanli, K., Eriksson, K.M., Vik, U., Veldre, V. and Nilsson, R.H.. Improved software detection and extraction of ITS1 and ITS2 from ribosomal ITS sequences of fungi and other eukaryotes for analysis of environmental sequencing data. Methods Ecol Evol 2013, 4: 914-919. doi: 10.1111/2041-210X.12073. + +### Summarizing software + +* [MultiQC](https://doi.org/10.1093/bioinformatics/btw354) + > Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics , 32(19), 3047–3048. doi: doi.org/10.1093/bioinformatics/btw354. + +## Data + +* [Full-size test data](https://doi.org/10.3389/fmicb.2020.550420) + > Straub D, Blackwell N, Langarica-Fuentes A, Peltzer A, Nahnsen S, Kleindienst S. Interpretations of Environmental Microbial Community Studies Are Biased by the Selected 16S rRNA (Gene) Amplicon Sequencing Pipeline. Front Microbiol. 2020 Oct 23;11:550420. doi: 10.3389/fmicb.2020.550420. PMID: 33193131; PMCID: PMC7645116. + +## Software packaging/containerisation tools + +* [Anaconda](https://anaconda.com) + > Anaconda Software Distribution. Computer software. Vers. 2-2.4.0. Anaconda, Nov. 2016. Web. + +* [Bioconda](https://pubmed.ncbi.nlm.nih.gov/29967506/) + > Grüning B, Dale R, Sjödin A, Chapman BA, Rowe J, Tomkins-Tinch CH, Valieris R, Köster J; Bioconda Team. Bioconda: sustainable and comprehensive software distribution for the life sciences. Nat Methods. 2018 Jul;15(7):475-476. doi: 10.1038/s41592-018-0046-7. PubMed PMID: 29967506. + +* [BioContainers](https://pubmed.ncbi.nlm.nih.gov/28379341/) + > da Veiga Leprevost F, Grüning B, Aflitos SA, Röst HL, Uszkoreit J, Barsnes H, Vaudel M, Moreno P, Gatto L, Weber J, Bai M, Jimenez RC, Sachsenberg T, Pfeuffer J, Alvarez RV, Griss J, Nesvizhskii AI, Perez-Riverol Y. BioContainers: an open-source and community-driven framework for software standardization. Bioinformatics. 2017 Aug 15;33(16):2580-2582. doi: 10.1093/bioinformatics/btx192. PubMed PMID: 28379341; PubMed Central PMCID: PMC5870671. + +* [Docker](https://dl.acm.org/doi/10.5555/2600239.2600241) + +* [Singularity](https://pubmed.ncbi.nlm.nih.gov/28494014/) + > Kurtzer GM, Sochat V, Bauer MW. Singularity: Scientific containers for mobility of compute. PLoS One. 2017 May 11;12(5):e0177459. doi: 10.1371/journal.pone.0177459. eCollection 2017. PubMed PMID: 28494014; PubMed Central PMCID: PMC5426675. diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index 405fb1bf..f4fd052f 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -1,46 +1,111 @@ -# Contributor Covenant Code of Conduct +# Code of Conduct at nf-core (v1.0) ## Our Pledge -In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. +In the interest of fostering an open, collaborative, and welcoming environment, we as contributors and maintainers of nf-core, pledge to making participation in our projects and community a harassment-free experience for everyone, regardless of: -## Our Standards +- Age +- Body size +- Familial status +- Gender identity and expression +- Geographical location +- Level of experience +- Nationality and national origins +- Native language +- Physical and neurological ability +- Race or ethnicity +- Religion +- Sexual identity and orientation +- Socioeconomic status -Examples of behavior that contributes to creating a positive environment include: +Please note that the list above is alphabetised and is therefore not ranked in any order of preference or importance. -* Using welcoming and inclusive language -* Being respectful of differing viewpoints and experiences -* Gracefully accepting constructive criticism -* Focusing on what is best for the community -* Showing empathy towards other community members +## Preamble -Examples of unacceptable behavior by participants include: +> Note: This Code of Conduct (CoC) has been drafted by the nf-core Safety Officer and been edited after input from members of the nf-core team and others. "We", in this document, refers to the Safety Officer and members of the nf-core core team, both of whom are deemed to be members of the nf-core community and are therefore required to abide by this Code of Conduct. This document will amended periodically to keep it up-to-date, and in case of any dispute, the most current version will apply. -* The use of sexualized language or imagery and unwelcome sexual attention or advances -* Trolling, insulting/derogatory comments, and personal or political attacks -* Public or private harassment -* Publishing others' private information, such as a physical or electronic address, without explicit permission -* Other conduct which could reasonably be considered inappropriate in a professional setting +An up-to-date list of members of the nf-core core team can be found [here](https://nf-co.re/about). Our current safety officer is Renuka Kudva. + +nf-core is a young and growing community that welcomes contributions from anyone with a shared vision for [Open Science Policies](https://www.fosteropenscience.eu/taxonomy/term/8). Open science policies encompass inclusive behaviours and we strive to build and maintain a safe and inclusive environment for all individuals. + +We have therefore adopted this code of conduct (CoC), which we require all members of our community and attendees in nf-core events to adhere to in all our workspaces at all times. Workspaces include but are not limited to Slack, meetings on Zoom, Jitsi, YouTube live etc. + +Our CoC will be strictly enforced and the nf-core team reserve the right to exclude participants who do not comply with our guidelines from our workspaces and future nf-core activities. + +We ask all members of our community to help maintain a supportive and productive workspace and to avoid behaviours that can make individuals feel unsafe or unwelcome. Please help us maintain and uphold this CoC. + +Questions, concerns or ideas on what we can include? Contact safety [at] nf-co [dot] re ## Our Responsibilities -Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. +The safety officer is responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behaviour. + +The safety officer in consultation with the nf-core core team have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. + +Members of the core team or the safety officer who violate the CoC will be required to recuse themselves pending investigation. They will not have access to any reports of the violations and be subject to the same actions as others in violation of the CoC. + +## When are where does this Code of Conduct apply? + +Participation in the nf-core community is contingent on following these guidelines in all our workspaces and events. This includes but is not limited to the following listed alphabetically and therefore in no order of preference: + +- Communicating with an official project email address. +- Communicating with community members within the nf-core Slack channel. +- Participating in hackathons organised by nf-core (both online and in-person events). +- Participating in collaborative work on GitHub, Google Suite, community calls, mentorship meetings, email correspondence. +- Participating in workshops, training, and seminar series organised by nf-core (both online and in-person events). This applies to events hosted on web-based platforms such as Zoom, Jitsi, YouTube live etc. +- Representing nf-core on social media. This includes both official and personal accounts. + +## nf-core cares 😊 + +nf-core's CoC and expectations of respectful behaviours for all participants (including organisers and the nf-core team) include but are not limited to the following (listed in alphabetical order): + +- Ask for consent before sharing another community member’s personal information (including photographs) on social media. +- Be respectful of differing viewpoints and experiences. We are all here to learn from one another and a difference in opinion can present a good learning opportunity. +- Celebrate your accomplishments at events! (Get creative with your use of emojis 🎉 🥳 💯 🙌 !) +- Demonstrate empathy towards other community members. (We don’t all have the same amount of time to dedicate to nf-core. If tasks are pending, don’t hesitate to gently remind members of your team. If you are leading a task, ask for help if you feel overwhelmed.) +- Engage with and enquire after others. (This is especially important given the geographically remote nature of the nf-core community, so let’s do this the best we can) +- Focus on what is best for the team and the community. (When in doubt, ask) +- Graciously accept constructive criticism, yet be unafraid to question, deliberate, and learn. +- Introduce yourself to members of the community. (We’ve all been outsiders and we know that talking to strangers can be hard for some, but remember we’re interested in getting to know you and your visions for open science!) +- Show appreciation and **provide clear feedback**. (This is especially important because we don’t see each other in person and it can be harder to interpret subtleties. Also remember that not everyone understands a certain language to the same extent as you do, so **be clear in your communications to be kind.**) +- Take breaks when you feel like you need them. +- Using welcoming and inclusive language. (Participants are encouraged to display their chosen pronouns on Zoom or in communication on Slack.) + +## nf-core frowns on 😕 + +The following behaviours from any participants within the nf-core community (including the organisers) will be considered unacceptable under this code of conduct. Engaging or advocating for any of the following could result in expulsion from nf-core workspaces. + +- Deliberate intimidation, stalking or following and sustained disruption of communication among participants of the community. This includes hijacking shared screens through actions such as using the annotate tool in conferencing software such as Zoom. +- “Doxing” i.e. posting (or threatening to post) another person’s personal identifying information online. +- Spamming or trolling of individuals on social media. +- Use of sexual or discriminatory imagery, comments, or jokes and unwelcome sexual attention. +- Verbal and text comments that reinforce social structures of domination related to gender, gender identity and expression, sexual orientation, ability, physical appearance, body size, race, age, religion or work experience. + +### Online Trolling + +The majority of nf-core interactions and events are held online. Unfortunately, holding events online comes with the added issue of online trolling. This is unacceptable, reports of such behaviour will be taken very seriously, and perpetrators will be excluded from activities immediately. + +All community members are required to ask members of the group they are working within for explicit consent prior to taking screenshots of individuals during video calls. + +## Procedures for Reporting CoC violations -Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. +If someone makes you feel uncomfortable through their behaviours or actions, report it as soon as possible. -## Scope +You can reach out to members of the [nf-core core team](https://nf-co.re/about) and they will forward your concerns to the safety officer(s). -This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. +Issues directly concerning members of the core team will be dealt with by other members of the core team and the safety manager, and possible conflicts of interest will be taken into account. nf-core is also in discussions about having an ombudsperson, and details will be shared in due course. -## Enforcement +All reports will be handled with utmost discretion and confidentially. -Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team on [Slack](https://nf-co.re/join/slack). The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. +## Attribution and Acknowledgements -Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. +- The [Contributor Covenant, version 1.4](http://contributor-covenant.org/version/1/4) +- The [OpenCon 2017 Code of Conduct](http://www.opencon2017.org/code_of_conduct) (CC BY 4.0 OpenCon organisers, SPARC and Right to Research Coalition) +- The [eLife innovation sprint 2020 Code of Conduct](https://sprint.elifesciences.org/code-of-conduct/) +- The [Mozilla Community Participation Guidelines v3.1](https://www.mozilla.org/en-US/about/governance/policies/participation/) (version 3.1, CC BY-SA 3.0 Mozilla) -## Attribution +## Changelog -This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [https://www.contributor-covenant.org/version/1/4/code-of-conduct/][version] +### v1.0 - March 12th, 2021 -[homepage]: https://contributor-covenant.org -[version]: https://www.contributor-covenant.org/version/1/4/code-of-conduct/ +- Complete rewrite from original [Contributor Covenant](http://contributor-covenant.org/) CoC. diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index 26fe0b84..00000000 --- a/Dockerfile +++ /dev/null @@ -1,20 +0,0 @@ -FROM nfcore/base:1.12.1 -LABEL authors="Daniel Straub, Alexander Peltzer" \ - description="Docker image containing all software requirements for the nf-core/ampliseq pipeline" - -# Install the conda environment -COPY environment.yml / -RUN conda env create --quiet -f /environment.yml && conda clean -a - -# Add conda installation dir to PATH (instead of doing 'conda activate') -ENV PATH /opt/conda/envs/nf-core-ampliseq-1.2.0/bin:$PATH - -# Dump the details of the installed packages to a file for posterity -RUN conda env export --name nf-core-ampliseq-1.2.0 > nf-core-ampliseq-1.2.0.yml - -# Instruct R processes to use these empty files instead of clashing with a local version -RUN touch .Rprofile -RUN touch .Renviron - -## Don't recache on each execution, do that once per build process -RUN qiime dev refresh-cache diff --git a/README.md b/README.md index cecb48d4..9f3ffc3c 100644 --- a/README.md +++ b/README.md @@ -1,35 +1,39 @@ # ![nf-core/ampliseq](docs/images/nf-core-ampliseq_logo.png) -**16S rRNA amplicon sequencing analysis workflow using QIIME2**. +**Amplicon sequencing analysis workflow using DADA2 and QIIME2**. -[![nf-core](https://img.shields.io/badge/nf--core-pipeline-brightgreen.svg)](https://nf-co.re/) [![DOI](https://zenodo.org/badge/150448201.svg)](https://zenodo.org/badge/latestdoi/150448201) -[![Cite Preprint](https://img.shields.io/badge/Cite%20Us!-Cite%20Publication-important)](https://doi.org/10.3389/fmicb.2020.550420) +[![Cite Publication](https://img.shields.io/badge/Cite%20Us!-Cite%20Publication-important)](https://doi.org/10.3389/fmicb.2020.550420) [![GitHub Actions CI Status](https://github.com/nf-core/ampliseq/workflows/nf-core%20CI/badge.svg)](https://github.com/nf-core/ampliseq/actions) [![GitHub Actions Linting Status](https://github.com/nf-core/ampliseq/workflows/nf-core%20linting/badge.svg)](https://github.com/nf-core/ampliseq/actions) -[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A520.04.0-brightgreen.svg)](https://www.nextflow.io/) +[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/ampliseq/results) + +[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A521.04.0-23aa62.svg?labelColor=000000)](https://www.nextflow.io/) +[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) +[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) +[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) -[![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg)](https://bioconda.github.io/) -[![Docker](https://img.shields.io/docker/automated/nfcore/ampliseq.svg)](https://hub.docker.com/r/nfcore/ampliseq) [![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23ampliseq-4A154B?logo=slack)](https://nfcore.slack.com/channels/ampliseq) +[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/nf_core) +[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core) ## Introduction -**nfcore/ampliseq** is a bioinformatics analysis pipeline used for 16S rRNA or ITS amplicon sequencing data (currently supported is Illumina paired end or PacBio). +**nfcore/ampliseq** is a bioinformatics analysis pipeline used for amplicon sequencing, supporting denoising of any amplicon and, currently, taxonomic assignment of 16S, ITS and 18S amplicons. Supported is paired-end Illumina or single-end Illumina, PacBio and IonTorrent data. Default is the analysis of 16S rRNA gene amplicons sequenced paired-end with Illumina. The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It comes with docker containers making installation trivial and results highly reproducible. ## Quick Start -1. Install [`nextflow`](https://nf-co.re/usage/installation) +1. Install [`nextflow`](https://nf-co.re/usage/installation) (`>=21.04.0`) -2. Install any of [`Docker`](https://docs.docker.com/engine/installation/), [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/) or [`Podman`](https://podman.io/) for full pipeline reproducibility _(please only use [`Conda`](https://conda.io/miniconda.html) as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles))_ +2. Install any of [`Docker`](https://docs.docker.com/engine/installation/), [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/), [`Podman`](https://podman.io/), [`Shifter`](https://nersc.gitlab.io/development/shifter/how-to-use/) or [`Charliecloud`](https://hpc.github.io/charliecloud/) for full pipeline reproducibility _(please only use [`Conda`](https://conda.io/miniconda.html) as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles))_ 3. Download the pipeline and test it on a minimal dataset with a single command: ```bash - nextflow run nf-core/ampliseq -profile test, + nextflow run nf-core/ampliseq -profile test, ``` > Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile ` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment. @@ -37,7 +41,8 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool 4. Start running your own analysis! ```bash - nextflow run nf-core/ampliseq -profile --input "data" --FW_primer GTGYCAGCMGCCGCGGTAA --RV_primer GGACTACNVGGGTWTCTAAT --metadata "data/Metadata.tsv" + #16S rRNA gene amplicon analysis of Illumina paired-end data + nextflow run nf-core/ampliseq -profile --input "data" --FW_primer "GTGYCAGCMGCCGCGGTAA" --RV_primer "GGACTACNVGGGTWTCTAAT" --metadata "data/Metadata.tsv" ``` See [usage docs](https://nf-co.re/ampliseq/usage) and [parameter docs](https://nf-co.re/ampliseq/parameters) for all of the available options when running the pipeline. @@ -48,10 +53,9 @@ By default, the pipeline currently performs the following: * Sequencing quality control ([FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) * Trimming of reads ([Cutadapt](https://journal.embnet.org/index.php/embnetjournal/article/view/200)) -* Illumina read processing with [QIIME2](https://www.nature.com/articles/s41587-019-0209-9) * Infer Amplicon Sequence Variants (ASVs) ([DADA2](https://doi.org/10.1038/nmeth.3869)) -* Taxonomical classification based on [SILVA](https://www.arb-silva.de/) [v132](https://www.arb-silva.de/documentation/release-132/) or [UNITE](https://unite.ut.ee/) database -* excludes unwanted taxa, produces absolute and relative feature/taxa count tables and plots, plots alpha rarefaction curves, computes alpha and beta diversity indices and plots thereof ([QIIME2](https://www.nature.com/articles/s41587-019-0209-9)) +* Taxonomical classification using DADA2 or [QIIME2](https://www.nature.com/articles/s41587-019-0209-9) +* Excludes unwanted taxa, produces absolute and relative feature/taxa count tables and plots, plots alpha rarefaction curves, computes alpha and beta diversity indices and plots thereof ([QIIME2](https://www.nature.com/articles/s41587-019-0209-9)) * Calls differentially abundant taxa ([ANCOM](https://www.ncbi.nlm.nih.gov/pubmed/26028277)) * Overall pipeline run summaries ([MultiQC](https://multiqc.info/)) @@ -84,6 +88,8 @@ If you use `nf-core/ampliseq` for your analysis, please cite the `ampliseq` arti You can cite the `nf-core/ampliseq` zenodo record for a specific version using the following [doi: 10.5281/zenodo.1493841](https://zenodo.org/badge/latestdoi/150448201) +An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. + You can cite the `nf-core` publication as follows: > **The nf-core framework for community-curated bioinformatics pipelines.** @@ -91,8 +97,3 @@ You can cite the `nf-core` publication as follows: > Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen. > > _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x). -> ReadCube: [Full Access Link](https://rdcu.be/b1GjZ) - - - - diff --git a/assets/email_template.html b/assets/email_template.html index a77d1a3e..505409d4 100644 --- a/assets/email_template.html +++ b/assets/email_template.html @@ -1,11 +1,10 @@ - - + nf-core/ampliseq Pipeline Report diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml index 58ecf8d1..601ba5c1 100644 --- a/assets/multiqc_config.yaml +++ b/assets/multiqc_config.yaml @@ -7,6 +7,5 @@ report_section_order: order: -1000 nf-core-ampliseq-summary: order: -1001 - -#export_plots clashes with current MatPlotLib -#export_plots: true + +export_plots: true diff --git a/assets/nf-core-ampliseq_logo.png b/assets/nf-core-ampliseq_logo.png index 4fdf6ae0..3d9df7e9 100644 Binary files a/assets/nf-core-ampliseq_logo.png and b/assets/nf-core-ampliseq_logo.png differ diff --git a/assets/nf-core-ampliseq_social_preview.png b/assets/nf-core-ampliseq_social_preview.png deleted file mode 100644 index 206dfd43..00000000 Binary files a/assets/nf-core-ampliseq_social_preview.png and /dev/null differ diff --git a/assets/nf-core-ampliseq_social_preview.svg b/assets/nf-core-ampliseq_social_preview.svg deleted file mode 100644 index 38bcfcb4..00000000 --- a/assets/nf-core-ampliseq_social_preview.svg +++ /dev/null @@ -1,448 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - image/svg+xml - - - - - - - 16S rRNA amplicon sequencing analysis workflow using QIIME2 - ampliseq - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/bin/add_full_sequence_to_taxfile.py b/bin/add_full_sequence_to_taxfile.py new file mode 100755 index 00000000..e104ae4b --- /dev/null +++ b/bin/add_full_sequence_to_taxfile.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 +#@author Jeanette Tangrot +# Takes one TSV taxonomy file from DADA2 and a sequence fasta file, +# adds sequence to taxonomy based on ASV_ID + +import pandas as pd +import sys, os + +# Argument check +if len(sys.argv) != 3: + exit("Usage: add_full_sequence_to_taxfile.py ") + +# Read tsv and remove sequence column +taxfile = sys.argv[1] +tax = pd.read_csv(taxfile, sep='\t', header=0) +tax.drop(columns='sequence', inplace=True) + +# Read fasta file and store as data frame +seqs = pd.DataFrame(columns=["id","sequence"]) +seq = "" +name = "" +with open(sys.argv[2], 'r') as reader: + for line in reader: + if line.startswith('>'): + if (seq != "" and name != ""): + seqs = seqs.append({'id':name, 'sequence': seq}, ignore_index=True) + seq = "" + name = line.lstrip('>').rstrip('\s+*\n') + else: + seq = seq + line.rstrip('\n') +if (seq != "" and name != ""): + seqs = seqs.append({'id':name, 'sequence': seq}, ignore_index=True) + +# Create name of results file +outfile = taxfile.replace("ASV_ITS_", "ASV_") + +# Join taxonomy and full sequence, write to file +tax = tax.set_index('ASV_ID').join(seqs.set_index('id'), how='outer') +tax.to_csv(outfile, sep='\t',na_rep="NA", index_label="ASV_ID") + diff --git a/bin/combineTable.r b/bin/combine_table.r similarity index 100% rename from bin/combineTable.r rename to bin/combine_table.r diff --git a/bin/cutadapt_summary.py b/bin/cutadapt_summary.py new file mode 100755 index 00000000..3989d1d3 --- /dev/null +++ b/bin/cutadapt_summary.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 + +#--- Import libraries, do initializations ---# +import re, sys +from sys import argv +usage = "Usage: cutadapt_summary.py cutadapt_log_*.txt" + +#--- Check and read arguments ---# +if len(argv) < 3: + exit(usage) +if argv[1] != "single_end" and argv[1] != "paired_end": + exit(usage) + +regexes = [r" -o (\S+) ", + r"Total (?:read pairs|reads) processed:\s+([0-9,,]+)", + r"Reverse-complemented:\s+([0-9,,]+)", + r"(?:Pairs|Reads) written .+?:\s+([0-9,,]+)", + r"(?:Pairs|Reads) written .+?:.*?\(([^)]+)"] + +columns = ["sample", "cutadapt_total_processed", "cutadapt_reverse_complemented", "cutadapt_passing_filters", "cutadapt_passing_filters_percent"] + +#--- Search each file using regex ---# +print("\t".join(columns)) +for FILE in argv[2:]: + with open(FILE) as x: + results = [] + TEXT = x.read() + for REGEX in regexes: + match = re.search(REGEX, TEXT) + if match: + results.append(match.group(1)) + else: + results.append("") + + #modify sample names + if argv[1] == "single_end": + results[0] = results[0].replace(".double-primer.trim.fastq.gz","") + results[0] = results[0].replace(".trim.fastq.gz","") + if argv[1] == "paired_end": + results[0] = results[0].replace(".double-primer_1.trim.fastq.gz","") + results[0] = results[0].replace("_1.trim.fastq.gz","") + + #output per file + print("\t".join(results)) \ No newline at end of file diff --git a/bin/dada2_chimrem.r b/bin/dada2_chimrem.r deleted file mode 100755 index 2d3599b9..00000000 --- a/bin/dada2_chimrem.r +++ /dev/null @@ -1,151 +0,0 @@ -#!/usr/bin/env Rscript -######################################### -# dada2_chimrem -# -# Run DADA2 chimera removal on DADA2 object given in input. -# Also, produce different output and stats files: -# * Fasta file with representative sequences -# * Counts table -# * Relative counts table -# * Denoising stats for each sample; number of reads before/after -# denoising and chimera removal -# -# Author: Jeanette Tångrot (jeanette.tangrot@nbis.se), Daniel Lundin - -suppressPackageStartupMessages(library(optparse)) - -VERSION = 1.0 - -# Get arguments -option_list = list( - make_option( - c('--dadaObj'), type='character', default='dd.rds', - help='R RDS file with DADA2 object containing denoised reads. Default: "dd.rds"' - ), - make_option( - c('--manifest'), type='character', default='', - help='Manifest file listing sample names and paths to sequence files. No default.' - ), - make_option( - c('--method'), type='character', default='pooled', - help='Method for bimera identification. Valid options are "pooled" (all samples are pooled), "consensus" (samples independently checked, consensus decision on each sequence), and "per-sample" (samples are treated independently). Default: "pooled"' - ), - make_option( - c('--allowOneOff'), action="store_true", default=TRUE, - help='Also flag sequences that have one mismatch or indel to an exact bimera as bimeric. Default: "TRUE"' - ), - make_option( - c('--minab'), type='integer', default=8, - help='Minimum parent abundance, default %default. See DADA2 R documentation for isBimeraDenovo.' - ), - make_option( - c('--overab'), type='integer', default=2, - help='Parent overabundance multiplier, default %default. See DADA2 R documentation for isBimeraDenovo.' - ), - make_option( - c('--stats'), type='character', default='denoise_stats.tsv', - help='File for writing some stats from denoising and chimera removal. Default: "denoise_stats.tsv"' - ), - make_option( - c('--table'), type='character', default='feature-table.tsv', - help='File for writing counts per sample and ASV. Default: "feature-table.tsv"' - ), - make_option( - c('--reltable'), type='character', default='rel-feature-table.tsv', - help='File for writing relative abundances of the ASVs. Default: "rel-feature-table.tsv"' - ), - make_option( - c('--repseqs'), type='character', default='sequences.fasta', - help='File for writing ASV sequences in fasta format. Default: "sequences.fasta"' - ), - make_option( - c('-v', '--verbose'), action="store_true", default=FALSE, - help="Print progress messages." - ), - make_option( - c('--version'), action="store_true", default=FALSE, - help="Print version of script and DADA2 library." - ) -) -opt = parse_args(OptionParser(option_list=option_list)) - -if ( opt$version ) { - write(sprintf("dada2_chimrem.r version %s, DADA2 version %s", VERSION, packageVersion('dada2')), stderr()) - q('no', 0) -} - -# Check options -if ( ! file.exists(opt$dada) ) { - stop(sprintf("Cannot find %s. See help (-h).\n",opt$dada)) -} - -# Function for log messages -logmsg = function(msg, llevel='INFO') { - if ( opt$verbose ) { - write( - sprintf("%s: %s: %s", llevel, format(Sys.time(), "%Y-%m-%d %H:%M:%S"), msg), - stderr() - ) - } -} - -logmsg( sprintf( "Chimera removal with DADA2." ) ) - -# Load DADA2 library here, to avoid --help and --version taking so long -suppressPackageStartupMessages(library(dada2)) -suppressPackageStartupMessages(library(ShortRead)) - -dd = readRDS(opt$dadaObj) - -# Make sequence table -seqtab <- makeSequenceTable(dd) - -# Remove chimeras -nochim <- removeBimeraDenovo( - seqtab, - method=opt$method, - allowOneOff=opt$allowOneOff, - minFoldParentOverAbundance=opt$overab, - minParentAbundance = opt$minab, - multithread=T, - verbose=opt$verbose -) - -# Store stats; track reads through filtering/denoising/chimera removal -getN <- function(x) sum(getUniques(x)) -track <- cbind(sapply(dd, getN), rowSums(nochim)) -track <- cbind(rownames(track), track) -colnames(track) <- c("file", "denoised", "nonchim") - -# Write stats to file opt$stats -write.table( track, file = opt$stats, sep = "\t", row.names = FALSE, quote = FALSE) - -logmsg( sprintf( "Creating count tables and generating sequence file." ) ) - -# Create counts table, write to file opt$table -metadata <- read.table(opt$manifest, header = TRUE, sep = ",", colClasses = "character") -metadata["file"] <- basename(metadata$absolute.filepath) -nochim2 <- base:::as.data.frame(t(nochim)) -sample_ids <- metadata$sample.id[match(colnames(nochim2),metadata$file)] -colnames(nochim2) <- sample_ids -nochim2$seq <- row.names(nochim2) -row.names(nochim2) <- paste0("ASV_", seq(nrow(nochim2))) -nochim2 <- cbind(ASV_ID=row.names(nochim2), nochim2) - -write("# Generated by script dada2_chimrem.r from dada2 objects", file = opt$table) -suppressWarnings(write.table(nochim2[,1:(length(nochim2)-1)], file = opt$table, sep = "\t", row.names = F, quote = F, col.names = c("#ASV_ID", colnames(nochim2[2:(length(nochim2)-1)])), append=TRUE)) - -# Write fasta file with ASV sequences to file opt$seqfile -fasta.tab <- nochim2[,c("ASV_ID","seq")] -fasta.tab$ASV_ID <- gsub("ASV_",">ASV_",fasta.tab$ASV_ID) -fasta.tab.join <- c(rbind( fasta.tab$ASV_ID, fasta.tab$seq )) -write( fasta.tab.join, file = opt$repseqs ) - -# Calculate relative abundances and write to file opt$reltable -nochim2$seq <- NULL -nochim2[,2:length(nochim2)] <- nochim2[,2:length(nochim2)]/colSums(nochim2[,2:length(nochim2)])[col(nochim2[,2:length(nochim2)])] -write("# Generated by script dada2_chimrem.r", file = opt$reltable) -suppressWarnings(write.table(nochim2, file = opt$reltable, sep = "\t", row.names = F, quote = F, col.names = c("#ASV_ID", colnames(nochim2[2:length(nochim2)])), append=TRUE)) - - -logmsg(sprintf("Finished chimera removal")) diff --git a/bin/dada2_denoise_pacbio.r b/bin/dada2_denoise_pacbio.r deleted file mode 100755 index 4b917e29..00000000 --- a/bin/dada2_denoise_pacbio.r +++ /dev/null @@ -1,96 +0,0 @@ -#!/usr/bin/env Rscript -######################################### -# dada2_denoise_pacbio -# -# Run DADA2 denoising, using sequence files in a given folder -# and reading error model from given rds file. -# -# Author: Jeanette Tångrot (jeanette.tangrot@nbis.se), Daniel Lundin - -suppressPackageStartupMessages(library(optparse)) - -VERSION = 1.0 - -# Get arguments -option_list = list( - make_option( - c('--filterDir'), type='character', default='dada2_filtered', - help='Directory containing quality filtered reads to estimate sequence errors from, default "dada2_filtered".' - ), - make_option( - c('--errModel'), type='character', default='err.rds', - help='R RDS file with calculated error model, as generated by DADA2 learnErrors. Default: "err.rds".' - ), -make_option( - c('--prefix'), type='character', default='./', - help='Prefix for names of generated files. Default: "./"' - ), -make_option( - c('--pool'), type='character', default='TRUE', - help='Whether to pool together all samples prior to sample inference. Possible options: TRUE, pseudo, FALSE. Default: TRUE' - ), - make_option( - c("-v", "--verbose"), action="store_true", default=FALSE, - help="Print progress messages." - ), - make_option( - c("--version"), action="store_true", default=FALSE, - help="Print version of script and DADA2 library." - ) -) -opt = parse_args(OptionParser(option_list=option_list)) - -if ( opt$version ) { - write(sprintf("dada2_denoise_pacbio.r version %s, DADA2 version %s", VERSION, packageVersion('dada2')), stderr()) - q('no', 0) -} - -# Check options -if ( ! file_test("-d", opt$filterDir) ) { - stop( sprintf("Cannot find folder with filtered files: %s. See help (-h)\n",opt$filterDir) ) -} - -if ( ! file.exists(opt$errModel) ) { - stop(sprintf("Cannot find %s. See help (-h).\n",opt$errModel)) -} - -if ( opt$pool == "TRUE" || opt$pool == "T") { - opt$pool = TRUE -} else if - ( opt$pool == "FALSE" || opt$pool == "F") { - opt$pool = FALSE -} else if ( is.character(opt$pool) && opt$pool != "pseudo" ) { - stop(sprintf("Invalid pool argument for dada2 denoising. See help (-h).\n",opt$errModel)) -} - -# Function for log messages -logmsg = function(msg, llevel='INFO') { - if ( opt$verbose ) { - write( - sprintf("%s: %s: %s", llevel, format(Sys.time(), "%Y-%m-%d %H:%M:%S"), msg), - stderr() - ) - } -} - -logmsg( sprintf( "Sequence denoising with DADA2." ) ) - -# Load DADA2 library here, to avoid --help and --version taking so long -suppressPackageStartupMessages(library(dada2)) -suppressPackageStartupMessages(library(ShortRead)) - -# Write versions used -write(sprintf("%s\nDADA2: %s / Rcpp: %s / RcppParallel: %s", R.version.string, packageVersion('dada2'), packageVersion('Rcpp'), packageVersion('RcppParallel') ),file="") - -# Read error model from file -err = readRDS(opt$errModel) - -# Dereplicate identical reads -files=list.files(opt$filterDir,full.names=T) -derep <- derepFastq(files, verbose = opt$verbose) - -# Denoising, save rds for dada2 object -dd <- dada(derep, err=err, multithread=T, pool=opt$pool) -saveRDS(dd,sprintf('%sdd.rds', opt$prefix)) - -logmsg(sprintf("Finished denoising")) diff --git a/bin/dada2_errmodels_pacbio.r b/bin/dada2_errmodels_pacbio.r deleted file mode 100755 index 56c13954..00000000 --- a/bin/dada2_errmodels_pacbio.r +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env Rscript -######################################### -# dada2_errmodels_pacbio -# -# DADA2 sequence error estimation based on files in a given folder -# Assumes PacBio reads, i.e. uses errorEstimationFunction = PacBioErrfun -# -# Author: Jeanette Tångrot (jeanette.tangrot@nbis.se), Daniel Lundin - -suppressPackageStartupMessages(library(optparse)) - -VERSION = 1.0 - -# Get arguments -option_list = list( - make_option( - c('--filterDir'), type='character', default='dada2_filtered', - help='Directory containing quality filtered reads to estimate sequence errors from, default "dada2_filtered".' - ), - make_option( - c('--prefix'), type='character', default='./', - help='Prefix for name of rds file with DADA2 error model. Can include a path to another folder. Default: "./"' - ), - make_option( - c('--nbases'), type='character', default=1e8, - help='Minimum number of total bases to use for error estimation, please see DADA2 documentation for details. Default: 1e8' - ), - make_option( - c("-v", "--verbose"), action="store_true", default=FALSE, - help="Print progress messages." - ), - make_option( - c("--version"), action="store_true", default=FALSE, - help="Print version of script and DADA2 library." - ) -) -opt = parse_args(OptionParser(option_list=option_list)) - -if ( opt$version ) { - write(sprintf("dada2_errmodels_pacbio.r version %s, DADA2 version %s", VERSION, packageVersion('dada2')), stderr()) - q('no', 0) -} - -# Check options -if ( ! file_test("-d", opt$filterDir) ) { - stop( sprintf("Cannot find folder with filtered files: %s. See help (-h)\n",opt$filterDir) ) -} - -# Function for log messages -logmsg = function(msg, llevel='INFO') { - if ( opt$verbose ) { - write( - sprintf("%s: %s: %s", llevel, format(Sys.time(), "%Y-%m-%d %H:%M:%S"), msg), - stderr() - ) - } -} - -logmsg( sprintf( "Sequence error estimation with DADA2 learnErrors and PacBioErrFun." ) ) - -# Load DADA2 library here, to avoid --help and --version taking so long -suppressPackageStartupMessages(library(dada2)) -suppressPackageStartupMessages(library(ShortRead)) - -# Do the error estimation, save rds for error profile -files=list.files(opt$filterDir,full.names=T) -logmsg( sprintf("Using files: %s", files)) -err <- learnErrors(files, errorEstimationFunction=PacBioErrfun, multithread=TRUE, randomize=FALSE, verbose=opt$verbose, nbases=as.double(opt$nbases)) -saveRDS(err,sprintf('%serr.rds', opt$prefix)) - -logmsg(sprintf("Finished error estimation")) diff --git a/bin/dada2_filter_pacbio.r b/bin/dada2_filter_pacbio.r deleted file mode 100755 index 32cedeed..00000000 --- a/bin/dada2_filter_pacbio.r +++ /dev/null @@ -1,103 +0,0 @@ -#!/usr/bin/env Rscript -######################################### -# dada2_filter_pacbio -# -# Uses the filterAndTrim function in the dada2 package -# to filter long amplicon sequences generated with PacBio. -# -# Author: Jeanette Tångrot (jeanette.tangrot@nbis.se), Daniel Lundin - -suppressPackageStartupMessages(library(optparse)) - -VERSION = 1.0 - -# Get arguments -option_list = list( - make_option( - c('--infile'), type='character', default='', - help='Manifest file listing sample names and paths to files to filter and trim. No default.' - ), - make_option( - c('--filterDir'), type='character', default='dada2_filtered', - help='Directory for quality filtered reads, default "dada2_filtered". Will be created if it does not exist.' - ), - make_option( - c('--stats'), type='character', default='filter_stats.tsv', - help='File for writing filtering information. Default: "filter_stats.tsv".' - ), - make_option( - c('--maxEE'), type='integer', default=-1, - help='Maximum number of expected errors in sequence, default Inf.' - ), - make_option( - c('--truncLen'), type='integer', default=0, - help='Truncate sequence after truncLen bases, reads shorter than this are discarded. Default: 0 (no truncation).' - ), - make_option( - c('--truncQ'), type='integer', default=2, - help='truncQ option in filterAndTrim(). Default 2.' - ), - make_option( - c('--minLen'), type='integer', default=20, - help='Remove reads shorter than minLen, after trimming and truncation. Default: 20.' - ), - make_option( - c('--maxLen'), type='integer', default=-1, - help='Remove reads longer than maxLen, before trimming and truncation.Default: Inf.' - ), - make_option( - c("-v", "--verbose"), action="store_true", default=FALSE, - help="Print progress messages." - ), - make_option( - c("--version"), action="store_true", default=FALSE, - help="Print version of this script and of DADA2 library." - ) -) -opt = parse_args(OptionParser(option_list=option_list)) - -if ( opt$version ) { - write(sprintf("dada2_filter_pacbio.r version %s, DADA2 version %s", VERSION, packageVersion('dada2')), stderr()) - q('no', 0) -} - -# Check options -if ( ! file.exists(opt$infile) ) { -stop(sprintf("Cannot find %s. See help (-h).\n",opt$infile)) -} -if ( ! file_test("-d", opt$filterDir) ) { dir.create(opt$filterDir) } - -if ( opt$maxEE < 0 ) { opt$maxEE = Inf } -if ( opt$maxLen < 0 ) { opt$maxLen = Inf } - -# Function for log messages -logmsg = function(msg, llevel='INFO') { - if ( opt$verbose ) { - write( - sprintf("%s: %s: %s", llevel, format(Sys.time(), "%Y-%m-%d %H:%M:%S"), msg), - stderr() - ) - } -} - -# Load DADA2 library here, to avoid --help and --version taking so long -suppressPackageStartupMessages(library(dada2)) -suppressPackageStartupMessages(library(ShortRead)) - -logmsg( sprintf( "Read quality filtering with filterAndTrim. Options used:\n maxN: %d, maxEE: %s, truncQ: %d, minLen: %d, maxLen: %s", - 0, opt$maxEE, opt$truncQ, opt$minLen, opt$maxLen ) -) - -# Do the filtering/trimming -input <- read.table(opt$infile, header = TRUE, sep = ",", colClasses = "character") - -filt <- file.path(opt$filterDir, basename(input$absolute.filepath)) -res_filt <- filterAndTrim(input$absolute.filepath, filt, maxN = 0, maxEE = opt$maxEE, truncQ = opt$truncQ, truncLen = opt$truncLen, minLen = opt$minLen, maxLen = opt$maxLen, compress = T, multithread = T, verbose = opt$verbose) - -input["file"] <- basename(input$absolute.filepath) -output <- merge(input,res_filt, by.x="file", by.y ="row.names") - -# Write filtering stats to file opt$stats -write.table( output, file = opt$stats, sep = "\t", row.names = FALSE, quote = FALSE) - -logmsg(sprintf("Done filtering, output in %s", opt$filterDir)) diff --git a/bin/dada_quality.r b/bin/dada_quality.r new file mode 100755 index 00000000..bf8f7f92 --- /dev/null +++ b/bin/dada_quality.r @@ -0,0 +1,38 @@ +#!/usr/bin/env Rscript + +args = commandArgs(trailingOnly=TRUE) + +if(length(args) != 2){ + stop("Usage: dada_quality.r ") +} + +OUT <- args[1] +number_of_records <- as.integer(args[2]) + +print(OUT) +print(number_of_records) + +suppressPackageStartupMessages(library(dada2)) + +readfiles <- sort(list.files(".", pattern = ".fastq.gz", full.names = TRUE)) +plot <- plotQualityProfile(readfiles, n = number_of_records, aggregate = TRUE) +data <- plot$data + +df <- data.frame(Cycle=character(), Count=character(), Median=character(), stringsAsFactors=FALSE) +cycles <- sort(unique(data$Cycle)) + +#aggregate data for each sequencing cycle +for (cycle in cycles) { + subdata <- data[data[, "Cycle"] == cycle, ] + score <- list() + #convert to list to calculate median + for (j in 1:nrow(subdata)) {score <- unlist(c(score, rep(subdata$Score[j], subdata$Count[j])))} + temp = data.frame(Cycle=cycle, Count=sum(subdata$Count), Median=median(score), stringsAsFactors=FALSE) + df <- rbind(df, temp) +} + +#write output +write.table( t(df), file = paste0(OUT,".tsv"), sep = "\t", row.names = TRUE, col.names = FALSE, quote = FALSE) +pdf(paste0(OUT,".pdf")) +plot +dev.off() \ No newline at end of file diff --git a/bin/dada_trunc_parameter.py b/bin/dada_trunc_parameter.py deleted file mode 100755 index bd6351c4..00000000 --- a/bin/dada_trunc_parameter.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python3 -#@author Daniel Straub -# Takes two CSV files from QIIME2 demux output, a quality threshold and a cutoff for the retained read fraction -# to generate a tuple of index locations that resemble the cutoff value used for DADA2 in QIIME2. - -import pandas as pd -import sys - -#argument check -if len(sys.argv) != 5: - exit("Usage: dada_trunc_parameter.py ") - -#parameters -data_fw = pd.read_csv(sys.argv[1]) #quality values forward reads -data_rv = pd.read_csv(sys.argv[2]) #quality values reverse reads -qmin = float(sys.argv[3]) #quality threshold -rmin = float(sys.argv[4]) #read count threshold (fraction) - -#select row with median values (file row 6, starting with "50%") and drop first row -median_fw = data_fw.iloc[4][1:].values.tolist() -median_rv = data_rv.iloc[4][1:].values.tolist() - -#select row with count numbers (file row name "count") -reads_fw = data_fw.iloc[0][1:].values.tolist() -reads_rv = data_rv.iloc[0][1:].values.tolist() -#extract maximum read count -fraction_reads = int(max(reads_fw)*rmin) - -#iterate through values and find first value that falls below threshold -def function(values, cutoff): - trunc = len(values) - for value in values: - if value < cutoff: - trunc = values.index(value) - break - return trunc - -#find quality threshold -trunc_median_fw = function(median_fw, qmin) -trunc_median_rv = function(median_rv, qmin) - -#find read threshold -trunc_reads_fw = function(reads_fw, fraction_reads) -trunc_reads_rv = function(reads_rv, fraction_reads) - -#final threshold -trunc_fw = min(trunc_median_fw,trunc_reads_fw) -trunc_rv = min(trunc_median_rv,trunc_reads_rv) - -#print values -print(trunc_fw, trunc_rv, sep=',', end='') \ No newline at end of file diff --git a/bin/count_table_filter_stats.py b/bin/filter_stats.py similarity index 64% rename from bin/count_table_filter_stats.py rename to bin/filter_stats.py index 358e7d51..8b84955d 100755 --- a/bin/count_table_filter_stats.py +++ b/bin/filter_stats.py @@ -11,7 +11,7 @@ exit("Usage: count_table_max_reads.py ") #read tsv and skip first two rows -data_unfiltered = pd.read_csv(sys.argv[1], sep='\t', skiprows=[0]) #count table +data_unfiltered = pd.read_csv(sys.argv[1], sep='\t', skiprows=None) #count table data_filtered = pd.read_csv(sys.argv[2], sep='\t', skiprows=[0]) #count table #drop feature ids @@ -25,8 +25,14 @@ #merge dataframes out = sums_unfiltered.to_frame(name = 'unfiltered').join(sums_filtered.to_frame(name = 'filtered')) out['lost'] = out['unfiltered'] - out['filtered'] -out['retained [%]'] = out['filtered'] / out['unfiltered'] *100 -out['lost [%]'] = (100 - out['retained [%]']) +out['retained_percent'] = out['filtered'] / out['unfiltered'] *100 +out['lost_percent'] = (100 - out['retained_percent']) + +#add column with sample names at beginning +out = out.rename_axis('sample').reset_index() + +#rename columns +out = out.rename(columns={'unfiltered': 'input_tax_filter', 'filtered': 'filtered_tax_filter'}) #write file -out.to_csv('count_table_filter_stats.tsv', sep='\t') \ No newline at end of file +out.to_csv('count_table_filter_stats.tsv', sep='\t', index=False) \ No newline at end of file diff --git a/bin/make_dada2_report_pacbio.py b/bin/make_dada2_report_pacbio.py deleted file mode 100755 index 7f3614f3..00000000 --- a/bin/make_dada2_report_pacbio.py +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env python3 -# -# Uses the output from dada2_errmodels_pacbio.r and -# dada2_denoise_pacbio.r to create a report file for -# the DADA2 denoising process, mimicking the report -# generated by QIIME2. -# Results are written to file "dada_report.txt" -# -# Jeanette Tångrot - -#-- Import libraries, do initializations --# -import sys - -file_out = "dada_report.txt" -err = "" -ver = "" -info = "" - -#-- Check arguments --# -if len( sys.argv ) != 4: - exit( "Usage: make_dada2_report_pacbio.py " ) - -#-- Read information from files --# -with open( sys.argv[1] ) as ef: - for line in ef: - err = err + line - -with open( sys.argv[2] ) as df: - for line in df: - if line.startswith('R version') or line.startswith('DADA2:'): - ver = ver + line - else: - info = info + line - -#-- Write to file --# -f = open( file_out, "w") -f.write( ver + "1) Filtering\n2) Learning Error Rates\n" + err + - "3) Denoising\n" + info + "4) Remove chimeras (method = " + sys.argv[3] + - ")\n5) Write output (absolute and relative abundances, ASV sequence file, summary statistics)" ) -f.close() diff --git a/bin/make_dada2_stats_pacbio.py b/bin/make_dada2_stats_pacbio.py deleted file mode 100755 index e0b7bc23..00000000 --- a/bin/make_dada2_stats_pacbio.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python3 -# -# Takes two TSV files as input; stats from dada2 filtering and stats -# from dada2 denoising, and reports these numbers together with -# fractions of input sequences that are filtered/non-chimeric. -# Results are written to file "dada_stats.tsv" -# -# Jeanette Tångrot - -#-- Import libraries, do initializations --# -import pandas as pd -import sys - -file_out = "dada_stats.tsv" - -#-- Check arguments --# -if len( sys.argv ) != 3: - exit( "Usage: make_dada2_stats_pacbio.py " ) - -#-- Reas TSVs --# -filt = pd.read_csv( sys.argv[1], sep = '\t', usecols = ['sample.id', 'file', 'reads.in', 'reads.out'] ) -denoise = pd.read_csv( sys.argv[2], sep = '\t' ) - -#-- Count number of input sequences --# -num_input = filt[ 'reads.in' ] - -#-- Create results table --# -res = filt.join( denoise.set_index( 'file' ), on = 'file' ) -res.pop( 'file' ) -res['perc_filt'] = res['reads.out'] / num_input * 100 -res['perc_denoise'] = res['denoised'] / num_input * 100 -res['perc_nonchim'] = res['nonchim'] / num_input * 100 -res = res[ ['sample.id', 'reads.in', 'reads.out', 'perc_filt', 'denoised', 'perc_denoise', 'nonchim', 'perc_nonchim'] ] -res.to_csv( file_out, sep = '\t', header = [ 'sample-id', 'input', 'filtered', 'percentage of input passed filter', 'denoised', 'percentage of input denoised', 'non-chimeric', 'percentage of input non-chimeric' ], index=False ) - diff --git a/bin/metadataCategory.r b/bin/metadataCategory.r deleted file mode 100755 index 5dde6d17..00000000 --- a/bin/metadataCategory.r +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env Rscript -args = commandArgs(trailingOnly=TRUE) - -if(length(args) < 1){ - stop("Usage: metadataCategory.r ") -} - -metadata <- args[1] - -data = read.delim(metadata) - -#remove all numeric columns -nums <- unlist(lapply(data, is.numeric)) -data <- data[ , !nums] - -vector <- character() -for (i in 2:ncol(data)) { - -#remove blanks or NA -cleandata <- data[!(is.na(data[i]) | data[i]==""),] - -#select only columns with multiple different values but not all unique -if (nrow(unique(cleandata[i])) > 1 & nrow(unique(cleandata[i])) < nrow(cleandata[i])) { - vector <- c(vector, colnames(cleandata[i])) -} -} -vector <- paste(vector, collapse=",") -cat(vector) - diff --git a/bin/metadataCategoryPairwise.r b/bin/metadataCategoryPairwise.r deleted file mode 100755 index d86f64ef..00000000 --- a/bin/metadataCategoryPairwise.r +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env Rscript -args = commandArgs(trailingOnly=TRUE) - -if(length(args) < 1){ - stop("Usage: metadataCategory.r ") -} - -metadata <- args[1] - -data = read.delim(metadata) - -#remove all numeric columns -nums <- unlist(lapply(data, is.numeric)) -data <- data[ , !nums] - -vector <- character() -for (i in 1:ncol(data)) { - -#remove blanks or NA -cleandata <- data[!(is.na(data[i]) | data[i]==""), ] - -#select only columns that have at least 2 of each value so that it can be used for pairwise comparisons -noccur <- data.frame(table(cleandata[i])) -if (nrow(unique(cleandata[i])) > 1 & nrow(unique(cleandata[i])) < nrow(cleandata[i])) { -if ( nrow(noccur[noccur$Freq != 1,]) == nrow(noccur) ) { - vector <- c(vector, colnames(cleandata[i])) -} -} -} -vector <- paste(vector, collapse=",") -cat(vector) diff --git a/bin/metadata_all.r b/bin/metadata_all.r new file mode 100755 index 00000000..d21d5a76 --- /dev/null +++ b/bin/metadata_all.r @@ -0,0 +1,29 @@ +#!/usr/bin/env Rscript +args = commandArgs(trailingOnly=TRUE) + +if(length(args) < 1){ + stop("Usage: metadata_all.r ") +} + +metadata <- args[1] + +data = read.delim(metadata) + +#remove all numeric columns +nums <- unlist(lapply(data, is.numeric)) +data <- data[ , !nums] + +vector <- character() +for (i in 2:ncol(data)) { + + #remove blanks or NA + cleandata <- data[!(is.na(data[i]) | data[i]==""),] + + #select only columns with multiple different values but not all unique + if (nrow(unique(cleandata[i])) > 1 & nrow(unique(cleandata[i])) < nrow(cleandata[i])) { + vector <- c(vector, colnames(cleandata[i])) + } +} +vector <- paste(vector, collapse=",") +cat(vector) + diff --git a/bin/metadata_pairwise.r b/bin/metadata_pairwise.r new file mode 100755 index 00000000..58e7e7bf --- /dev/null +++ b/bin/metadata_pairwise.r @@ -0,0 +1,31 @@ +#!/usr/bin/env Rscript +args = commandArgs(trailingOnly=TRUE) + +if(length(args) < 1){ + stop("Usage: metadataCategory.r ") +} + +metadata <- args[1] + +data = read.delim(metadata) + +#remove all numeric columns +nums <- unlist(lapply(data, is.numeric)) +data <- data[ , !nums] + +vector <- character() +for (i in 1:ncol(data)) { + + #remove blanks or NA + cleandata <- data[!(is.na(data[i]) | data[i]==""), ] + + #select only columns that have at least 2 of each value so that it can be used for pairwise comparisons + noccur <- data.frame(table(cleandata[i])) + if (nrow(unique(cleandata[i])) > 1 & nrow(unique(cleandata[i])) < nrow(cleandata[i])) { + if ( nrow(noccur[noccur$Freq != 1,]) == nrow(noccur) ) { + vector <- c(vector, colnames(cleandata[i])) + } + } +} +vector <- paste(vector, collapse=",") +cat(vector) diff --git a/bin/parse_dada2_taxonomy.r b/bin/parse_dada2_taxonomy.r new file mode 100755 index 00000000..54568472 --- /dev/null +++ b/bin/parse_dada2_taxonomy.r @@ -0,0 +1,22 @@ +#!/usr/bin/env Rscript + +args = commandArgs(trailingOnly=TRUE) + +if(length(args) != 1){ + stop("Usage: parse_dada2_taxonomy.r ") +} + +tax_file <- args[1] + +OUT="tax.tsv" + +# read required files +tax = read.table(tax_file, header = TRUE, sep = "\t", stringsAsFactors = FALSE) + +# Join columns 2:ncol(.) - 1, the taxonomy ranks (sequence is the last) +r <- colnames(tax)[!colnames(tax) %in% c('ASV_ID', 'sequence')] +tax$taxonomy <- do.call(paste, c(tax[r], sep = ';')) + +#write +print (paste("write",OUT)) +write.table(tax[,c('ASV_ID', 'taxonomy')], file = OUT, quote=FALSE, col.names=FALSE, row.names=FALSE, sep="\t") diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py index c05ed13d..2a8b74ac 100755 --- a/bin/scrape_software_versions.py +++ b/bin/scrape_software_versions.py @@ -1,44 +1,21 @@ #!/usr/bin/env python from __future__ import print_function -from collections import OrderedDict -import re +import os -# nf-core: Add additional regexes for new tools in process get_software_versions -regexes = { - 'nf-core/ampliseq': ['v_pipeline.txt', r"(\S+)"], - 'Nextflow': ['v_nextflow.txt', r"(\S+)"], - 'FastQC': ['v_fastqc.txt', r"FastQC v(\S+)"], - 'MultiQC': ['v_multiqc.txt', r"multiqc, version (\S+)"], - 'Cutadapt': ['v_cutadapt.txt', r"(\S+)"], - 'QIIME2': ['v_qiime.txt', r"q2cli version (\S+)"] -} -results = OrderedDict() -results['nf-core/ampliseq'] = 'N/A' -results['Nextflow'] = 'N/A' -results['FastQC'] = 'N/A' -results['MultiQC'] = 'N/A' -results['Cutadapt'] = 'N/A' -results['QIIME2'] = 'N/A' +results = {} +version_files = [x for x in os.listdir('.') if x.endswith('.version.txt')] +for version_file in version_files: -# Search each file using its regex -for k, v in regexes.items(): - try: - with open(v[0]) as x: - versions = x.read() - match = re.search(v[1], versions) - if match: - results[k] = "v{}".format(match.group(1)) - except IOError: - results[k] = False + software = version_file.replace('.version.txt','') + if software == 'pipeline': + software = 'nf-core/ampliseq' -# Remove software set to false in results -for k in list(results): - if not results[k]: - del results[k] + with open(version_file) as fin: + version = fin.read().strip() + results[software] = version # Dump to YAML -print( - """ +print (''' id: 'software_versions' section_name: 'nf-core/ampliseq Software Versions' section_href: 'https://github.com/nf-core/ampliseq' @@ -46,13 +23,12 @@ description: 'are collected at run time from the software output.' data: |
-""" -) -for k, v in results.items(): - print("
{}
{}
".format(k, v)) -print("
") +''') +for k,v in sorted(results.items()): + print("
{}
{}
".format(k,v)) +print (" ") # Write out regexes as csv file: -with open("software_versions.csv", "w") as f: - for k, v in results.items(): - f.write("{}\t{}\n".format(k, v)) +with open('software_versions.csv', 'w') as f: + for k,v in sorted(results.items()): + f.write("{}\t{}\n".format(k,v)) diff --git a/bin/taxref_reformat_gtdb.sh b/bin/taxref_reformat_gtdb.sh new file mode 100755 index 00000000..5f85d4be --- /dev/null +++ b/bin/taxref_reformat_gtdb.sh @@ -0,0 +1,16 @@ +#!/bin/sh + +# Reads the ar122 and bac120 SSU fasta files from GTDB (after first untarring) +# and outputs two new fasta files, one suitable for DADA2's assignTaxonomy() +# and addSpecies() functions. + +# Untar any tar file in the working directory +for f in *.tar.gz; do + tar xzf $f +done + +# Write the assignTaxonomy() fasta file: assignTaxonomy.fna +cat ar122*.fna bac120*.fna | sed '/^>/s/>\([^ ]\+\) \([^[]\+\) \[.*/>\2(\1\)/' | sed '/^>/s/;s__.*//' | sed 's/[a-z]__//g' | sed 's/ /_/g' | sed '/^>/s/\(Archaea\)\|\(Bacteria\)/&;&/' > assignTaxonomy.fna + +# Write the addSpecies() fasta file: addSpecies.fna +cat ar122*.fna bac120*.fna | sed '/^>/s/>\([^ ]\+\) .*;s__\([^[]\+\) \[.*/>\1 \2/' > addSpecies.fna diff --git a/bin/taxref_reformat_pr2.sh b/bin/taxref_reformat_pr2.sh new file mode 100755 index 00000000..8a33f51f --- /dev/null +++ b/bin/taxref_reformat_pr2.sh @@ -0,0 +1,10 @@ +#!/bin/sh + +# Handles the PR2 database. + +# There's a preformatted DADA2 file for assignTaxonomy() -- this is just ungzipped +gunzip -c *dada2.fasta.gz > assignTaxonomy.fna + +# For addSpecies(), the UTAX file is downloaded and reformated to only contain the id and species. +# The second two sed calls are to replace "_" with space only in the species name and not the last part of the id (overdoing it a bit, as I don't the id actually matters as long as it's unique). +gunzip -c *UTAX.fasta.gz | sed '/^>/s/>\([^;]*\);.*,s:\(.*\)/>\1 \2/' | sed 's/_/ /g' | sed 's/ \([A-Z]\) /_\1 /' > addSpecies.fna diff --git a/bin/taxref_reformat_qiime_greengenes85.sh b/bin/taxref_reformat_qiime_greengenes85.sh new file mode 100755 index 00000000..5c49d262 --- /dev/null +++ b/bin/taxref_reformat_qiime_greengenes85.sh @@ -0,0 +1,5 @@ +#!/bin/sh + +# Select and rename files +mv *.fasta greengenes85.fna +mv *.txt greengenes85.tax diff --git a/bin/taxref_reformat_qiime_silva138.sh b/bin/taxref_reformat_qiime_silva138.sh new file mode 100755 index 00000000..ac5f85c0 --- /dev/null +++ b/bin/taxref_reformat_qiime_silva138.sh @@ -0,0 +1,9 @@ +#!/bin/sh + +# Unzip the qza files +unzip *seqs.qza +unzip *tax.qza + +# Select and rename dynamic files +cat */data/taxonomy.tsv > silva.tax +cat */data/*sequences.fasta > silva.fna diff --git a/bin/taxref_reformat_qiime_unite.sh b/bin/taxref_reformat_qiime_unite.sh new file mode 100755 index 00000000..6fd80ac9 --- /dev/null +++ b/bin/taxref_reformat_qiime_unite.sh @@ -0,0 +1,8 @@ +#!/bin/sh + +# Untar the Unite file +tar xzf *.gz + +# Select and rename dynamic files +cat */*_dynamic_*.fasta > unite.fna +cat */*_dynamic_*.txt > unite.tax diff --git a/bin/taxref_reformat_standard.sh b/bin/taxref_reformat_standard.sh new file mode 100755 index 00000000..cbe3b2ca --- /dev/null +++ b/bin/taxref_reformat_standard.sh @@ -0,0 +1,9 @@ +#!/bin/sh + +# Uses preformatted databases from DADA2 (https://benjjneb.github.io/dada2/training.html) +# The file for taxonomy assignment, identified by containing "train" in the name, +# gets the first field duplicated: +gunzip -c *train*gz | sed 's/>\([^;]*\)/>\1;\1/' > assignTaxonomy.fna + +# and the file for add species, identified by containing "species" in the name, is renamed +mv *species*gz addSpecies.fna.gz diff --git a/bin/taxref_reformat_unite.sh b/bin/taxref_reformat_unite.sh new file mode 100755 index 00000000..c6d514db --- /dev/null +++ b/bin/taxref_reformat_unite.sh @@ -0,0 +1,14 @@ +#!/bin/sh + +# Untars a file ending with .gz downloaded from Unite, reformats a bit to +# assignTaxonomy.fna and reformats a copy to addSpecies.fna. + +# Untar the Unite file +tar xzf *.gz + +# Remove leading "k__" and the like, remove ranks classified as "unknown", +# and replace space with underscore to create assignTaxonomy.fna +cat */*.fasta | sed '/^>/s/;[ks]__.*//' | sed '/^>/s/[a-z]__unidentified//g' | sed '/^>/s/[a-z]__//g' | sed '/^>/s/ /_/g' | sed 's/>.*|/&Eukaryota;/' > assignTaxonomy.fna + +# Reformat to addSpecies format +sed 's/>\([^|]\+\)|\([^|]\+|[^|]\+\)|.*/>\2 \1/' assignTaxonomy.fna | sed '/^>/s/_/ /g' > addSpecies.fna diff --git a/bin/trunclen.py b/bin/trunclen.py new file mode 100755 index 00000000..12430cd4 --- /dev/null +++ b/bin/trunclen.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 +#@author Daniel Straub +# Takes two CSV files from QIIME2 demux output, a quality threshold and a cutoff for the retained read fraction +# to generate a tuple of index locations that resemble the cutoff value used for DADA2 in QIIME2. + +import pandas as pd +import sys + +#argument check +if len(sys.argv) != 4: + exit("Usage: dada_trunc_parameter.py <*_qual_stats.tsv> ") + +#parameters +data = pd.read_csv(sys.argv[1], delimiter="\t") #quality values forward reads +qmin = float(sys.argv[2]) #quality threshold +rmin = float(sys.argv[3]) #read count threshold (fraction) + +#select row with median values (file row 6, starting with "50%") and drop first row +median = data.iloc[1][1:].values.tolist() + +#select row with count numbers (file row name "count") +reads = data.iloc[0][1:].values.tolist() +#extract maximum read count +fraction_reads = int(max(reads)*rmin) + +#iterate through values and find first value that falls below threshold +def function(values, cutoff): + trunc = len(values) + for value in values: + if value < cutoff: + trunc = values.index(value) + break + return trunc + +#find quality threshold +trunc_median = function(median, qmin) + +#find read threshold +trunc_reads = function(reads, fraction_reads) + +#final threshold +trunc = min(trunc_median,trunc_reads) + +#print values +print(trunc, end='') \ No newline at end of file diff --git a/conf/base.config b/conf/base.config index cc2127ca..b90dcee7 100644 --- a/conf/base.config +++ b/conf/base.config @@ -10,66 +10,36 @@ */ process { - cpus = { check_max( 1 * task.attempt, 'cpus' ) } - memory = { check_max( 7.GB * task.attempt, 'memory' ) } - time = { check_max( 4.h * task.attempt, 'time' ) } + cpus = { check_max( 1 * task.attempt, 'cpus' ) } + memory = { check_max( 7.GB * task.attempt, 'memory' ) } + time = { check_max( 4.h * task.attempt, 'time' ) } - errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } - maxRetries = 3 - maxErrors = '-1' + errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } + maxRetries = 3 + maxErrors = '-1' - withName: classifier_extract_seq { - cpus = { check_max (1 * task.attempt, 'cpus' ) } - memory = { check_max (42.GB * task.attempt, 'memory' ) } - time = { check_max (12.h * task.attempt, 'time' ) } - } - - withName: classifier_train { - cpus = { check_max (1 * task.attempt, 'cpus' ) } - memory = { check_max (42.GB * task.attempt, 'memory' ) } - time = { check_max (12.h * task.attempt, 'time' ) } - } - - //you can limit memory by typing --max_memory in the command stream, e.g. --max_memory 63.GB - withName: classifier { - cpus = { check_max (20 * task.attempt, 'cpus' ) } - memory = { check_max (120.GB * task.attempt, 'memory' ) } - time = { check_max (36.h * task.attempt, 'time' ) } - } - - withName: dada_single { - cpus = { check_max (8 * task.attempt, 'cpus' ) } - memory = { check_max (28.GB * task.attempt, 'memory' ) } - time = { check_max (18.h * task.attempt, 'time' ) } - } - withName: dada_multi { - cpus = { check_max (8 * task.attempt, 'cpus' ) } - memory = { check_max (28.GB * task.attempt, 'memory' ) } - time = { check_max (18.h * task.attempt, 'time' ) } - } - withName: dada_pacBio { - cpus = { check_max (8 * task.attempt, 'cpus' ) } - memory = { check_max (28.GB * task.attempt, 'memory' ) } - time = { check_max (18.h * task.attempt, 'time' ) } - } - withName: tree { - cpus = { check_max (8 * task.attempt, 'cpus' ) } - memory = { check_max (24.GB * task.attempt, 'memory' ) } - time = { check_max (4.h * task.attempt, 'time' ) } - } - withName: ancom_asv { - errorStrategy = { task.exitStatus in [143,137] ? 'retry' : 'ignore' } - cpus = { check_max (8 * task.attempt, 'cpus' ) } - memory = { check_max (16.GB * task.attempt, 'memory' ) } - time = { check_max (8.h * task.attempt, 'time' ) } - } - withName: ancom_tax { - cpus = { check_max (8 * task.attempt, 'cpus' ) } - memory = { check_max (16.GB * task.attempt, 'memory' ) } - time = { check_max (2.h * task.attempt, 'time' ) } - } - withName: get_software_versions { - cache = false - } - + withLabel:process_low { + cpus = { check_max( 2 * task.attempt, 'cpus' ) } + memory = { check_max( 12.GB * task.attempt, 'memory' ) } + time = { check_max( 6.h * task.attempt, 'time' ) } + } + withLabel:process_medium { + cpus = { check_max( 6 * task.attempt, 'cpus' ) } + memory = { check_max( 42.GB * task.attempt, 'memory' ) } + time = { check_max( 12.h * task.attempt, 'time' ) } + } + withLabel:process_high { + cpus = { check_max( 20 * task.attempt, 'cpus' ) } + memory = { check_max( 120.GB * task.attempt, 'memory' ) } + time = { check_max( 36.h * task.attempt, 'time' ) } + } + withLabel:process_long { + time = { check_max( 48.h * task.attempt, 'time' ) } + } + withLabel:single_cpu { + cpus = { check_max( 1, 'cpus' ) } + } + withLabel:error_ignore { + errorStrategy = 'ignore' + } } diff --git a/conf/modules.config b/conf/modules.config new file mode 100644 index 00000000..69b294e2 --- /dev/null +++ b/conf/modules.config @@ -0,0 +1,203 @@ +/* + * -------------------------------------------------- + * Config file for defining DSL2 per module options + * -------------------------------------------------- + * + * Available keys to override module options: + * args = Additional arguments appended to command in module. + * args2 = Second set of arguments appended to command in module (multi-tool modules). + * publish_dir = Directory to publish results. + * publish_by_id = Publish results in separate folders by meta.id value. + * publish_files = Groovy map where key = "file_ext" and value = "directory" to publish results for that file extension + * The value of "directory" is appended to the standard "publish_dir" path as defined above. + * If publish_files == null (unspecified) - All files are published. + * If publish_files == false - No files are published. + * suffix = File name suffix for output files. + * + */ + +params { + modules { + 'fastqc' { + args = "--quiet" + publish_files = ['html':''] + } + 'multiqc' { + args = "" + publish_dir = "multiqc" + } + 'cutadapt' { + args = "--minimum-length 1" + publish_files = ['log':''] + } + 'cutadapt_readthrough' { + args = "--minimum-length 1" + suffix = ".read-through" + publish_files = ['log':''] + } + 'cutadapt_doubleprimer' { + args = "--discard-trimmed --minimum-length 1" + suffix = ".double-primer" + publish_files = ['log':''] + } + 'cutadapt_summary' { + publish_files = false + } + 'cutadapt_summary_merge' { + publish_files = null + } + 'cutadapt_taxonomy' { + args = "--discard-untrimmed --minimum-length 1" + publish_files = ['log':''] + } + 'dada2_quality' { + args = "5e+06" + publish_files = ['pdf':'QC','.args.txt':'args'] + } + 'dada2_filtntrim' { + args = 'maxN = 0, truncQ = 2, trimRight = 0, minQ = 0, rm.lowcomplex = 0, orient.fwd = NULL, matchIDs = FALSE, id.sep = "\\\\s", id.field = NULL, n = 1e+05, OMP = TRUE, qualityType = "Auto"' + publish_files = ['.args.txt':'args'] + } + 'dada2_err' { + args = 'nbases = 1e8, nreads = NULL, randomize = FALSE, MAX_CONSIST = 10, OMEGA_C = 0, qualityType = "Auto"' + publish_files = ['pdf':'QC','.args.txt':'args','convergence.txt':'QC','log':'log'] + } + 'dada2_dereplicate' { + args = "qualityType = \"Auto\"" + publish_files = false + } + 'dada2_denoising' { + // standard setting can be inspected with getDadaOpt(option = NULL) + args = 'selfConsist = FALSE, priors = character(0), DETECT_SINGLETONS = FALSE, GAPLESS = TRUE, GAP_PENALTY = -8, GREEDY = TRUE, KDIST_CUTOFF = 0.42, MATCH = 5, MAX_CLUST = 0, MAX_CONSIST = 10, MIN_ABUNDANCE = 1, MIN_FOLD = 1, MIN_HAMMING = 1, MISMATCH = -4, OMEGA_A = 1e-40, OMEGA_C = 1e-40, OMEGA_P = 1e-4, PSEUDO_ABUNDANCE = Inf, PSEUDO_PREVALENCE = 2, SSE = 2, USE_KMERS = TRUE, USE_QUALS = TRUE, VECTORIZED_ALIGNMENT = TRUE' + // setting from https://rdrr.io/bioc/dada2/man/mergePairs.html & https://rdrr.io/bioc/dada2/man/nwalign.html & match = getDadaOpt("MATCH"), mismatch = getDadaOpt("MISMATCH"), gap = getDadaOpt("GAP_PENALTY"), missing from the list below is: 'band = -1' + args2 = 'minOverlap = 12, maxMismatch = 0, returnRejects = FALSE, propagateCol = character(0), trimOverhang = FALSE, match = 1, mismatch = -64, gap = -64, homo_gap = NULL, endsfree = TRUE, vec = FALSE' + publish_files = ['.args.txt':'args','log':'log'] + } + 'dada2_rmchimera' { + args = 'method="consensus", minSampleFraction = 0.9, ignoreNNegatives = 1, minFoldParentOverAbundance = 2, minParentAbundance = 8, allowOneOff = FALSE, minOneOffParentDistance = 4, maxShift = 16' + publish_files = ['.args.txt':'args'] + } + 'dada2_stats' { + publish_files = false + } + 'dada2_merge' { + publish_files = null + } + 'merge_stats' { + publish_dir = './' + publish_files = null + } + 'itsx_cutasv' { + args = '-t all --preserve T --date F --positions F --graphical F --save_regions none' + } + 'dada2_taxonomy' { + args = 'minBoot = 50' + publish_files = ['.args.txt':'args','tsv':''] + } + 'dada2_addspecies' { + args = 'allowMultiple = FALSE, n = 1e5' + publish_files = ['.args.txt':'args','tsv':''] + } + 'qiime2_preptax' { + args = "" + publish_dir = 'qiime2/taxonomy' + publish_files = ['-classifier.qza':'','tsv':''] + } + 'qiime2_taxonomy' { + args = "" + publish_dir = 'qiime2/taxonomy' + publish_files = ['tsv':''] + } + 'qiime2_inasv' { + args = "" + publish_files = false + } + 'qiime2_inseq' { + args = "" + publish_files = false + } + 'qiime2_filtertaxa' { + args = "" + publish_files = false + } + 'filter_stats' { + args = "" + publish_dir = 'qiime2/abundance_tables' + publish_files = ['tsv':''] + } + 'qiime2_barplot' { + args = "" + publish_dir = 'qiime2' + publish_files = null + } + 'qiime2_export_absolute' { + args = "" + publish_dir = 'qiime2' + publish_files = ['descriptive_stats.tsv':'representative_sequences','seven_number_summary.tsv':'representative_sequences','tsv':'abundance_tables','biom':'abundance_tables','rep-seq.fasta':'representative_sequences'] + } + 'qiime2_export_relasv' { + args = "" + publish_dir = 'qiime2/rel_abundance_tables' + publish_files = ['tsv':''] + } + 'qiime2_export_reltax' { + args = "" + publish_dir = 'qiime2/rel_abundance_tables' + publish_files = ['tsv':''] + } + 'combine_table' { + args = "" + publish_dir = 'qiime2/rel_abundance_tables' + publish_files = ['tsv':''] + } + 'qiime2_tree' { + args = "" + publish_dir = 'qiime2/phylogenetic_tree' + publish_files = null + } + 'qiime2_alphararefaction' { + args = "" + publish_dir = 'qiime2' + publish_files = null + } + 'qiime2_diversity_core' { + args = "" + publish_dir = 'qiime2/diversity' + publish_files = ['rarefaction.txt':''] + } + 'qiime2_diversity_alpha' { + args = "" + publish_dir = 'qiime2/diversity' + publish_files = null + } + 'qiime2_diversity_beta' { + args = "" + publish_dir = 'qiime2/diversity' + publish_files = null + } + 'qiime2_diversity_betaord' { + args = "" + publish_dir = 'qiime2/diversity' + publish_files = null + } + 'qiime2_filterasv' { + args = "" + publish_dir = 'qiime2' + publish_files = false + } + 'qiime2_ancom_tax' { + args = "" + publish_dir = 'qiime2' + publish_files = null + } + 'qiime2_ancom_asv' { + args = "" + publish_dir = 'qiime2' + publish_files = null + } + 'qiime2_intax' { + args = "" + publish_files = false + } + } +} diff --git a/conf/ref_databases.config b/conf/ref_databases.config index 5a1a55f4..64aed7a0 100644 --- a/conf/ref_databases.config +++ b/conf/ref_databases.config @@ -1,26 +1,106 @@ /* * ----------------------------------------------------------- - * Nextflow config file for 16S rRNA gene reference databases + * Nextflow config file for reference databases * ----------------------------------------------------------- * Defines sources and files for reference databases + * Please also reflect all changes in 'nextflow_schema.json' */ params { - genomes { - 'silva132' { - db_zip = "https://www.arb-silva.de/fileadmin/silva_databases/qiime/Silva_132_release.zip" - fasta = "SILVA_132_QIIME_release/rep_set/rep_set_16S_only/${params.dereplication}/silva_132_${params.dereplication}_16S.fna" - taxonomy = "SILVA_132_QIIME_release/taxonomy/16S_only/${params.dereplication}/consensus_taxonomy_7_levels.txt" + dada_ref_databases { + 'gtdb=05-RS95' { + file = [ "https://data.ace.uq.edu.au/public/gtdb/data/releases/release95/95.0/genomic_files_reps/bac120_ssu_reps_r95.tar.gz", "https://data.ace.uq.edu.au/public/gtdb/data/releases/release95/95.0/genomic_files_reps/ar122_ssu_reps_r95.tar.gz" ] + fmtscript = "taxref_reformat_gtdb.sh" + } + 'gtdb' { + file = [ "https://data.gtdb.ecogenomic.org/releases/latest/genomic_files_reps/bac120_ssu_reps.tar.gz", "https://data.gtdb.ecogenomic.org/releases/latest/genomic_files_reps/ar122_ssu_reps.tar.gz" ] + fmtscript = "taxref_reformat_gtdb.sh" + } + 'pr2=4.13.0' { + file = [ "https://github.com/pr2database/pr2database/releases/download/v4.13.0/pr2_version_4.13.0_18S_dada2.fasta.gz", "https://github.com/pr2database/pr2database/releases/download/v4.13.0/pr2_version_4.13.0_18S_UTAX.fasta.gz" ] + fmtscript = "taxref_reformat_pr2.sh" + } + 'pr2' { + file = [ "https://github.com/pr2database/pr2database/releases/download/v4.13.0/pr2_version_4.13.0_18S_dada2.fasta.gz", "https://github.com/pr2database/pr2database/releases/download/v4.13.0/pr2_version_4.13.0_18S_UTAX.fasta.gz" ] + fmtscript = "taxref_reformat_pr2.sh" + } + 'rdp=18' { + file = [ "https://zenodo.org/record/4310151/files/rdp_train_set_18.fa.gz", "https://zenodo.org/record/4310151/files/rdp_species_assignment_18.fa.gz" ] + fmtscript = "taxref_reformat_standard.sh" + } + 'rdp' { + file = [ "https://zenodo.org/record/4310151/files/rdp_train_set_18.fa.gz", "https://zenodo.org/record/4310151/files/rdp_species_assignment_18.fa.gz" ] + fmtscript = "taxref_reformat_standard.sh" + } + 'silva=132' { + file = [ "https://zenodo.org/record/1172783/files/silva_nr_v132_train_set.fa.gz", "https://zenodo.org/record/1172783/files/silva_species_assignment_v132.fa.gz" ] + fmtscript = "taxref_reformat_standard.sh" + } + 'silva=138' { + file = [ "https://zenodo.org/record/4587955/files/silva_nr99_v138.1_train_set.fa.gz", "https://zenodo.org/record/4587955/files/silva_species_assignment_v138.1.fa.gz" ] + fmtscript = "taxref_reformat_standard.sh" + } + 'silva' { + file = [ "https://zenodo.org/record/4587955/files/silva_nr99_v138.1_train_set.fa.gz", "https://zenodo.org/record/4587955/files/silva_species_assignment_v138.1.fa.gz" ] + fmtscript = "taxref_reformat_standard.sh" + } + 'unite-fungi=8.2' { + file = [ "https://files.plutof.ut.ee/public/orig/E7/28/E728E2CAB797C90A01CD271118F574B8B7D0DAEAB7E81193EB89A2AC769A0896.gz" ] + fmtscript = "taxref_reformat_unite.sh" + } + 'unite-fungi' { + file = [ "https://files.plutof.ut.ee/public/orig/E7/28/E728E2CAB797C90A01CD271118F574B8B7D0DAEAB7E81193EB89A2AC769A0896.gz" ] + fmtscript = "taxref_reformat_unite.sh" + } + 'unite-alleuk=8.2' { + file = [ "https://files.plutof.ut.ee/public/orig/F9/ED/F9EDE36E5209F469056675EBD672425BC06EACB7FE0C0D18F5A13E4CA632DCFA.gz" ] + fmtscript = "taxref_reformat_unite.sh" + } + 'unite-alleuk' { + file = [ "https://files.plutof.ut.ee/public/orig/F9/ED/F9EDE36E5209F469056675EBD672425BC06EACB7FE0C0D18F5A13E4CA632DCFA.gz" ] + fmtscript = "taxref_reformat_unite.sh" + } } - 'silva128' { - db_zip = "https://www.arb-silva.de/fileadmin/silva_databases/qiime/Silva_128_release.tgz" - fasta = "SILVA_128_QIIME_release/rep_set/rep_set_16S_only/${params.dereplication}/${params.dereplication}_otus_16S.fasta" - taxonomy = "SILVA_128_QIIME_release/taxonomy/16S_only/${params.dereplication}/consensus_taxonomy_7_levels.txt" + //QIIME2 taxonomic reference databases + qiime_ref_databases { + //SILVA for QIIME2 v2021.2, see https://docs.qiime2.org/2021.2/data-resources/#silva-16s-18s-rrna + 'silva=138' { + file = [ "https://data.qiime2.org/2021.2/common/silva-138-99-seqs.qza", "https://data.qiime2.org/2021.2/common/silva-138-99-tax.qza" ] + citation = "https://www.arb-silva.de/; Bokulich, N.A., Robeson, M., Dillon, M.R. bokulich-lab/RESCRIPt. Zenodo. http://doi.org/10.5281/zenodo.3891931" + license = "https://www.arb-silva.de/silva-license-information/" + fmtscript = "taxref_reformat_qiime_silva138.sh" + } + 'silva' { + file = [ "https://data.qiime2.org/2021.2/common/silva-138-99-seqs.qza", "https://data.qiime2.org/2021.2/common/silva-138-99-tax.qza" ] + citation = "https://www.arb-silva.de/; Bokulich, N.A., Robeson, M., Dillon, M.R. bokulich-lab/RESCRIPt. Zenodo. http://doi.org/10.5281/zenodo.3891931" + license = "https://www.arb-silva.de/silva-license-information/" + fmtscript = "taxref_reformat_qiime_silva138.sh" + } + //UNITE for QIIME2, see https://unite.ut.ee/repository.php + 'unite-fungi=8.2' { + file = [ "https://files.plutof.ut.ee/public/orig/98/AE/98AE96C6593FC9C52D1C46B96C2D9064291F4DBA625EF189FEC1CCAFCF4A1691.gz" ] + citation = "Abarenkov, Kessy; Zirk, Allan; Piirmann, Timo; Pöhönen, Raivo; Ivanov, Filipp; Nilsson, R. Henrik; Kõljalg, Urmas (2020): UNITE QIIME release for Fungi. Version 04.02.2020. UNITE Community. https://doi.org/10.15156/BIO/786385" + fmtscript = "taxref_reformat_qiime_unite.sh" + } + 'unite-fungi' { + file = [ "https://files.plutof.ut.ee/public/orig/98/AE/98AE96C6593FC9C52D1C46B96C2D9064291F4DBA625EF189FEC1CCAFCF4A1691.gz" ] + citation = "Abarenkov, Kessy; Zirk, Allan; Piirmann, Timo; Pöhönen, Raivo; Ivanov, Filipp; Nilsson, R. Henrik; Kõljalg, Urmas (2020): UNITE QIIME release for Fungi. Version 04.02.2020. UNITE Community. https://doi.org/10.15156/BIO/786385" + fmtscript = "taxref_reformat_qiime_unite.sh" + } + 'unite-alleuk=8.2' { + file = [ "https://files.plutof.ut.ee/public/orig/6E/0E/6E0EDD5592003B47C70A1B384C3C784AA32B726AC861CD7E2BD22AEB0278675E.gz" ] + citation = "Abarenkov, Kessy; Zirk, Allan; Piirmann, Timo; Pöhönen, Raivo; Ivanov, Filipp; Nilsson, R. Henrik; Kõljalg, Urmas (2020): UNITE QIIME release for eukaryotes. Version 04.02.2020. UNITE Community. https://doi.org/10.15156/BIO/786386" + fmtscript = "taxref_reformat_qiime_unite.sh" + } + 'unite-alleuk' { + file = [ "https://files.plutof.ut.ee/public/orig/6E/0E/6E0EDD5592003B47C70A1B384C3C784AA32B726AC861CD7E2BD22AEB0278675E.gz" ] + citation = "Abarenkov, Kessy; Zirk, Allan; Piirmann, Timo; Pöhönen, Raivo; Ivanov, Filipp; Nilsson, R. Henrik; Kõljalg, Urmas (2020): UNITE QIIME release for eukaryotes. Version 04.02.2020. UNITE Community. https://doi.org/10.15156/BIO/786386" + fmtscript = "taxref_reformat_qiime_unite.sh" + } + 'greengenes85' { + file = [ "https://data.qiime2.org/2021.2/tutorials/training-feature-classifiers/85_otus.fasta", "https://data.qiime2.org/2021.2/tutorials/training-feature-classifiers/85_otu_taxonomy.txt" ] + citation = "McDonald, D., Price, M., Goodrich, J. et al. An improved Greengenes taxonomy with explicit ranks for ecological and evolutionary analyses of bacteria and archaea. ISME J 6, 610–618 (2012). https://doi.org/10.1038/ismej.2011.139" + fmtscript = "taxref_reformat_qiime_greengenes85.sh" + } } - 'greengenes' { - db_zip = "ftp://greengenes.microbio.me/greengenes_release/gg_13_5/gg_13_8_otus.tar.gz" - fasta = "gg_13_8_otus/rep_set/${params.dereplication}_otus.fasta" - taxonomy = "gg_13_8_otus/taxonomy/${params.dereplication}_otu_taxonomy.txt" - } - } } diff --git a/conf/test.config b/conf/test.config index 521c3cd1..41254cc8 100644 --- a/conf/test.config +++ b/conf/test.config @@ -8,24 +8,24 @@ */ params { - config_profile_name = 'Test profile' - config_profile_description = 'Minimal test dataset to check pipeline function' - // Limit resources so that this can run on GitHub Actions - max_cpus = 2 - max_memory = 6.GB - max_time = 48.h + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = 6.GB + max_time = 48.h - // Input data - FW_primer = "GTGYCAGCMGCCGCGGTAA" - RV_primer = "GGACTACNVGGGTWTCTAAT" - classifier = "https://github.com/nf-core/test-datasets/raw/ampliseq/testdata/GTGYCAGCMGCCGCGGTAA-GGACTACNVGGGTWTCTAAT-gg_13_8-85-qiime2_2019.7-classifier.qza" - metadata = "https://github.com/nf-core/test-datasets/raw/ampliseq/testdata/Metadata.tsv" - outdir = "./results" - temp_dir = "./results/tmp_dir" - readPaths = [ - ['1_S103', ['https://github.com/nf-core/test-datasets/raw/ampliseq/testdata/1_S103_L001_R1_001.fastq.gz', 'https://github.com/nf-core/test-datasets/raw/ampliseq/testdata/1_S103_L001_R2_001.fastq.gz']], - ['1a_S103', ['https://github.com/nf-core/test-datasets/raw/ampliseq/testdata/1a_S103_L001_R1_001.fastq.gz', 'https://github.com/nf-core/test-datasets/raw/ampliseq/testdata/1a_S103_L001_R2_001.fastq.gz']], - ['2_S115', ['https://github.com/nf-core/test-datasets/raw/ampliseq/testdata/2_S115_L001_R1_001.fastq.gz', 'https://github.com/nf-core/test-datasets/raw/ampliseq/testdata/2_S115_L001_R2_001.fastq.gz']], - ['2a_S115', ['https://github.com/nf-core/test-datasets/raw/ampliseq/testdata/2a_S115_L001_R1_001.fastq.gz', 'https://github.com/nf-core/test-datasets/raw/ampliseq/testdata/2a_S115_L001_R2_001.fastq.gz']] - ] + // Input data + FW_primer = "GTGYCAGCMGCCGCGGTAA" + RV_primer = "GGACTACNVGGGTWTCTAAT" + input = "https://github.com/nf-core/test-datasets/raw/ampliseq/samplesheets/Samplesheet.tsv" + metadata = "https://github.com/nf-core/test-datasets/raw/ampliseq/samplesheets/Metadata.tsv" + dada_ref_taxonomy = "rdp=18" + cut_dada_ref_taxonomy = true + qiime_ref_taxonomy = "greengenes85" + + //this is to remove low abundance ASVs to reduce runtime of downstream processes + min_samples = 2 + min_frequency = 10 } diff --git a/conf/test_doubleprimers.config b/conf/test_doubleprimers.config index 4f25e592..544da204 100644 --- a/conf/test_doubleprimers.config +++ b/conf/test_doubleprimers.config @@ -8,23 +8,18 @@ */ params { - config_profile_name = 'Test doubleprimers profile' - config_profile_description = 'Minimal test dataset to check pipeline function when removing double primers' - // Limit resources so that this can run on GitHub Actions - max_cpus = 2 - max_memory = 6.GB - max_time = 48.h + config_profile_name = 'Test doubleprimers profile' + config_profile_description = 'Minimal test dataset to check pipeline function when removing double primers' + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = 6.GB + max_time = 48.h - // Input data - FW_primer = "NNNNCCTAHGGGRBGCAGCAG" - RV_primer = "GACTACHVGGGTATCTAATCC" - double_primer = true - classifier = "https://github.com/nf-core/test-datasets/raw/ampliseq/testdata/GTGYCAGCMGCCGCGGTAA-GGACTACNVGGGTWTCTAAT-gg_13_8-85-qiime2_2019.7-classifier.qza" - metadata = false - outdir = "./results" - temp_dir = "./results/tmp_dir" - readPaths = [ - ['testdoubleprimers-1003', ['https://github.com/nf-core/test-datasets/raw/ampliseq/testdata/testdoubleprimers-1003_L001_R1_001.fastq.gz', 'https://github.com/nf-core/test-datasets/raw/ampliseq/testdata/testdoubleprimers-1003_L001_R2_001.fastq.gz']], - ['testdoubleprimers-1006', ['https://github.com/nf-core/test-datasets/raw/ampliseq/testdata/testdoubleprimers-1006_L001_R1_001.fastq.gz', 'https://github.com/nf-core/test-datasets/raw/ampliseq/testdata/testdoubleprimers-1006_L001_R2_001.fastq.gz']] - ] + // Input data + FW_primer = "NNNNCCTAHGGGRBGCAGCAG" + RV_primer = "GACTACHVGGGTATCTAATCC" + double_primer = true + dada_ref_taxonomy = false + input = "https://github.com/nf-core/test-datasets/raw/ampliseq/samplesheets/Samplesheet_double_primer.tsv" + trunc_qmin = 30 } diff --git a/conf/test_full.config b/conf/test_full.config index 8c606cfb..1e736c71 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -8,14 +8,19 @@ */ params { - config_profile_name = 'Full test profile' - config_profile_description = 'Full test dataset to check pipeline function' + config_profile_name = 'Full test profile' + config_profile_description = 'Full test dataset to check pipeline function' - // Input data for full size test - // TODO nf-core: Specify the paths to your full test data ( on nf-core/test-datasets or directly in repositories, e.g. SRA) - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input_paths = [ - ['Testdata', ['https://github.com/nf-core/test-datasets/raw/exoseq/testdata/Testdata_R1.tiny.fastq.gz', 'https://github.com/nf-core/test-datasets/raw/exoseq/testdata/Testdata_R2.tiny.fastq.gz']], - ['SRR389222', ['https://github.com/nf-core/test-datasets/raw/methylseq/testdata/SRR389222_sub1.fastq.gz', 'https://github.com/nf-core/test-datasets/raw/methylseq/testdata/SRR389222_sub2.fastq.gz']] - ] + // Input data for full size test + FW_primer = "GTGYCAGCMGCCGCGGTAA" + RV_primer = "GGACTACNVGGGTWTCTAAT" + input = "https://github.com/nf-core/test-datasets/raw/ampliseq/samplesheets/Samplesheet_full.tsv" + metadata = "https://github.com/nf-core/test-datasets/raw/ampliseq/samplesheets/Metadata_full.tsv" + dada_ref_taxonomy = "rdp" + qiime_ref_taxonomy = "greengenes85" + trunc_qmin = 35 + + //this is to remove very low abundance and low prevalence ASVs to reduce runtime of downstream processes + min_samples = 3 + min_frequency = 30 } diff --git a/conf/test_iontorrent.config b/conf/test_iontorrent.config new file mode 100644 index 00000000..ce5ed698 --- /dev/null +++ b/conf/test_iontorrent.config @@ -0,0 +1,27 @@ +/* + * ------------------------------------------------- + * Nextflow config file for running tests + * ------------------------------------------------- + * Defines bundled input files and everything required + * to run a fast and simple test. Use as follows: + * nextflow run nf-core/ampliseq -profile test_iontorrent + */ + +params { + config_profile_name = 'Test profile single-end ionTorrent reads' + config_profile_description = 'Minimal test dataset to check pipeline function with single-end ionTorrent sequences' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = 6.GB + max_time = 48.h + + // Input data + FW_primer = "GTGARTCATCGARTCTTTG" + RV_primer = "GCATATCAATAAGSSGAGGA" + dada_ref_taxonomy = "unite-fungi" + input = "https://github.com/nf-core/test-datasets/raw/ampliseq/samplesheets/Samplesheet_it_SE_ITS.tsv" + iontorrent = true + max_ee = 5 + skip_qiime = true +} diff --git a/conf/test_manifest.config b/conf/test_manifest.config deleted file mode 100644 index e8bc4b2f..00000000 --- a/conf/test_manifest.config +++ /dev/null @@ -1,25 +0,0 @@ -/* - * ------------------------------------------------- - * Nextflow config file for running tests - * ------------------------------------------------- - * Defines bundled input files and everything required - * to run a fast and simple test. Use as follows: - * nextflow run nf-core/ampliseq -profile test_manifest - */ - -params { - config_profile_name = 'Test profile manifest' - config_profile_description = 'Minimal test dataset to check pipeline function with input QIIME2 manifest file' - // Limit resources so that this can run on Travis - max_cpus = 2 - max_memory = 6.GB - max_time = 48.h - // Input data - FW_primer = "GTGYCAGCMGCCGCGGTAA" - RV_primer = "GGACTACNVGGGTWTCTAAT" - classifier = "https://github.com/nf-core/test-datasets/raw/ampliseq/testdata/GTGYCAGCMGCCGCGGTAA-GGACTACNVGGGTWTCTAAT-gg_13_8-85-qiime2_2019.7-classifier.qza" - metadata = "https://github.com/nf-core/test-datasets/raw/ampliseq/testdata/Metadata_manifest.tsv" - outdir = "./results" - temp_dir = "./results/tmp_dir" - manifest = "https://github.com/nf-core/test-datasets/raw/ampliseq/testdata/Manifest.tsv" -} diff --git a/conf/test_multi.config b/conf/test_multi.config index 4c2d6816..af40663c 100644 --- a/conf/test_multi.config +++ b/conf/test_multi.config @@ -4,30 +4,21 @@ * ------------------------------------------------- * Defines bundled input files and everything required * to run a fast and simple test. Use as follows: - * nextflow run nf-core/ampliseq -profile test + * nextflow run nf-core/ampliseq -profile test_multi */ params { - config_profile_name = 'Test_multi profile' - config_profile_description = 'Test dataset for multiple sequencing runs to check pipeline function' - // Limit resources so that this can run on Travis - max_cpus = 2 - max_memory = 6.GB - max_time = 48.h - // Input data - FW_primer = "GTGYCAGCMGCCGCGGTAA" - RV_primer = "GGACTACNVGGGTWTCTAAT" - classifier = "https://github.com/nf-core/test-datasets/raw/ampliseq/testdata/GTGYCAGCMGCCGCGGTAA-GGACTACNVGGGTWTCTAAT-gg_13_8-85-qiime2_2019.7-classifier.qza" - metadata = "https://github.com/nf-core/test-datasets/raw/ampliseq/testdata/Metadata_multi.tsv" - outdir = "./results" - temp_dir = "./results/tmp_dir" + config_profile_name = 'Test profile for multiple sequencing runs' + config_profile_description = 'Test dataset for multiple sequencing runs to check pipeline function' - - readPaths = [ - ['1_S103', ['https://raw.githubusercontent.com/nf-core/test-datasets/ampliseq/testdata/run1/1_S103_L001_R1_001.fastq.gz', 'https://raw.githubusercontent.com/nf-core/test-datasets/ampliseq/testdata/run1/1_S103_L001_R2_001.fastq.gz'], 'run1'], - ['1a_S103', ['https://raw.githubusercontent.com/nf-core/test-datasets/ampliseq/testdata/run1/1a_S103_L001_R1_001.fastq.gz', 'https://raw.githubusercontent.com/nf-core/test-datasets/ampliseq/testdata/run1/1a_S103_L001_R2_001.fastq.gz'], 'run1'], - ['2_S115', ['https://raw.githubusercontent.com/nf-core/test-datasets/ampliseq/testdata/run2/2_S115_L001_R1_001.fastq.gz', 'https://raw.githubusercontent.com/nf-core/test-datasets/ampliseq/testdata/run2/2_S115_L001_R2_001.fastq.gz'], 'run2'], - ['2a_S115', ['https://raw.githubusercontent.com/nf-core/test-datasets/ampliseq/testdata/run2/2a_S115_L001_R1_001.fastq.gz', 'https://raw.githubusercontent.com/nf-core/test-datasets/ampliseq/testdata/run2/2a_S115_L001_R2_001.fastq.gz'], 'run2'] - ] - multipleSequencingRuns = true + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = 6.GB + max_time = 48.h + + // Input data + FW_primer = "GTGYCAGCMGCCGCGGTAA" + RV_primer = "GGACTACNVGGGTWTCTAAT" + dada_ref_taxonomy = false + input = "https://github.com/nf-core/test-datasets/raw/ampliseq/samplesheets/Samplesheet_multi.tsv" } diff --git a/conf/test_pacbio_its.config b/conf/test_pacbio_its.config index 3d7536a3..e25d1c16 100644 --- a/conf/test_pacbio_its.config +++ b/conf/test_pacbio_its.config @@ -8,24 +8,21 @@ */ params { - config_profile_name = 'Test profile PacBio ITS' - config_profile_description = 'Minimal test dataset to check pipeline function with PacBio ITS sequences' - // Limit resources so that this can run on Travis - max_cpus = 2 - max_memory = 6.GB - max_time = 48.h + config_profile_name = 'Test profile PacBio ITS' + config_profile_description = 'Minimal test dataset to check pipeline function with PacBio ITS sequences and option --cut_its enabled' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = 6.GB + max_time = 48.h - // Input data - FW_primer = "CTTGGTCATTTAGAGGAAGTAA" - RV_primer = "CGAAGTTTCCCTCAGGA" - classifier = "https://github.com/nf-core/test-datasets/raw/ampliseq/testdata/CTTGGTCATTTAGAGGAAGTAA-CGAAGTTTCCCTCAGGA-UNITE-classifier.qza" - metadata = "https://github.com/nf-core/test-datasets/raw/ampliseq/testdata/Metadata_pacbio_ITS.tsv" - outdir = "./results" - temp_dir = "./results/tmp_dir" - manifest = "https://github.com/nf-core/test-datasets/raw/ampliseq/testdata/Manifest_pacbio_ITS.tsv" - single_end = true - pacbio = true - trunclenf = 0 - trunclenr = 0 - maxEE = 12 + // Input data + FW_primer = "CTTGGTCATTTAGAGGAAGTAA" + RV_primer = "CGAAGTTTCCCTCAGGA" + dada_ref_taxonomy = "unite-fungi" + input = "https://github.com/nf-core/test-datasets/raw/ampliseq/samplesheets/Samplesheet_pacbio_ITS.tsv" + pacbio = true + max_ee = 12 + cut_its = true + skip_qiime = true } diff --git a/docs/README.md b/docs/README.md index af948e68..e1700973 100644 --- a/docs/README.md +++ b/docs/README.md @@ -6,10 +6,5 @@ The nf-core/ampliseq documentation is split into the following pages: * An overview of how the pipeline works, how to run it and a description of all of the different command-line flags. * [Output](output.md) * An overview of the different results produced by the pipeline and how to interpret them. -* [Troubleshooting](https://nf-co.re/usage/troubleshooting) You can find a lot more documentation about installing, configuring and running nf-core pipelines on the website: [https://nf-co.re](https://nf-co.re) - -## Credits - -These scripts were originally written for use at the [Quantitative Biology Center (QBiC)](http://www.qbic.life) and [Microbial Ecology, Center for Applied Geosciences](http://www.uni-tuebingen.de/de/104325), part of Eberhard Karls Universität Tübingen (Germany) by Daniel Straub ([@d4straub](https://github.com/d4straub)) and Alexander Peltzer ([@apeltzer](https://github.com/apeltzer)). diff --git a/docs/images/nf-core-ampliseq_logo.png b/docs/images/nf-core-ampliseq_logo.png index b5e1cdac..f0e4dd71 100644 Binary files a/docs/images/nf-core-ampliseq_logo.png and b/docs/images/nf-core-ampliseq_logo.png differ diff --git a/docs/output.md b/docs/output.md index 3369b79c..4d407eeb 100644 --- a/docs/output.md +++ b/docs/output.md @@ -1,9 +1,5 @@ # nf-core/ampliseq: Output -## :warning: Please read this documentation on the nf-core website: [https://nf-co.re/ampliseq/output](https://nf-co.re/ampliseq/output) - -> _Documentation of pipeline parameters is generated automatically from the pipeline schema and can no longer be found in markdown files._ - ## Introduction This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. @@ -20,18 +16,20 @@ and processes data using the following steps: * [FastQC](#fastqc) - Read quality control * [Cutadapt](#cutadapt) - Primer trimming * [MultiQC](#multiqc) - Aggregate report describing results - * [QIIME2](#qiime2) - Import & quality control - * [DADA2](#dada2) - Infer Amplicon Sequence Variants (ASVs) - * [Taxonomic classification](#taxonomic-classification) - Taxonomical classification of ASVs - * [Exclude taxa](#exclude-taxa) - Remove unwanted ASV based on taxonomy - * [Relative abundance tables](#relative-abundance-tables) - Exported relative abundance tables - * [Barplot](#barplot) - Interactive barplot - * [Alpha diversity rarefaction curves](#alpha-diversity-rarefaction-curves) - Rarefaction curves for quality control - * [Alpha diversity indices](#alpha-diversity-indices) - Diversity within samples - * [Beta diversity indices](#beta-diversity-indices) - Diversity between samples (e.g. PCoA plots) - * [ANCOM](#ancom) - Differential abundance analysis + * [DADA2](#dada2) - Infer Amplicon Sequence Variants (ASVs) and taxonomic classification + * [ITSx](#itsx) - Optionally, taxonomic classification can be performed on ITS region only + * [QIIME2](#qiime2) - Secondary analysis + * [Taxonomic classification](#taxonomic-classification) - Taxonomical classification of ASVs + * [Exclude taxa](#exclude-taxa) - Remove unwanted ASV based on taxonomy + * [Relative abundance tables](#relative-abundance-tables) - Exported relative abundance tables + * [Barplot](#barplot) - Interactive barplot + * [Alpha diversity rarefaction curves](#alpha-diversity-rarefaction-curves) - Rarefaction curves for quality control + * [Diversity analysis](#diversity-analysis) - High level overview with different diversity indices + * [Alpha diversity indices](#alpha-diversity-indices) - Diversity within samples + * [Beta diversity indices](#beta-diversity-indices) - Diversity between samples (e.g. PCoA plots) + * [ANCOM](#ancom) - Differential abundance analysis + * [Read count report](#Read-count-report) - Report of read counts during various steps of the pipeline * [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution - * [More help](#more-help) * [Citations](#citations) ### FastQC @@ -44,8 +42,6 @@ For further reading and documentation see the [FastQC help pages](http://www.bio * `fastqc/` * `*_fastqc.html`: FastQC report containing quality metrics for your untrimmed raw fastq files. -* `fastqc/zips/` - * `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. > **NB:** The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. @@ -55,7 +51,9 @@ For further reading and documentation see the [FastQC help pages](http://www.bio **Output files:** -* `trimmed/logs/`: directory containing log files with retained reads, trimming percentage, etc. for each sample. +* `cutadapt/`: directory containing log files with retained reads, trimming percentage, etc. for each sample. + * `cutadapt_summary.tsv`: Summary of read numbers that pass cutadapt. + * `assignTaxonomy.cutadapt.log`: Contains how many expected amplified sequences were extracted from the DADA2 reference taxonomy database. Optional. ### MultiQC @@ -70,68 +68,75 @@ For more information about how to use MultiQC reports, see [https://multiqc.info * `multiqc/` * `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. * `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. - * `multiqc_plots/`: directory containing static images from the report in various formats. -### QIIME2 +### DADA2 -**Quantitative Insights Into Microbial Ecology 2** ([QIIME2](https://qiime2.org/)) is a next-generation microbiome bioinformatics platform and the successor of the widely used [QIIME1](https://www.nature.com/articles/nmeth.f.303). QIIME2 is currently **under heavy development** and often updated, this version of ampliseq uses QIIME2 2019.10. QIIME2 has a wide variety of analysis tools available and has excellent support in its [forum](https://docs.qiime2.org/2019.10/). +[DADA2](https://www.nature.com/articles/nmeth.3869) performs fast and accurate sample inference from amplicon data with single-nucleotide resolution. It infers exact amplicon sequence variants (ASVs) from amplicon data with fewer false positives than many other methods while maintaining high sensitivity. -At this point of the analysis the trimmed reads are imported into QIIME2 and an interactive quality plot is made. +DADA2 computes an error model on the sequencing reads (forward and reverse independently), therefore quality filtering or paired read merging may not be performed before. Each sequencing run varies in their error profile and it is recommended that DADA2 runs separately on data from each run individually. It is recommended to use the ampliseq option `--multiple_sequencing_runs` to analyse such data. -**Output files:** +DADA2 reduces sequence errors and dereplicates sequences by quality filtering, denoising, read pair merging (for paired end Illumina reads only) and PCR chimera removal. -* `demux/` - * `index.html`: Quality plots that can be viewed in your web browser. - * `demux.qza` (only when --untilQ2import is true): QIIME2 artefact for imported reads. +Additionally, DADA2 taxonomically classifies the ASVs using a choice of supplied databases (specified with `--dada_ref_taxonomy`). -All following analysis steps are performed in QIIME2, except DADA2 in the case of pacbio data. +**Output files:** -### DADA2 +* `dada2/` + * `ASV_seqs.fasta`: Fasta file with ASV sequences. + * `ASV_table.tsv`: Counts for each ASV sequence. + * `ASV_tax.tsv`: Taxonomic classification for each ASV sequence. + * `ASV_tax_species.tsv`: Species classification for each ASV sequence. + * `DADA2_stats.tsv`: Tracking read numbers through DADA2 processing steps, for each sample. + * `DADA2_table.rds`: DADA2 ASV table as R object. + * `DADA2_tables.tsv`: DADA2 ASV table. +* `dada2/args/`: Directory containing files with all parameters for DADA2 steps. +* `dada2/log/`: Directory containing log files for DADA2 steps. +* `dada2/QC/` + * `*.err.convergence.txt`: Convergence values for DADA2's dada command, should reduce over several magnitudes and approaching 0. + * `*.err.pdf`: Estimated error rates for each possible transition. The black line shows the estimated error rates after convergence of the machine-learning algorithm. The red line shows the error rates expected under the nominal definition of the Q-score. The estimated error rates (black line) should be a good fit to the observed rates (points), and the error rates should drop with increased quality. + * `*_qual_stats.pdf`: Overall read quality profiles: heat map of the frequency of each quality score at each base position. The mean quality score at each position is shown by the green line, and the quartiles of the quality score distribution by the orange lines. The red line shows the scaled proportion of reads that extend to at least that position. + +#### ITSx + +Optionally, the ITS region can be extracted from each ASV sequence using ITSx, and taxonomic classification is performed based on the ITS sequence. -[DADA2](https://www.nature.com/articles/nmeth.3869) performs fast and accurate sample inference from amplicon data with single-nucleotide resolution. It infers exact amplicon sequence variants (ASVs) from amplicon data with fewer false positives than many other methods while maintaining high sensitivity. +**Output files:** -DADA2 computes an error model on the sequencing reads (forward and reverse independently), therefore quality filtering or paired read merging may not be performed before. Each sequencing run varies in their error profile and it is recommended that DADA2 runs separately on data from each run individually. It is recommended to use the ampliseq option `--multipleSequencingRuns` to analyse such data. +* `itsx/` + * `ASV_ITS_seqs.full.fasta`: Fasta file with ITS region from each ASV sequence. +* `dada2/` + * `ASV_ITS_tax.tsv`: Taxonomic classification with ITS region of each ASV sequence. + * `ASV_ITS_tax_species.tsv`: Species classification with ITS region of each ASV sequence. -DADA2 reduces sequence errors and dereplicates sequences by quality filtering, denoising, read pair merging (for paired end Illumina reads only) and PCR chimera removal. +### QIIME2 -**Output files:** +**Quantitative Insights Into Microbial Ecology 2** ([QIIME2](https://qiime2.org/)) is a next-generation microbiome bioinformatics platform and the successor of the widely used [QIIME1](https://www.nature.com/articles/nmeth.f.303). -* `representative_sequences/unfiltered/` - * `sequences.fasta`: Fasta file with ASV sequences. - * `index.html`: ASV IDs, sequences and blast results in an interactive table that can be viewed in your web browser. - * `rep-seqs.qza`: QIIME2 data artefact. -* `abundance-table/unfiltered/` - * `dada_report.txt`: DADA2 verbose output. - * `dada_stats.tsv`: Tab-separated table of DADA2 statistics. - * `feature-table.biom`: Abundance table in biom format for importing into downstream analysis tools. - * `feature-table.tsv`: Tab-separated abundance table for each ASV and each sample. - * `rel-feature-table.biom`: Relative abundance table in biom format for importing into downstream analysis tools. - * `rel-feature-table.tsv`: Tab-separated relative abundance table for each ASV and each sample. - * `table.qza`: QIIME2 data artefact. +ASV sequences, counts, and taxonomic classification as produced before with DADA2 are imported into QIIME2 and further analysed. Optionally, ASVs can be taxonomically classified also with QIIME2 against a database chosen with `--qiime_ref_taxonomy` (but DADA2 taxonomic classification takes precedence). Next, ASVs are filtered (`--exclude_taxa`, `--min_frequency`, `--min_samples`), and abundance tables are exported. Following, diversity indices are calculated and testing for differential abundant features between sample groups is performed. -### Taxonomic classification +#### Taxonomic classification -ASV abundance and sequences inferred in DADA2 are informative but routinely taxonomic classifications such as family or genus annotation is desireable. ASV sequences are classified by default against the [SILVA](https://www.arb-silva.de/) [v132](https://www.arb-silva.de/documentation/release-132/) database to add taxonomic information, but a custom database is used if provided. In particular, a [UNITE](https://unite.ut.ee/repository.php) fasta file can be provided to classify fungal ITS sequences. +Taxonomic classification with QIIME2 is typically similar to DADA2 classifications. However, both options are available. When taxonomic classification with DADA2 and QIIME2 is performed, DADA2 classification takes precedence over QIIME2 classifications for all downstream analysis. **Output files:** * `taxonomy/` * `taxonomy.tsv`: Tab-separated table with taxonomic classification for each ASV - * `index.html`: ASV IDs with taxonomic classification in an interactive table that can be viewed in your web browser + * `*-classifier.qza`: QIIME2 artefact of the trained classifier. Can be supplied to other pipeline runs with `--classifier` -### Exclude taxa +#### Exclude taxa -Removes unwanted taxa in DADA2 output sequences and abundance tables by taxonomic classification. Unwanted taxa are often off-targets generated in PCR with primers that are not perfectly specific for the target DNA. For example, PCR with commonly used primers also amplifyies mitrochindrial or chloroplast rRNA genes and therefore leads to non-bacteria products. These mitrochondria or chloroplast amplicons are removed in this step. +Removes unwanted taxa in DADA2 output sequences and abundance tables by taxonomic classification. Unwanted taxa are often off-targets generated in PCR with primers that are not perfectly specific for the target DNA. For example, PCR with commonly used primers also amplifyies mitrochindrial or chloroplast rRNA genes and therefore leads to non-bacteria products. These mitrochondria or chloroplast amplicons are removed in this step by default (`--exclude_taxa`). All following analysis is based on these filtered tables. **Output files:** -* `representative_sequences/filtered/` - * `sequences.fasta`: Fasta file with ASV sequences. - * `index.html`: ASV IDs, sequences and blast results in an interactive table that can be viewed in your web browser. - * `rep-seqs.qza`: QIIME2 data artefact. -* `abundance-table/filtered/` +* `qiime2/representative_sequences/` + * `rep-seq.fasta`: Fasta file with ASV sequences. + * `descriptive_stats.tsv`: Length, mean, etc. of ASV sequences. + * `seven_number_summary.tsv`: Length of ASV sequences in different quantiles. +* `qiime2/abundance_tables/` * `abs-abund-table-2.tsv`: Tab-separated absolute abundance table at phylum level. * `abs-abund-table-3.tsv`: Tab-separated absolute abundance table at class level. * `abs-abund-table-4.tsv`: Tab-separated absolute abundance table at order level. @@ -141,15 +146,14 @@ All following analysis is based on these filtered tables. * `count_table_filter_stats.tsv`: Tab-separated table with information on how much counts were filtered for each sample. * `feature-table.biom`: Abundance table in biom format for importing into downstream analysis tools. * `feature-table.tsv`: Tab-separated abundance table for each ASV and each sample. - * `table.qza`: QIIME2 data artefact. -### Relative abundance tables +#### Relative abundance tables Absolute abundance tables produced by the previous steps contain count data, but the compositional nature of 16S rRNA amplicon sequencing requires sequencing depth normalisation. This step computes relative abundance tables for various taxonomic levels and a detailed table for all ASVs with taxonomic classification, sequence and relative abundance for each sample. Typically used for in depth investigation of taxa abundances. **Output files:** -* `rel_abundance_tables/` +* `qiime2/rel_abundance_tables/` * `rel-table-2.tsv`: Tab-separated relative abundance table at phylum level. * `rel-table-3.tsv`: Tab-separated relative abundance table at class level. * `rel-table-4.tsv`: Tab-separated relative abundance table at order level. @@ -157,41 +161,53 @@ Absolute abundance tables produced by the previous steps contain count data, but * `rel-table-6.tsv`: Tab-separated relative abundance table at genus level. * `rel-table-7.tsv`: Tab-separated relative abundance table at species level. * `rel-table-ASV.tsv`: Tab-separated relative abundance table for all ASVs. - * `qiime2_ASV_table.tsv`: Tab-separated table for all ASVs with taxonomic classification, sequence and relative abundance. + * `qiime2_ASV_table.tsv`: Tab-separated table for all ASVs with taxonomic classification, sequence and relative abundance. *NOTE: This file is based on QIIME2 taxonomic classifications, contrary to all other files that are based on DADA2 classification, if available.* -### Barplot +#### Barplot Produces an interactive abundance plot count tables that aids exploratory browsing the discovered taxa and their abundance in samples and allows sorting for associated meta data. **Output files:** -* `barplot/` +* `qiime2/barplot/` * `index.html`: Interactive barplot for taxa abundance per sample that can be viewed in your web browser. -### Alpha diversity rarefaction curves +#### Alpha diversity rarefaction curves Produces rarefaction plots for several alpha diversity indices, and is primarily used to determine if the richness of the samples has been fully observed or sequenced. If the slope of the curves does not level out and the lines do not becomes horizontal, this might be because the sequencing depth was too low to observe all diversity or that sequencing error artificially increases sequence diversity and causes false discoveries. **Output files:** -* `alpha-rarefaction/` +* `qiime2/alpha-rarefaction/` * `index.html`: Interactive alphararefaction curve for taxa abundance per sample that can be viewed in your web browser. -### Alpha diversity indices +#### Diversity analysis + +Diversity measures summarize important sample features (alpha diversity) or differences between samples (beta diversity). To do so, sample data is first rarefied to the minimum number of counts per sample. Also, a phylogenetic tree of all ASVs is computed to provide phylogenetic information. + +**Output files:** + +* `qiime2/diversity/` + * `Use the sampling depth of * for rarefaction.txt`: File that reports the rarefaction depth in the file name and file content. +* `qiime2/phylogenetic_tree/` + * `tree.nwk`: Phylogenetic tree in newick format. + * `rooted-tree.qza`: Phylogenetic tree in QIIME2 format. + +##### Alpha diversity indices -Alpha diversity measures the species diversity within samples. This step calculates alpha diversity using various methods and performs pairwise comparisons of groups of samples. +Alpha diversity measures the species diversity within samples. Diversity calculations are based on sub-sampled data rarefied to the minimum read count of all samples. This step calculates alpha diversity using various methods and performs pairwise comparisons of groups of samples. It is based on a phylogenetic tree of all ASV sequences. **Output files:** -* `alpha-diversity` +* `qiime2/diversity/alpha_diversity/` * `evenness_vector/index.html`: Pielou’s Evenness. * `faith_pd_vector/index.html`: Faith’s Phylogenetic Diversity (qualitiative, phylogenetic). * `observed_otus_vector/index.html`: Observed OTUs (qualitative). * `shannon_vector/index.html`: Shannon’s diversity index (quantitative). -### Beta diversity indices +##### Beta diversity indices -Beta diversity measures the species community differences between samples. This step calculates beta diversity distances using various methods and performs pairwise comparisons of groups of samples. Additionally principle coordinates analysis (PCoA) plots are produced that can be visualized with [Emperor](https://biocore.github.io/emperor/build/html/index.html) in your default browser without the need for installation. +Beta diversity measures the species community differences between samples. Diversity calculations are based on sub-sampled data rarefied to the minimum read count of all samples. This step calculates beta diversity distances using various methods and performs pairwise comparisons of groups of samples. Additionally principle coordinates analysis (PCoA) plots are produced that can be visualized with [Emperor](https://biocore.github.io/emperor/build/html/index.html) in your default browser without the need for installation. This calculations are based on a phylogenetic tree of all ASV sequences. **The following methods are used to calculate community dissimilarities:** @@ -202,25 +218,33 @@ Beta diversity measures the species community differences between samples. This **Output files:** -* `beta-diversity/` - * `_distance_matrix-/index.html` - * `_pcoa_results-PCoA/index.html` +* `qiime2/diversity/beta_diversity/` + * `_distance_matrix-/index.html`: Box plots and significance analysis (PERMANOVA). + * `_pcoa_results-PCoA/index.html`: Interactive PCoA plot. * method: bray_curtis, jaccard, unweighted_unifrac, weighted_unifrac * treatment: depends on your metadata sheet or what metadata categories you have specified -### ANCOM +#### ANCOM Analysis of Composition of Microbiomes ([ANCOM](https://www.ncbi.nlm.nih.gov/pubmed/26028277)) is applied to identify features that are differentially abundant across sample groups. A key assumption made by ANCOM is that few taxa (less than about 25%) will be differentially abundant between groups otherwise the method will be inaccurate. -ANCOM is applied to each suitable or specified metadata column for 6 taxonomic levels. +ANCOM is applied to each suitable or specified metadata column for 5 taxonomic levels (2-6). **Output files:** -* `ancom/` - * `Category--/index.html` +* `qiime2/ancom/` + * `Category--/index.html`: Statistical results and interactive Volcano plot. * treatment: depends on your metadata sheet or what metadata categories you have specified * taxonomic level: level-2 (phylum), level-3 (class), level-4 (order), level-5 (family), level-6 (genus), ASV +## Read count report + +This report includes information on how many reads per sample passed each pipeline step in which a loss can occur. Specifically, how many read pairs entered cutadapt, were reverse complemented, passed trimming; how many read pairs entered DADA2, were denoised, merged and non-chimeric; and how many counts were lost during excluding unwanted tax and removing low abundance/prevalence sequences in QIIME2. + +**Output files:** + +* `overall_summary.tsv`: Tab-separated file with count summary. + ## Pipeline information [Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. @@ -232,21 +256,6 @@ ANCOM is applied to each suitable or specified metadata column for 6 taxonomic l * Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.csv`. * Documentation for interpretation of results in HTML format: `results_description.html`. -## More help - -QIIME2 is currently **under heavy development** and often updated, this version of ampliseq uses QIIME2 2019.10. QIIME2 has excellent support in its [forum](https://docs.qiime2.org/2019.10/). - ## Citations -Besides citing the [pipeline](https://doi.org/10.5281/zenodo.3568091) and its [publication](https://doi.org/10.3389/fmicb.2020.550420), all tools that were used inside the pipeline have to be cited in a publication properly: - -* FastQC, "Andrews, Simon. "FastQC: a quality control tool for high throughput sequence data." (2010)." -* Cutadapt "Martin, Marcel. "Cutadapt removes adapter sequences from high-throughput sequencing reads." EMBnet. journal 17.1 (2011): pp-10." -* MultiQC, "Ewels, Philip, et al. "MultiQC: summarize analysis results for multiple tools and samples in a single report." Bioinformatics 32.19 (2016): 3047-3048." -* QIIME2, "Bolyen, Evan, et al. "Reproducible, interactive, scalable and extensible microbiome data science using QIIME 2." Nature Biotechnology 37 (2019): 852–857." -* DADA2, "Callahan, Benjamin J., et al. "DADA2: high-resolution sample inference from Illumina amplicon data." Nature methods 13.7 (2016): 581." -* Matplotlib, "Hunter, John D. "Matplotlib: A 2D graphics environment." Computing in science & engineering 9.3 (2007): 90-95." -* Feature-classifier, "Bokulich, Kaehler, et al. "Optimizing taxonomic classification of marker-gene amplicon sequences with QIIME 2's q2-feature-classifier plugin." Microbiome 6 (2018): 90. -* SILVA database, "Quast, Pruesse, et al. 2013. 'The SILVA ribosomal RNA gene database project: improved data processing and web-based tools', Nucleic Acids Research, 41: D590-D96." -* Mafft, "Katoh, Kazutaka and Standley, Daron M. "MAFFT multiple sequence alignment software version 7: improvements in performance and usability. Molecular biology and evolution 4 (2013): 772-780" -* ANCOM, "Mandal, Siddhartha et al. “Analysis of composition of microbiomes: a novel method for studying microbial composition” Microbial ecology in health and disease vol. 26 27663. 29 May. 2015, doi:10.3402/mehd.v26.27663" +An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. diff --git a/docs/usage.md b/docs/usage.md index 3cce7a09..6aeb1b19 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -30,7 +30,7 @@ results # Finished results (configurable, see below) # Other nextflow hidden files, eg. history of pipeline runs and old logs. ``` -See the [nf-core/ampliseq website documentation](https://nf-co.re/ampliseq/usage#usage) for more information about pipeline specific parameters. +See the [nf-core/ampliseq website documentation](https://nf-co.re/ampliseq/parameters) for more information about pipeline specific parameters. ### Updating the pipeline @@ -56,7 +56,7 @@ This version number will be logged in reports when you run the pipeline, so that Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments. -Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Conda) - see below. +Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Conda) - see below. > We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. @@ -76,11 +76,17 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof * `podman` * A generic configuration profile to be used with [Podman](https://podman.io/) * Pulls software from Docker Hub: [`nfcore/ampliseq`](https://hub.docker.com/r/nfcore/ampliseq/) +* `shifter` + * A generic configuration profile to be used with [Shifter](https://nersc.gitlab.io/development/shifter/how-to-use/) + * Pulls software from Docker Hub: [`nfcore/ampliseq`](https://hub.docker.com/r/nfcore/ampliseq/) +* `charliecloud` + * A generic configuration profile to be used with [Charliecloud](https://hpc.github.io/charliecloud/) + * Pulls software from Docker Hub: [`nfcore/ampliseq`](https://hub.docker.com/r/nfcore/ampliseq/) * `conda` - * Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity or Podman. + * Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter or Charliecloud. * A generic configuration profile to be used with [Conda](https://conda.io/docs/) * Pulls most software from [Bioconda](https://bioconda.github.io/) -* `test`, `test_multi`, `test_manifest`, `test_full`, `test_pacbio_its`, `test_doubleprimers` +* `test`, `test_multi`, `test_full`, `test_pacbio_its`, `test_iontorrent`, `test_doubleprimers` * Profiles with a complete configuration for automated testing * Includes links to test data so needs no other parameters @@ -108,6 +114,8 @@ process { } ``` +To find the exact name of a process you wish to modify the compute resources, check the live-status of a nextflow run displayed on your terminal or check the nextflow error for a line like so: `Error executing process > 'bwa'`. In this case the name to specify in the custom config file is `bwa`. + See the main [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html) for more information. If you are likely to be running `nf-core` pipelines regularly it may be a good idea to request that your custom config file is uploaded to the `nf-core/configs` git repository. Before you do this please can you test that the config file works with your pipeline of choice using the `-c` parameter (see definition above). You can then create a pull request to the `nf-core/configs` repository with the addition of your config file, associated documentation file (see examples in [`nf-core/configs/docs`](https://github.com/nf-core/configs/tree/master/docs)), and amending [`nfcore_custom.config`](https://github.com/nf-core/configs/blob/master/nfcore_custom.config) to include your custom profile. diff --git a/environment.yml b/environment.yml deleted file mode 100644 index ba1f0867..00000000 --- a/environment.yml +++ /dev/null @@ -1,356 +0,0 @@ -# You can use this file to create a conda environment for this pipeline: -# conda env create -f environment.yml -name: nf-core-ampliseq-1.2.0 -channels: -- qiime2 -- qiime2/label/r2019.10 -- conda-forge -- bioconda -- defaults -dependencies: -# nf-core -- conda-forge::markdown=3.1.1 -- conda-forge::pymdown-extensions=6.0 -# Pipeline -- fastqc=0.11.8 -- font-ttf-dejavu-sans-mono=2.37 #for fastqc -- multiqc=1.9 -- unzip=6.0 #required for silva unzippping -- r-rmarkdown=1.18 -- r-optparse=1.6.4 #for pacbio R scripts -- ncurses=6.1 -- biopython=1.77 #for database conversion (UNITE) -# QIIME2 2019/10 -- _libgcc_mutex=0.1 -- _r-mutex=1.0.1 -- alsa-lib=1.1.5 -- arb-bio-tools=6.0.6 -- asn1crypto=1.2.0 -- atomicwrites=1.3.0 -- attrs=19.3.0 -- backcall=0.1.0 -- bibtexparser=1.1.0 -- binutils_impl_linux-64=2.31.1 -- binutils_linux-64=2.31.1 -- bioconductor-biobase=2.42.0 -- bioconductor-biocgenerics=0.28.0 -- bioconductor-biocparallel=1.16.6 -- bioconductor-biostrings=2.50.2 -- bioconductor-dada2=1.10.0 -- bioconductor-delayedarray=0.8.0 -- bioconductor-genomeinfodb=1.18.1 -- bioconductor-genomeinfodbdata=1.2.1 -- bioconductor-genomicalignments=1.18.1 -- bioconductor-genomicranges=1.34.0 -- bioconductor-iranges=2.16.0 -- bioconductor-rsamtools=1.34.0 -- bioconductor-s4vectors=0.20.1 -- bioconductor-shortread=1.40.0 -- bioconductor-summarizedexperiment=1.12.0 -- bioconductor-xvector=0.22.0 -- bioconductor-zlibbioc=1.28.0 -- conda-forge::biom-format=2.1.7 -- blas=2.14 -- blast=2.9.0 -- bleach=3.1.0 -- bokeh=1.3.4 -- boost=1.68.0 -- boost-cpp=1.68.0 -- bwidget=1.9.13 -- bzip2=1.0.8 -- ca-certificates=2019.9.11 -- cachecontrol=0.12.5 -- cairo=1.16.0 -- certifi=2019.9.11 -- cffi=1.13.1 -- chardet=3.0.4 -- click=7.0 -- cryptography=2.7 -- curl=7.65.3 -- cutadapt=2.8 -- cycler=0.10.0 -- cython=0.29.13 -- dbus=1.13.6 -- bioconda::deblur=1.1.0 -- decorator=4.4.1 -- defusedxml=0.6.0 -- dendropy=4.4.0 -- dnaio=0.4 -- emperor=1.0.0b20 -- entrypoints=0.3 -- expat=2.2.5 -- fastcluster=1.1.25 -- fasttree=2.1.10 -- fontconfig=2.13.1 -- freetype=2.10.0 -- future=0.18.2 -- gcc_impl_linux-64=7.3.0 -- gcc_linux-64=7.3.0 -- gettext=0.19.8.1 -- gfortran_impl_linux-64=7.3.0 -- gfortran_linux-64=7.3.0 -- giflib=5.1.7 -- glib=2.58.3 -- gmp=6.1.2 -- gneiss=0.4.6 -- gnutls=3.6.5 -- graphite2=1.3.13 -- gsl=2.5 -- gst-plugins-base=1.14.5 -- gstreamer=1.14.5 -- gxx_impl_linux-64=7.3.0 -- gxx_linux-64=7.3.0 -- h5py=2.10.0 -- harfbuzz=2.4.0 -- hdf5=1.10.5 -- hdmedians=0.13 -- hmmer=3.1b2 -- icu=58.2 -- idna=2.8 -- conda-forge::ijson=2.5 -- importlib_metadata=0.23 -- ipykernel=5.1.3 -- ipython=7.9.0 -- ipython_genutils=0.2.0 -- ipywidgets=7.5.1 -- iqtree=1.6.12 -- jedi=0.15.1 -- jinja2=2.10.3 -- joblib=0.14.0 -- jpeg=9c -- jsonschema=3.1.1 -- jupyter_client=5.3.3 -- jupyter_core=4.5.0 -- kiwisolver=1.1.0 -- krb5=1.16.3 -- lcms2=2.9 -- libarbdb=6.0.6 -- libblas=3.8.0 -- libcblas=3.8.0 -- libcurl=7.65.3 -- libedit=3.1.20170329 -- libffi=3.2.1 -- libgcc=7.2.0 -- anaconda::libgcc-ng=9.1.0 -- libgfortran-ng=7.3.0 -- libiconv=1.15 -- liblapack=3.8.0 -- liblapacke=3.8.0 -- libopenblas=0.3.7 -- libpng=1.6.37 -- libsodium=1.0.17 -- libssh2=1.8.2 -- anaconda::libstdcxx-ng=9.1.0 -- libtiff=4.0.10 -- libuuid=2.32.1 -- libxcb=1.13 -- libxml2=2.9.9 -- lockfile=0.12.2 -- lz4-c=1.8.3 -- mafft=7.310 -- make=4.2.1 -- markupsafe=1.1.1 -- matplotlib=3.1.0 -- matplotlib-base=3.1.0 -- mistune=0.8.4 -- more-itertools=7.2.0 -- msgpack-python=0.6.2 -- natsort=6.0.0 -- nbconvert=5.6.1 -- nbformat=4.4.0 -- ncurses=6.1 -- nettle=3.4.1 -- networkx=2.4 -- nose=1.3.7 -- notebook=6.0.1 -- numpy=1.17.3 -- olefile=0.46 -- openjdk=11.0.1 -- openssl=1.1.1c -- packaging=19.2 -- pandas=0.25.2 -- pandoc=2.7.3 -- pandocfilters=1.4.2 -- pango=1.40.14 -- parso=0.5.1 -- patsy=0.5.1 -- pcre=8.43 -- perl=5.26.2 -- perl-archive-tar=2.32 -- perl-carp=1.38 -- perl-common-sense=3.74 -- perl-compress-raw-bzip2=2.087 -- perl-compress-raw-zlib=2.087 -- perl-exporter=5.72 -- perl-exporter-tiny=1.002001 -- perl-extutils-makemaker=7.36 -- perl-io-compress=2.087 -- perl-io-zlib=1.10 -- perl-json=4.02 -- perl-json-xs=2.34 -- perl-list-moreutils=0.428 -- perl-list-moreutils-xs=0.428 -- perl-pathtools=3.75 -- perl-scalar-list-utils=1.52 -- perl-types-serialiser=1.0 -- perl-xsloader=0.24 -- pexpect=4.7.0 -- pickleshare=0.7.5 -- pigz=2.3.4 -- pillow=6.2.1 -- pip=19.3.1 -- pixman=0.38.0 -- pluggy=0.12.0 -- prometheus_client=0.7.1 -- prompt_toolkit=2.0.10 -- psutil=5.6.3 -- pthread-stubs=0.4 -- ptyprocess=0.6.0 -- py=1.8.0 -- pycparser=2.19 -- pygments=2.4.2 -- pyopenssl=19.0.0 -- pyparsing=2.4.2 -- pyqt=5.9.2 -- pyrsistent=0.15.5 -- pysocks=1.7.1 -- pytest=5.2.2 -- python=3.6.7 -- python-dateutil=2.8.0 -- pytz=2019.3 -- pyyaml=5.1.2 -- pyzmq=18.1.0 -- q2-alignment=2019.10.0 -- q2-composition=2019.10.0 -- q2-cutadapt=2019.10.0 -- q2-dada2=2019.10.0 -- q2-deblur=2019.10.0 -- q2-demux=2019.10.0 -- q2-diversity=2019.10.0 -- q2-emperor=2019.10.0 -- q2-feature-classifier=2019.10.0 -- q2-feature-table=2019.10.0 -- q2-fragment-insertion=2019.10.0 -- q2-gneiss=2019.10.0 -- q2-longitudinal=2019.10.0 -- q2-metadata=2019.10.0 -- q2-phylogeny=2019.10.0 -- q2-quality-control=2019.10.0 -- q2-quality-filter=2019.10.0 -- q2-sample-classifier=2019.10.0 -- q2-taxa=2019.10.0 -- q2-types=2019.10.0 -- q2-vsearch=2019.10.0 -- q2cli=2019.10.0 -- q2templates=2019.10.0 -- qiime2=2019.10.0 -- qt=5.9.7 -- r-assertthat=0.2.1 -- r-backports=1.1.5 -- r-base=3.5.1 -- r-bh=1.69.0_1 -- r-bitops=1.0_6 -- r-cli=1.1.0 -- r-cluster=2.1.0 -- r-colorspace=1.4_1 -- r-crayon=1.3.4 -- r-data.table=1.12.6 -- r-digest=0.6.22 -- r-ellipsis=0.3.0 -- r-fansi=0.4.0 -- r-formatr=1.7 -- r-futile.logger=1.4.3 -- r-futile.options=1.0.1 -- r-ggplot2=3.2.1 -- r-glue=1.3.1 -- r-gtable=0.3.0 -- r-hwriter=1.3.2 -- r-labeling=0.3 -- r-lambda.r=1.2.4 -- r-lattice=0.20_38 -- r-latticeextra=0.6_28 -- r-lazyeval=0.2.2 -- r-magrittr=1.5 -- r-mass=7.3_51.4 -- r-matrix=1.2_17 -- r-matrixstats=0.55.0 -- r-mgcv=1.8_29 -- r-munsell=0.5.0 -- r-nlme=3.1_141 -- r-permute=0.9_5 -- r-pillar=1.4.2 -- r-pkgconfig=2.0.3 -- r-plyr=1.8.4 -- r-r6=2.4.0 -- r-rcolorbrewer=1.1_2 -- r-rcpp=1.0.2 -- r-rcppparallel=4.4.4 -- r-rcurl=1.95_4.12 -- r-reshape2=1.4.3 -- r-rlang=0.4.1 -- r-scales=1.0.0 -- r-snow=0.4_3 -- r-stringi=1.4.3 -- r-stringr=1.4.0 -- r-tibble=2.1.3 -- r-utf8=1.1.4 -- r-vctrs=0.2.0 -- r-vegan=2.5_6 -- r-viridislite=0.3.0 -- r-withr=2.1.2 -- r-zeallot=0.1.0 -- raxml=8.2.12 -- readline=8.0 -- requests=2.22.0 -- conda-forge::scikit-bio=0.5.5 -- scikit-learn=0.21.2 -- scipy=1.3.1 -- seaborn=0.9.0 -- send2trash=1.5.0 -- sepp=4.3.10 -- setuptools=41.6.0 -- sina=1.6.0 -- sip=4.19.8 -- six=1.12.0 -- sortmerna=2.0 -- sqlite=3.30.1 -- statsmodels=0.10.1 -- tbb=2019.9 -- terminado=0.8.2 -- testpath=0.4.2 -- tk=8.6.9 -- tktable=2.10 -- tornado=6.0.3 -- traitlets=4.3.3 -- conda-forge::tzlocal=2.0.0 -- unifrac=0.10.0 -- urllib3=1.25.6 -- vsearch=2.7.0 -- wcwidth=0.1.7 -- webencodings=0.5.1 -- wheel=0.33.6 -- widgetsnbextension=3.5.1 -- xopen=0.8.4 -- xorg-fixesproto=5.0 -- xorg-inputproto=2.3.2 -- xorg-kbproto=1.0.7 -- xorg-libice=1.0.10 -- xorg-libsm=1.2.3 -- xorg-libx11=1.6.9 -- xorg-libxau=1.0.9 -- xorg-libxdmcp=1.1.3 -- xorg-libxext=1.3.4 -- xorg-libxfixes=5.0.3 -- xorg-libxi=1.7.10 -- xorg-libxrender=0.9.10 -- xorg-libxtst=1.2.3 -- xorg-recordproto=1.14.2 -- xorg-renderproto=0.11.1 -- xorg-xextproto=7.3.0 -- xorg-xproto=7.0.31 -- xz=5.2.4 -- yaml=0.1.7 -- zeromq=4.3.2 -- zipp=0.6.0 -- zlib=1.2.11 -- zstd=1.4.3 diff --git a/lib/Checks.groovy b/lib/Checks.groovy new file mode 100644 index 00000000..3b23aabc --- /dev/null +++ b/lib/Checks.groovy @@ -0,0 +1,80 @@ +import org.yaml.snakeyaml.Yaml + +/* + * This file holds several functions used to perform standard checks for the nf-core pipeline template. + */ + +class Checks { + + static void check_conda_channels(log) { + Yaml parser = new Yaml() + def channels = [] + try { + def config = parser.load("conda config --show channels".execute().text) + channels = config.channels + } catch(NullPointerException | IOException e) { + log.warn "Could not verify conda channel configuration." + return + } + + // Check that all channels are present + def required_channels = ['conda-forge', 'bioconda', 'defaults'] + def conda_check_failed = !required_channels.every { ch -> ch in channels } + + // Check that they are in the right order + conda_check_failed |= !(channels.indexOf('conda-forge') < channels.indexOf('bioconda')) + conda_check_failed |= !(channels.indexOf('bioconda') < channels.indexOf('defaults')) + + if (conda_check_failed) { + log.warn "=============================================================================\n" + + " There is a problem with your Conda configuration!\n\n" + + " You will need to set-up the conda-forge and bioconda channels correctly.\n" + + " Please refer to https://bioconda.github.io/user/install.html#set-up-channels\n" + + " NB: The order of the channels matters!\n" + + "===================================================================================" + } + } + + static void aws_batch(workflow, params) { + if (workflow.profile.contains('awsbatch')) { + assert (params.awsqueue && params.awsregion) : "Specify correct --awsqueue and --awsregion parameters on AWSBatch!" + // Check outdir paths to be S3 buckets if running on AWSBatch + // related: https://github.com/nextflow-io/nextflow/issues/813 + assert params.outdir.startsWith('s3:') : "Outdir not on S3 - specify S3 Bucket to run on AWSBatch!" + // Prevent trace files to be stored on S3 since S3 does not support rolling files. + assert !params.tracedir.startsWith('s3:') : "Specify a local tracedir or run without trace! S3 cannot be used for tracefiles." + } + } + + static void hostname(workflow, params, log) { + Map colors = Headers.log_colours(params.monochrome_logs) + if (params.hostnames) { + def hostname = "hostname".execute().text.trim() + params.hostnames.each { prof, hnames -> + hnames.each { hname -> + if (hostname.contains(hname) && !workflow.profile.contains(prof)) { + log.info "=${colors.yellow}====================================================${colors.reset}=\n" + + "${colors.yellow}WARN: You are running with `-profile $workflow.profile`\n" + + " but your machine hostname is ${colors.white}'$hostname'${colors.reset}.\n" + + " ${colors.yellow_bold}Please use `-profile $prof${colors.reset}`\n" + + "=${colors.yellow}====================================================${colors.reset}=" + } + } + } + } + } + + // Citation string + private static String citation(workflow) { + return "If you use ${workflow.manifest.name} for your analysis please cite:\n\n" + + "* The pipeline\n" + + " https://doi.org/10.5281/zenodo.1493841\n\n" + + "* The pipeline publication\n" + + " https://doi.org/10.3389/fmicb.2020.550420\n\n" + + "* The nf-core framework\n" + + " https://dx.doi.org/10.1038/s41587-020-0439-x\n" + + " https://rdcu.be/b1GjZ\n\n" + + "* Software dependencies\n" + + " https://github.com/${workflow.manifest.name}/blob/master/CITATIONS.md" + } +} \ No newline at end of file diff --git a/lib/Completion.groovy b/lib/Completion.groovy new file mode 100644 index 00000000..6b188a7c --- /dev/null +++ b/lib/Completion.groovy @@ -0,0 +1,132 @@ +/* + * Functions to be run on completion of pipeline + */ + +class Completion { + static void email(workflow, params, summary_params, projectDir, log, multiqc_report=[], fail_percent_mapped=[:]) { + + // Set up the e-mail variables + def subject = "[$workflow.manifest.name] Successful: $workflow.runName" + if (fail_percent_mapped.size() > 0) { + subject = "[$workflow.manifest.name] Partially successful (${fail_percent_mapped.size()} skipped): $workflow.runName" + } + if (!workflow.success) { + subject = "[$workflow.manifest.name] FAILED: $workflow.runName" + } + + def summary = [:] + for (group in summary_params.keySet()) { + summary << summary_params[group] + } + + def misc_fields = [:] + misc_fields['Date Started'] = workflow.start + misc_fields['Date Completed'] = workflow.complete + misc_fields['Pipeline script file path'] = workflow.scriptFile + misc_fields['Pipeline script hash ID'] = workflow.scriptId + if (workflow.repository) misc_fields['Pipeline repository Git URL'] = workflow.repository + if (workflow.commitId) misc_fields['Pipeline repository Git Commit'] = workflow.commitId + if (workflow.revision) misc_fields['Pipeline Git branch/tag'] = workflow.revision + misc_fields['Nextflow Version'] = workflow.nextflow.version + misc_fields['Nextflow Build'] = workflow.nextflow.build + misc_fields['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp + + def email_fields = [:] + email_fields['version'] = workflow.manifest.version + email_fields['runName'] = workflow.runName + email_fields['success'] = workflow.success + email_fields['dateComplete'] = workflow.complete + email_fields['duration'] = workflow.duration + email_fields['exitStatus'] = workflow.exitStatus + email_fields['errorMessage'] = (workflow.errorMessage ?: 'None') + email_fields['errorReport'] = (workflow.errorReport ?: 'None') + email_fields['commandLine'] = workflow.commandLine + email_fields['projectDir'] = workflow.projectDir + email_fields['summary'] = summary << misc_fields + email_fields['fail_percent_mapped'] = fail_percent_mapped.keySet() + + // On success try attach the multiqc report + def mqc_report = null + try { + if (workflow.success && !params.skip_multiqc) { + mqc_report = multiqc_report.getVal() + if (mqc_report.getClass() == ArrayList && mqc_report.size() >= 1) { + if (mqc_report.size() > 1) { + log.warn "[$workflow.manifest.name] Found multiple reports from process 'MULTIQC', will use only one" + } + mqc_report = mqc_report[0] + } + } + } catch (all) { + log.warn "[$workflow.manifest.name] Could not attach MultiQC report to summary email" + } + + // Check if we are only sending emails on failure + def email_address = params.email + if (!params.email && params.email_on_fail && !workflow.success) { + email_address = params.email_on_fail + } + + // Render the TXT template + def engine = new groovy.text.GStringTemplateEngine() + def tf = new File("$projectDir/assets/email_template.txt") + def txt_template = engine.createTemplate(tf).make(email_fields) + def email_txt = txt_template.toString() + + // Render the HTML template + def hf = new File("$projectDir/assets/email_template.html") + def html_template = engine.createTemplate(hf).make(email_fields) + def email_html = html_template.toString() + + // Render the sendmail template + def max_multiqc_email_size = params.max_multiqc_email_size as nextflow.util.MemoryUnit + def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, projectDir: "$projectDir", mqcFile: mqc_report, mqcMaxSize: max_multiqc_email_size.toBytes()] + def sf = new File("$projectDir/assets/sendmail_template.txt") + def sendmail_template = engine.createTemplate(sf).make(smail_fields) + def sendmail_html = sendmail_template.toString() + + // Send the HTML e-mail + Map colors = Headers.log_colours(params.monochrome_logs) + if (email_address) { + try { + if (params.plaintext_email) { throw GroovyException('Send plaintext e-mail, not HTML') } + // Try to send HTML e-mail using sendmail + [ 'sendmail', '-t' ].execute() << sendmail_html + log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Sent summary e-mail to $email_address (sendmail)-" + } catch (all) { + // Catch failures and try with plaintext + def mail_cmd = [ 'mail', '-s', subject, '--content-type=text/html', email_address ] + if ( mqc_report.size() <= max_multiqc_email_size.toBytes() ) { + mail_cmd += [ '-A', mqc_report ] + } + mail_cmd.execute() << email_html + log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Sent summary e-mail to $email_address (mail)-" + } + } + + // Write summary e-mail HTML to a file + def output_d = new File("${params.outdir}/pipeline_info/") + if (!output_d.exists()) { + output_d.mkdirs() + } + def output_hf = new File(output_d, "pipeline_report.html") + output_hf.withWriter { w -> w << email_html } + def output_tf = new File(output_d, "pipeline_report.txt") + output_tf.withWriter { w -> w << email_txt } + } + + static void summary(workflow, params, log) { + Map colors = Headers.log_colours(params.monochrome_logs) + + if (workflow.success) { + if (workflow.stats.ignoredCount == 0) { + log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Pipeline completed successfully${colors.reset}-" + } else { + log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} Pipeline completed successfully, but with errored process(es) ${colors.reset}-" + } + } else { + Checks.hostname(workflow, params, log) + log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} Pipeline completed with errors${colors.reset}-" + } + } +} diff --git a/lib/Headers.groovy b/lib/Headers.groovy new file mode 100644 index 00000000..15d1d388 --- /dev/null +++ b/lib/Headers.groovy @@ -0,0 +1,43 @@ +/* + * This file holds several functions used to render the nf-core ANSI header. + */ + +class Headers { + + private static Map log_colours(Boolean monochrome_logs) { + Map colorcodes = [:] + colorcodes['reset'] = monochrome_logs ? '' : "\033[0m" + colorcodes['dim'] = monochrome_logs ? '' : "\033[2m" + colorcodes['black'] = monochrome_logs ? '' : "\033[0;30m" + colorcodes['green'] = monochrome_logs ? '' : "\033[0;32m" + colorcodes['yellow'] = monochrome_logs ? '' : "\033[0;33m" + colorcodes['yellow_bold'] = monochrome_logs ? '' : "\033[1;93m" + colorcodes['blue'] = monochrome_logs ? '' : "\033[0;34m" + colorcodes['purple'] = monochrome_logs ? '' : "\033[0;35m" + colorcodes['cyan'] = monochrome_logs ? '' : "\033[0;36m" + colorcodes['white'] = monochrome_logs ? '' : "\033[0;37m" + colorcodes['red'] = monochrome_logs ? '' : "\033[1;91m" + return colorcodes + } + + static String dashed_line(monochrome_logs) { + Map colors = log_colours(monochrome_logs) + return "-${colors.dim}----------------------------------------------------${colors.reset}-" + } + + static String nf_core(workflow, monochrome_logs) { + Map colors = log_colours(monochrome_logs) + String.format( + """\n + ${dashed_line(monochrome_logs)} + ${colors.green},--.${colors.black}/${colors.green},-.${colors.reset} + ${colors.blue} ___ __ __ __ ___ ${colors.green}/,-._.--~\'${colors.reset} + ${colors.blue} |\\ | |__ __ / ` / \\ |__) |__ ${colors.yellow}} {${colors.reset} + ${colors.blue} | \\| | \\__, \\__/ | \\ |___ ${colors.green}\\`-._,-`-,${colors.reset} + ${colors.green}`._,._,\'${colors.reset} + ${colors.purple} ${workflow.manifest.name} v${workflow.manifest.version}${colors.reset} + ${dashed_line(monochrome_logs)} + """.stripIndent() + ) + } +} diff --git a/lib/MultiqcSchema.groovy b/lib/MultiqcSchema.groovy new file mode 100644 index 00000000..6a32c5cd --- /dev/null +++ b/lib/MultiqcSchema.groovy @@ -0,0 +1,31 @@ +/* + * This file holds several functions used to perform JSON parameter validation, help and summary rendering for the nf-core pipeline template. + */ + +import groovy.json.JsonSlurper + +class MultiqcSchema { + static String params_summary_multiqc(workflow, summary) { + String summary_section = '' + for (group in summary.keySet()) { + def group_params = summary.get(group) // This gets the parameters of that particular group + if (group_params) { + summary_section += "

$group

\n" + summary_section += "
\n" + for (param in group_params.keySet()) { + summary_section += "
$param
${group_params.get(param) ?: 'N/A'}
\n" + } + summary_section += "
\n" + } + } + + String yaml_file_text = "id: '${workflow.manifest.name.replace('/','-')}-summary'\n" + yaml_file_text += "description: ' - this information is collected when the pipeline is started.'\n" + yaml_file_text += "section_name: '${workflow.manifest.name} Workflow Summary'\n" + yaml_file_text += "section_href: 'https://github.com/${workflow.manifest.name}'\n" + yaml_file_text += "plot_type: 'html'\n" + yaml_file_text += "data: |\n" + yaml_file_text += "${summary_section}" + return yaml_file_text + } +} \ No newline at end of file diff --git a/lib/NfcoreSchema.groovy b/lib/NfcoreSchema.groovy new file mode 100644 index 00000000..52ee7304 --- /dev/null +++ b/lib/NfcoreSchema.groovy @@ -0,0 +1,573 @@ +/* + * This file holds several functions used to perform JSON parameter validation, help and summary rendering for the nf-core pipeline template. + */ + +import org.everit.json.schema.Schema +import org.everit.json.schema.loader.SchemaLoader +import org.everit.json.schema.ValidationException +import org.json.JSONObject +import org.json.JSONTokener +import org.json.JSONArray +import groovy.json.JsonSlurper +import groovy.json.JsonBuilder + +class NfcoreSchema { + + /* + * Function to loop over all parameters defined in schema and check + * whether the given paremeters adhere to the specificiations + */ + /* groovylint-disable-next-line UnusedPrivateMethodParameter */ + private static void validateParameters(params, jsonSchema, log) { + def has_error = false + //=====================================================================// + // Check for nextflow core params and unexpected params + def json = new File(jsonSchema).text + def Map schemaParams = (Map) new JsonSlurper().parseText(json).get('definitions') + def nf_params = [ + // Options for base `nextflow` command + 'bg', + 'c', + 'C', + 'config', + 'd', + 'D', + 'dockerize', + 'h', + 'log', + 'q', + 'quiet', + 'syslog', + 'v', + 'version', + + // Options for `nextflow run` command + 'ansi', + 'ansi-log', + 'bg', + 'bucket-dir', + 'c', + 'cache', + 'config', + 'dsl2', + 'dump-channels', + 'dump-hashes', + 'E', + 'entry', + 'latest', + 'lib', + 'main-script', + 'N', + 'name', + 'offline', + 'params-file', + 'pi', + 'plugins', + 'poll-interval', + 'pool-size', + 'profile', + 'ps', + 'qs', + 'queue-size', + 'r', + 'resume', + 'revision', + 'stdin', + 'stub', + 'stub-run', + 'test', + 'w', + 'with-charliecloud', + 'with-conda', + 'with-dag', + 'with-docker', + 'with-mpi', + 'with-notification', + 'with-podman', + 'with-report', + 'with-singularity', + 'with-timeline', + 'with-tower', + 'with-trace', + 'with-weblog', + 'without-docker', + 'without-podman', + 'work-dir' + ] + def unexpectedParams = [] + + // Collect expected parameters from the schema + def expectedParams = [] + for (group in schemaParams) { + for (p in group.value['properties']) { + expectedParams.push(p.key) + } + } + + for (specifiedParam in params.keySet()) { + // nextflow params + if (nf_params.contains(specifiedParam)) { + log.error "ERROR: You used a core Nextflow option with two hyphens: '--${specifiedParam}'. Please resubmit with '-${specifiedParam}'" + has_error = true + } + // unexpected params + def params_ignore = params.schema_ignore_params.split(',') + 'schema_ignore_params' + def expectedParamsLowerCase = expectedParams.collect{ it.replace("-", "").toLowerCase() } + def specifiedParamLowerCase = specifiedParam.replace("-", "").toLowerCase() + if (!expectedParams.contains(specifiedParam) && !params_ignore.contains(specifiedParam) && !expectedParamsLowerCase.contains(specifiedParamLowerCase)) { + // Temporarily remove camelCase/camel-case params #1035 + def unexpectedParamsLowerCase = unexpectedParams.collect{ it.replace("-", "").toLowerCase()} + if (!unexpectedParamsLowerCase.contains(specifiedParamLowerCase)){ + unexpectedParams.push(specifiedParam) + } + } + } + + //=====================================================================// + // Validate parameters against the schema + InputStream inputStream = new File(jsonSchema).newInputStream() + JSONObject rawSchema = new JSONObject(new JSONTokener(inputStream)) + + // Remove anything that's in params.schema_ignore_params + rawSchema = removeIgnoredParams(rawSchema, params) + + Schema schema = SchemaLoader.load(rawSchema) + + // Clean the parameters + def cleanedParams = cleanParameters(params) + + // Convert to JSONObject + def jsonParams = new JsonBuilder(cleanedParams) + JSONObject paramsJSON = new JSONObject(jsonParams.toString()) + + // Validate + try { + schema.validate(paramsJSON) + } catch (ValidationException e) { + println '' + log.error 'ERROR: Validation of pipeline parameters failed!' + JSONObject exceptionJSON = e.toJSON() + printExceptions(exceptionJSON, paramsJSON, log) + println '' + has_error = true + } + + // Check for unexpected parameters + if (unexpectedParams.size() > 0) { + Map colors = log_colours(params.monochrome_logs) + println '' + def warn_msg = 'Found unexpected parameters:' + for (unexpectedParam in unexpectedParams) { + warn_msg = warn_msg + "\n* --${unexpectedParam}: ${params[unexpectedParam].toString()}" + } + log.warn warn_msg + log.info "- ${colors.dim}Ignore this warning: params.schema_ignore_params = \"${unexpectedParams.join(',')}\" ${colors.reset}" + println '' + } + + if (has_error) { + System.exit(1) + } + } + + // Loop over nested exceptions and print the causingException + private static void printExceptions(exJSON, paramsJSON, log) { + def causingExceptions = exJSON['causingExceptions'] + if (causingExceptions.length() == 0) { + def m = exJSON['message'] =~ /required key \[([^\]]+)\] not found/ + // Missing required param + if (m.matches()) { + log.error "* Missing required parameter: --${m[0][1]}" + } + // Other base-level error + else if (exJSON['pointerToViolation'] == '#') { + log.error "* ${exJSON['message']}" + } + // Error with specific param + else { + def param = exJSON['pointerToViolation'] - ~/^#\// + def param_val = paramsJSON[param].toString() + log.error "* --${param}: ${exJSON['message']} (${param_val})" + } + } + for (ex in causingExceptions) { + printExceptions(ex, paramsJSON, log) + } + } + + // Remove an element from a JSONArray + private static JSONArray removeElement(jsonArray, element){ + def list = [] + int len = jsonArray.length() + for (int i=0;i + if(rawSchema.keySet().contains('definitions')){ + rawSchema.definitions.each { definition -> + for (key in definition.keySet()){ + if (definition[key].get("properties").keySet().contains(ignore_param)){ + // Remove the param to ignore + definition[key].get("properties").remove(ignore_param) + // If the param was required, change this + if (definition[key].has("required")) { + def cleaned_required = removeElement(definition[key].required, ignore_param) + definition[key].put("required", cleaned_required) + } + } + } + } + } + if(rawSchema.keySet().contains('properties') && rawSchema.get('properties').keySet().contains(ignore_param)) { + rawSchema.get("properties").remove(ignore_param) + } + if(rawSchema.keySet().contains('required') && rawSchema.required.contains(ignore_param)) { + def cleaned_required = removeElement(rawSchema.required, ignore_param) + rawSchema.put("required", cleaned_required) + } + } + return rawSchema + } + + private static Map cleanParameters(params) { + def new_params = params.getClass().newInstance(params) + for (p in params) { + // remove anything evaluating to false + if (!p['value']) { + new_params.remove(p.key) + } + // Cast MemoryUnit to String + if (p['value'].getClass() == nextflow.util.MemoryUnit) { + new_params.replace(p.key, p['value'].toString()) + } + // Cast Duration to String + if (p['value'].getClass() == nextflow.util.Duration) { + new_params.replace(p.key, p['value'].toString().replaceFirst(/d(?!\S)/, "day")) + } + // Cast LinkedHashMap to String + if (p['value'].getClass() == LinkedHashMap) { + new_params.replace(p.key, p['value'].toString()) + } + } + return new_params + } + + /* + * This method tries to read a JSON params file + */ + private static LinkedHashMap params_load(String json_schema) { + def params_map = new LinkedHashMap() + try { + params_map = params_read(json_schema) + } catch (Exception e) { + println "Could not read parameters settings from JSON. $e" + params_map = new LinkedHashMap() + } + return params_map + } + + private static Map log_colours(Boolean monochrome_logs) { + Map colorcodes = [:] + + // Reset / Meta + colorcodes['reset'] = monochrome_logs ? '' : "\033[0m" + colorcodes['bold'] = monochrome_logs ? '' : "\033[1m" + colorcodes['dim'] = monochrome_logs ? '' : "\033[2m" + colorcodes['underlined'] = monochrome_logs ? '' : "\033[4m" + colorcodes['blink'] = monochrome_logs ? '' : "\033[5m" + colorcodes['reverse'] = monochrome_logs ? '' : "\033[7m" + colorcodes['hidden'] = monochrome_logs ? '' : "\033[8m" + + // Regular Colors + colorcodes['black'] = monochrome_logs ? '' : "\033[0;30m" + colorcodes['red'] = monochrome_logs ? '' : "\033[0;31m" + colorcodes['green'] = monochrome_logs ? '' : "\033[0;32m" + colorcodes['yellow'] = monochrome_logs ? '' : "\033[0;33m" + colorcodes['blue'] = monochrome_logs ? '' : "\033[0;34m" + colorcodes['purple'] = monochrome_logs ? '' : "\033[0;35m" + colorcodes['cyan'] = monochrome_logs ? '' : "\033[0;36m" + colorcodes['white'] = monochrome_logs ? '' : "\033[0;37m" + + // Bold + colorcodes['bblack'] = monochrome_logs ? '' : "\033[1;30m" + colorcodes['bred'] = monochrome_logs ? '' : "\033[1;31m" + colorcodes['bgreen'] = monochrome_logs ? '' : "\033[1;32m" + colorcodes['byellow'] = monochrome_logs ? '' : "\033[1;33m" + colorcodes['bblue'] = monochrome_logs ? '' : "\033[1;34m" + colorcodes['bpurple'] = monochrome_logs ? '' : "\033[1;35m" + colorcodes['bcyan'] = monochrome_logs ? '' : "\033[1;36m" + colorcodes['bwhite'] = monochrome_logs ? '' : "\033[1;37m" + + // Underline + colorcodes['ublack'] = monochrome_logs ? '' : "\033[4;30m" + colorcodes['ured'] = monochrome_logs ? '' : "\033[4;31m" + colorcodes['ugreen'] = monochrome_logs ? '' : "\033[4;32m" + colorcodes['uyellow'] = monochrome_logs ? '' : "\033[4;33m" + colorcodes['ublue'] = monochrome_logs ? '' : "\033[4;34m" + colorcodes['upurple'] = monochrome_logs ? '' : "\033[4;35m" + colorcodes['ucyan'] = monochrome_logs ? '' : "\033[4;36m" + colorcodes['uwhite'] = monochrome_logs ? '' : "\033[4;37m" + + // High Intensity + colorcodes['iblack'] = monochrome_logs ? '' : "\033[0;90m" + colorcodes['ired'] = monochrome_logs ? '' : "\033[0;91m" + colorcodes['igreen'] = monochrome_logs ? '' : "\033[0;92m" + colorcodes['iyellow'] = monochrome_logs ? '' : "\033[0;93m" + colorcodes['iblue'] = monochrome_logs ? '' : "\033[0;94m" + colorcodes['ipurple'] = monochrome_logs ? '' : "\033[0;95m" + colorcodes['icyan'] = monochrome_logs ? '' : "\033[0;96m" + colorcodes['iwhite'] = monochrome_logs ? '' : "\033[0;97m" + + // Bold High Intensity + colorcodes['biblack'] = monochrome_logs ? '' : "\033[1;90m" + colorcodes['bired'] = monochrome_logs ? '' : "\033[1;91m" + colorcodes['bigreen'] = monochrome_logs ? '' : "\033[1;92m" + colorcodes['biyellow'] = monochrome_logs ? '' : "\033[1;93m" + colorcodes['biblue'] = monochrome_logs ? '' : "\033[1;94m" + colorcodes['bipurple'] = monochrome_logs ? '' : "\033[1;95m" + colorcodes['bicyan'] = monochrome_logs ? '' : "\033[1;96m" + colorcodes['biwhite'] = monochrome_logs ? '' : "\033[1;97m" + + return colorcodes + } + + static String dashed_line(monochrome_logs) { + Map colors = log_colours(monochrome_logs) + return "-${colors.dim}----------------------------------------------------${colors.reset}-" + } + + /* + Method to actually read in JSON file using Groovy. + Group (as Key), values are all parameters + - Parameter1 as Key, Description as Value + - Parameter2 as Key, Description as Value + .... + Group + - + */ + private static LinkedHashMap params_read(String json_schema) throws Exception { + def json = new File(json_schema).text + def Map schema_definitions = (Map) new JsonSlurper().parseText(json).get('definitions') + def Map schema_properties = (Map) new JsonSlurper().parseText(json).get('properties') + /* Tree looks like this in nf-core schema + * definitions <- this is what the first get('definitions') gets us + group 1 + title + description + properties + parameter 1 + type + description + parameter 2 + type + description + group 2 + title + description + properties + parameter 1 + type + description + * properties <- parameters can also be ungrouped, outside of definitions + parameter 1 + type + description + */ + + // Grouped params + def params_map = new LinkedHashMap() + schema_definitions.each { key, val -> + def Map group = schema_definitions."$key".properties // Gets the property object of the group + def title = schema_definitions."$key".title + def sub_params = new LinkedHashMap() + group.each { innerkey, value -> + sub_params.put(innerkey, value) + } + params_map.put(title, sub_params) + } + + // Ungrouped params + def ungrouped_params = new LinkedHashMap() + schema_properties.each { innerkey, value -> + ungrouped_params.put(innerkey, value) + } + params_map.put("Other parameters", ungrouped_params) + + return params_map + } + + /* + * Get maximum number of characters across all parameter names + */ + private static Integer params_max_chars(params_map) { + Integer max_chars = 0 + for (group in params_map.keySet()) { + def group_params = params_map.get(group) // This gets the parameters of that particular group + for (param in group_params.keySet()) { + if (param.size() > max_chars) { + max_chars = param.size() + } + } + } + return max_chars + } + + /* + * Beautify parameters for --help + */ + private static String params_help(workflow, params, json_schema, command) { + Map colors = log_colours(params.monochrome_logs) + Integer num_hidden = 0 + String output = '' + output += 'Typical pipeline command:\n\n' + output += " ${colors.cyan}${command}${colors.reset}\n\n" + Map params_map = params_load(json_schema) + Integer max_chars = params_max_chars(params_map) + 1 + Integer desc_indent = max_chars + 14 + Integer dec_linewidth = 160 - desc_indent + for (group in params_map.keySet()) { + Integer num_params = 0 + String group_output = colors.underlined + colors.bold + group + colors.reset + '\n' + def group_params = params_map.get(group) // This gets the parameters of that particular group + for (param in group_params.keySet()) { + if (group_params.get(param).hidden && !params.show_hidden_params) { + num_hidden += 1 + continue; + } + def type = '[' + group_params.get(param).type + ']' + def description = group_params.get(param).description + def defaultValue = group_params.get(param).default ? " [default: " + group_params.get(param).default.toString() + "]" : '' + def description_default = description + colors.dim + defaultValue + colors.reset + // Wrap long description texts + // Loosely based on https://dzone.com/articles/groovy-plain-text-word-wrap + if (description_default.length() > dec_linewidth){ + List olines = [] + String oline = "" // " " * indent + description_default.split(" ").each() { wrd -> + if ((oline.size() + wrd.size()) <= dec_linewidth) { + oline += wrd + " " + } else { + olines += oline + oline = wrd + " " + } + } + olines += oline + description_default = olines.join("\n" + " " * desc_indent) + } + group_output += " --" + param.padRight(max_chars) + colors.dim + type.padRight(10) + colors.reset + description_default + '\n' + num_params += 1 + } + group_output += '\n' + if (num_params > 0){ + output += group_output + } + } + output += dashed_line(params.monochrome_logs) + if (num_hidden > 0){ + output += colors.dim + "\n Hiding $num_hidden params, use --show_hidden_params to show.\n" + colors.reset + output += dashed_line(params.monochrome_logs) + } + return output + } + + /* + * Groovy Map summarising parameters/workflow options used by the pipeline + */ + private static LinkedHashMap params_summary_map(workflow, params, json_schema) { + // Get a selection of core Nextflow workflow options + def Map workflow_summary = [:] + if (workflow.revision) { + workflow_summary['revision'] = workflow.revision + } + workflow_summary['runName'] = workflow.runName + if (workflow.containerEngine) { + workflow_summary['containerEngine'] = workflow.containerEngine + } + if (workflow.container) { + workflow_summary['container'] = workflow.container + } + workflow_summary['launchDir'] = workflow.launchDir + workflow_summary['workDir'] = workflow.workDir + workflow_summary['projectDir'] = workflow.projectDir + workflow_summary['userName'] = workflow.userName + workflow_summary['profile'] = workflow.profile + workflow_summary['configFiles'] = workflow.configFiles.join(', ') + + // Get pipeline parameters defined in JSON Schema + def Map params_summary = [:] + def blacklist = ['hostnames'] + def params_map = params_load(json_schema) + for (group in params_map.keySet()) { + def sub_params = new LinkedHashMap() + def group_params = params_map.get(group) // This gets the parameters of that particular group + for (param in group_params.keySet()) { + if (params.containsKey(param) && !blacklist.contains(param)) { + def params_value = params.get(param) + def schema_value = group_params.get(param).default + def param_type = group_params.get(param).type + if (schema_value != null) { + if (param_type == 'string') { + if (schema_value.contains('$projectDir') || schema_value.contains('${projectDir}')) { + def sub_string = schema_value.replace('\$projectDir', '') + sub_string = sub_string.replace('\${projectDir}', '') + if (params_value.contains(sub_string)) { + schema_value = params_value + } + } + if (schema_value.contains('$params.outdir') || schema_value.contains('${params.outdir}')) { + def sub_string = schema_value.replace('\$params.outdir', '') + sub_string = sub_string.replace('\${params.outdir}', '') + if ("${params.outdir}${sub_string}" == params_value) { + schema_value = params_value + } + } + } + } + + // We have a default in the schema, and this isn't it + if (schema_value != null && params_value != schema_value) { + sub_params.put(param, params_value) + } + // No default in the schema, and this isn't empty + else if (schema_value == null && params_value != "" && params_value != null && params_value != false) { + sub_params.put(param, params_value) + } + } + } + params_summary.put(group, sub_params) + } + return [ 'Core Nextflow options' : workflow_summary ] << params_summary + } + + /* + * Beautify parameters for summary and return as string + */ + private static String params_summary_log(workflow, params, json_schema) { + Map colors = log_colours(params.monochrome_logs) + String output = '' + def params_map = params_summary_map(workflow, params, json_schema) + def max_chars = params_max_chars(params_map) + for (group in params_map.keySet()) { + def group_params = params_map.get(group) // This gets the parameters of that particular group + if (group_params) { + output += colors.bold + group + colors.reset + '\n' + for (param in group_params.keySet()) { + output += " " + colors.blue + param.padRight(max_chars) + ": " + colors.green + group_params.get(param) + colors.reset + '\n' + } + output += '\n' + } + } + output += dashed_line(params.monochrome_logs) + output += colors.dim + "\n Only displaying parameters that differ from defaults.\n" + colors.reset + output += dashed_line(params.monochrome_logs) + return output + } + +} diff --git a/lib/nfcore_external_java_deps.jar b/lib/nfcore_external_java_deps.jar new file mode 100644 index 00000000..805c8bb5 Binary files /dev/null and b/lib/nfcore_external_java_deps.jar differ diff --git a/main.nf b/main.nf index 251caa55..960d6873 100644 --- a/main.nf +++ b/main.nf @@ -9,2224 +9,71 @@ ---------------------------------------------------------------------------------------- */ -def helpMessage() { - log.info nfcoreHeader() - log.info""" - Usage: +nextflow.enable.dsl = 2 - The minimal command for running the pipeline is as follows: - nextflow run nf-core/ampliseq -profile singularity --input "data" --FW_primer GTGYCAGCMGCCGCGGTAA --RV_primer GGACTACNVGGGTWTCTAAT +log.info Headers.nf_core(workflow, params.monochrome_logs) - In case of a timezone error, please specify "--qiime_timezone", e.g. --qiime_timezone 'Europe/Berlin'! +//////////////////////////////////////////////////// +/* -- PRINT HELP -- */ +//////////////////////////////////////////////////// - Main arguments: - -profile [strings] Use this parameter to choose a configuration profile. If not specified, runs locally and expects all software - to be installed and available on the `PATH`. Otherwise specify a container engine, "docker" or "singularity" - and a specialized profile such as "binac". - --input [path/to/folder] Folder containing paired-end demultiplexed fastq files - Note: All samples have to be sequenced in one run, otherwise also specifiy "--multipleSequencingRuns" - --FW_primer [str] Forward primer sequence - --RV_primer [str] Reverse primer sequence - --metadata [path/to/file] Path to metadata sheet, when missing most downstream analysis are skipped (barplots, PCoA plots, ...). - File extension is not relevant. Must have a comma separated list of metadata column headers. - --manifest [path/to/file] Path to manifest.tsv table with the following labels in this exact order: sampleID, forwardReads, reverseReads. In case of single end reads, the labels should be: sampleID, Reads. - Tab ('\t') must be the table separator. Multiple sequencing runs not supported by manifest at this stage. - Default is FALSE. - --qiime_timezone [str] Needs to be specified to resolve a timezone error (default: 'Europe/Berlin') - - Other input options: - --extension [str] Naming of sequencing files (default: "/*_R{1,2}_001.fastq.gz"). - The prepended "/" is required, also one "*" is required for sample names and "{1,2}" indicates read orientation - --multipleSequencingRuns If samples were sequenced in multiple sequencing runs. Expects one subfolder per sequencing run - in the folder specified by "--input" containing sequencing data of the specific run. These folders - may not contain underscores. - --split [str] A string that will be used between the prepended run/folder name and the sample name. (default: "-") - May not be present in run/folder names and no underscore(s) allowed. Only used with "--multipleSequencingRuns" - --pacbio If PacBio data. Use this option together with --manifest. - --phred64 If the sequencing data has PHRED 64 encoded quality scores (default: PHRED 33) - - Filters: - --exclude_taxa [str] Comma separated list of unwanted taxa (default: "mitochondria,chloroplast") - To skip taxa filtering use "none" - --min_frequency [int] Remove entries from the feature table below an absolute abundance threshold (default: 1) - --min_samples [int] Filtering low prevalent features from the feature table (default: 1) - - Cutoffs: - --double_primer Cutdapt will be run twice, first to remove reads without primers (default), then a second time to remove reads that erroneously contain a second set of primers, not to be used with "--retain_untrimmed" - --retain_untrimmed Cutadapt will retain untrimmed reads - --maxEE [number] DADA2 read filtering option. After truncation, reads with higher than ‘maxEE’ "expected errors" will be discarded. We recommend (to start with) a value corresponding to approximately 1 expected error per 100-200 bp (default: 2) - --maxLen [int] DADA2 read filtering option [PacBio only], remove reads with length greater than maxLen after trimming and truncation (default: 2999) - --minLen [int] DADA2 read filtering option [PacBio only], remove reads with length less than minLen after trimming and truncation (default: 50) - --trunclenf [int] DADA2 read truncation value for forward strand and single end reads, set this to 0 for no truncation - --trunclenr [int] DADA2 read truncation value for reverse strand, set this to 0 for no truncation - --trunc_qmin [int] If --trunclenf and --trunclenr are not set, these values will be automatically determined using this mean quality score (not preferred) (default: 25) - --trunc_rmin [float] Assures that values chosen with --trunc_qmin will retain a fraction of reads (default: 0.75) - - References: If you have trained a compatible classifier before, or want to use a custom database - --classifier [path/to/file] Path to QIIME2 classifier file (typically *-classifier.qza) - --classifier_removeHash Remove all hash signs from taxonomy strings, resolves a rare ValueError during classification (process classifier) - --reference_database Path to file with reference database with taxonomies, currently either a qiime compatible file Silva_132_release.zip, or a UNITE fasta file (default: "https://www.arb-silva.de/fileadmin/silva_databases/qiime/Silva_132_release.zip") - --taxon_reference Specify which database to use for taxonomic assignment. Either 'silva' or 'unite' (default: 'silva') - - Statistics: - --metadata_category [str] Comma separated list of metadata column headers for statistics (default: false) - If not specified, all suitable columns in the metadata sheet will be used. - Suitable are columns which are categorical (not numerical) and have multiple - different values that are not all unique. - - Other options: - --untilQ2import Skip all steps after importing into QIIME2, used for visually choosing DADA2 parameter - --Q2imported [path/to/file] Path to imported reads (e.g. "demux.qza"), used after visually choosing DADA2 parameter - --onlyDenoising Skip all steps after denoising, produce only sequences and abundance tables on ASV level - --keepIntermediates Keep additional intermediate files, such as trimmed reads or various QIIME2 archives - --outdir [file] The output directory where the results will be saved - --publish_dir_mode [str] Mode for publishing results in the output directory. Available: symlink, rellink, link, copy, copyNoFollow, move (Default: copy) - --email [email] Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits - --email_on_fail [email] Same as --email, except only send mail if the workflow is not successful - --max_multiqc_email_size [str] Threshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB) - -name [str] Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic - - Skipping steps: - --skip_fastqc Skip FastQC - --skip_alpha_rarefaction Skip alpha rarefaction - --skip_taxonomy Skip taxonomic classification - --skip_barplot Skip producing barplot - --skip_abundance_tables Skip producing any relative abundance tables - --skip_diversity_indices Skip alpha and beta diversity analysis - --skip_ancom Skip differential abundance testing - - AWSBatch options: - --awsqueue [str] The AWSBatch JobQueue that needs to be set when running on AWSBatch - --awsregion [str] The AWS Region for your AWS Batch job to run on - --awscli [str] Path to the AWS CLI tool - """.stripIndent() -} - -/* - * SET UP CONFIGURATION VARIABLES - */ - -// Show help message -if (params.help){ - helpMessage() - exit 0 -} - -// Configurable variables -params.name = false -params.email = false -params.plaintext_email = false - -ch_output_docs = Channel.fromPath("$projectDir/docs/output.md") -Channel.fromPath("$projectDir/assets/matplotlibrc") - .into { ch_mpl_for_classifier_extract_seq; ch_mpl_for_classifier_train; ch_mpl_for_qiime_import; ch_mpl_for_ancom_asv; ch_mpl_for_ancom_tax; ch_mpl_for_ancom; ch_mpl_for_beta_diversity_ord; ch_mpl_for_beta_diversity; ch_mpl_for_alpha_diversity; ch_mpl_for_metadata_pair; ch_mpl_for_metadata_cat; ch_mpl_for_diversity_core; ch_mpl_for_alpha_rare; ch_mpl_for_tree; ch_mpl_for_barcode; ch_mpl_for_relreducetaxa; ch_mpl_for_relasv; ch_mpl_for_export_dada_output; ch_mpl_filter_taxa; ch_mpl_classifier; ch_mpl_dada; ch_mpl_dada_merge; ch_mpl_for_demux_visualize; ch_mpl_for_classifier } - - -/* - * Define pipeline steps - */ -params.untilQ2import = false - -params.Q2imported = false -if (params.Q2imported) { - params.skip_fastqc = true - params.skip_multiqc = true -} else { - params.skip_multiqc = false -} - -params.onlyDenoising = false -if (params.onlyDenoising || params.untilQ2import) { - params.skip_abundance_tables = true - params.skip_barplot = true - params.skip_taxonomy = true - params.skip_alpha_rarefaction = true - params.skip_diversity_indices = true - params.skip_ancom = true -} else { - params.skip_abundance_tables = false - params.skip_barplot = false - params.skip_taxonomy = false - params.skip_alpha_rarefaction = false - params.skip_diversity_indices = false - params.skip_ancom = false -} - -params.manifest = false - -/* - * Import input files - */ -if (params.metadata) { - Channel.fromPath("${params.metadata}", checkIfExists: true) - .into { ch_metadata_for_barplot; ch_metadata_for_alphararefaction; ch_metadata_for_diversity_core; ch_metadata_for_alpha_diversity; ch_metadata_for_metadata_category_all; ch_metadata_for_metadata_category_pairwise; ch_metadata_for_beta_diversity; ch_metadata_for_beta_diversity_ordination; ch_metadata_for_ancom; ch_metadata_for_ancom_tax; ch_metadata_for_ancom_asv } -} else { - Channel.from() - .into { ch_metadata_for_barplot; ch_metadata_for_alphararefaction; ch_metadata_for_diversity_core; ch_metadata_for_alpha_diversity; ch_metadata_for_metadata_category_all; ch_metadata_for_metadata_category_pairwise; ch_metadata_for_beta_diversity; ch_metadata_for_beta_diversity_ordination; ch_metadata_for_ancom; ch_metadata_for_ancom_tax; ch_metadata_for_ancom_asv } +def json_schema = "$projectDir/nextflow_schema.json" +if (params.help) { + def command = "nextflow run nf-core/ampliseq --input 'samplesheet.tsv' -profile docker" + log.info NfcoreSchema.params_help(workflow, params, json_schema, command) + exit 0 } -if (params.Q2imported) { - Channel.fromPath("${params.Q2imported}", checkIfExists: true) - .into { ch_qiime_demux_import; ch_qiime_demux_vis; ch_qiime_demux_dada } -} +//////////////////////////////////////////////////// +/* -- PRINT PARAMETER SUMMARY -- */ +//////////////////////////////////////////////////// -if (params.classifier) { - Channel.fromPath("${params.classifier}", checkIfExists: true) - .set { ch_qiime_classifier } -} +def summary_params = NfcoreSchema.params_summary_map(workflow, params, json_schema) +log.info NfcoreSchema.params_summary_log(workflow, params, json_schema) -/* - * Sanity check input values - */ -if (!params.Q2imported) { - if (!params.FW_primer) { exit 1, "Option --FW_primer missing" } - if (!params.RV_primer) { exit 1, "Option --RV_primer missing" } - if (!params.input) { exit 1, "Option --input missing" } +//////////////////////////////////////////////////// +/* -- VALIDATE PARAMETERS -- */ +//////////////////////////////////////////////////// +if (params.validate_params) { + NfcoreSchema.validateParameters(params, json_schema, log) } -if (params.Q2imported && params.untilQ2import) { - exit 1, "Choose either to import data into a QIIME2 artefact and quit with --untilQ2import or use an already existing QIIME2 data artefact with --Q2imported." -} +//////////////////////////////////////////////////// +/* -- PARAMETER CHECKS -- */ +//////////////////////////////////////////////////// -if ("${params.split}".indexOf("_") > -1 ) { - exit 1, "Underscore is not allowed in --split, please review your input." -} - -if (params.multipleSequencingRuns && params.manifest) { - exit 1, "The manifest file does not support multiple sequencing runs at this point." -} - -single_end = false -if (params.pacbio) { - single_end = true -} - -if (single_end && !params.manifest) { - exit 1, "A manifest file is needed for single end reads such as PacBio data." -} - -if (params.double_primer && params.retain_untrimmed) { - exit 1, "Incompatible parameters --double_primer and --retain_untrimmed cannot be set at the same time." -} - -if (!params.classifier){ - if (!(params.taxon_reference == 'silva' || params.taxon_reference == 'unite')) exit 1, "--taxon_reference need to be set to either 'silva' or 'unite'" -} - -// AWSBatch sanity checking -if(workflow.profile == 'awsbatch'){ - if (!params.awsqueue || !params.awsregion) exit 1, "Specify correct --awsqueue and --awsregion parameters on AWSBatch!" - if (!workflow.workDir.startsWith('s3') || !params.outdir.startsWith('s3')) exit 1, "Specify S3 URLs for workDir and outdir parameters on AWSBatch!" -} - -// Has the run name been specified by the user? -// this has the bonus effect of catching both -name and --name -custom_runName = params.name -if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) { - custom_runName = workflow.runName +// Check that conda channels are set-up correctly +if (params.enable_conda) { + Checks.check_conda_channels(log) } // Check AWS batch settings -if (workflow.profile.contains('awsbatch')) { - // AWSBatch sanity checking - if (!params.awsqueue || !params.awsregion) exit 1, "Specify correct --awsqueue and --awsregion parameters on AWSBatch!" - // Check outdir paths to be S3 buckets if running on AWSBatch - // related: https://github.com/nextflow-io/nextflow/issues/813 - if (!params.outdir.startsWith('s3:')) exit 1, "Outdir not on S3 - specify S3 Bucket to run on AWSBatch!" - // Prevent trace files to be stored on S3 since S3 does not support rolling files. - if (params.tracedir.startsWith('s3:')) exit 1, "Specify a local tracedir or run without trace! S3 cannot be used for tracefiles." -} +Checks.aws_batch(workflow, params) -// Stage config files -ch_multiqc_config = file("$projectDir/assets/multiqc_config.yaml", checkIfExists: true) -ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config, checkIfExists: true) : Channel.empty() -ch_output_docs = file("$projectDir/docs/output.md", checkIfExists: true) -ch_output_docs_images = file("$projectDir/docs/images/", checkIfExists: true) - - -// Header log info -log.info nfcoreHeader() -def summary = [:] -summary['Pipeline Name'] = 'nf-core/ampliseq' -if(workflow.revision) summary['Pipeline Release'] = workflow.revision -summary['Run Name'] = custom_runName ?: workflow.runName -summary['Input'] = params.manifest ?: params.input -summary['Max Resources'] = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job" -if (workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container" -summary['Output dir'] = params.outdir -summary['Launch dir'] = workflow.launchDir -summary['Working dir'] = workflow.workDir -summary['Script dir'] = workflow.projectDir -summary['User'] = workflow.userName -if (workflow.profile.contains('awsbatch')) { - summary['AWS Region'] = params.awsregion - summary['AWS Queue'] = params.awsqueue - summary['AWS CLI'] = params.awscli -} -summary['Config Profile'] = workflow.profile -if (params.config_profile_description) summary['Config Profile Description'] = params.config_profile_description -if (params.config_profile_contact) summary['Config Profile Contact'] = params.config_profile_contact -if (params.config_profile_url) summary['Config Profile URL'] = params.config_profile_url -summary['Config Files'] = workflow.configFiles.join(', ') -if (params.email || params.email_on_fail) { - summary['E-mail Address'] = params.email - summary['E-mail on failure'] = params.email_on_fail - summary['MultiQC maxsize'] = params.max_multiqc_email_size -} -log.info summary.collect { k,v -> "${k.padRight(18)}: $v" }.join("\n") -log.info "-\033[2m--------------------------------------------------\033[0m-" - -if( params.trunclenf == false || params.trunclenr == false ){ - if ( !params.untilQ2import ) log.info "\n######## WARNING: No DADA2 cutoffs were specified, therefore reads will be truncated where median quality drops below ${params.trunc_qmin} but at least a fraction of ${params.trunc_rmin} of the reads will be retained.\nThe chosen cutoffs do not account for required overlap for merging, therefore DADA2 might have poor merging efficiency or even fail.\n" -} // Check the hostnames against configured profiles -checkHostname() +Checks.hostname(workflow, params, log) -Channel.from(summary.collect{ [it.key, it.value] }) - .map { k,v -> "
$k
${v ?: 'N/A'}
" } - .reduce { a, b -> return [a, b].join("\n ") } - .map { x -> """ - id: 'nf-core-ampliseq-summary' - description: " - this information is collected when the pipeline is started." - section_name: 'nf-core/ampliseq Workflow Summary' - section_href: 'https://github.com/nf-core/ampliseq' - plot_type: 'html' - data: | -
- $x -
- """.stripIndent() } - .set { ch_workflow_summary } +/////////////////////////////////////////////////// +/* -- RUN MAIN WORKFLOW -- */ +//////////////////////////////////////////////////// -/* - * Parse software version numbers - */ -process get_software_versions { - publishDir "${params.outdir}/pipeline_info", mode: params.publish_dir_mode, - saveAs: { filename -> - if (filename.indexOf(".csv") > 0) filename - else null - } - - output: - file 'software_versions_mqc.yaml' into ch_software_versions_yaml - file "software_versions.csv" - - script: - """ - echo $workflow.manifest.version > v_pipeline.txt - echo $workflow.nextflow.version > v_nextflow.txt - fastqc --version > v_fastqc.txt - multiqc --version > v_multiqc.txt - cutadapt --version > v_cutadapt.txt - qiime --version > v_qiime.txt - scrape_software_versions.py &> software_versions_mqc.yaml - """ +workflow { + /* + * SUBWORKFLOW: Run main nf-core/ampliseq analysis pipeline + */ + include { AMPLISEQ } from './workflows/ampliseq' addParams( summary_params: summary_params ) + AMPLISEQ () } +//////////////////////////////////////////////////// +/* -- CHECK PARAMETER ON ERROR -- */ +//////////////////////////////////////////////////// -if (!params.Q2imported){ - /* - * Create a channel for optional input manifest file - */ - if (params.manifest && !single_end) { - tsvFile = file(params.manifest).getName() - // extracts read files from TSV and distribute into channels - Channel - .fromPath(params.manifest) - .ifEmpty {exit 1, log.info "Cannot find path file ${tsvFile}"} - .splitCsv(header:true, sep:'\t') - .map { row -> [ row.sampleID, [ file(row.forwardReads, checkIfExists: true), file(row.reverseReads, checkIfExists: true) ] ] } - .into { ch_read_pairs; ch_read_pairs_fastqc; ch_read_pairs_name_check } - } else if ( single_end ) { - // Manifest file is currently the only available input option for single_end - tsvFile = file(params.manifest).getName() - // extracts read files from TSV and distribute into channels - Channel - .fromPath(params.manifest) - .ifEmpty {exit 1, log.info "Cannot find path file ${tsvFile}"} - .splitCsv(header:true, sep:'\t') - .map { row -> [ row.sampleID, file(row.Reads, checkIfExists: true) ] } - .into { ch_read_pairs; ch_read_pairs_fastqc; ch_read_pairs_name_check } - - /* - * Create a channel for input read files - */ - } else if (params.readPaths && params.input == "data${params.extension}" && !params.multipleSequencingRuns){ - //Test input for single sequencing runs, profile = test - - Channel - .from(params.readPaths) - .map { row -> [ row[0], [file(row[1][0]), file(row[1][1])]] } - .ifEmpty { exit 1, "params.readPaths was empty - no input files supplied" } - .map { name, reads -> [ name.toString().indexOf("_") != -1 ? name.toString().take(name.toString().indexOf("_")) : name, reads ] } - .into { ch_read_pairs; ch_read_pairs_fastqc; ch_read_pairs_name_check } - - } else if ( !params.readPaths && params.multipleSequencingRuns ) { - //Standard input for multiple sequencing runs - - //Get files - Channel - .fromFilePairs( params.input + "/*" + params.extension, size: 2 ) - .ifEmpty { exit 1, "Cannot find any reads matching: ${params.input}/*${params.extension}\nNB: Path needs to be enclosed in quotes!" } - .into { ch_extract_folders; ch_rename_key } - - //Get folder information - ch_extract_folders - .flatMap { key, files -> [files[0]] } - .map { it.take(it.findLastIndexOf{"/"})[-1] } - .unique() - .into { ch_count_folders; ch_check_folders; ch_report_folders } - - //Report folders with sequencing files - ch_report_folders - .collect() - .subscribe { - String folders = it.toString().replace("[", "").replace("]","") - log.info "\nFound the folder(s) \"$folders\" containing sequencing read files matching \"${params.extension}\" in \"${params.input}\".\n" } - - //Stop if folder count is 1 - ch_count_folders - .count() - .subscribe { if ( it == 1 ) exit 1, "Found only one folder with read data but \"--multipleSequencingRuns\" was specified. Please review data input." } - - //Stop if folder names contain "_" or "${params.split}" - ch_check_folders - .subscribe { - if ( it.toString().indexOf("${params.split}") > -1 ) exit 1, "Folder name \"$it\" contains \"${params.split}\", but may not. Please review data input or choose another string using \"--split [str]\" (no underscore allowed!)." - if ( it.toString().indexOf("_") > -1 ) exit 1, "Folder name \"$it\" contains \"_\", but may not. Please review data input." - } - - //Add folder information to sequence files - ch_rename_key - .map { key, files -> [ key, files, (files[0].take(files[0].findLastIndexOf{"/"})[-1]) ] } - .into { ch_read_pairs; ch_read_pairs_fastqc } - - } else if ( params.readPaths && params.multipleSequencingRuns ) { - //Test input for multiple sequencing runs, profile = test_multi - - Channel - .from(params.readPaths) - .map { row -> [ row[0], [file(row[1][0]), file(row[1][1])], row[2] ] } - .ifEmpty { exit 1, "params.readPaths was empty - no input files supplied" } - .into { ch_read_pairs; ch_read_pairs_fastqc } - - } else { - //Standard input - - Channel - .fromFilePairs( params.input + params.extension, size: 2 ) - .ifEmpty { exit 1, "Cannot find any reads matching: ${params.input}${params.extension}\nNB: Path needs to be enclosed in quotes!" } - .map { name, reads -> [ name.toString().indexOf("_") != -1 ? name.toString().take(name.toString().indexOf("_")) : name, reads ] } - .into { ch_read_pairs; ch_read_pairs_fastqc } - } - - /* - * fastQC - */ - if (!params.multipleSequencingRuns){ - process fastqc { - tag "${pair_id}" - publishDir "${params.outdir}/fastQC", mode: params.publish_dir_mode, - saveAs: {filename -> filename.indexOf(".zip") > 0 ? "zips/$filename" : "$filename"} - - input: - set val(pair_id), file(reads) from ch_read_pairs_fastqc - - output: - file "*_fastqc.{zip,html}" into ch_fastqc_results - - when: - !params.skip_fastqc - - script: - """ - fastqc -q ${reads} - """ - } - } else { - process fastqc_multi { - tag "${folder}${params.split}${pair_id}" - publishDir "${params.outdir}/fastQC", mode: params.publish_dir_mode, - saveAs: {filename -> filename.indexOf(".zip") > 0 ? "zips/$filename" : "$filename"} - - input: - set val(pair_id), file(reads), val(folder) from ch_read_pairs_fastqc - - output: - file "*_fastqc.{zip,html}" into ch_fastqc_results - - when: - !params.skip_fastqc - - script: - """ - #Rename files so that there is no possible overlap - ln -s "${reads[0]}" "$folder${params.split}${reads[0]}" - ln -s "${reads[1]}" "$folder${params.split}${reads[1]}" - fastqc -q "$folder${params.split}${reads[0]}" "$folder${params.split}${reads[1]}" - """ - } - } - - /* - * Trim each read or read-pair with cutadapt - */ - if (!params.multipleSequencingRuns){ - process trimming { - tag "${pair_id}" - publishDir "${params.outdir}/trimmed", mode: params.publish_dir_mode, - saveAs: {filename -> - if (filename.indexOf(".gz") == -1) "logs/$filename" - else if(params.keepIntermediates) filename - else null} - - input: - set val(pair_id), file(reads) from ch_read_pairs - - output: - set val(pair_id), file ("trimmed/*.*") into ch_fastq_trimmed_manifest - file "trimmed/*.*" into (ch_fastq_trimmed, ch_fastq_trimmed_qiime) - file "cutadapt_log_*.txt" into ch_fastq_cutadapt_log - - script: - discard_untrimmed = params.retain_untrimmed ? '' : '--discard-untrimmed' - primers = single_end ? "--rc -g ${params.FW_primer}...${params.RV_primer}" : "-g ${params.FW_primer} -G ${params.RV_primer}" - in_2_files = single_end ? "second-trimming_${reads}" : "second-trimming_${reads[0]} second-trimming_${reads[1]}" - out_1_files = single_end ? "-o second-trimming_${reads}" : "-o second-trimming_${reads[0]} -p second-trimming_${reads[1]}" - in_1_files = single_end ? "first-trimming_${reads}" : "first-trimming_${reads[0]} first-trimming_${reads[1]}" //these have to be symlinked below - out_files = single_end ? "-o trimmed/${reads}" : "-o trimmed/${reads[0]} -p trimmed/${reads[1]}" - in_files = single_end ? "${reads}" : "${reads[0]} ${reads[1]}" - """ - mkdir -p trimmed - if [[ \"${params.double_primer}\" = \"true\" && \"${params.retain_untrimmed}\" = \"false\" ]]; then - #rename files to list results correctly in MultiQC - if [ \"${single_end}\" = \"true\" ]; then - ln -s "${reads}" "first-trimming_${reads}" - else - ln -s "${reads[0]}" "first-trimming_${reads[0]}" - ln -s "${reads[1]}" "first-trimming_${reads[1]}" - fi - - cutadapt ${primers} ${discard_untrimmed} \ - ${out_1_files} \ - ${in_1_files} >> cutadapt_log_${pair_id}.txt - cutadapt ${primers} --discard-trimmed \ - ${out_files} \ - ${in_2_files} >> cutadapt_log_${pair_id}.txt - else - cutadapt ${primers} ${discard_untrimmed} \ - ${out_files} ${in_files} \ - >> cutadapt_log_${pair_id}.txt - fi - """ - } - - } else { - process trimming_multi { - tag "$folder${params.split}$pair_id" - publishDir "${params.outdir}/trimmed", mode: params.publish_dir_mode, - saveAs: {filename -> - if (filename.indexOf(".gz") == -1) "logs/$filename" - else if(params.keepIntermediates) filename - else null} - - input: - set val(pair_id), file(reads), val(folder) from ch_read_pairs - - output: - file "trimmed/*.*" into (ch_fastq_trimmed, ch_fastq_trimmed_manifest, ch_fastq_trimmed_qiime) - file "cutadapt_log_$folder${params.split}${pair_id}.txt" into ch_fastq_cutadapt_log - - script: - discard_untrimmed = params.retain_untrimmed ? '' : '--discard-untrimmed' - """ - mkdir -p trimmed - if [[ \"${params.double_primer}\" = \"true\" && \"${params.retain_untrimmed}\" = \"false\" ]]; then - mkdir -p firstcutadapt - #Rename files so that MultiQC will pick them up correctely as first trimming - ln -s "${reads[0]}" "first-trimming_$folder${params.split}${reads[0]}" - ln -s "${reads[1]}" "first-trimming_$folder${params.split}${reads[1]}" - cutadapt -g ${params.FW_primer} -G ${params.RV_primer} ${discard_untrimmed} \ - -o firstcutadapt/$folder${params.split}${reads[0]} -p firstcutadapt/$folder${params.split}${reads[1]} \ - "first-trimming_$folder${params.split}${reads[0]}" "first-trimming_$folder${params.split}${reads[1]}" >> cutadapt_log_$folder${params.split}${pair_id}.txt - #Rename files so that MultiQC will pick them up correctely as second trimming - ln -s "firstcutadapt/$folder${params.split}${reads[0]}" "second-trimming_$folder${params.split}${reads[0]}" - ln -s "firstcutadapt/$folder${params.split}${reads[1]}" "second-trimming_$folder${params.split}${reads[1]}" - cutadapt -g ${params.FW_primer} -G ${params.RV_primer} --discard-trimmed \ - -o trimmed/$folder${params.split}${reads[0]} -p trimmed/$folder${params.split}${reads[1]} \ - second-trimming_$folder${params.split}${reads[0]} second-trimming_$folder${params.split}${reads[1]} >> cutadapt_log_$folder${params.split}${pair_id}.txt - else - #first, rename files so that MultiQC will pick them up correctely - ln -s "${reads[0]}" "$folder${params.split}${reads[0]}" - ln -s "${reads[1]}" "$folder${params.split}${reads[1]}" - cutadapt -g ${params.FW_primer} -G ${params.RV_primer} ${discard_untrimmed} \ - -o trimmed/$folder${params.split}${reads[0]} -p trimmed/$folder${params.split}${reads[1]} \ - $folder${params.split}${reads[0]} $folder${params.split}${reads[1]} > cutadapt_log_$folder${params.split}${pair_id}.txt - fi - """ - } - } - - - /* - * multiQC - */ - process multiqc { - publishDir "${params.outdir}/MultiQC", mode: params.publish_dir_mode - - input: - file (multiqc_config) from ch_multiqc_config - file (mqc_custom_config) from ch_multiqc_custom_config.collect().ifEmpty([]) - file ('cutadapt/logs/*') from ch_fastq_cutadapt_log.collect().ifEmpty([]) - file ('fastqc/*') from ch_fastqc_results.collect().ifEmpty([]) - file ('software_versions/*') from ch_software_versions_yaml.collect().ifEmpty([]) - file workflow_summary from ch_workflow_summary.collectFile(name: "workflow_summary_mqc.yaml") - - output: - file "*multiqc_report.html" into ch_multiqc_report - file "*_data" - - when: - !params.skip_multiqc - - script: - rtitle = custom_runName ? "--title \"$custom_runName\"" : '' - rfilename = custom_runName ? "--filename " + custom_runName.replaceAll('\\W','_').replaceAll('_+','_') + "_multiqc_report" : '' - custom_config_file = params.multiqc_config ? "--config $mqc_custom_config" : '' - """ - multiqc --interactive -f $rtitle $rfilename $custom_config_file . - """ - } - - - /* - * Produce manifest file for QIIME2 - */ - if (!params.multipleSequencingRuns && single_end){ - ch_fastq_trimmed_manifest - .map { name, reads -> - def sampleID = name - def Reads = reads - [ "${sampleID}" +","+ "${Reads}" + ",forward" ] - } - .flatten() - .collectFile(name: 'manifest.txt', newLine: true, storeDir: "${params.outdir}/demux", seed: "sample-id,absolute-filepath,direction") - .set { ch_manifest } - - } else if (!params.multipleSequencingRuns){ - ch_fastq_trimmed_manifest - .map { name, reads -> - def sampleID = name - def fwdReads = reads [0] - def revReads = reads [1] - [ "${sampleID}" +","+ "${fwdReads}" + ",forward\n" + "${sampleID}" +","+ "${revReads}" +",reverse" ] - } - .flatten() - .collectFile(name: 'manifest.txt', newLine: true, storeDir: "${params.outdir}/demux", seed: "sample-id,absolute-filepath,direction") - .set { ch_manifest } - - } else { - ch_fastq_trimmed_manifest - .map { forward, reverse -> [ forward.drop(forward.findLastIndexOf{"/"})[0], forward, reverse ] } //extract file name - .map { name, forward, reverse -> [ name.toString().indexOf("_") != -1 ? name.toString().take(name.toString().indexOf("_")) : name, forward, reverse ] } //extract sample name - .map { name, forward, reverse -> [ name +","+ forward + ",forward\n" + name +","+ reverse +",reverse" ] } //prepare basic synthax - .flatten() - .collectFile(storeDir: "${params.outdir}", seed: "sample-id,absolute-filepath,direction\n") { item -> - def folder = item.take(item.indexOf("${params.split}")) //re-extract folder - [ "${folder}${params.split}manifest.txt", item + '\n' ] - } - .set { ch_manifest_file } - - ch_manifest_file - .combine( ch_mpl_for_qiime_import ) - .set { ch_manifest } - } - - /* - * Import trimmed files into QIIME2 artefact - */ - if (!params.multipleSequencingRuns && !params.pacbio) { - process qiime_import { - publishDir "${params.outdir}/demux", mode: params.publish_dir_mode, - saveAs: { filename -> - params.keepIntermediates ? filename : null - params.untilQ2import ? filename : null } - - input: - file(manifest) from ch_manifest - env MATPLOTLIBRC from ch_mpl_for_qiime_import - file('*') from ch_fastq_trimmed_qiime.collect() - - output: - file "demux.qza" into (ch_qiime_demux_import, ch_qiime_demux_vis, ch_qiime_demux_dada) - - when: - !params.Q2imported - - script: - input_format = params.phred64 ? "PairedEndFastqManifestPhred64" : "PairedEndFastqManifestPhred33" - """ - head -n 1 ${manifest} > header.txt - tail -n+2 ${manifest} | cut -d, -f1 > col1.txt - tail -n+2 ${manifest} | cut -d, -f2 | sed 's:.*/::' > col2.txt - while read f; do - realpath \$f >> full_path.txt - done col3.txt - paste -d, col1.txt full_path.txt col3.txt > cols.txt - cat cols.txt >> header.txt && mv header.txt ${manifest} - - qiime tools import \ - --type 'SampleData[PairedEndSequencesWithQuality]' \ - --input-path ${manifest} \ - --output-path demux.qza \ - --input-format $input_format - """ - } - } else if (params.pacbio) { - process qiime_import_pacbio{ - publishDir "${params.outdir}/demux", mode: 'copy', - saveAs: { filename -> - params.keepIntermediates ? filename : null - params.untilQ2import ? filename : null } - - input: - file(manifest) from ch_manifest - env MATPLOTLIBRC from ch_mpl_for_qiime_import - - output: - file manifest into ch_dada_import - file "demux.qza" into (ch_qiime_demux_import, ch_qiime_demux_vis) - - when: - !params.Q2imported - - script: - input_format = params.phred64 ? "SingleEndFastqManifestPhred64" : "SingleEndFastqManifestPhred33" - """ - qiime tools import \ - --type 'SampleData[SequencesWithQuality]' \ - --input-path ${manifest} \ - --output-path demux.qza \ - --input-format $input_format - """ - } - } else { - process qiime_import_multi { - tag "${manifest}" - - publishDir "${params.outdir}", mode: params.publish_dir_mode, - saveAs: { filename -> - params.keepIntermediates ? filename : null} - - input: - set file(manifest), env(MATPLOTLIBRC) from ch_manifest - file('*') from ch_fastq_trimmed_qiime.collect() - - output: - file "*demux.qza" into (ch_qiime_demux_import, ch_qiime_demux_vis, ch_qiime_demux_dada) mode flatten - - when: - !params.Q2imported - - script: - input_format = params.phred64 ? "PairedEndFastqManifestPhred64" : "PairedEndFastqManifestPhred33" - def folder = "${manifest}".take("${manifest}".indexOf("${params.split}")) - """ - head -n 1 ${manifest} > header.txt - tail -n+2 ${manifest} | cut -d, -f1 > col1.txt - tail -n+2 ${manifest} | cut -d, -f2 | sed 's:.*/::' > col2.txt - while read f; do - realpath \$f >> full_path.txt - done col3.txt - paste -d, col1.txt full_path.txt col3.txt > cols.txt - cat cols.txt >> header.txt && mv header.txt ${manifest} - - qiime tools import \ - --type 'SampleData[PairedEndSequencesWithQuality]' \ - --input-path ${manifest} \ - --output-path ${folder}-demux.qza \ - --input-format $input_format - """ - } - } - ch_qiime_demux_vis - .combine( ch_mpl_for_demux_visualize ) - .set{ ch_qiime_demux_visualisation } -} - -/* - * Download, unpack, extract and train classifier - * Download, unpack, and extract classifier in one process, train classifier in following process - * Use "--dereplication 90" for testing and "--dereplication 99" for real datasets - * Requirements with "--dereplication 99": 1 core (seems not to scale with more?), ~35 Gb mem, ~2:15:00 walltime - */ - -if( !params.classifier ){ - if( !params.onlyDenoising && !params.skip_taxonomy ){ - Channel.fromPath("${params.reference_database}") - .set { ch_ref_database } - } else { - Channel.empty() - .set { ch_ref_database } - } - - process classifier_extract_seq { - - input: - file database from ch_ref_database - env MATPLOTLIBRC from ch_mpl_for_classifier_extract_seq - - output: - file("*.qza") into ch_qiime_pretrain - stdout ch_message_classifier_removeHash - - when: - !params.onlyDenoising || !params.untilQ2import - - script: - - """ - export HOME="\${PWD}/HOME" - - if [ ${params.taxon_reference} = \"unite\" ]; then - create_unite_taxfile.py $database db.fa db.tax - fasta=\"db.fa\" - taxonomy=\"db.tax\" - else - unzip -qq $database - fasta=\"SILVA_132_QIIME_release/rep_set/rep_set_16S_only/${params.dereplication}/silva_132_${params.dereplication}_16S.fna\" - taxonomy=\"SILVA_132_QIIME_release/taxonomy/16S_only/${params.dereplication}/consensus_taxonomy_7_levels.txt\" - fi - - - if [ \"${params.classifier_removeHash}\" = \"true\" ]; then - sed \'s/#//g\' \$taxonomy >taxonomy-${params.dereplication}_removeHash.txt - taxonomy=\"taxonomy-${params.dereplication}_removeHash.txt\" - echo \"\n######## WARNING! The taxonomy file was altered by removing all hash signs!\" - fi - - ### Import - qiime tools import --type \'FeatureData[Sequence]\' \ - --input-path \$fasta \ - --output-path ref-seq-${params.dereplication}.qza - qiime tools import --type \'FeatureData[Taxonomy]\' \ - --input-format HeaderlessTSVTaxonomyFormat \ - --input-path \$taxonomy \ - --output-path ref-taxonomy-${params.dereplication}.qza - - #Extract sequences based on primers - qiime feature-classifier extract-reads \ - --i-sequences ref-seq-${params.dereplication}.qza \ - --p-f-primer ${params.FW_primer} \ - --p-r-primer ${params.RV_primer} \ - --o-reads ${params.FW_primer}-${params.RV_primer}-${params.dereplication}-ref-seq.qza \ - --quiet - """ - } - - ch_message_classifier_removeHash - .subscribe { log.info it } - - process classifier_train { - publishDir "${params.outdir}/DB/", mode: params.publish_dir_mode, - saveAs: {filename -> - if (filename.indexOf("${params.FW_primer}-${params.RV_primer}-${params.dereplication}-classifier.qza") == 0) filename - else if(params.keepIntermediates) filename - else null} - - input: - file '*' from ch_qiime_pretrain - env MATPLOTLIBRC from ch_mpl_for_classifier_train - - output: - file("${params.FW_primer}-${params.RV_primer}-${params.dereplication}-classifier.qza") into ch_qiime_classifier - - when: - !params.onlyDenoising || !params.untilQ2import - - script: - - """ - export HOME="\${PWD}/HOME" - - #Train classifier - qiime feature-classifier fit-classifier-naive-bayes \ - --i-reference-reads ${params.FW_primer}-${params.RV_primer}-${params.dereplication}-ref-seq.qza \ - --i-reference-taxonomy ref-taxonomy-${params.dereplication}.qza \ - --o-classifier ${params.FW_primer}-${params.RV_primer}-${params.dereplication}-classifier.qza \ - --quiet - """ - } - -} - -/* - * Import trimmed files into QIIME2 artefact - */ -if( !params.Q2imported ){ - process qiime_demux_visualize { - tag "${demux.baseName}" - publishDir "${params.outdir}", mode: params.publish_dir_mode - - input: - set file(demux), env(MATPLOTLIBRC) from ch_qiime_demux_visualisation - - output: - file("${demux.baseName}/*-seven-number-summaries.csv") into ch_csv_demux - file("${demux.baseName}/*") - - """ - export HOME="\${PWD}/HOME" - - qiime demux summarize \ - --i-data ${demux} \ - --o-visualization ${demux.baseName}.qzv - - qiime tools export --input-path ${demux.baseName}.qzv --output-path ${demux.baseName} - """ - } -} else { - process qiime_importdemux_visualize { - publishDir "${params.outdir}", mode: params.publish_dir_mode - - input: - env MATPLOTLIBRC from ch_mpl_for_demux_visualize - - output: - file("demux/*-seven-number-summaries.csv") into ch_csv_demux - file("demux/*") - - """ - export HOME="\${PWD}/HOME" - - qiime demux summarize \ - --i-data ${params.Q2imported} \ - --o-visualization demux.qzv - - qiime tools export --input-path demux.qzv --output-path demux - """ - } +workflow.onError { + // Print unexpected parameters - easiest is to just rerun validation + NfcoreSchema.validateParameters(params, json_schema, log) } - -/* - * Determine params.trunclenf and params.trunclenr where the median quality value drops below params.trunc_qmin - * But at least the fraction of params.trunc_rmin reads is retained - * "Warning massage" is printed - */ -if ( ! single_end ) { - process dada_trunc_parameter { - - input: - file summary_demux from ch_csv_demux - - output: - stdout ch_dada_trunc - - when: - !params.untilQ2import - - script: - if( params.trunclenf == false || params.trunclenr == false ){ - """ - dada_trunc_parameter.py ${summary_demux[0]} ${summary_demux[1]} ${params.trunc_qmin} ${params.trunc_rmin} - """ - } - else - """ - printf "${params.trunclenf},${params.trunclenr}" - """ - } -} else { - process dada_trunc_se { - - output: - stdout ch_dada_trunc - - when: - !params.untilQ2import - - script: - if ( params.trunclenf == false ) { - """ - printf "0" - """ - } else { - """ - printf "${params.trunclenf}" - """ - } - } -} - -if (params.multipleSequencingRuns){ - //find minimum dada truncation values - ch_dada_trunc - .into { dada_trunc_forward; dada_trunc_reverse } - dada_trunc_forward - .map { trunc -> (trunc.split(',')[0]) } - .min() - .set { dada_forward } - dada_trunc_reverse - .map { trunc -> (trunc.split(',')[1]) } - .min() - .set { dada_reverse } - dada_forward - .combine( dada_reverse ) - .set { dada_trunc_multi } - //combine channels for dada_multi - ch_qiime_demux_dada - .combine( dada_trunc_multi ) - .combine( ch_mpl_dada ) - .set { ch_dada_multi } -} - - -/* - * Find ASVs with DADA2 - * (i) for single sequencing run - * (ii) for PacBio reads - * (iii) for multiple sequencing runs - */ -if (!params.multipleSequencingRuns && !params.pacbio){ - process dada_single { - tag "$trunc" - publishDir "${params.outdir}", mode: params.publish_dir_mode, - saveAs: {filename -> - if (filename.indexOf("dada_stats/stats.tsv") == 0) "abundance_table/unfiltered/dada_stats.tsv" - else if (filename.indexOf("dada_report.txt") == 0) "abundance_table/unfiltered/dada_report.txt" - else if (filename.indexOf("table.qza") == 0) "abundance_table/unfiltered/$filename" - else if (filename.indexOf("rel-table/feature-table.biom") == 0) "abundance_table/unfiltered/rel-feature-table.biom" - else if (filename.indexOf("table/feature-table.biom") == 0) "abundance_table/unfiltered/feature-table.biom" - else if (filename.indexOf("rel-feature-table.tsv") > 0) "abundance_table/unfiltered/rel-feature-table.tsv" - else if (filename.indexOf("feature-table.tsv") > 0) "abundance_table/unfiltered/feature-table.tsv" - else if (filename.indexOf("rep-seqs.qza") == 0) "representative_sequences/unfiltered/rep-seqs.qza" - else if (filename.indexOf("unfiltered/*")) "representative_sequences/$filename" - else null} - - input: - file demux from ch_qiime_demux_dada - val trunc from ch_dada_trunc - env MATPLOTLIBRC from ch_mpl_dada - - output: - file("table.qza") into ch_qiime_table_raw - file("rep-seqs.qza") into (ch_qiime_repseq_raw_for_classifier,ch_qiime_repseq_raw_for_filter) - file("table/feature-table.tsv") into ch_tsv_table_raw - file("dada_stats/stats.tsv") - file("table/feature-table.biom") - file("rel-table/feature-table.biom") - file("table/rel-feature-table.tsv") - file("unfiltered/*") - file("dada_report.txt") - - when: - !params.untilQ2import - - script: - def values = trunc.split(',') - if (values[0].toInteger() + values[1].toInteger() <= 10) { - log.info "\n######## ERROR: Total read pair length is below 10, this is definitely too low.\nForward ${values[0]} and reverse ${values[1]} are chosen.\nPlease provide appropriate values for --trunclenf and --trunclenr or lower --trunc_qmin\n" } - """ - export HOME="\${PWD}/HOME" - IFS=',' read -r -a trunclen <<< \"$trunc\" - - #denoise samples with DADA2 and produce - qiime dada2 denoise-paired \ - --i-demultiplexed-seqs ${demux} \ - --p-trunc-len-f \${trunclen[0]} \ - --p-trunc-len-r \${trunclen[1]} \ - --p-max-ee-f ${params.maxEE} \ - --p-max-ee-r ${params.maxEE} \ - --p-n-threads 0 \ - --o-table table.qza \ - --o-representative-sequences rep-seqs.qza \ - --o-denoising-stats stats.qza \ - --verbose \ - >dada_report.txt - - #produce dada2 stats "dada_stats/stats.tsv" - qiime tools export --input-path stats.qza \ - --output-path dada_stats - - #produce raw count table in biom format "table/feature-table.biom" - qiime tools export --input-path table.qza \ - --output-path table - - #produce raw count table - biom convert -i table/feature-table.biom \ - -o table/feature-table.tsv \ - --to-tsv - - #produce representative sequence fasta file - qiime feature-table tabulate-seqs \ - --i-data rep-seqs.qza \ - --o-visualization rep-seqs.qzv - qiime tools export --input-path rep-seqs.qzv \ - --output-path unfiltered - - #convert to relative abundances - qiime feature-table relative-frequency \ - --i-table table.qza \ - --o-relative-frequency-table relative-table-ASV.qza - - #export to biom - qiime tools export --input-path relative-table-ASV.qza \ - --output-path rel-table - - #convert to tab separated text file - biom convert \ - -i rel-table/feature-table.biom \ - -o table/rel-feature-table.tsv --to-tsv - """ - } -} else if (params.pacbio){ - process dada_pacBio { - - publishDir "${params.outdir}", mode: 'copy', - saveAs: {filename -> - if (filename.indexOf("dada_stats.tsv") == 0) "abundance_table/unfiltered/dada_stats.tsv" - else if (filename.indexOf("dada_report.txt") == 0) "abundance_table/unfiltered/dada_report.txt" - else if (filename.indexOf("rel-feature-table.tsv") == 0) "abundance_table/unfiltered/rel-feature-table.tsv" - else if (filename.indexOf("feature-table.tsv") == 0) "abundance_table/unfiltered/feature-table.tsv" - else if (filename.indexOf("feature-table.biom") == 0) "abundance_table/unfiltered/feature-table.biom" - else if (filename.indexOf("sequences.fasta") == 0) "representative_sequences/unfiltered/sequences.fasta" - else if (filename.indexOf("rep-seqs.qza") == 0) "representative_sequences/unfiltered/rep-seqs.qza" - else null} - - input: - file demux from ch_dada_import - val trunc from ch_dada_trunc - - output: - file("table.qza") into ch_qiime_table_raw - file("rep-seqs.qza") into (ch_qiime_repseq_raw_for_classifier,ch_qiime_repseq_raw_for_filter) - file("feature-table.tsv") into ch_tsv_table_raw - file("dada_stats.tsv") - file("feature-table.biom") - file("rel-feature-table.tsv") - file("sequences.fasta") - file("dada_report.txt") - - when: - !params.untilQ2import - - script: - """ - # Quality filtering with DADA2 filterAndTrim - dada2_filter_pacbio.r --infile ${demux} --filterDir dada2_filtered --maxEE ${params.maxEE} --truncLen ${trunc} --minLen ${params.minLen} --maxLen ${params.maxLen} --stats filter_stats.tsv --verbose - - # Estimation of error models with DADA2 learnErrors - dada2_errmodels_pacbio.r --filterDir dada2_filtered > err.out - - # Denoise samples with DADA2 - dada2_denoise_pacbio.r --filterDir dada2_filtered --errModel err.rds --pool TRUE --verbose > dd.out - - # Chimera removal with DADA2, and produce - # * raw count table "feature-table.tsv" - # * relative abundancies "rel-feature-table.tsv" - # * DADA2 stats to file "denoise_stats.tsv" - # * representative sequences "sequences.fasta" - dada2_chimrem.r --manifest ${demux} --dadaObj dd.rds --method "pooled" --allowOneOff TRUE --table feature-table.tsv --reltable rel-feature-table.tsv --repseqs sequences.fasta --stats denoise_stats.tsv - - # Create qiime2 object from representative sequences - qiime tools import --type \'FeatureData[Sequence]\' \ - --input-path sequences.fasta \ - --output-path rep-seqs.qza - - # Create qiime2 object for feature table - biom convert -i feature-table.tsv -o feature-table.biom --to-hdf5 -# make_biom_from_tsv feature-table.tsv feature-table.biom - qiime tools import \ - --input-path feature-table.biom \ - --type 'FeatureTable[Frequency]' \ - --input-format BIOMV210Format \ - --output-path table.qza - - - # Produce dada2 report "dada_report.txt" from err.out and dd.out - make_dada2_report_pacbio.py err.out dd.out pooled - - # Produce dada2 stats "dada_stats.tsv" from filter_stats.tsv and denoise_stats.tsv - make_dada2_stats_pacbio.py filter_stats.tsv denoise_stats.tsv - - """ - } -} else { - process dada_multi { - tag "${demux.baseName} ${trunclenf} ${trunclenr}" - - input: - set file(demux), val(trunclenf), val(trunclenr), env(MATPLOTLIBRC) from ch_dada_multi - - output: - file("${demux.baseName}-table.qza") into ch_qiime_table - file("${demux.baseName}-rep-seqs.qza") into ch_qiime_repseq - file("${demux.baseName}-stats.tsv") into ch_dada_stats - file("${demux.baseName}-report.txt") into ch_dada_reports - - when: - !params.untilQ2import - - script: - if (trunclenf.toInteger() + trunclenr.toInteger() <= 10) { - log.info "\n######## ERROR: Total read pair length is below 10, this is definitely too low.\nForward ${trunclenf} and reverse ${trunclenr} are chosen.\nPlease provide appropriate values for --trunclenf and --trunclenr or lower --trunc_qmin\n" } - """ - export HOME="\${PWD}/HOME" - - #denoise samples with DADA2 and produce - qiime dada2 denoise-paired \ - --i-demultiplexed-seqs ${demux} \ - --p-trunc-len-f ${trunclenf} \ - --p-trunc-len-r ${trunclenr} \ - --p-max-ee-f ${params.maxEE} \ - --p-max-ee-r ${params.maxEE} \ - --p-n-threads 0 \ - --o-table ${demux.baseName}-table.qza \ - --o-representative-sequences ${demux.baseName}-rep-seqs.qza \ - --o-denoising-stats ${demux.baseName}-stats.qza \ - --verbose \ - >${demux.baseName}-report.txt - - #produce dada2 stats "${demux.baseName}-dada_stats/stats.tsv" - qiime tools export --input-path ${demux.baseName}-stats.qza \ - --output-path ${demux.baseName}-dada_stats - cp ${demux.baseName}-dada_stats/stats.tsv ${demux.baseName}-stats.tsv - """ - } - - process dada_merge { - tag "${tables}" - publishDir "${params.outdir}", mode: params.publish_dir_mode, - saveAs: {filename -> - if (filename.indexOf("stats.tsv") == 0) "abundance_table/unfiltered/dada_stats.tsv" - else if (filename.indexOf("dada_report.txt") == 0) "abundance_table/unfiltered/dada_report.txt" - else if (filename.indexOf("table.qza") == 0) "abundance_table/unfiltered/$filename" - else if (filename.indexOf("rel-table/feature-table.biom") == 0) "abundance_table/unfiltered/rel-feature-table.biom" - else if (filename.indexOf("table/feature-table.biom") == 0) "abundance_table/unfiltered/feature-table.biom" - else if (filename.indexOf("rel-feature-table.tsv") > 0) "abundance_table/unfiltered/rel-feature-table.tsv" - else if (filename.indexOf("feature-table.tsv") > 0) "abundance_table/unfiltered/feature-table.tsv" - else if (filename.indexOf("rep-seqs.qza") == 0) "representative_sequences/unfiltered/rep-seqs.qza" - else if (filename.indexOf("unfiltered/*")) "representative_sequences/$filename" - else null} - - input: - file tables from ch_qiime_table.collect() - file repseqs from ch_qiime_repseq.collect() - file stats from ch_dada_stats.collect() - file reports from ch_dada_reports.collect() - env MATPLOTLIBRC from ch_mpl_dada_merge - - output: - file("table.qza") into ch_qiime_table_raw - file("rep-seqs.qza") into (ch_qiime_repseq_raw_for_classifier,ch_qiime_repseq_raw_for_filter) - file("table/feature-table.tsv") into ch_tsv_table_raw - file("stats.tsv") - file("table/feature-table.biom") - file("rel-table/feature-table.biom") - file("table/rel-feature-table.tsv") - file("unfiltered/*") - file("dada_report.txt") - - when: - !params.untilQ2import - - script: - def TABLES = '' - def REPSEQ = '' - def STAT = '' - def REPORT = '' - for (table in tables) { TABLES+= " --i-tables ${table}" } - for (repseq in repseqs) { REPSEQ+= " --i-data ${repseq}" } - for (stat in stats) { STAT+= " $stat" } - for (report in reports) { REPORT+= " $report" } - """ - export HOME="\${PWD}/HOME" - - #concatenate tables - #merge files - qiime feature-table merge \ - ${TABLES} \ - --o-merged-table table.qza \ - --quiet - qiime feature-table merge-seqs \ - ${REPSEQ} \ - --o-merged-data rep-seqs.qza \ - --quiet - cat ${STAT} >stats.tsv - cat ${REPORT} >dada_report.txt - - #produce raw count table in biom format "table/feature-table.biom" - qiime tools export --input-path table.qza \ - --output-path table - - #produce raw count table - biom convert -i table/feature-table.biom \ - -o table/feature-table.tsv \ - --to-tsv - - #produce representative sequence fasta file - qiime feature-table tabulate-seqs \ - --i-data rep-seqs.qza \ - --o-visualization rep-seqs.qzv - qiime tools export --input-path rep-seqs.qzv \ - --output-path unfiltered - - #convert to relative abundances - qiime feature-table relative-frequency \ - --i-table table.qza \ - --o-relative-frequency-table relative-table-ASV.qza - - #export to biom - qiime tools export --input-path relative-table-ASV.qza \ - --output-path rel-table - - #convert to tab separated text file - biom convert \ - -i rel-table/feature-table.biom \ - -o table/rel-feature-table.tsv --to-tsv - """ - } -} - -/* - * Assign taxonomy to ASV sequences - * Requirements: many cores, ~35 Gb mem, walltime scales with no. of ASV and ${params.classifier} = trained_classifier size (~15 min to several hours) - * USE NXF feature of file size introduced in 0.32.0 here!!! - */ - -process classifier { - publishDir "${params.outdir}", mode: params.publish_dir_mode, - saveAs: {filename -> - if (filename == "taxonomy/taxonomy.tsv") filename - else if (filename == "taxonomy.qza") "taxonomy/$filename"} - - input: - file repseq from ch_qiime_repseq_raw_for_classifier - file trained_classifier from ch_qiime_classifier - env MATPLOTLIBRC from ch_mpl_classifier - - output: - file("taxonomy.qza") into (ch_qiime_taxonomy_for_filter,ch_qiime_taxonomy_for_relative_abundance_reduced_taxa,ch_qiime_taxonomy_for_barplot,ch_qiime_taxonomy_for_ancom,ch_qiime_taxonomy_for_export_filtered_dada_output) - file("taxonomy/taxonomy.tsv") into ch_tsv_taxonomy - - - """ - export HOME="\${PWD}/HOME" - - qiime feature-classifier classify-sklearn \ - --i-classifier ${trained_classifier} \ - --p-n-jobs ${task.cpus} \ - --i-reads ${repseq} \ - --o-classification taxonomy.qza \ - --verbose - - qiime metadata tabulate \ - --m-input-file taxonomy.qza \ - --o-visualization taxonomy.qzv \ - --verbose - - #produce "taxonomy/taxonomy.tsv" - qiime tools export --input-path taxonomy.qza \ - --output-path taxonomy - - qiime tools export --input-path taxonomy.qzv \ - --output-path taxonomy - """ -} - -/* - * Filter out unwanted/off-target taxa - */ -if (params.exclude_taxa == "none" && !params.min_frequency && !params.min_samples) { - - ch_qiime_repseq_raw_for_filter - .into{ ch_qiime_repseq_for_dada_output; ch_qiime_repseq_for_tree } - - ch_qiime_table_raw - .into{ ch_qiime_table_for_filtered_dada_output; ch_qiime_table_for_relative_abundance_asv; ch_qiime_table_for_relative_abundance_reduced_taxa; ch_qiime_table_for_ancom; ch_qiime_table_for_barplot; ch_qiime_table_for_alpha_rarefaction; ch_qiime_table_for_diversity_core } - -} else { - process filter_taxa { - tag "taxa:${params.exclude_taxa};min-freq:${params.min_frequency};min-samples:${params.min_samples}" - - publishDir "${params.outdir}", mode: params.publish_dir_mode, - saveAs: {filename -> - if (filename.indexOf("filtered-table.qza") == 0) "abundance_table/filtered/table.qza" - else if (filename.indexOf("filtered-sequences.qza") == 0) "representative_sequences/filtered/rep-seqs.qza" - else null} - - input: - file table from ch_qiime_table_raw - file repseq from ch_qiime_repseq_raw_for_filter - file taxonomy from ch_qiime_taxonomy_for_filter - env MATPLOTLIBRC from ch_mpl_filter_taxa - - output: - file("filtered-table.qza") into (ch_qiime_table_for_filtered_dada_output, ch_qiime_table_for_relative_abundance_asv,ch_qiime_table_for_relative_abundance_reduced_taxa,ch_qiime_table_for_ancom,ch_qiime_table_for_barplot,ch_qiime_table_for_alpha_rarefaction, ch_qiime_table_for_diversity_core) - file("filtered-sequences.qza") into (ch_qiime_repseq_for_dada_output,ch_qiime_repseq_for_tree) - - script: - if ( "${params.min_frequency}" == "false" ) { minfrequency = 1 } else { minfrequency = "${params.min_frequency}" } - if ( "${params.min_samples}" == "false" ) { minsamples = 1 } else { minsamples = "${params.min_samples}" } - //if ( "${params.exclude_taxa}" == "none" ) { exclude = "" } else { exclude = "--p-exclude ${params.exclude_taxa} --p-mode contains " } - """ - export HOME="\${PWD}/HOME" - - if ! [ \"${params.exclude_taxa}\" = \"none\" ]; then - #filter sequences - qiime taxa filter-seqs \ - --i-sequences ${repseq} \ - --i-taxonomy ${taxonomy} \ - --p-exclude ${params.exclude_taxa} --p-mode contains \ - --o-filtered-sequences tax_filtered-sequences.qza - - #filter abundance table - qiime taxa filter-table \ - --i-table ${table} \ - --i-taxonomy ${taxonomy} \ - --p-exclude ${params.exclude_taxa} --p-mode contains \ - --o-filtered-table tax_filtered-table.qza - - filtered_table="tax_filtered-table.qza" - filtered_sequences="tax_filtered-sequences.qza" - else - filtered_table=${table} - filtered_sequences=${repseq} - fi - - qiime feature-table filter-features \ - --i-table \$filtered_table \ - --p-min-frequency ${minfrequency} \ - --p-min-samples ${minsamples} \ - --o-filtered-table filtered-table.qza - - qiime feature-table filter-seqs \ - --i-data \$filtered_sequences \ - --i-table filtered-table.qza \ - --o-filtered-data filtered-sequences.qza - """ - } -} - -/* - * Export qiime artefacts from filtered dada output - */ -process export_filtered_dada_output { - publishDir "${params.outdir}", mode: params.publish_dir_mode, - saveAs: {filename -> - if (filename.indexOf("table/feature-table.biom") == 0) "abundance_table/filtered/feature-table.biom" - else if (filename.indexOf("table/feature-table.tsv") == 0) "abundance_table/filtered/feature-table.tsv" - else if (filename.indexOf("abs-abund-table-") == 0) "abundance_table/filtered/$filename" - else if (filename.indexOf("filtered/*")) "representative_sequences/$filename" - else null} - - input: - file table from ch_qiime_table_for_filtered_dada_output - file repseq from ch_qiime_repseq_for_dada_output - file taxonomy from ch_qiime_taxonomy_for_export_filtered_dada_output - env MATPLOTLIBRC from ch_mpl_for_export_dada_output - - output: - file("filtered/sequences.fasta") into ch_fasta_repseq - file("table/feature-table.tsv") into (ch_tsv_table_for_alpha_rarefaction,ch_tsv_table_for_report_filter_stats,ch_tsv_table_for_diversity_core) - file("table/feature-table.biom") - file("filtered/*") - file("abs-abund-table-*.tsv") - - """ - export HOME="\${PWD}/HOME" - - #produce raw count table in biom format "table/feature-table.biom" - qiime tools export --input-path ${table} \ - --output-path table - - #produce raw count table "table/feature-table.tsv" - biom convert -i table/feature-table.biom \ - -o table/feature-table.tsv \ - --to-tsv - - #produce representative sequence fasta file "${params.outdir}/representative_sequences/sequences.fasta" - qiime feature-table tabulate-seqs \ - --i-data ${repseq} \ - --o-visualization rep-seqs.qzv - qiime tools export --input-path rep-seqs.qzv \ - --output-path filtered - - ##on several taxa level - array=( 2 3 4 5 6 7 ) - for i in \${array[@]} - do - #collapse taxa - qiime taxa collapse \ - --i-table ${table} \ - --i-taxonomy ${taxonomy} \ - --p-level \$i \ - --o-collapsed-table table-\$i.qza - #export to biom - qiime tools export --input-path table-\$i.qza \ - --output-path table-\$i - #convert to tab separated text file - biom convert \ - -i table-\$i/feature-table.biom \ - -o abs-abund-table-\$i.tsv --to-tsv - done - """ -} - -/* - * Report stats after taxa filtering - */ -process report_filter_stats { - publishDir "${params.outdir}/abundance_table/filtered", mode: params.publish_dir_mode - - input: - file 'unfiltered_table' from ch_tsv_table_raw - file 'filtered_table' from ch_tsv_table_for_report_filter_stats - - output: - file("count_table_filter_stats.tsv") - - """ - count_table_filter_stats.py unfiltered_table filtered_table - """ -} - -/* - * Export relative abundance tables on ASV level - */ -process RelativeAbundanceASV { - publishDir "${params.outdir}/rel_abundance_tables", mode: params.publish_dir_mode - - input: - file table from ch_qiime_table_for_relative_abundance_asv - env MATPLOTLIBRC from ch_mpl_for_relasv - - output: - file("rel-table-ASV.tsv") into ch_tsv_relASV_table - - when: - !params.skip_abundance_tables - - """ - export HOME="\${PWD}/HOME" - - #convert to relative abundances - qiime feature-table relative-frequency \ - --i-table ${table} \ - --o-relative-frequency-table relative-table-ASV.qza - - #export to biom - qiime tools export --input-path relative-table-ASV.qza --output-path relative-table-ASV - - #convert to tab separated text file "${params.outdir}/rel-table-ASV.tsv" - biom convert -i relative-table-ASV/feature-table.biom \ - -o rel-table-ASV.tsv --to-tsv - """ -} - - -/* - * Export relative abundance tables based on taxonomic levels - */ -process RelativeAbundanceReducedTaxa { - publishDir "${params.outdir}/rel_abundance_tables", mode: params.publish_dir_mode - - input: - file table from ch_qiime_table_for_relative_abundance_reduced_taxa - file taxonomy from ch_qiime_taxonomy_for_relative_abundance_reduced_taxa - env MATPLOTLIBRC from ch_mpl_for_relreducetaxa - - output: - file("*.tsv") - - when: - !params.skip_abundance_tables && !params.skip_taxonomy - - """ - export HOME="\${PWD}/HOME" - ##on several taxa level - - array=( 2 3 4 5 6 7 ) - for i in \${array[@]} - do - #collapse taxa - qiime taxa collapse \ - --i-table ${table} \ - --i-taxonomy ${taxonomy} \ - --p-level \$i \ - --o-collapsed-table table-\$i.qza - #convert to relative abundances - qiime feature-table relative-frequency \ - --i-table table-\$i.qza \ - --o-relative-frequency-table relative-table-\$i.qza - #export to biom - qiime tools export --input-path relative-table-\$i.qza \ - --output-path relative-table-\$i - #convert to tab separated text file - biom convert \ - -i relative-table-\$i/feature-table.biom \ - -o rel-table-\$i.tsv --to-tsv - done - - """ -} - - -/* - * Produce a bar plot - */ -process barplot { - publishDir "${params.outdir}", mode: params.publish_dir_mode - - input: - file metadata from ch_metadata_for_barplot - file table from ch_qiime_table_for_barplot - file taxonomy from ch_qiime_taxonomy_for_barplot - env MATPLOTLIBRC from ch_mpl_for_barcode - - output: - file("barplot/*") - - when: - !params.skip_barplot && !params.skip_taxonomy - - """ - export HOME="\${PWD}/HOME" - - qiime taxa barplot \ - --i-table ${table} \ - --i-taxonomy ${taxonomy} \ - --m-metadata-file ${metadata} \ - --o-visualization taxa-bar-plots.qzv \ - --verbose - - qiime tools export --input-path taxa-bar-plots.qzv \ - --output-path barplot - """ -} - -/* - * Produce a rooted tree - */ -process tree { - publishDir "${params.outdir}", mode: params.publish_dir_mode, - saveAs: {filename -> - if (filename.indexOf("rooted-tree.qza") == 0) "phylogenetic_tree/$filename" - else filename } - - input: - file repseq from ch_qiime_repseq_for_tree - env MATPLOTLIBRC from ch_mpl_for_tree - - output: - file("rooted-tree.qza") into (ch_qiime_tree_for_diversity_core, ch_qiime_tree_for_alpha_rarefaction) - file("phylogenetic_tree/tree.nwk") - - when: - !params.skip_diversity_indices || !params.skip_alpha_rarefaction - - - """ - export HOME="\${PWD}/HOME" - - qiime alignment mafft \ - --i-sequences ${repseq} \ - --o-alignment aligned-rep-seqs.qza \ - --p-n-threads ${task.cpus} - - qiime alignment mask \ - --i-alignment aligned-rep-seqs.qza \ - --o-masked-alignment masked-aligned-rep-seqs.qza - - qiime phylogeny fasttree \ - --i-alignment masked-aligned-rep-seqs.qza \ - --p-n-threads ${task.cpus} \ - --o-tree unrooted-tree.qza - - qiime phylogeny midpoint-root \ - --i-tree unrooted-tree.qza \ - --o-rooted-tree rooted-tree.qza - - qiime tools export --input-path rooted-tree.qza \ - --output-path phylogenetic_tree - """ -} - - -/* - * Alpha-rarefaction - */ -process alpha_rarefaction { - publishDir "${params.outdir}", mode: params.publish_dir_mode - - input: - file metadata from ch_metadata_for_alphararefaction - file table from ch_qiime_table_for_alpha_rarefaction - file tree from ch_qiime_tree_for_alpha_rarefaction - file stats from ch_tsv_table_for_alpha_rarefaction - env MATPLOTLIBRC from ch_mpl_for_alpha_rare - - output: - file("alpha-rarefaction/*") - - when: - !params.skip_alpha_rarefaction - - """ - export HOME="\${PWD}/HOME" - - maxdepth=\$(count_table_minmax_reads.py $stats maximum 2>&1) - - #check values - if [ \"\$maxdepth\" -gt \"75000\" ]; then maxdepth=\"75000\"; fi - if [ \"\$maxdepth\" -gt \"5000\" ]; then maxsteps=\"250\"; else maxsteps=\$((maxdepth/20)); fi - - qiime diversity alpha-rarefaction \ - --i-table ${table} \ - --i-phylogeny ${tree} \ - --p-max-depth \$maxdepth \ - --m-metadata-file ${metadata} \ - --p-steps \$maxsteps \ - --p-iterations 10 \ - --o-visualization alpha-rarefaction.qzv - - qiime tools export --input-path alpha-rarefaction.qzv \ - --output-path alpha-rarefaction - """ -} - -/* - * Combine abundances, sequences and taxonomic classification into one table with R - */ -process combinetable { - publishDir "${params.outdir}/rel_abundance_tables", mode: params.publish_dir_mode - - input: - file TABLE from ch_tsv_relASV_table - file SEQ from ch_fasta_repseq - file TAXONOMY from ch_tsv_taxonomy - - output: - file("qiime2_ASV_table.tsv") - - when: - !params.skip_abundance_tables && !params.skip_taxonomy - - """ - combineTable.r ${TABLE} ${SEQ} ${TAXONOMY} - """ -} - -/* - * Compute diversity matrices - */ -process diversity_core { - publishDir "${params.outdir}", mode: params.publish_dir_mode, - saveAs: {filename -> - params.keepIntermediates ? filename : null} - - input: - file metadata from ch_metadata_for_diversity_core - file table from ch_qiime_table_for_diversity_core - file tree from ch_qiime_tree_for_diversity_core - file stats from ch_tsv_table_for_diversity_core - env MATPLOTLIBRC from ch_mpl_for_diversity_core - - output: - file("diversity_core/*_pcoa_results.qza") into (ch_qiime_diversity_core_for_beta_diversity_ordination) mode flatten - file("diversity_core/*_vector.qza") into ch_qiime_diversity_core_for_alpha_diversity mode flatten - file("diversity_core/*_distance_matrix.qza") into ch_qiime_diversity_core_for_beta_diversity mode flatten - stdout rarefaction_depth - - when: - !params.skip_diversity_indices - - """ - export HOME="\${PWD}/HOME" - mindepth=\$(count_table_minmax_reads.py $stats minimum 2>&1) - - if [ \"\$mindepth\" -gt \"10000\" ]; then echo \"\nUse the sampling depth of \$mindepth for rarefaction\" ; fi - if [ \"\$mindepth\" -lt \"10000\" -a \"\$mindepth\" -gt \"5000\" ]; then echo \"\n######## WARNING! The sampling depth of \$mindepth is quite small for rarefaction!\" ; fi - if [ \"\$mindepth\" -lt \"5000\" -a \"\$mindepth\" -gt \"1000\" ]; then echo \"\n######## WARNING! The sampling depth of \$mindepth is very small for rarefaction!\" ; fi - if [ \"\$mindepth\" -lt \"1000\" ]; then echo \"\n######## ERROR! The sampling depth of \$mindepth seems too small for rarefaction!\" ; fi - - qiime diversity core-metrics-phylogenetic \ - --m-metadata-file ${metadata} \ - --i-phylogeny ${tree} \ - --i-table ${table} \ - --p-sampling-depth \$mindepth \ - --output-dir diversity_core \ - --p-n-jobs ${task.cpus} \ - --quiet - """ -} - -rarefaction_depth - .subscribe { log.info it } - -/* - * Capture all possible metadata categories for statistics - */ -process metadata_category_all { - input: - file metadata from ch_metadata_for_metadata_category_all - env MATPLOTLIBRC from ch_mpl_for_metadata_cat - - output: - stdout into (ch_meta_category_all_for_alphadiversity, ch_meta_category_all_for_ancom) - - when: - (!params.skip_ancom || !params.skip_diversity_indices) && - (!params.untilQ2import && !params.onlyDenoising) - - script: - if( !params.metadata_category ) - """ - metadataCategory.r ${metadata} - """ - else - """ - printf ${params.metadata_category} - """ -} - -/* - * Capture all pairwise metadata categories for statistics - */ -process metadata_category_pairwise { - - input: - file metadata from ch_metadata_for_metadata_category_pairwise - env MATPLOTLIBRC from ch_mpl_for_metadata_pair - - output: - stdout ch_meta_category_pairwise - - when: - !params.skip_diversity_indices - - """ - metadataCategoryPairwise.r ${metadata} - """ -} - -/* - * Combine channels for diversity analysis - */ - -ch_metadata_for_alpha_diversity - .combine( ch_qiime_diversity_core_for_alpha_diversity ) - .combine( ch_mpl_for_alpha_diversity ) - .combine( ch_meta_category_all_for_alphadiversity ) - .set{ ch_for_alpha_diversity } -ch_metadata_for_beta_diversity - .combine( ch_qiime_diversity_core_for_beta_diversity ) - .combine( ch_meta_category_pairwise ) - .combine( ch_mpl_for_beta_diversity ) - .set{ ch_for_beta_diversity } -ch_metadata_for_beta_diversity_ordination - .combine( ch_qiime_diversity_core_for_beta_diversity_ordination ) - .combine( ch_mpl_for_beta_diversity_ord ) - .set{ ch_for_beta_diversity_ordination } - - - -/* - * Compute alpha diversity indices - */ -process alpha_diversity { - tag "${core.baseName}" - publishDir "${params.outdir}", mode: params.publish_dir_mode - - input: - set file(metadata), file(core), env(MATPLOTLIBRC), val(meta) from ch_for_alpha_diversity - - output: - file("alpha-diversity/*") - - when: - meta.length() > 0 - - """ - export HOME="\${PWD}/HOME" - - qiime diversity alpha-group-significance \ - --i-alpha-diversity ${core} \ - --m-metadata-file ${metadata} \ - --o-visualization ${core.baseName}-vis.qzv - qiime tools export --input-path ${core.baseName}-vis.qzv \ - --output-path "alpha-diversity/${core.baseName}" - """ -} - - -/* - * Compute beta diversity indices - */ -process beta_diversity { - tag "${core.baseName}" - publishDir "${params.outdir}", mode: params.publish_dir_mode - - input: - set file(meta), file(core), val(category), env(MATPLOTLIBRC) from ch_for_beta_diversity - - output: - file("beta-diversity/*") - - when: - category.length() > 0 - - """ - export HOME="\${PWD}/HOME" - IFS=',' read -r -a metacategory <<< \"$category\" - - for j in \"\${metacategory[@]}\" - do - qiime diversity beta-group-significance \ - --i-distance-matrix $core \ - --m-metadata-file ${meta} \ - --m-metadata-column \"\$j\" \ - --o-visualization ${core.baseName}-\$j.qzv \ - --p-pairwise - qiime tools export --input-path ${core.baseName}-\$j.qzv \ - --output-path beta-diversity/${core.baseName}-\$j - done - """ -} - -/* - * Compute beta diversity ordination - */ -process beta_diversity_ordination { - tag "${core.baseName}" - publishDir "${params.outdir}", mode: params.publish_dir_mode - - input: - set file(metadata), file(core), env(MATPLOTLIBRC) from ch_for_beta_diversity_ordination - - output: - file("beta-diversity/*") - - """ - export HOME="\${PWD}/HOME" - - qiime emperor plot \ - --i-pcoa ${core} \ - --m-metadata-file ${metadata} \ - --o-visualization ${core.baseName}-vis.qzv - qiime tools export --input-path ${core.baseName}-vis.qzv \ - --output-path beta-diversity/${core.baseName}-PCoA - """ -} - - -/* - * Differential abundance analysis with ANCOM - */ -process prepare_ancom { - tag "${meta}" - - publishDir "${params.outdir}/ancom", mode: params.publish_dir_mode, - saveAs: {filename -> - params.keepIntermediates ? filename : null} - - input: - file metadata from ch_metadata_for_ancom - file table from ch_qiime_table_for_ancom - val meta from ch_meta_category_all_for_ancom - env MATPLOTLIBRC from ch_mpl_for_ancom - - output: - file("*.qza") into (ch_meta_tables_tax, ch_meta_tables_asv) mode flatten - - when: - !params.skip_ancom && (meta.length() > 0) - - """ - export HOME="\${PWD}/HOME" - IFS=',' read -r -a metacategory <<< \"$meta\" - - #remove samples that do not have any value - for j in \"\${metacategory[@]}\" - do - qiime feature-table filter-samples \ - --i-table ${table} \ - --m-metadata-file ${metadata} \ - --p-where \"\$j<>\'\'\" \ - --o-filtered-table \$j.qza - done - """ -} - -/* - * Combine channels for ancom - */ - -ch_taxlevel_tax = Channel.from( 2, 3, 4, 5, 6 ) - -ch_meta_tables_tax - .combine( ch_taxlevel_tax ) - .combine( ch_qiime_taxonomy_for_ancom ) - .combine( ch_metadata_for_ancom_tax ) - .combine( ch_mpl_for_ancom_tax ) - .set{ ch_for_ancom_tax } - -ch_meta_tables_asv - .combine( ch_metadata_for_ancom_asv ) - .combine ( ch_mpl_for_ancom_asv ) - .set{ ch_for_ancom_asv } - - -/* - * Differential abundance analysis with ANCOM on various taxonomic levels - */ -process ancom_tax { - tag "${table.baseName}-level${taxlevel}" - - publishDir "${params.outdir}", mode: params.publish_dir_mode - - input: - set file(table), val(taxlevel), file(taxonomy), file(metadata), env(MATPLOTLIBRC) from ch_for_ancom_tax - - output: - file("ancom/*") - - when: - !params.skip_ancom - - """ - export HOME="\${PWD}/HOME" - - qiime taxa collapse \ - --i-table ${table} \ - --i-taxonomy ${taxonomy} \ - --p-level ${taxlevel} \ - --o-collapsed-table lvl${taxlevel}-${table} - qiime composition add-pseudocount \ - --i-table lvl${taxlevel}-${table} \ - --o-composition-table comp-lvl${taxlevel}-${table} - qiime composition ancom \ - --i-table comp-lvl${taxlevel}-${table} \ - --m-metadata-file ${metadata} \ - --m-metadata-column ${table.baseName} \ - --o-visualization comp-lvl${taxlevel}-${table.baseName}.qzv - qiime tools export --input-path comp-lvl${taxlevel}-${table.baseName}.qzv \ - --output-path ancom/Category-${table.baseName}-level-${taxlevel} - """ -} - -/* - * Differential abundance analysis with ANCOM on ASV level - */ -process ancom_asv { - tag "${table.baseName}" - - publishDir "${params.outdir}", mode: params.publish_dir_mode - - input: - set file(table), file(metadata), env(MATPLOTLIBRC) from ch_for_ancom_asv - - output: - file("ancom/*") - - """ - export HOME="\${PWD}/HOME" - - qiime composition add-pseudocount \ - --i-table ${table} \ - --o-composition-table comp-${table} - qiime composition ancom \ - --i-table comp-${table} \ - --m-metadata-file ${metadata} \ - --m-metadata-column ${table.baseName} \ - --o-visualization comp-${table.baseName}.qzv - qiime tools export --input-path comp-${table.baseName}.qzv \ - --output-path ancom/Category-${table.baseName}-ASV - """ -} - -/* - * STEP 3 - Output Description HTML - */ -process output_documentation { - publishDir "${params.outdir}/pipeline_info", mode: params.publish_dir_mode - - input: - file output_docs from ch_output_docs - file images from ch_output_docs_images - - output: - file "results_description.html" - - script: - """ - markdown_to_html.py $output_docs -o results_description.html - """ -} - -/* - * Completion e-mail notification - */ -workflow.onComplete { - - // Set up the e-mail variables - def subject = "[nf-core/ampliseq] Successful: $workflow.runName" - if (!workflow.success) { - subject = "[nf-core/ampliseq] FAILED: $workflow.runName" - } - def email_fields = [:] - email_fields['version'] = workflow.manifest.version - email_fields['runName'] = custom_runName ?: workflow.runName - email_fields['success'] = workflow.success - email_fields['dateComplete'] = workflow.complete - email_fields['duration'] = workflow.duration - email_fields['exitStatus'] = workflow.exitStatus - email_fields['errorMessage'] = (workflow.errorMessage ?: 'None') - email_fields['errorReport'] = (workflow.errorReport ?: 'None') - email_fields['commandLine'] = workflow.commandLine - email_fields['projectDir'] = workflow.projectDir - email_fields['summary'] = summary - email_fields['summary']['Date Started'] = workflow.start - email_fields['summary']['Date Completed'] = workflow.complete - email_fields['summary']['Pipeline script file path'] = workflow.scriptFile - email_fields['summary']['Pipeline script hash ID'] = workflow.scriptId - if (workflow.repository) email_fields['summary']['Pipeline repository Git URL'] = workflow.repository - if (workflow.commitId) email_fields['summary']['Pipeline repository Git Commit'] = workflow.commitId - if (workflow.revision) email_fields['summary']['Pipeline Git branch/tag'] = workflow.revision - email_fields['summary']['Nextflow Version'] = workflow.nextflow.version - email_fields['summary']['Nextflow Build'] = workflow.nextflow.build - email_fields['summary']['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp - - // On success try attach the multiqc report - def mqc_report = null - try { - if (workflow.success) { - mqc_report = ch_multiqc_report.getVal() - if (mqc_report.getClass() == ArrayList) { - log.warn "[nf-core/ampliseq] Found multiple reports from process 'multiqc', will use only one" - mqc_report = mqc_report[0] - } - } - } catch (all) { - log.warn "[nf-core/ampliseq] Could not attach MultiQC report to summary email" - } - - // Check if we are only sending emails on failure - email_address = params.email - if (!params.email && params.email_on_fail && !workflow.success) { - email_address = params.email_on_fail - } - - // Render the TXT template - def engine = new groovy.text.GStringTemplateEngine() - def tf = new File("$projectDir/assets/email_template.txt") - def txt_template = engine.createTemplate(tf).make(email_fields) - def email_txt = txt_template.toString() - - // Render the HTML template - def hf = new File("$projectDir/assets/email_template.html") - def html_template = engine.createTemplate(hf).make(email_fields) - def email_html = html_template.toString() - - // Render the sendmail template - def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, projectDir: "$projectDir", mqcFile: mqc_report, mqcMaxSize: params.max_multiqc_email_size.toBytes() ] - def sf = new File("$projectDir/assets/sendmail_template.txt") - def sendmail_template = engine.createTemplate(sf).make(smail_fields) - def sendmail_html = sendmail_template.toString() - - // Send the HTML e-mail - if (email_address) { - try { - if (params.plaintext_email) { throw GroovyException('Send plaintext e-mail, not HTML') } - // Try to send HTML e-mail using sendmail - [ 'sendmail', '-t' ].execute() << sendmail_html - log.info "[nf-core/ampliseq] Sent summary e-mail to $email_address (sendmail)" - } catch (all) { - // Catch failures and try with plaintext - def mail_cmd = [ 'mail', '-s', subject, '--content-type=text/html', email_address ] - if ( mqc_report.size() <= params.max_multiqc_email_size.toBytes() ) { - mail_cmd += [ '-A', mqc_report ] - } - mail_cmd.execute() << email_html - log.info "[nf-core/ampliseq] Sent summary e-mail to $email_address (mail)" - } - } - - // Write summary e-mail HTML to a file - def output_d = new File("${params.outdir}/pipeline_info/") - if (!output_d.exists()) { - output_d.mkdirs() - } - def output_hf = new File(output_d, "pipeline_report.html") - output_hf.withWriter { w -> w << email_html } - def output_tf = new File(output_d, "pipeline_report.txt") - output_tf.withWriter { w -> w << email_txt } - - c_green = params.monochrome_logs ? '' : "\033[0;32m"; - c_purple = params.monochrome_logs ? '' : "\033[0;35m"; - c_red = params.monochrome_logs ? '' : "\033[0;31m"; - c_reset = params.monochrome_logs ? '' : "\033[0m"; - - if (workflow.stats.ignoredCount > 0 && workflow.success) { - log.info "-${c_purple}Warning, pipeline completed, but with errored process(es) ${c_reset}-" - log.info "-${c_red}Number of ignored errored process(es) : ${workflow.stats.ignoredCount} ${c_reset}-" - log.info "-${c_green}Number of successfully ran process(es) : ${workflow.stats.succeedCount} ${c_reset}-" - } - - if (workflow.success) { - log.info "-${c_purple}[nf-core/ampliseq]${c_green} Pipeline completed successfully${c_reset}-" - } else { - checkHostname() - log.info "-${c_purple}[nf-core/ampliseq]${c_red} Pipeline completed with errors${c_reset}-" - } - -} - - -def nfcoreHeader() { - // Log colors ANSI codes - c_black = params.monochrome_logs ? '' : "\033[0;30m"; - c_blue = params.monochrome_logs ? '' : "\033[0;34m"; - c_cyan = params.monochrome_logs ? '' : "\033[0;36m"; - c_dim = params.monochrome_logs ? '' : "\033[2m"; - c_green = params.monochrome_logs ? '' : "\033[0;32m"; - c_purple = params.monochrome_logs ? '' : "\033[0;35m"; - c_reset = params.monochrome_logs ? '' : "\033[0m"; - c_white = params.monochrome_logs ? '' : "\033[0;37m"; - c_yellow = params.monochrome_logs ? '' : "\033[0;33m"; - - return """ -${c_dim}--------------------------------------------------${c_reset}- - ${c_green},--.${c_black}/${c_green},-.${c_reset} - ${c_blue} ___ __ __ __ ___ ${c_green}/,-._.--~\'${c_reset} - ${c_blue} |\\ | |__ __ / ` / \\ |__) |__ ${c_yellow}} {${c_reset} - ${c_blue} | \\| | \\__, \\__/ | \\ |___ ${c_green}\\`-._,-`-,${c_reset} - ${c_green}`._,._,\'${c_reset} - ${c_purple} nf-core/ampliseq v${workflow.manifest.version}${c_reset} - -${c_dim}--------------------------------------------------${c_reset}- - """.stripIndent() -} - -def checkHostname() { - def c_reset = params.monochrome_logs ? '' : "\033[0m" - def c_white = params.monochrome_logs ? '' : "\033[0;37m" - def c_red = params.monochrome_logs ? '' : "\033[1;91m" - def c_yellow_bold = params.monochrome_logs ? '' : "\033[1;93m" - if (params.hostnames) { - def hostname = "hostname".execute().text.trim() - params.hostnames.each { prof, hnames -> - hnames.each { hname -> - if (hostname.contains(hname) && !workflow.profile.contains(prof)) { - log.error "====================================================\n" + - " ${c_red}WARNING!${c_reset} You are running with `-profile $workflow.profile`\n" + - " but your machine hostname is ${c_white}'$hostname'${c_reset}\n" + - " ${c_yellow_bold}It's highly recommended that you use `-profile $prof${c_reset}`\n" + - "============================================================" - } - } - } - } -} +//////////////////////////////////////////////////// +/* -- THE END -- */ +//////////////////////////////////////////////////// diff --git a/modules/local/combine_table.nf b/modules/local/combine_table.nf new file mode 100644 index 00000000..1b403203 --- /dev/null +++ b/modules/local/combine_table.nf @@ -0,0 +1,32 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process COMBINE_TABLE { + label 'process_low' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + conda (params.enable_conda ? "bioconductor::biostrings=2.58.0" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/bioconductor-biostrings:2.58.0--r40h037d062_0" + } else { + container "quay.io/biocontainers/bioconductor-biostrings:2.58.0--r40h037d062_0" + } + + input: + path(table) + path(seq) + path(tax) + + output: + path("qiime2_ASV_table.tsv") + + script: + """ + combine_table.r ${table} ${seq} ${tax} + """ +} \ No newline at end of file diff --git a/modules/local/cutadapt_summary.nf b/modules/local/cutadapt_summary.nf new file mode 100644 index 00000000..58831104 --- /dev/null +++ b/modules/local/cutadapt_summary.nf @@ -0,0 +1,36 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process CUTADAPT_SUMMARY { + tag "${name}" + label 'process_low' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + conda (params.enable_conda ? "conda-forge::python=3.8.3" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/python:3.8.3" + } else { + container "quay.io/biocontainers/python:3.8.3" + } + + input: + val(name) + tuple val(meta), path(logs) + + output: + path("*_summary.tsv") , emit: tsv + path "*.version.txt" , emit: version + + script: + def software = "python" + def mode = meta.single_end ? "single_end" : "paired_end" + """ + cutadapt_summary.py $mode *.cutadapt.log > ${name}_summary.tsv + echo \$(python --version) > ${software}.version.txt + """ +} \ No newline at end of file diff --git a/modules/local/cutadapt_summary_merge.nf b/modules/local/cutadapt_summary_merge.nf new file mode 100644 index 00000000..885a7813 --- /dev/null +++ b/modules/local/cutadapt_summary_merge.nf @@ -0,0 +1,51 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process CUTADAPT_SUMMARY_MERGE { + tag "${files}" + label 'process_low' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + conda (params.enable_conda ? "bioconductor-dada2=1.18.0" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.18.0--r40h5f743cb_0" + } else { + container "quay.io/biocontainers/bioconductor-dada2:1.18.0--r40h5f743cb_0" + } + + input: + val(action) + path(files) + + output: + path("cutadapt_summary.tsv") , emit: tsv + + script: + if (action == "merge") { + """ + #!/usr/bin/env Rscript + standard <- read.table(\"${files[0]}\", header = TRUE, sep = "\t", stringsAsFactors = FALSE) + doubleprimer <- read.table(\"${files[1]}\", header = TRUE, sep = "\t", stringsAsFactors = FALSE) + colnames(doubleprimer) <- c("sample", "cutadapt_doubleprimer_total_processed", "cutadapt_doubleprimer_reverse_complemented", "cutadapt_doubleprimer_passing_filters", "cutadapt_doubleprimer_passing_filters_percent") + + #merge + df <- merge(standard, doubleprimer, by = "sample") + + #filter columns + remove_columns <- c("cutadapt_doubleprimer_total_processed") + for(column in remove_columns) df[column]<-NULL + + #write + write.table(df, file = \"cutadapt_summary.tsv\", quote=FALSE, col.names=TRUE, row.names=FALSE, sep="\t") + """ + } else { + """ + cp $files cutadapt_summary.tsv + """ + } +} diff --git a/modules/local/dada2_addspecies.nf b/modules/local/dada2_addspecies.nf new file mode 100644 index 00000000..994aa801 --- /dev/null +++ b/modules/local/dada2_addspecies.nf @@ -0,0 +1,65 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process DADA2_ADDSPECIES { + tag "${taxtable},${database}" + label 'process_high' + label 'single_cpu' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + conda (params.enable_conda ? "bioconductor-dada2=1.18.0" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.18.0--r40h5f743cb_0" + } else { + container "quay.io/biocontainers/bioconductor-dada2:1.18.0--r40h5f743cb_0" + } + + input: + path(taxtable) + path(database) + val(outfile) + + output: + path(outfile) , emit: tsv + path "*.version.txt", emit: version + path "*.args.txt" , emit: args + + script: + def software = getSoftwareName(task.process) + """ + #!/usr/bin/env Rscript + suppressPackageStartupMessages(library(dada2)) + set.seed(100) # Initialize random number generator for reproducibility + + taxtable <- readRDS(\"$taxtable\") + + tx <- addSpecies(taxtable, \"$database\", $options.args, verbose=TRUE) + + # Create a table with specified column order + tmp <- data.frame(row.names(tx)) # To separate ASV_ID from sequence + taxa <- data.frame( + ASV_ID = tx[,"ASV_ID"], + Domain = tx[,"Domain"], + Kingdom = tx[,"Kingdom"], + Phylum = tx[,"Phylum"], + Class = tx[,"Class"], + Order = tx[,"Order"], + Family = tx[,"Family"], + Genus = tx[,"Genus"], + Species = tx[,"Species"], + confidence = tx[,"confidence"], + sequence = tmp[,], + row.names=row.names(tmp) + ) + + write.table(taxa, file = \"$outfile\", sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE) + + write.table('addSpecies\t$options.args', file = "addSpecies.args.txt", row.names = FALSE, col.names = FALSE, quote = FALSE) + write.table(packageVersion("dada2"), file = "${software}.version.txt", row.names = FALSE, col.names = FALSE, quote = FALSE) + """ +} diff --git a/modules/local/dada2_denoising.nf b/modules/local/dada2_denoising.nf new file mode 100644 index 00000000..4bb278c8 --- /dev/null +++ b/modules/local/dada2_denoising.nf @@ -0,0 +1,90 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process DADA2_DENOISING { + tag "$meta.run" + label 'process_medium' + label 'process_long' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + conda (params.enable_conda ? "bioconductor-dada2=1.18.0" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.18.0--r40h5f743cb_0" + } else { + container "quay.io/biocontainers/bioconductor-dada2:1.18.0--r40h5f743cb_0" + } + + input: + tuple val(meta), path(dereplicated), path(errormodel) + + output: + tuple val(meta), path("*.dada.rds") , emit: denoised + tuple val(meta), path("*.seqtab.rds") , emit: seqtab + tuple val(meta), path("*.mergers.rds"), emit: mergers + tuple val(meta), path("*.log") , emit: log + path "*.version.txt" , emit: version + path "*.args.txt" , emit: args + + script: + def software = getSoftwareName(task.process) + if (!meta.single_end) { + """ + #!/usr/bin/env Rscript + suppressPackageStartupMessages(library(dada2)) + + errF = readRDS("${errormodel[0]}") + errR = readRDS("${errormodel[1]}") + + derepFs = readRDS("${dereplicated[0]}") + derepRs = readRDS("${dereplicated[1]}") + + #denoising + sink(file = "${meta.run}.dada.log") + dadaFs <- dada(derepFs, err = errF, $options.args, multithread = $task.cpus) + saveRDS(dadaFs, "${meta.run}_1.dada.rds") + dadaRs <- dada(derepRs, err = errR, $options.args, multithread = $task.cpus) + saveRDS(dadaRs, "${meta.run}_2.dada.rds") + sink(file = NULL) + + #make table + mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, $options.args2, verbose=TRUE) + saveRDS(mergers, "${meta.run}.mergers.rds") + seqtab <- makeSequenceTable(mergers) + saveRDS(seqtab, "${meta.run}.seqtab.rds") + + write.table('dada\t$options.args', file = "dada.args.txt", row.names = FALSE, col.names = FALSE, quote = FALSE) + write.table('mergePairs\t$options.args2', file = "mergePairs.args.txt", row.names = FALSE, col.names = FALSE, quote = FALSE) + write.table(packageVersion("dada2"), file = "${software}.version.txt", row.names = FALSE, col.names = FALSE, quote = FALSE) + """ + } else { + """ + #!/usr/bin/env Rscript + suppressPackageStartupMessages(library(dada2)) + + errF = readRDS("${errormodel}") + + derepFs = readRDS("${dereplicated}") + + #denoising + sink(file = "${meta.run}.dada.log") + dadaFs <- dada(derepFs, err = errF, $options.args, multithread = $task.cpus) + saveRDS(dadaFs, "${meta.run}.dada.rds") + sink(file = NULL) + + #make table + seqtab <- makeSequenceTable(dadaFs) + saveRDS(seqtab, "${meta.run}.seqtab.rds") + + #dummy file to fulfill output rules + saveRDS("dummy", "dummy_${meta.run}.mergers.rds") + + write.table('dada\t$options.args', file = "dada.args.txt", row.names = FALSE, col.names = FALSE, quote = FALSE) + write.table(packageVersion("dada2"), file = "${software}.version.txt", row.names = FALSE, col.names = FALSE, quote = FALSE) + """ + } +} \ No newline at end of file diff --git a/modules/local/dada2_dereplicate.nf b/modules/local/dada2_dereplicate.nf new file mode 100644 index 00000000..3c5fe152 --- /dev/null +++ b/modules/local/dada2_dereplicate.nf @@ -0,0 +1,55 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process DADA2_DEREPLICATE { + tag "$meta.run" + label 'process_medium' + + conda (params.enable_conda ? "bioconductor-dada2=1.18.0" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.18.0--r40h5f743cb_0" + } else { + container "quay.io/biocontainers/bioconductor-dada2:1.18.0--r40h5f743cb_0" + } + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*.derep.rds"), emit: dereplicated + path "*.version.txt" , emit: version + + script: + def software = getSoftwareName(task.process) + if (!meta.single_end) { + """ + #!/usr/bin/env Rscript + suppressPackageStartupMessages(library(dada2)) + + filtFs <- sort(list.files(".", pattern = "_1.filt.fastq.gz", full.names = TRUE)) + filtRs <- sort(list.files(".", pattern = "_2.filt.fastq.gz", full.names = TRUE)) + + derepFs <- derepFastq(filtFs, verbose = TRUE) + saveRDS(derepFs, "${meta.run}_1.derep.rds") + derepRs <- derepFastq(filtRs, verbose = TRUE) + saveRDS(derepRs, "${meta.run}_2.derep.rds") + + write.table(packageVersion("dada2"), file = "${software}.version.txt", row.names = FALSE, col.names = FALSE, quote = FALSE) + """ + } else { + """ + #!/usr/bin/env Rscript + suppressPackageStartupMessages(library(dada2)) + + filtFs <- sort(list.files(".", pattern = ".filt.fastq.gz", full.names = TRUE)) + + derepFs <- derepFastq(filtFs, verbose = TRUE) + saveRDS(derepFs, "${meta.run}.derep.rds") + + write.table(packageVersion("dada2"), file = "${software}.version.txt", row.names = FALSE, col.names = FALSE, quote = FALSE) + """ + } +} \ No newline at end of file diff --git a/modules/local/dada2_err.nf b/modules/local/dada2_err.nf new file mode 100644 index 00000000..3e7094de --- /dev/null +++ b/modules/local/dada2_err.nf @@ -0,0 +1,92 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process DADA2_ERR { + tag "$meta.run" + label 'process_medium' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + conda (params.enable_conda ? "bioconductor-dada2=1.18.0" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.18.0--r40h5f743cb_0" + } else { + container "quay.io/biocontainers/bioconductor-dada2:1.18.0--r40h5f743cb_0" + } + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*.err.rds"), emit: errormodel + tuple val(meta), path("*.err.pdf"), emit: pdf + tuple val(meta), path("*.err.log"), emit: log + tuple val(meta), path("*.err.convergence.txt"), emit: convergence + path "*.version.txt" , emit: version + path "*.args.txt" , emit: args + + script: + def software = getSoftwareName(task.process) + if (!meta.single_end) { + """ + #!/usr/bin/env Rscript + suppressPackageStartupMessages(library(dada2)) + + fnFs <- sort(list.files(".", pattern = "_1.filt.fastq.gz", full.names = TRUE)) + fnRs <- sort(list.files(".", pattern = "_2.filt.fastq.gz", full.names = TRUE)) + + sink(file = "${meta.run}.err.log") + errF <- learnErrors(fnFs, $options.args, multithread = $task.cpus, verbose = TRUE) + saveRDS(errF, "${meta.run}_1.err.rds") + errR <- learnErrors(fnRs, $options.args, multithread = $task.cpus, verbose = TRUE) + saveRDS(errR, "${meta.run}_2.err.rds") + sink(file = NULL) + + pdf("${meta.run}_1.err.pdf") + plotErrors(errF, nominalQ = TRUE) + dev.off() + + pdf("${meta.run}_2.err.pdf") + plotErrors(errR, nominalQ = TRUE) + dev.off() + + sink(file = "${meta.run}_1.err.convergence.txt") + dada2:::checkConvergence(errF) + sink(file = NULL) + + sink(file = "${meta.run}_2.err.convergence.txt") + dada2:::checkConvergence(errR) + sink(file = NULL) + + write.table('learnErrors\t$options.args', file = "learnErrors.args.txt", row.names = FALSE, col.names = FALSE, quote = FALSE) + write.table(packageVersion("dada2"), file = "${software}.version.txt", row.names = FALSE, col.names = FALSE, quote = FALSE) + """ + } else { + """ + #!/usr/bin/env Rscript + suppressPackageStartupMessages(library(dada2)) + + fnFs <- sort(list.files(".", pattern = ".filt.fastq.gz", full.names = TRUE)) + + sink(file = "${meta.run}.err.log") + errF <- learnErrors(fnFs, $options.args, multithread = $task.cpus, verbose = TRUE) + saveRDS(errF, "${meta.run}.err.rds") + sink(file = NULL) + + pdf("${meta.run}.err.pdf") + plotErrors(errF, nominalQ = TRUE) + dev.off() + + sink(file = "${meta.run}.err.convergence.txt") + dada2:::checkConvergence(errF) + sink(file = NULL) + + write.table('learnErrors\t$options.args', file = "learnErrors.args.txt", row.names = FALSE, col.names = FALSE, quote = FALSE) + write.table(packageVersion("dada2"), file = "${software}.version.txt", row.names = FALSE, col.names = FALSE, quote = FALSE) + """ + } +} \ No newline at end of file diff --git a/modules/local/dada2_filtntrim.nf b/modules/local/dada2_filtntrim.nf new file mode 100644 index 00000000..149e41c3 --- /dev/null +++ b/modules/local/dada2_filtntrim.nf @@ -0,0 +1,52 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process DADA2_FILTNTRIM { + tag "$meta.id" + label 'process_medium' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + conda (params.enable_conda ? "bioconductor-dada2=1.18.0" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.18.0--r40h5f743cb_0" + } else { + container "quay.io/biocontainers/bioconductor-dada2:1.18.0--r40h5f743cb_0" + } + + input: + tuple val(meta), path(reads), val(trunclenf), val(trunclenr) + + output: + tuple val(meta), path("*.filter_stats.tsv"), emit: log + tuple val(meta), path("*.filt.fastq.gz") , emit: reads + path "*.version.txt" , emit: version + path "*.args.txt" , emit: args + + script: + def software = getSoftwareName(task.process) + def in_and_out = meta.single_end ? "\"${reads}\", \"${meta.id}.filt.fastq.gz\"" : "\"${reads[0]}\", \"${meta.id}_1.filt.fastq.gz\", \"${reads[1]}\", \"${meta.id}_2.filt.fastq.gz\"" + def trunclenf = trunclenf[1].toInteger() + def trunclenr = trunclenr[1].toInteger() + def trunc_args = meta.single_end ? "truncLen = $trunclenf" : "truncLen = c($trunclenf, $trunclenr)" + """ + #!/usr/bin/env Rscript + suppressPackageStartupMessages(library(dada2)) + + out <- filterAndTrim($in_and_out, + $trunc_args, + $options.args, + compress = TRUE, + multithread = $task.cpus, + verbose = TRUE) + out <- cbind(out, ID = row.names(out)) + + write.table( out, file = "${meta.id}.filter_stats.tsv", sep = "\t", row.names = FALSE, quote = FALSE) + write.table(paste('filterAndTrim\t$trunc_args','$options.args',sep=","), file = "filterAndTrim.args.txt", row.names = FALSE, col.names = FALSE, quote = FALSE) + write.table(packageVersion("dada2"), file = "${software}.version.txt", row.names = FALSE, col.names = FALSE, quote = FALSE) + """ +} \ No newline at end of file diff --git a/modules/local/dada2_merge.nf b/modules/local/dada2_merge.nf new file mode 100644 index 00000000..374827d5 --- /dev/null +++ b/modules/local/dada2_merge.nf @@ -0,0 +1,79 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process DADA2_MERGE { + label 'process_low' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + conda (params.enable_conda ? "bioconductor-dada2=1.18.0" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.18.0--r40h5f743cb_0" + } else { + container "quay.io/biocontainers/bioconductor-dada2:1.18.0--r40h5f743cb_0" + } + + input: + path(files) + path(rds) + + output: + path( "DADA2_stats.tsv" ), emit: dada2stats + path( "DADA2_table.tsv" ), emit: dada2asv + path( "ASV_table.tsv" ) , emit: asv + path( "ASV_seqs.fasta" ) , emit: fasta + path( "DADA2_table.rds" ), emit: rds + path "*.version.txt" , emit: version + + script: + def software = getSoftwareName(task.process) + """ + #!/usr/bin/env Rscript + suppressPackageStartupMessages(library(dada2)) + suppressPackageStartupMessages(library(digest)) + + #combine stats files + for (data in sort(list.files(".", pattern = ".stats.tsv", full.names = TRUE))) { + if (!exists("stats")){ stats <- read.csv(data, header=TRUE, sep="\t") } + if (exists("stats")){ + temp <-read.csv(data, header=TRUE, sep="\t") + stats <-unique(rbind(stats, temp)) + rm(temp) + } + } + write.table( stats, file = "DADA2_stats.tsv", sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE) + + #combine dada-class objects + files <- sort(list.files(".", pattern = ".ASVtable.rds", full.names = TRUE)) + if ( length(files) == 1 ) { + ASVtab = readRDS(files[1]) + } else { + ASVtab <- mergeSequenceTables(tables=files, repeats = "error", orderBy = "abundance", tryRC = FALSE) + } + saveRDS(ASVtab, "DADA2_table.rds") + + df <- t(ASVtab) + colnames(df) <- gsub('_1.filt.fastq.gz', '', colnames(df)) + colnames(df) <- gsub('.filt.fastq.gz', '', colnames(df)) + df <- data.frame(sequence = rownames(df), df) + # Create an md5 sum of the sequences as ASV_ID and rearrange columns + df\$ASV_ID <- sapply(df\$sequence, digest, algo='md5', serialize = FALSE) + df <- df[,c(ncol(df),3:ncol(df)-1,1)] + + # file to publish + write.table(df, file = "DADA2_table.tsv", sep = "\t", row.names = FALSE, quote = FALSE) + + # Write fasta file with ASV sequences to file + write.table(data.frame(s = sprintf(">%s\n%s", df\$ASV_ID, df\$sequence)), 'ASV_seqs.fasta', col.names = FALSE, row.names = FALSE, quote = FALSE) + + # Write ASV file with ASV abundances to file + df\$sequence <- NULL + write.table(df, file = "ASV_table.tsv", sep="\t", row.names = FALSE, quote = FALSE) + + write.table(packageVersion("dada2"), file = "${software}.version.txt", row.names = FALSE, col.names = FALSE, quote = FALSE) + """ +} diff --git a/modules/local/dada2_quality.nf b/modules/local/dada2_quality.nf new file mode 100644 index 00000000..a2f37149 --- /dev/null +++ b/modules/local/dada2_quality.nf @@ -0,0 +1,34 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process DADA2_QUALITY { + tag "$meta" + label 'process_low' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + conda (params.enable_conda ? "bioconductor-dada2=1.18.0" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.18.0--r40h5f743cb_0" + } else { + container "quay.io/biocontainers/bioconductor-dada2:1.18.0--r40h5f743cb_0" + } + + input: + tuple val(meta), path(reads) + + output: + path "${meta}_qual_stats.pdf" , emit: pdf + tuple val(meta), path("*_qual_stats.tsv"), emit: tsv + path "*.args.txt" , emit: args + + script: + """ + dada_quality.r "${meta}_qual_stats" $options.args + echo 'plotQualityProfile\t$options.args' > "plotQualityProfile.args.txt" + """ +} \ No newline at end of file diff --git a/modules/local/dada2_rmchimera.nf b/modules/local/dada2_rmchimera.nf new file mode 100644 index 00000000..d5725f71 --- /dev/null +++ b/modules/local/dada2_rmchimera.nf @@ -0,0 +1,47 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process DADA2_RMCHIMERA { + tag "$meta.run" + label 'process_medium' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + conda (params.enable_conda ? "bioconductor-dada2=1.18.0" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.18.0--r40h5f743cb_0" + } else { + container "quay.io/biocontainers/bioconductor-dada2:1.18.0--r40h5f743cb_0" + } + + input: + tuple val(meta), path(seqtab) + + output: + tuple val(meta), path("*.ASVtable.rds"), emit: rds + path "*.version.txt" , emit: version + path "*.args.txt" , emit: args + + script: + def software = getSoftwareName(task.process) + def no_samples = meta.id.size() + def first_sample = meta.id.first() + """ + #!/usr/bin/env Rscript + suppressPackageStartupMessages(library(dada2)) + + seqtab = readRDS("${seqtab}") + + #remove chimera + seqtab.nochim <- removeBimeraDenovo(seqtab, $options.args, multithread=$task.cpus, verbose=TRUE) + if ( ${no_samples} == 1 ) { rownames(seqtab.nochim) <- "${first_sample}" } + saveRDS(seqtab.nochim,"${meta.run}.ASVtable.rds") + + write.table('removeBimeraDenovo\t$options.args', file = "removeBimeraDenovo.args.txt", row.names = FALSE, col.names = FALSE, quote = FALSE) + write.table(packageVersion("dada2"), file = "${software}.version.txt", row.names = FALSE, col.names = FALSE, quote = FALSE) + """ +} \ No newline at end of file diff --git a/modules/local/dada2_stats.nf b/modules/local/dada2_stats.nf new file mode 100644 index 00000000..5ff8c783 --- /dev/null +++ b/modules/local/dada2_stats.nf @@ -0,0 +1,104 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process DADA2_STATS { + tag "$meta.run" + label 'process_low' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + conda (params.enable_conda ? "bioconductor-dada2=1.18.0" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.18.0--r40h5f743cb_0" + } else { + container "quay.io/biocontainers/bioconductor-dada2:1.18.0--r40h5f743cb_0" + } + + input: + tuple val(meta), path("filter_and_trim_files/*"), path(denoised), path(mergers), path(seqtab_nochim) + + output: + tuple val(meta), path("*.stats.tsv"), emit: stats + path "*.version.txt" , emit: version + + script: + def software = getSoftwareName(task.process) + if (!meta.single_end) { + """ + #!/usr/bin/env Rscript + suppressPackageStartupMessages(library(dada2)) + + #combine filter_and_trim files + for (data in list.files("./filter_and_trim_files", full.names=TRUE)){ + if (!exists("filter_and_trim")){ filter_and_trim <- read.csv(data, header=TRUE, sep="\t") } + if (exists("filter_and_trim")){ + tempory <-read.csv(data, header=TRUE, sep="\t") + filter_and_trim <-unique(rbind(filter_and_trim, tempory)) + rm(tempory) + } + } + rownames(filter_and_trim) <- filter_and_trim\$ID + filter_and_trim["ID"] <- NULL + #write.table( filter_and_trim, file = "${meta.run}.filter_and_trim.tsv", sep = "\t", row.names = TRUE, quote = FALSE) + + #read data + dadaFs = readRDS("${denoised[0]}") + dadaRs = readRDS("${denoised[1]}") + mergers = readRDS("$mergers") + seqtab.nochim = readRDS("$seqtab_nochim") + + #track reads through pipeline + getN <- function(x) sum(getUniques(x)) + if ( nrow(filter_and_trim) == 1 ) { + track <- cbind(filter_and_trim, getN(dadaFs), getN(dadaRs), getN(mergers), rowSums(seqtab.nochim)) + } else { + track <- cbind(filter_and_trim, sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(mergers, getN), rowSums(seqtab.nochim)) + } + colnames(track) <- c("DADA2_input", "filtered", "denoisedF", "denoisedR", "merged", "nonchim") + track <- cbind(sample = sub(pattern = "(.*?)\\\\..*\$", replacement = "\\\\1", rownames(track)), track) + track\$sample <- sub(pattern = "_1\$", replacement = "", track\$sample) + write.table( track, file = "${meta.run}.stats.tsv", sep = "\t", row.names = FALSE, quote = FALSE) + + write.table(packageVersion("dada2"), file = "${software}.version.txt", row.names = FALSE, col.names = FALSE, quote = FALSE) + """ + } else { + """ + #!/usr/bin/env Rscript + suppressPackageStartupMessages(library(dada2)) + + #combine filter_and_trim files + for (data in list.files("./filter_and_trim_files", full.names=TRUE)){ + if (!exists("filter_and_trim")){ filter_and_trim <- read.csv(data, header=TRUE, sep="\t") } + if (exists("filter_and_trim")){ + tempory <-read.csv(data, header=TRUE, sep="\t") + filter_and_trim <-unique(rbind(filter_and_trim, tempory)) + rm(tempory) + } + } + rownames(filter_and_trim) <- filter_and_trim\$ID + filter_and_trim["ID"] <- NULL + #write.table( filter_and_trim, file = "${meta.run}.filter_and_trim.tsv", sep = "\t", row.names = TRUE, quote = FALSE) + + #read data + dadaFs = readRDS("${denoised[0]}") + seqtab.nochim = readRDS("$seqtab_nochim") + + #track reads through pipeline + getN <- function(x) sum(getUniques(x)) + if ( nrow(filter_and_trim) == 1 ) { + track <- cbind(filter_and_trim, getN(dadaFs), rowSums(seqtab.nochim)) + } else { + track <- cbind(filter_and_trim, sapply(dadaFs, getN), rowSums(seqtab.nochim)) + } + colnames(track) <- c("input", "filtered", "denoised", "nonchim") + track <- cbind(sample = sub(pattern = "(.*?)\\\\..*\$", replacement = "\\\\1", rownames(track)), track) + write.table( track, file = "${meta.run}.stats.tsv", sep = "\t", row.names = FALSE, quote = FALSE) + + write.table(packageVersion("dada2"), file = "${software}.version.txt", row.names = FALSE, col.names = FALSE, quote = FALSE) + """ + } +} \ No newline at end of file diff --git a/modules/local/dada2_taxonomy.nf b/modules/local/dada2_taxonomy.nf new file mode 100644 index 00000000..5fff3dcb --- /dev/null +++ b/modules/local/dada2_taxonomy.nf @@ -0,0 +1,82 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process DADA2_TAXONOMY { + tag "${fasta},${database}" + label 'process_high' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + conda (params.enable_conda ? "bioconductor-dada2=1.18.0" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.18.0--r40h5f743cb_0" + } else { + container "quay.io/biocontainers/bioconductor-dada2:1.18.0--r40h5f743cb_0" + } + + input: + path(fasta) + path(database) + val(outfile) + + output: + path(outfile), emit: tsv + path( "ASV_tax.rds" ), emit: rds + path "*.version.txt" , emit: version + path "*.args.txt" , emit: args + + script: + def software = getSoftwareName(task.process) + """ + #!/usr/bin/env Rscript + suppressPackageStartupMessages(library(dada2)) + set.seed(100) # Initialize random number generator for reproducibility + + seq <- getSequences(\"$fasta\", collapse = TRUE, silence = FALSE) + taxa <- assignTaxonomy(seq, \"$database\", taxLevels = c("Domain", "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"), $options.args, multithread = $task.cpus, verbose=TRUE, outputBootstraps = TRUE) + + # Make a data frame, add ASV_ID from seq, set confidence to the bootstrap for the most specific taxon and reorder columns before writing to file + tx <- data.frame(ASV_ID = names(seq), taxa, sequence = row.names(taxa\$tax), row.names = names(seq)) + tx\$confidence <- with(tx, + ifelse(!is.na(tax.Genus), boot.Genus, + ifelse(!is.na(tax.Family), boot.Family, + ifelse(!is.na(tax.Order), boot.Order, + ifelse(!is.na(tax.Class), boot.Class, + ifelse(!is.na(tax.Phylum), boot.Phylum, + ifelse(!is.na(tax.Kingdom), boot.Kingdom, + ifelse(!is.na(tax.Domain), boot.Domain, 0) + ) + ) + ) + ) + ) + ) + )/100 + taxa_export <- data.frame( + ASV_ID = tx\$ASV_ID, + Domain = tx\$tax.Domain, + Kingdom = tx\$tax.Kingdom, + Phylum = tx\$tax.Phylum, + Class = tx\$tax.Class, + Order = tx\$tax.Order, + Family = tx\$tax.Family, + Genus = tx\$tax.Genus, + confidence = tx\$confidence, + sequence = tx\$sequence, + row.names = names(seq) + ) + + write.table(taxa_export, file = \"$outfile\", sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE) + + # Save a version with rownames for addSpecies + taxa_export <- cbind( ASV_ID = tx\$ASV_ID, taxa\$tax, confidence = tx\$confidence ) + saveRDS(taxa_export, "ASV_tax.rds") + + write.table('assignTaxonomy\t$options.args', file = "assignTaxonomy.args.txt", row.names = FALSE, col.names = FALSE, quote = FALSE) + write.table(packageVersion("dada2"), file = "${software}.version.txt", row.names = FALSE, col.names = FALSE, quote = FALSE) + """ +} diff --git a/modules/local/filter_stats.nf b/modules/local/filter_stats.nf new file mode 100644 index 00000000..c97a4ea1 --- /dev/null +++ b/modules/local/filter_stats.nf @@ -0,0 +1,32 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process FILTER_STATS { + label 'process_low' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + conda (params.enable_conda ? "pandas=1.1.5" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/pandas:1.1.5" + } else { + container "quay.io/biocontainers/pandas:1.1.5" + } + + input: + path(unfiltered) + path(filtered) + + output: + path("count_table_filter_stats.tsv"), emit: tsv + + script: + def software = getSoftwareName(task.process) + """ + filter_stats.py $unfiltered $filtered + """ +} \ No newline at end of file diff --git a/modules/local/format_taxonomy.nf b/modules/local/format_taxonomy.nf new file mode 100644 index 00000000..e9e6b36b --- /dev/null +++ b/modules/local/format_taxonomy.nf @@ -0,0 +1,22 @@ +process FORMAT_TAXONOMY { + label 'process_low' + + conda (params.enable_conda ? "conda-forge::sed=4.7" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv1/biocontainers_v1.2.0_cv1.img" + } else { + container "biocontainers/biocontainers:v1.2.0_cv1" + } + + input: + path(database) + + output: + path( "*assignTaxonomy.fna*" ), emit: assigntax + path( "*addSpecies.fna*"), emit: addspecies + + script: + """ + ${params.dada_ref_databases[params.dada_ref_taxonomy]["fmtscript"]} + """ +} diff --git a/modules/local/format_taxonomy_qiime.nf b/modules/local/format_taxonomy_qiime.nf new file mode 100644 index 00000000..a7e98f7b --- /dev/null +++ b/modules/local/format_taxonomy_qiime.nf @@ -0,0 +1,22 @@ +process FORMAT_TAXONOMY_QIIME { + label 'process_low' + + conda (params.enable_conda ? "conda-forge::sed=4.7" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv1/biocontainers_v1.2.0_cv1.img" + } else { + container "biocontainers/biocontainers:v1.2.0_cv1" + } + + input: + path(database) + + output: + path( "*.tax" ), emit: tax + path( "*.fna" ), emit: fasta + + script: + """ + ${params.qiime_ref_databases[params.qiime_ref_taxonomy]["fmtscript"]} + """ +} diff --git a/modules/local/format_taxresults.nf b/modules/local/format_taxresults.nf new file mode 100644 index 00000000..80b1f7e5 --- /dev/null +++ b/modules/local/format_taxresults.nf @@ -0,0 +1,34 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process FORMAT_TAXRESULTS { + label 'process_low' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:'dada2', publish_id:'') } + + conda (params.enable_conda ? "pandas=1.1.5" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/pandas:1.1.5" + } else { + container "quay.io/biocontainers/pandas:1.1.5" + } + + input: + path(taxtable) + path(taxtable_species) + path(fastafile) + + output: + path("ASV_tax.tsv") + path("ASV_tax_species.tsv"), emit: tsv + + script: + """ + add_full_sequence_to_taxfile.py $taxtable $fastafile + add_full_sequence_to_taxfile.py $taxtable_species $fastafile + """ +} diff --git a/modules/local/functions.nf b/modules/local/functions.nf new file mode 100644 index 00000000..54dc8fe8 --- /dev/null +++ b/modules/local/functions.nf @@ -0,0 +1,59 @@ +/* + * ----------------------------------------------------- + * Utility functions used in nf-core DSL2 module files + * ----------------------------------------------------- + */ + +/* + * Extract name of software tool from process name using $task.process + */ +def getSoftwareName(task_process) { + return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() +} + +/* + * Function to initialise default values and to generate a Groovy Map of available options for nf-core modules + */ +def initOptions(Map args) { + def Map options = [:] + options.args = args.args ?: '' + options.args2 = args.args2 ?: '' + options.publish_by_id = args.publish_by_id ?: false + options.publish_dir = args.publish_dir ?: '' + options.publish_files = args.publish_files + options.suffix = args.suffix ?: '' + return options +} + +/* + * Tidy up and join elements of a list to return a path string + */ +def getPathFromList(path_list) { + def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries + paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes + return paths.join('/') +} + +/* + * Function to save/publish module results + */ +def saveFiles(Map args) { + if (!args.filename.endsWith('.version.txt')) { + def ioptions = initOptions(args.options) + def path_list = [ ioptions.publish_dir ?: args.publish_dir ] + if (ioptions.publish_by_id) { + path_list.add(args.publish_id) + } + if (ioptions.publish_files instanceof Map) { + for (ext in ioptions.publish_files) { + if (args.filename.endsWith(ext.key)) { + def ext_list = path_list.collect() + ext_list.add(ext.value) + return "${getPathFromList(ext_list)}/$args.filename" + } + } + } else if (ioptions.publish_files == null) { + return "${getPathFromList(path_list)}/$args.filename" + } + } +} \ No newline at end of file diff --git a/modules/local/get_software_versions.nf b/modules/local/get_software_versions.nf new file mode 100644 index 00000000..b1ee01a6 --- /dev/null +++ b/modules/local/get_software_versions.nf @@ -0,0 +1,36 @@ +// Import generic module functions +include { saveFiles } from './functions' + +params.options = [:] + +/* + * Parse software version numbers + */ +process GET_SOFTWARE_VERSIONS { + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:'pipeline_info', publish_id:'') } + + conda (params.enable_conda ? "conda-forge::python=3.8.3" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/python:3.8.3" + } else { + container "quay.io/biocontainers/python:3.8.3" + } + + cache false + + input: + path versions + + output: + path "software_versions.csv" , emit: csv + path 'software_versions_mqc.yaml', emit: yaml + + script: + """ + echo $workflow.manifest.version > pipeline.version.txt + echo $workflow.nextflow.version > nextflow.version.txt + scrape_software_versions.py &> software_versions_mqc.yaml + """ +} \ No newline at end of file diff --git a/modules/local/itsx_cutasv.nf b/modules/local/itsx_cutasv.nf new file mode 100644 index 00000000..5488918d --- /dev/null +++ b/modules/local/itsx_cutasv.nf @@ -0,0 +1,34 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process ITSX_CUTASV { + label 'process_medium' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + conda (params.enable_conda ? "bioconda::itsx=1.1.3" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/itsx:1.1.3--hdfd78af_1" + } else { + container "quay.io/biocontainers/itsx:1.1.3--hdfd78af_1" + } + + input: + path fasta + + output: + path "ASV_ITS_seqs.full.fasta", emit: fasta + path "*.version.txt" , emit: version + + script: + def software = getSoftwareName(task.process) + """ + ITSx -i $fasta ${options}.args --cpu ${task.cpus} -o ASV_ITS_seqs + + ITSx -h 2>&1 > /dev/null | tail -n 2 | head -n 1 | cut -f 2 -d ' ' > ${software}.version.txt + """ +} diff --git a/modules/local/merge_stats.nf b/modules/local/merge_stats.nf new file mode 100644 index 00000000..a17ac5ff --- /dev/null +++ b/modules/local/merge_stats.nf @@ -0,0 +1,39 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process MERGE_STATS { + label 'process_low' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + conda (params.enable_conda ? "bioconductor-dada2=1.18.0" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.18.0--r40h5f743cb_0" + } else { + container "quay.io/biocontainers/bioconductor-dada2:1.18.0--r40h5f743cb_0" + } + + input: + path('file1.tsv') + path('file2.tsv') + + output: + path("overall_summary.tsv") , emit: tsv + + script: + """ + #!/usr/bin/env Rscript + x <- read.table(\"file1.tsv\", header = TRUE, sep = "\t", stringsAsFactors = FALSE) + y <- read.table(\"file2.tsv\", header = TRUE, sep = "\t", stringsAsFactors = FALSE) + + #merge + df <- merge(x, y, by = "sample", all = TRUE) + + #write + write.table(df, file = \"overall_summary.tsv\", quote=FALSE, col.names=TRUE, row.names=FALSE, sep="\t") + """ +} diff --git a/modules/local/metadata_all.nf b/modules/local/metadata_all.nf new file mode 100644 index 00000000..52f5d2c5 --- /dev/null +++ b/modules/local/metadata_all.nf @@ -0,0 +1,34 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] + +process METADATA_ALL { + tag "$metadata" + label 'process_low' + + conda (params.enable_conda ? "bioconductor-dada2=1.18.0" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.18.0--r40h5f743cb_0" + } else { + container "quay.io/biocontainers/bioconductor-dada2:1.18.0--r40h5f743cb_0" + } + + input: + path(metadata) + val(metadata_category) + + output: + stdout + + script: + if( !metadata_category ) { + """ + metadata_all.r ${metadata} + """ + } else { + """ + printf ${metadata_category} + """ + } +} \ No newline at end of file diff --git a/modules/local/metadata_pairwise.nf b/modules/local/metadata_pairwise.nf new file mode 100644 index 00000000..2263a828 --- /dev/null +++ b/modules/local/metadata_pairwise.nf @@ -0,0 +1,28 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process METADATA_PAIRWISE { + tag "$metadata" + label 'process_low' + + conda (params.enable_conda ? "bioconductor-dada2=1.18.0" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.18.0--r40h5f743cb_0" + } else { + container "quay.io/biocontainers/bioconductor-dada2:1.18.0--r40h5f743cb_0" + } + + input: + path(metadata) + + output: + stdout + + script: + """ + metadata_pairwise.r ${metadata} + """ +} \ No newline at end of file diff --git a/modules/local/multiqc.nf b/modules/local/multiqc.nf new file mode 100644 index 00000000..5af7a24c --- /dev/null +++ b/modules/local/multiqc.nf @@ -0,0 +1,39 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process MULTIQC { + label 'process_medium' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + conda (params.enable_conda ? "bioconda::multiqc=1.9" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/multiqc:1.9--pyh9f0ad1d_0" + } else { + container "quay.io/biocontainers/multiqc:1.9--pyh9f0ad1d_0" + } + + input: + path multiqc_config + path multiqc_custom_config + path software_versions + path workflow_summary + path ('fastqc/*') + path ('cutadapt/*') + + output: + path "*multiqc_report.html", emit: report + path "*_data" , emit: data + path "*_plots" , optional:true, emit: plots + + script: + def software = getSoftwareName(task.process) + def custom_config = params.multiqc_config ? "--config $multiqc_custom_config" : '' + """ + multiqc -f $options.args $custom_config . + """ +} \ No newline at end of file diff --git a/modules/local/parse_samplesheet.nf b/modules/local/parse_samplesheet.nf new file mode 100644 index 00000000..e60c8eb4 --- /dev/null +++ b/modules/local/parse_samplesheet.nf @@ -0,0 +1,30 @@ +// Function to get list of [ meta, [ fastq_1, fastq_2 ] ] +def parse_samplesheet(LinkedHashMap row, single_end) { + //Check if manifest contains column sampleID & forwardReads + if (row.sampleID == null || row.forwardReads == null) { + exit 1, "ERROR: Please check input samplesheet -> Column 'sampleID' and 'forwardReads' are required but not detected." + } + //Check if manifest contains a column for reverse reads + if (row.reverseReads == null && !single_end) { + exit 1, "ERROR: Please check input samplesheet -> Column 'reverseReads' is missing. In case you do have only single ended reads, please specify '--single_end', '--pacbio', or '--iontorrent'." + } + //read meta info + def meta = [:] + meta.id = row.sampleID + meta.single_end = single_end.toBoolean() + meta.run = row.run == null ? "1" : row.run + //read data info + def array = [] + if (!file(row.forwardReads).exists()) { + exit 1, "ERROR: Please check input samplesheet -> Forward read FastQ file does not exist!\n${row.forwardReads}" + } + if (meta.single_end) { + array = [ meta, [ file(row.forwardReads) ] ] + } else { + if (!file(row.reverseReads).exists()) { + exit 1, "ERROR: Please check input samplesheet -> Reverse read FastQ file does not exist!\n${row.reverseReads}" + } + array = [ meta, [ file(row.forwardReads), file(row.reverseReads) ] ] + } + return array +} \ No newline at end of file diff --git a/modules/local/qiime2_alphararefaction.nf b/modules/local/qiime2_alphararefaction.nf new file mode 100644 index 00000000..0f6f0474 --- /dev/null +++ b/modules/local/qiime2_alphararefaction.nf @@ -0,0 +1,49 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process QIIME2_ALPHARAREFACTION { + label 'process_low' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + conda (params.enable_conda ? { exit 1 "QIIME2 has no conda package" } : null) + container "quay.io/qiime2/core:2021.2" + + input: + path(metadata) + path(table) + path(tree) + path(stats) + + output: + path("alpha-rarefaction/*"), emit: rarefaction + path "*.version.txt" , emit: version + + script: + def software = getSoftwareName(task.process) + """ + export XDG_CONFIG_HOME="\${PWD}/HOME" + + maxdepth=\$(count_table_minmax_reads.py $stats maximum 2>&1) + + #check values + if [ \"\$maxdepth\" -gt \"75000\" ]; then maxdepth=\"75000\"; fi + if [ \"\$maxdepth\" -gt \"5000\" ]; then maxsteps=\"250\"; else maxsteps=\$((maxdepth/20)); fi + qiime diversity alpha-rarefaction \ + --i-table ${table} \ + --i-phylogeny ${tree} \ + --p-max-depth \$maxdepth \ + --m-metadata-file ${metadata} \ + --p-steps \$maxsteps \ + --p-iterations 10 \ + --o-visualization alpha-rarefaction.qzv + qiime tools export --input-path alpha-rarefaction.qzv \ + --output-path alpha-rarefaction + + echo \$(qiime --version | sed -e "s/q2cli version //g" | tr -d '`' | sed -e "s/Run qiime info for more version details.//g") > ${software}.version.txt + """ +} \ No newline at end of file diff --git a/modules/local/qiime2_ancom_asv.nf b/modules/local/qiime2_ancom_asv.nf new file mode 100644 index 00000000..b581363f --- /dev/null +++ b/modules/local/qiime2_ancom_asv.nf @@ -0,0 +1,45 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process QIIME2_ANCOM_ASV { + tag "${table.baseName}" + label 'process_medium' + label 'single_cpu' + label 'process_long' + label 'error_ignore' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + conda (params.enable_conda ? { exit 1 "QIIME2 has no conda package" } : null) + container "quay.io/qiime2/core:2021.2" + + input: + tuple path(metadata), path(table) + + output: + path("ancom/*") , emit: ancom + path "*.version.txt", emit: version + + script: + def software = getSoftwareName(task.process) + """ + export XDG_CONFIG_HOME="\${PWD}/HOME" + + qiime composition add-pseudocount \ + --i-table ${table} \ + --o-composition-table comp-${table} + qiime composition ancom \ + --i-table comp-${table} \ + --m-metadata-file ${metadata} \ + --m-metadata-column ${table.baseName} \ + --o-visualization comp-${table.baseName}.qzv + qiime tools export --input-path comp-${table.baseName}.qzv \ + --output-path ancom/Category-${table.baseName}-ASV + + echo \$(qiime --version | sed -e "s/q2cli version //g" | tr -d '`' | sed -e "s/Run qiime info for more version details.//g") > ${software}.version.txt + """ +} \ No newline at end of file diff --git a/modules/local/qiime2_ancom_tax.nf b/modules/local/qiime2_ancom_tax.nf new file mode 100644 index 00000000..b35aba62 --- /dev/null +++ b/modules/local/qiime2_ancom_tax.nf @@ -0,0 +1,59 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process QIIME2_ANCOM_TAX { + tag "${table.baseName} - taxonomic level: ${taxlevel}" + label 'process_medium' + label 'single_cpu' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + conda (params.enable_conda ? { exit 1 "QIIME2 has no conda package" } : null) + container "quay.io/qiime2/core:2021.2" + + input: + tuple path(metadata), path(table), path(taxonomy) ,val(taxlevel) + + output: + path "ancom/*" , emit: ancom + path "*.version.txt", emit: version + + script: + def software = getSoftwareName(task.process) + """ + export XDG_CONFIG_HOME="\${PWD}/HOME" + mkdir ancom + + # Sum data at the specified level + qiime taxa collapse \ + --i-table ${table} \ + --i-taxonomy ${taxonomy} \ + --p-level ${taxlevel} \ + --o-collapsed-table lvl${taxlevel}-${table} + + # Extract summarised table and output a file with the number of taxa + qiime tools export --input-path lvl${taxlevel}-${table} --output-path exported/ + biom convert -i exported/feature-table.biom -o ${table.baseName}-level-${taxlevel}.feature-table.tsv --to-tsv + + if [ \$(grep -v '^#' -c ${table.baseName}-level-${taxlevel}.feature-table.tsv) -lt 2 ]; then + echo ${taxlevel} > ancom/\"WARNING Summing your data at taxonomic level ${taxlevel} produced less than two rows (taxa), ANCOM can't proceed -- did you specify a bad reference taxonomy?\".txt + else + qiime composition add-pseudocount \ + --i-table lvl${taxlevel}-${table} \ + --o-composition-table comp-lvl${taxlevel}-${table} + qiime composition ancom \ + --i-table comp-lvl${taxlevel}-${table} \ + --m-metadata-file ${metadata} \ + --m-metadata-column ${table.baseName} \ + --o-visualization comp-lvl${taxlevel}-${table.baseName}.qzv + qiime tools export --input-path comp-lvl${taxlevel}-${table.baseName}.qzv \ + --output-path ancom/Category-${table.baseName}-level-${taxlevel} + fi + + echo \$(qiime --version | sed -e "s/q2cli version //g" | tr -d '`' | sed -e "s/Run qiime info for more version details.//g") > ${software}.version.txt + """ +} diff --git a/modules/local/qiime2_barplot.nf b/modules/local/qiime2_barplot.nf new file mode 100644 index 00000000..c14aa313 --- /dev/null +++ b/modules/local/qiime2_barplot.nf @@ -0,0 +1,41 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process QIIME2_BARPLOT { + label 'process_low' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + conda (params.enable_conda ? { exit 1 "QIIME2 has no conda package" } : null) + container "quay.io/qiime2/core:2021.2" + + input: + path(metadata) + path(table) + path(taxonomy) + + output: + path("barplot/*") , emit: folder + path "*.version.txt", emit: version + + script: + def software = getSoftwareName(task.process) + """ + export XDG_CONFIG_HOME="\${PWD}/HOME" + + qiime taxa barplot \ + --i-table ${table} \ + --i-taxonomy ${taxonomy} \ + --m-metadata-file ${metadata} \ + --o-visualization taxa-bar-plots.qzv \ + --verbose + qiime tools export --input-path taxa-bar-plots.qzv \ + --output-path barplot + + echo \$(qiime --version | sed -e "s/q2cli version //g" | tr -d '`' | sed -e "s/Run qiime info for more version details.//g") > ${software}.version.txt + """ +} \ No newline at end of file diff --git a/modules/local/qiime2_classify.nf b/modules/local/qiime2_classify.nf new file mode 100644 index 00000000..e876457e --- /dev/null +++ b/modules/local/qiime2_classify.nf @@ -0,0 +1,50 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process QIIME2_CLASSIFY { + tag "${repseq},${trained_classifier}" + label 'process_high' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + conda (params.enable_conda ? { exit 1 "QIIME2 has no conda package" } : null) + container "quay.io/qiime2/core:2021.2" + + input: + path(trained_classifier) + path(repseq) + + output: + path("taxonomy.qza"), emit: qza + path("taxonomy.tsv"), emit: tsv + path "*.version.txt", emit: version + + script: + def software = getSoftwareName(task.process) + """ + export XDG_CONFIG_HOME="\${PWD}/HOME" + + qiime feature-classifier classify-sklearn \ + --i-classifier ${trained_classifier} \ + --p-n-jobs ${task.cpus} \ + --i-reads ${repseq} \ + --o-classification taxonomy.qza \ + --verbose + qiime metadata tabulate \ + --m-input-file taxonomy.qza \ + --o-visualization taxonomy.qzv \ + --verbose + #produce "taxonomy/taxonomy.tsv" + qiime tools export --input-path taxonomy.qza \ + --output-path taxonomy + qiime tools export --input-path taxonomy.qzv \ + --output-path taxonomy + cp taxonomy/taxonomy.tsv . + + echo \$(qiime --version | sed -e "s/q2cli version //g" | tr -d '`' | sed -e "s/Run qiime info for more version details.//g") > ${software}.version.txt + """ +} \ No newline at end of file diff --git a/modules/local/qiime2_diversity_alpha.nf b/modules/local/qiime2_diversity_alpha.nf new file mode 100644 index 00000000..ce8e2fc0 --- /dev/null +++ b/modules/local/qiime2_diversity_alpha.nf @@ -0,0 +1,46 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process QIIME2_DIVERSITY_ALPHA { + tag "${core.baseName}" + label 'process_low' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + conda (params.enable_conda ? { exit 1 "QIIME2 has no conda package" } : null) + container "quay.io/qiime2/core:2021.2" + + input: + tuple path(metadata), path(core), val(category) + + output: + path("alpha_diversity/*"), emit: alpha + path "*.version.txt" , emit: version + + script: + def software = getSoftwareName(task.process) + if ( category.length() > 0 ) { + """ + export XDG_CONFIG_HOME="\${PWD}/HOME" + + qiime diversity alpha-group-significance \ + --i-alpha-diversity ${core} \ + --m-metadata-file ${metadata} \ + --o-visualization ${core.baseName}-vis.qzv + qiime tools export --input-path ${core.baseName}-vis.qzv \ + --output-path "alpha_diversity/${core.baseName}" + + echo \$(qiime --version | sed -e "s/q2cli version //g" | tr -d '`' | sed -e "s/Run qiime info for more version details.//g") > ${software}.version.txt + """ + } else { + """ + mkdir alpha_diversity + echo "" > "alpha_diversity/WARNING No column in ${metadata.baseName} seemed suitable.txt" + echo \$(qiime --version | sed -e "s/q2cli version //g" | tr -d '`' | sed -e "s/Run qiime info for more version details.//g") > ${software}.version.txt + """ + } +} \ No newline at end of file diff --git a/modules/local/qiime2_diversity_beta.nf b/modules/local/qiime2_diversity_beta.nf new file mode 100644 index 00000000..1e3508c2 --- /dev/null +++ b/modules/local/qiime2_diversity_beta.nf @@ -0,0 +1,52 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process QIIME2_DIVERSITY_BETA { + tag "${core.baseName}" + label 'process_low' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + conda (params.enable_conda ? { exit 1 "QIIME2 has no conda package" } : null) + container "quay.io/qiime2/core:2021.2" + + input: + tuple path(metadata), path(core), val(category) + + output: + path("beta_diversity/*"), emit: beta + path "*.version.txt" , emit: version + + script: + def software = getSoftwareName(task.process) + if ( category.length() > 0 ) { + """ + export XDG_CONFIG_HOME="\${PWD}/HOME" + + IFS=',' read -r -a metacategory <<< \"$category\" + for j in \"\${metacategory[@]}\" + do + qiime diversity beta-group-significance \ + --i-distance-matrix ${core} \ + --m-metadata-file ${metadata} \ + --m-metadata-column \"\$j\" \ + --o-visualization ${core.baseName}-\$j.qzv \ + --p-pairwise + qiime tools export --input-path ${core.baseName}-\$j.qzv \ + --output-path beta_diversity/${core.baseName}-\$j + done + + echo \$(qiime --version | sed -e "s/q2cli version //g" | tr -d '`' | sed -e "s/Run qiime info for more version details.//g") > ${software}.version.txt + """ + } else { + """ + mkdir beta_diversity + echo "" > "beta_diversity/WARNING No column in ${metadata.baseName} seemed suitable.txt" + echo \$(qiime --version | sed -e "s/q2cli version //g" | tr -d '`' | sed -e "s/Run qiime info for more version details.//g") > ${software}.version.txt + """ + } +} \ No newline at end of file diff --git a/modules/local/qiime2_diversity_betaord.nf b/modules/local/qiime2_diversity_betaord.nf new file mode 100644 index 00000000..76b555db --- /dev/null +++ b/modules/local/qiime2_diversity_betaord.nf @@ -0,0 +1,38 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process QIIME2_DIVERSITY_BETAORD { + tag "${core.baseName}" + label 'process_low' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + conda (params.enable_conda ? { exit 1 "QIIME2 has no conda package" } : null) + container "quay.io/qiime2/core:2021.2" + + input: + tuple path(metadata), path(core) + + output: + path("beta_diversity/*"), emit: beta + path "*.version.txt" , emit: version + + script: + def software = getSoftwareName(task.process) + """ + export XDG_CONFIG_HOME="\${PWD}/HOME" + + qiime emperor plot \ + --i-pcoa ${core} \ + --m-metadata-file ${metadata} \ + --o-visualization ${core.baseName}-vis.qzv + qiime tools export --input-path ${core.baseName}-vis.qzv \ + --output-path beta_diversity/${core.baseName}-PCoA + + echo \$(qiime --version | sed -e "s/q2cli version //g" | tr -d '`' | sed -e "s/Run qiime info for more version details.//g") > ${software}.version.txt + """ +} \ No newline at end of file diff --git a/modules/local/qiime2_diversity_core.nf b/modules/local/qiime2_diversity_core.nf new file mode 100644 index 00000000..54c5ae3e --- /dev/null +++ b/modules/local/qiime2_diversity_core.nf @@ -0,0 +1,51 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process QIIME2_DIVERSITY_CORE { + label 'process_low' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + conda (params.enable_conda ? { exit 1 "QIIME2 has no conda package" } : null) + container "quay.io/qiime2/core:2021.2" + + input: + path(metadata) + path(table) + path(tree) + path(stats) + + output: + path("diversity_core/*_pcoa_results.qza") , emit: pcoa + path("diversity_core/*_vector.qza") , emit: vector + path("diversity_core/*_distance_matrix.qza"), emit: distance + path "*.version.txt" , emit: version + path("*rarefaction.txt") , emit: depth + + script: + def software = getSoftwareName(task.process) + """ + export XDG_CONFIG_HOME="\${PWD}/HOME" + + mindepth=\$(count_table_minmax_reads.py $stats minimum 2>&1) + if [ \"\$mindepth\" -gt \"10000\" ]; then echo \$mindepth >\"Use the sampling depth of \$mindepth for rarefaction.txt\" ; fi + if [ \"\$mindepth\" -lt \"10000\" -a \"\$mindepth\" -gt \"5000\" ]; then echo \$mindepth >\"WARNING The sampling depth of \$mindepth is quite small for rarefaction.txt\" ; fi + if [ \"\$mindepth\" -lt \"5000\" -a \"\$mindepth\" -gt \"1000\" ]; then echo \$mindepth >\"WARNING The sampling depth of \$mindepth is very small for rarefaction.txt\" ; fi + if [ \"\$mindepth\" -lt \"1000\" ]; then echo \$mindepth >\"WARNING The sampling depth of \$mindepth seems too small for rarefaction.txt\" ; fi + + qiime diversity core-metrics-phylogenetic \ + --m-metadata-file ${metadata} \ + --i-phylogeny ${tree} \ + --i-table ${table} \ + --p-sampling-depth \$mindepth \ + --output-dir diversity_core \ + --p-n-jobs-or-threads ${task.cpus} \ + --verbose + + echo \$(qiime --version | sed -e "s/q2cli version //g" | tr -d '`' | sed -e "s/Run qiime info for more version details.//g") > ${software}.version.txt + """ +} \ No newline at end of file diff --git a/modules/local/qiime2_export_absolute.nf b/modules/local/qiime2_export_absolute.nf new file mode 100644 index 00000000..3ae41476 --- /dev/null +++ b/modules/local/qiime2_export_absolute.nf @@ -0,0 +1,75 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process QIIME2_EXPORT_ABSOLUTE { + label 'process_low' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + conda (params.enable_conda ? { exit 1 "QIIME2 has no conda package" } : null) + container "quay.io/qiime2/core:2021.2" + + input: + path(table) + path(repseq) + path(taxonomy) + + output: + path("rep-seq.fasta") , emit: fasta + path("feature-table.tsv") , emit: tsv + path("feature-table.biom") , emit: biom + path("seven_number_summary.tsv") , emit: summary + path("descriptive_stats.tsv") , emit: descr + path("abs-abund-table-*.tsv") , emit: abundtable + path "*.version.txt" , emit: version + + script: + def software = getSoftwareName(task.process) + """ + export XDG_CONFIG_HOME="\${PWD}/HOME" + + #produce raw count table in biom format "table/feature-table.biom" + qiime tools export --input-path ${table} \ + --output-path table + cp table/feature-table.biom . + + #produce raw count table "table/feature-table.tsv" + biom convert -i table/feature-table.biom \ + -o feature-table.tsv \ + --to-tsv + + #produce representative sequence fasta file "sequences.fasta" + qiime feature-table tabulate-seqs \ + --i-data ${repseq} \ + --o-visualization rep-seqs.qzv + qiime tools export --input-path rep-seqs.qzv \ + --output-path representative_sequences + cp representative_sequences/sequences.fasta rep-seq.fasta + cp representative_sequences/*.tsv . + + ##on several taxa level + array=( 2 3 4 5 6 ) + for i in \${array[@]} + do + #collapse taxa + qiime taxa collapse \ + --i-table ${table} \ + --i-taxonomy ${taxonomy} \ + --p-level \$i \ + --o-collapsed-table table-\$i.qza + #export to biom + qiime tools export --input-path table-\$i.qza \ + --output-path table-\$i + #convert to tab separated text file + biom convert \ + -i table-\$i/feature-table.biom \ + -o abs-abund-table-\$i.tsv --to-tsv + done + + echo \$(qiime --version | sed -e "s/q2cli version //g" | tr -d '`' | sed -e "s/Run qiime info for more version details.//g") > ${software}.version.txt + """ +} \ No newline at end of file diff --git a/modules/local/qiime2_export_relasv.nf b/modules/local/qiime2_export_relasv.nf new file mode 100644 index 00000000..24ff7d32 --- /dev/null +++ b/modules/local/qiime2_export_relasv.nf @@ -0,0 +1,42 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process QIIME2_EXPORT_RELASV { + label 'process_low' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + conda (params.enable_conda ? { exit 1 "QIIME2 has no conda package" } : null) + container "quay.io/qiime2/core:2021.2" + + input: + path(table) + + output: + path("rel-table-ASV.tsv"), emit: tsv + path "*.version.txt" , emit: version + + script: + def software = getSoftwareName(task.process) + """ + export XDG_CONFIG_HOME="\${PWD}/HOME" + + #convert to relative abundances + qiime feature-table relative-frequency \ + --i-table ${table} \ + --o-relative-frequency-table relative-table-ASV.qza + + #export to biom + qiime tools export --input-path relative-table-ASV.qza --output-path relative-table-ASV + + #convert to tab separated text file "rel-table-ASV.tsv" + biom convert -i relative-table-ASV/feature-table.biom \ + -o rel-table-ASV.tsv --to-tsv + + echo \$(qiime --version | sed -e "s/q2cli version //g" | tr -d '`' | sed -e "s/Run qiime info for more version details.//g") > ${software}.version.txt + """ +} \ No newline at end of file diff --git a/modules/local/qiime2_export_reltax.nf b/modules/local/qiime2_export_reltax.nf new file mode 100644 index 00000000..89fa382b --- /dev/null +++ b/modules/local/qiime2_export_reltax.nf @@ -0,0 +1,55 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process QIIME2_EXPORT_RELTAX { + label 'process_low' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + conda (params.enable_conda ? { exit 1 "QIIME2 has no conda package" } : null) + container "quay.io/qiime2/core:2021.2" + + input: + path(table) + path(taxonomy) + + output: + path("*.tsv") , emit: tsv + path "*.version.txt" , emit: version + + script: + def software = getSoftwareName(task.process) + """ + export XDG_CONFIG_HOME="\${PWD}/HOME" + + ##on several taxa level + array=( 2 3 4 5 6 ) + + for i in \${array[@]} + do + #collapse taxa + qiime taxa collapse \ + --i-table ${table} \ + --i-taxonomy ${taxonomy} \ + --p-level \$i \ + --o-collapsed-table table-\$i.qza + #convert to relative abundances + qiime feature-table relative-frequency \ + --i-table table-\$i.qza \ + --o-relative-frequency-table relative-table-\$i.qza + #export to biom + qiime tools export --input-path relative-table-\$i.qza \ + --output-path relative-table-\$i + #convert to tab separated text file + biom convert \ + -i relative-table-\$i/feature-table.biom \ + -o rel-table-\$i.tsv --to-tsv + done + + echo \$(qiime --version | sed -e "s/q2cli version //g" | tr -d '`' | sed -e "s/Run qiime info for more version details.//g") > ${software}.version.txt + """ +} \ No newline at end of file diff --git a/modules/local/qiime2_extract.nf b/modules/local/qiime2_extract.nf new file mode 100644 index 00000000..7a290480 --- /dev/null +++ b/modules/local/qiime2_extract.nf @@ -0,0 +1,48 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process QIIME2_EXTRACT { + tag "${meta.FW_primer}-${meta.RV_primer}" + label 'process_low' + label 'single_cpu' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + conda (params.enable_conda ? { exit 1 "QIIME2 has no conda package" } : null) + container "quay.io/qiime2/core:2021.2" + + input: + tuple val(meta), path(database) + + output: + tuple val(meta), path("*.qza"), emit: qza + path "*.version.txt" , emit: version + + script: + def software = getSoftwareName(task.process) + """ + export XDG_CONFIG_HOME="\${PWD}/HOME" + + ### Import + qiime tools import --type \'FeatureData[Sequence]\' \ + --input-path ${database[0]} \ + --output-path ref-seq.qza + qiime tools import --type \'FeatureData[Taxonomy]\' \ + --input-format HeaderlessTSVTaxonomyFormat \ + --input-path ${database[1]} \ + --output-path ref-taxonomy.qza + #Extract sequences based on primers + qiime feature-classifier extract-reads \ + --i-sequences ref-seq.qza \ + --p-f-primer ${meta.FW_primer} \ + --p-r-primer ${meta.RV_primer} \ + --o-reads ${meta.FW_primer}-${meta.RV_primer}-ref-seq.qza \ + --quiet + + echo \$(qiime --version | sed -e "s/q2cli version //g" | tr -d '`' | sed -e "s/Run qiime info for more version details.//g") > ${software}.version.txt + """ +} \ No newline at end of file diff --git a/modules/local/qiime2_filterasv.nf b/modules/local/qiime2_filterasv.nf new file mode 100644 index 00000000..3552ea82 --- /dev/null +++ b/modules/local/qiime2_filterasv.nf @@ -0,0 +1,53 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process QIIME2_FILTERASV { + tag "${category}" + label 'process_low' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + conda (params.enable_conda ? { exit 1 "QIIME2 has no conda package" } : null) + container "quay.io/qiime2/core:2021.2" + + input: + path(metadata) + path(table) + val(category) + + output: + path("*.qza") , emit: qza + path "*.version.txt", emit: version + + script: + def software = getSoftwareName(task.process) + if ( category.length() > 0 ) { + """ + export XDG_CONFIG_HOME="\${PWD}/HOME" + + IFS=',' read -r -a metacategory <<< \"$category\" + + #remove samples that do not have any value + for j in \"\${metacategory[@]}\" + do + qiime feature-table filter-samples \ + --i-table ${table} \ + --m-metadata-file ${metadata} \ + --p-where \"\$j<>\'\'\" \ + --o-filtered-table \$j.qza + done + + echo \$(qiime --version | sed -e "s/q2cli version //g" | tr -d '`' | sed -e "s/Run qiime info for more version details.//g") > ${software}.version.txt + """ + } else { + """ + mkdir beta_diversity + echo "" > "WARNING No column in ${metadata.baseName} seemed suitable.qza" + echo \$(qiime --version | sed -e "s/q2cli version //g" | tr -d '`' | sed -e "s/Run qiime info for more version details.//g") > ${software}.version.txt + """ + } +} \ No newline at end of file diff --git a/modules/local/qiime2_filtertaxa.nf b/modules/local/qiime2_filtertaxa.nf new file mode 100644 index 00000000..6f95bad8 --- /dev/null +++ b/modules/local/qiime2_filtertaxa.nf @@ -0,0 +1,77 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process QIIME2_FILTERTAXA { + tag "taxa:${exclude_taxa};min-freq:${min_frequency};min-samples:${min_samples}" + label 'process_low' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + conda (params.enable_conda ? { exit 1 "QIIME2 has no conda package" } : null) + container "quay.io/qiime2/core:2021.2" + + input: + path(table) + path(repseq) + path(taxonomy) + val(min_frequency) + val(min_samples) + val(exclude_taxa) + + output: + path("filtered-table.qza"), emit: asv + path("filtered-table.tsv"), emit: tsv + path("filtered-sequences.qza"), emit: seq + path "*.version.txt" , emit: version + + script: + def software = getSoftwareName(task.process) + """ + export XDG_CONFIG_HOME="\${PWD}/HOME" + + if ! [ \"${exclude_taxa}\" = \"none\" ]; then + #filter sequences + qiime taxa filter-seqs \ + --i-sequences ${repseq} \ + --i-taxonomy ${taxonomy} \ + --p-exclude ${exclude_taxa} --p-mode contains \ + --o-filtered-sequences tax_filtered-sequences.qza + #filter abundance table + qiime taxa filter-table \ + --i-table ${table} \ + --i-taxonomy ${taxonomy} \ + --p-exclude ${exclude_taxa} --p-mode contains \ + --o-filtered-table tax_filtered-table.qza + filtered_table="tax_filtered-table.qza" + filtered_sequences="tax_filtered-sequences.qza" + else + filtered_table=${table} + filtered_sequences=${repseq} + fi + qiime feature-table filter-features \ + --i-table \$filtered_table \ + --p-min-frequency ${min_frequency} \ + --p-min-samples ${min_samples} \ + --o-filtered-table filtered-table.qza + + qiime feature-table filter-seqs \ + --i-data \$filtered_sequences \ + --i-table filtered-table.qza \ + --o-filtered-data filtered-sequences.qza + + #produce raw count table in biom format "table/feature-table.biom" + qiime tools export --input-path filtered-table.qza \ + --output-path table + #produce raw count table + biom convert -i table/feature-table.biom \ + -o table/feature-table.tsv \ + --to-tsv + cp table/feature-table.tsv filtered-table.tsv + + echo \$(qiime --version | sed -e "s/q2cli version //g" | tr -d '`' | sed -e "s/Run qiime info for more version details.//g") > ${software}.version.txt + """ +} \ No newline at end of file diff --git a/modules/local/qiime2_inasv.nf b/modules/local/qiime2_inasv.nf new file mode 100644 index 00000000..f68ce577 --- /dev/null +++ b/modules/local/qiime2_inasv.nf @@ -0,0 +1,36 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process QIIME2_INASV { + tag "${asv}" + label 'process_low' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + conda (params.enable_conda ? { exit 1 "QIIME2 has no conda package" } : null) + container "quay.io/qiime2/core:2021.2" + + input: + path(asv) + + output: + path("table.qza") , emit: qza + path "*.version.txt" , emit: version + + script: + def software = getSoftwareName(task.process) + """ + echo -n "#OTU Table" | cat - "$asv" > biom-table.txt + biom convert -i biom-table.txt -o table.biom --table-type="OTU table" --to-hdf5 + qiime tools import \ + --input-path table.biom \ + --type 'FeatureTable[Frequency]' \ + --input-format BIOMV210Format \ + --output-path table.qza + echo \$(qiime --version | sed -e "s/q2cli version //g" | tr -d '`' | sed -e "s/Run qiime info for more version details.//g") > ${software}.version.txt + """ +} \ No newline at end of file diff --git a/modules/local/qiime2_inseq.nf b/modules/local/qiime2_inseq.nf new file mode 100644 index 00000000..44ef3da7 --- /dev/null +++ b/modules/local/qiime2_inseq.nf @@ -0,0 +1,33 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process QIIME2_INSEQ { + tag "${seq}" + label 'process_low' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + conda (params.enable_conda ? { exit 1 "QIIME2 has no conda package" } : null) + container "quay.io/qiime2/core:2021.2" + + input: + path(seq) + + output: + path("rep-seqs.qza"), emit: qza + path "*.version.txt", emit: version + + script: + def software = getSoftwareName(task.process) + """ + qiime tools import \ + --input-path "$seq" \ + --type 'FeatureData[Sequence]' \ + --output-path rep-seqs.qza + echo \$(qiime --version | sed -e "s/q2cli version //g" | tr -d '`' | sed -e "s/Run qiime info for more version details.//g") > ${software}.version.txt + """ +} \ No newline at end of file diff --git a/modules/local/qiime2_intax.nf b/modules/local/qiime2_intax.nf new file mode 100644 index 00000000..49fd7a0f --- /dev/null +++ b/modules/local/qiime2_intax.nf @@ -0,0 +1,37 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process QIIME2_INTAX { + tag "${tax}" + label 'process_low' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + conda (params.enable_conda ? { exit 1 "QIIME2 has no conda package" } : null) + container "quay.io/qiime2/core:2021.2" + + input: + path(tax) //ASV_tax_species.tsv + + output: + path("taxonomy.qza") , emit: qza + path "*.version.txt" , emit: version + + script: + def software = getSoftwareName(task.process) + """ + parse_dada2_taxonomy.r $tax + + qiime tools import \ + --type 'FeatureData[Taxonomy]' \ + --input-format HeaderlessTSVTaxonomyFormat \ + --input-path tax.tsv \ + --output-path taxonomy.qza + + echo \$(qiime --version | sed -e "s/q2cli version //g" | tr -d '`' | sed -e "s/Run qiime info for more version details.//g") > ${software}.version.txt + """ +} diff --git a/modules/local/qiime2_train.nf b/modules/local/qiime2_train.nf new file mode 100644 index 00000000..73d23f8b --- /dev/null +++ b/modules/local/qiime2_train.nf @@ -0,0 +1,39 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process QIIME2_TRAIN { + tag "${meta.FW_primer}-${meta.RV_primer}" + label 'process_high' + label 'single_cpu' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + conda (params.enable_conda ? { exit 1 "QIIME2 has no conda package" } : null) + container "quay.io/qiime2/core:2021.2" + + input: + tuple val(meta), path(qza) + + output: + path("*-classifier.qza"), emit: qza + path "*.version.txt" , emit: version + + script: + def software = getSoftwareName(task.process) + """ + export XDG_CONFIG_HOME="\${PWD}/HOME" + + #Train classifier + qiime feature-classifier fit-classifier-naive-bayes \ + --i-reference-reads ${meta.FW_primer}-${meta.RV_primer}-ref-seq.qza \ + --i-reference-taxonomy ref-taxonomy.qza \ + --o-classifier ${meta.FW_primer}-${meta.RV_primer}-classifier.qza \ + --quiet + + echo \$(qiime --version | sed -e "s/q2cli version //g" | tr -d '`' | sed -e "s/Run qiime info for more version details.//g") > ${software}.version.txt + """ +} \ No newline at end of file diff --git a/modules/local/qiime2_tree.nf b/modules/local/qiime2_tree.nf new file mode 100644 index 00000000..955639da --- /dev/null +++ b/modules/local/qiime2_tree.nf @@ -0,0 +1,49 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process QIIME2_TREE { + label 'process_medium' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + conda (params.enable_conda ? { exit 1 "QIIME2 has no conda package" } : null) + container "quay.io/qiime2/core:2021.2" + + input: + path(repseq) + + output: + path("rooted-tree.qza"), emit: qza + path("tree.nwk") , emit: nwk + path "*.version.txt" , emit: version + + script: + def software = getSoftwareName(task.process) + """ + export XDG_CONFIG_HOME="\${PWD}/HOME" + + qiime alignment mafft \ + --i-sequences ${repseq} \ + --o-alignment aligned-rep-seqs.qza \ + --p-n-threads ${task.cpus} + qiime alignment mask \ + --i-alignment aligned-rep-seqs.qza \ + --o-masked-alignment masked-aligned-rep-seqs.qza + qiime phylogeny fasttree \ + --i-alignment masked-aligned-rep-seqs.qza \ + --p-n-threads ${task.cpus} \ + --o-tree unrooted-tree.qza + qiime phylogeny midpoint-root \ + --i-tree unrooted-tree.qza \ + --o-rooted-tree rooted-tree.qza + qiime tools export --input-path rooted-tree.qza \ + --output-path phylogenetic_tree + cp phylogenetic_tree/tree.nwk . + + echo \$(qiime --version | sed -e "s/q2cli version //g" | tr -d '`' | sed -e "s/Run qiime info for more version details.//g") > ${software}.version.txt + """ +} \ No newline at end of file diff --git a/modules/local/rename_raw_data_files.nf b/modules/local/rename_raw_data_files.nf new file mode 100644 index 00000000..424daffc --- /dev/null +++ b/modules/local/rename_raw_data_files.nf @@ -0,0 +1,36 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process RENAME_RAW_DATA_FILES { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? 'bioconda::cutadapt=3.2' : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container 'https://depot.galaxyproject.org/singularity/cutadapt:3.2--py38h0213d0e_0' + } else { + container 'quay.io/biocontainers/cutadapt:3.2--py38h0213d0e_0' + } + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("${meta.id}{_1,_2,}.fastq.gz", includeInputs: true), emit: fastq + + script: + // Add soft-links to original FastQs for consistent naming in pipeline + if (meta.single_end) { + """ + [ ! -f ${meta.id}.fastq.gz ] && ln -s $reads ${meta.id}.fastq.gz + """ + } else { + """ + [ -f "${meta.id}_1.fastq.gz" ] || ln -s "${reads[0]}" "${meta.id}_1.fastq.gz" + [ -f "${meta.id}_2.fastq.gz" ] || ln -s "${reads[1]}" "${meta.id}_2.fastq.gz" + """ + } +} \ No newline at end of file diff --git a/modules/local/trunclen.nf b/modules/local/trunclen.nf new file mode 100644 index 00000000..afbaebcc --- /dev/null +++ b/modules/local/trunclen.nf @@ -0,0 +1,28 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process TRUNCLEN { + //tag "$meta" + label 'process_low' + + conda (params.enable_conda ? "pandas=1.1.5" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/pandas:1.1.5" + } else { + container "quay.io/biocontainers/pandas:1.1.5" + } + + input: + tuple val(meta), path(qual_stats) + + output: + tuple val(meta), stdout + + script: + """ + trunclen.py $qual_stats $options.args + """ +} \ No newline at end of file diff --git a/modules/nf-core/software/cutadapt/functions.nf b/modules/nf-core/software/cutadapt/functions.nf new file mode 100644 index 00000000..d25eea86 --- /dev/null +++ b/modules/nf-core/software/cutadapt/functions.nf @@ -0,0 +1,59 @@ +/* + * ----------------------------------------------------- + * Utility functions used in nf-core DSL2 module files + * ----------------------------------------------------- + */ + +/* + * Extract name of software tool from process name using $task.process + */ +def getSoftwareName(task_process) { + return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() +} + +/* + * Function to initialise default values and to generate a Groovy Map of available options for nf-core modules + */ +def initOptions(Map args) { + def Map options = [:] + options.args = args.args ?: '' + options.args2 = args.args2 ?: '' + options.publish_by_id = args.publish_by_id ?: false + options.publish_dir = args.publish_dir ?: '' + options.publish_files = args.publish_files + options.suffix = args.suffix ?: '' + return options +} + +/* + * Tidy up and join elements of a list to return a path string + */ +def getPathFromList(path_list) { + def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries + paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes + return paths.join('/') +} + +/* + * Function to save/publish module results + */ +def saveFiles(Map args) { + if (!args.filename.endsWith('.version.txt')) { + def ioptions = initOptions(args.options) + def path_list = [ ioptions.publish_dir ?: args.publish_dir ] + if (ioptions.publish_by_id) { + path_list.add(args.publish_id) + } + if (ioptions.publish_files instanceof Map) { + for (ext in ioptions.publish_files) { + if (args.filename.endsWith(ext.key)) { + def ext_list = path_list.collect() + ext_list.add(ext.value) + return "${getPathFromList(ext_list)}/$args.filename" + } + } + } else if (ioptions.publish_files == null) { + return "${getPathFromList(path_list)}/$args.filename" + } + } +} diff --git a/modules/nf-core/software/cutadapt/main.nf b/modules/nf-core/software/cutadapt/main.nf new file mode 100644 index 00000000..05ef8842 --- /dev/null +++ b/modules/nf-core/software/cutadapt/main.nf @@ -0,0 +1,42 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process CUTADAPT { + tag "$meta.id" + label 'process_medium' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + + conda (params.enable_conda ? 'bioconda::cutadapt=3.2' : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container 'https://depot.galaxyproject.org/singularity/cutadapt:3.2--py38h0213d0e_0' + } else { + container 'quay.io/biocontainers/cutadapt:3.2--py38h0213d0e_0' + } + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path('*.trim.fastq.gz'), emit: reads + tuple val(meta), path('*.log') , emit: log + path '*.version.txt' , emit: version + + script: + def software = getSoftwareName(task.process) + def prefix = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}" + def trimmed = meta.single_end ? "-o ${prefix}.trim.fastq.gz" : "-o ${prefix}_1.trim.fastq.gz -p ${prefix}_2.trim.fastq.gz" + """ + cutadapt \\ + --cores $task.cpus \\ + $options.args \\ + $trimmed \\ + $reads \\ + > ${prefix}.cutadapt.log + echo \$(cutadapt --version) > ${software}.version.txt + """ +} diff --git a/modules/nf-core/software/cutadapt/meta.yml b/modules/nf-core/software/cutadapt/meta.yml new file mode 100644 index 00000000..14652343 --- /dev/null +++ b/modules/nf-core/software/cutadapt/meta.yml @@ -0,0 +1,45 @@ +name: cutadapt +description: Perform adapter/quality trimming on sequencing reads +keywords: + - trimming + - adapter trimming + - adapters + - quality trimming +tools: + - cuatadapt: + description: | + Cutadapt finds and removes adapter sequences, primers, poly-A tails and other types of unwanted sequence from your high-throughput sequencing reads. + documentation: https://cutadapt.readthedocs.io/en/stable/index.html + doi: DOI:10.14806/ej.17.1.200 +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: The trimmed/modified fastq reads + pattern: "*fastq.gz" + - log: + type: file + description: cuatadapt log file + pattern: "*cutadapt.log" + - version: + type: file + description: File containing software version + pattern: "*.{version.txt}" +authors: + - "@drpatelh" + - "@kevinmenden" diff --git a/modules/nf-core/software/fastqc/functions.nf b/modules/nf-core/software/fastqc/functions.nf new file mode 100644 index 00000000..d25eea86 --- /dev/null +++ b/modules/nf-core/software/fastqc/functions.nf @@ -0,0 +1,59 @@ +/* + * ----------------------------------------------------- + * Utility functions used in nf-core DSL2 module files + * ----------------------------------------------------- + */ + +/* + * Extract name of software tool from process name using $task.process + */ +def getSoftwareName(task_process) { + return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() +} + +/* + * Function to initialise default values and to generate a Groovy Map of available options for nf-core modules + */ +def initOptions(Map args) { + def Map options = [:] + options.args = args.args ?: '' + options.args2 = args.args2 ?: '' + options.publish_by_id = args.publish_by_id ?: false + options.publish_dir = args.publish_dir ?: '' + options.publish_files = args.publish_files + options.suffix = args.suffix ?: '' + return options +} + +/* + * Tidy up and join elements of a list to return a path string + */ +def getPathFromList(path_list) { + def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries + paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes + return paths.join('/') +} + +/* + * Function to save/publish module results + */ +def saveFiles(Map args) { + if (!args.filename.endsWith('.version.txt')) { + def ioptions = initOptions(args.options) + def path_list = [ ioptions.publish_dir ?: args.publish_dir ] + if (ioptions.publish_by_id) { + path_list.add(args.publish_id) + } + if (ioptions.publish_files instanceof Map) { + for (ext in ioptions.publish_files) { + if (args.filename.endsWith(ext.key)) { + def ext_list = path_list.collect() + ext_list.add(ext.value) + return "${getPathFromList(ext_list)}/$args.filename" + } + } + } else if (ioptions.publish_files == null) { + return "${getPathFromList(path_list)}/$args.filename" + } + } +} diff --git a/modules/nf-core/software/fastqc/main.nf b/modules/nf-core/software/fastqc/main.nf new file mode 100644 index 00000000..4e847876 --- /dev/null +++ b/modules/nf-core/software/fastqc/main.nf @@ -0,0 +1,47 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process FASTQC { + tag "$meta.id" + label 'process_medium' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + + conda (params.enable_conda ? "bioconda::fastqc=0.11.9" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/fastqc:0.11.9--0" + } else { + container "quay.io/biocontainers/fastqc:0.11.9--0" + } + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*.html"), emit: html + tuple val(meta), path("*.zip") , emit: zip + path "*.version.txt" , emit: version + + script: + // Add soft-links to original FastQs for consistent naming in pipeline + def software = getSoftwareName(task.process) + def prefix = options.suffix ? "${meta.id}.${options.suffix}" : "${meta.id}" + if (meta.single_end) { + """ + [ ! -f ${prefix}.fastq.gz ] && ln -s $reads ${prefix}.fastq.gz + fastqc $options.args --threads $task.cpus ${prefix}.fastq.gz + fastqc --version | sed -e "s/FastQC v//g" > ${software}.version.txt + """ + } else { + """ + [ ! -f ${prefix}_1.fastq.gz ] && ln -s ${reads[0]} ${prefix}_1.fastq.gz + [ ! -f ${prefix}_2.fastq.gz ] && ln -s ${reads[1]} ${prefix}_2.fastq.gz + fastqc $options.args --threads $task.cpus ${prefix}_1.fastq.gz ${prefix}_2.fastq.gz + fastqc --version | sed -e "s/FastQC v//g" > ${software}.version.txt + """ + } +} diff --git a/modules/nf-core/software/fastqc/meta.yml b/modules/nf-core/software/fastqc/meta.yml new file mode 100644 index 00000000..8eb9953d --- /dev/null +++ b/modules/nf-core/software/fastqc/meta.yml @@ -0,0 +1,51 @@ +name: fastqc +description: Run FastQC on sequenced reads +keywords: + - quality control + - qc + - adapters + - fastq +tools: + - fastqc: + description: | + FastQC gives general quality metrics about your reads. + It provides information about the quality score distribution + across your reads, the per base sequence content (%A/C/G/T). + You get information about adapter contamination and other + overrepresented sequences. + homepage: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/ + documentation: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/ +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - html: + type: file + description: FastQC report + pattern: "*_{fastqc.html}" + - zip: + type: file + description: FastQC report archive + pattern: "*_{fastqc.zip}" + - version: + type: file + description: File containing software version + pattern: "*.{version.txt}" +authors: + - "@drpatelh" + - "@grst" + - "@ewels" + - "@FelixKrueger" diff --git a/nextflow.config b/nextflow.config index 3c6a41a9..ae5a7c98 100644 --- a/nextflow.config +++ b/nextflow.config @@ -7,192 +7,233 @@ // Global default params, used in configs params { - publish_dir_mode = 'copy' - help = false - extension = "/*_R{1,2}_001.fastq.gz" - input = "data" + "${extension}" - readPaths = false - pacbio = false - outdir = './results' - FW_primer = false - RV_primer = false - classifier = false - metadata = false - tracedir = "${params.outdir}/pipeline_info" - clusterOptions = false - qiime_timezone = 'Europe/Berlin' - - // Defines all parameters that are independent of a test run - trunc_qmin = 25 //to calculate params.trunclenf and params.trunclenr automatically - trunc_rmin = 0.75 //to calculate params.trunclenf and params.trunclenr automatically - trunclenf = false - trunclenr = false - maxEE = 2 - maxLen = 2999 //2999 is the maximum allowed read length in dada2 version 1.12 - minLen = 50 - metadata_category = false - double_primer = false - retain_untrimmed = false - exclude_taxa = "mitochondria,chloroplast" - keepIntermediates = false - classifier_removeHash = false - min_frequency = false - min_samples = false - multipleSequencingRuns = false - phred64 = false - split = "-" - skip_fastqc = false - - //Database specific parameters - //currently only this is compatible with process make_SILVA_132_16S_classifier - reference_database = "https://www.arb-silva.de/fileadmin/silva_databases/qiime/Silva_132_release.zip" - dereplication = 99 - taxon_reference = "silva" - - // Boilerplate options - name = false - multiqc_config = false - email = false - email_on_fail = false - max_multiqc_email_size = 25.MB - plaintext_email = false - monochrome_logs = false - help = false - tracedir = "${params.outdir}/pipeline_info" - awsqueue = false - awsregion = 'eu-west-1' - custom_config_version = 'master' - custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" - hostnames = false - config_profile_description = false - config_profile_contact = false - config_profile_url = false - - // Defaults only, expecting to be overwritten - max_memory = 128.GB - max_cpus = 16 - max_time = 240.h - -} - -//export Time Zone required for QIIME2 2019.10 -env { - TZ = params.qiime_timezone + input = false + pacbio = false + iontorrent = false + FW_primer = false + RV_primer = false + classifier = "" + metadata = "" + + // Defines all parameters that are independent of a test run + extension = "/*_R{1,2}_001.fastq.gz" + help = false + publish_dir_mode = 'copy' + trunc_qmin = 25 + trunc_rmin = 0.75 + trunclenf = false + trunclenr = false + max_ee = 2 + max_len = "Inf" + min_len = 50 + metadata_category = "" + double_primer = false + retain_untrimmed = false + exclude_taxa = "mitochondria,chloroplast" + min_frequency = 1 + min_samples = 1 + multiple_sequencing_runs = false + single_end = false + sample_inference = "independent" + illumina_pe_its = false + concatenate_reads = false + cut_its = false + + //skipping steps + skip_qiime = false + skip_fastqc = false + skip_alpha_rarefaction = false + skip_abundance_tables = false + skip_barplot = false + skip_taxonomy = false + skip_alpha_rarefaction = false + skip_diversity_indices = false + skip_ancom = false + skip_multiqc = false + + //Database specific parameters + dada_ref_taxonomy = "silva=138" + cut_dada_ref_taxonomy = false + qiime_ref_taxonomy = "" + + // Boilerplate options + outdir = './results' + enable_conda = false + multiqc_config = '' + multiqc_title = '' + email = '' + email_on_fail = '' + max_multiqc_email_size = 25.MB + plaintext_email = false + monochrome_logs = false + help = false + tracedir = "${params.outdir}/pipeline_info" + custom_config_version = 'master' + custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" + hostnames = false + config_profile_name = null + config_profile_description = false + config_profile_contact = false + config_profile_url = false + singularity_pull_docker_container = false + validate_params = true + show_hidden_params = false + schema_ignore_params = 'dada_ref_databases,qiime_ref_databases,modules,igenomes_base' + + // Defaults only, expecting to be overwritten + max_memory = 128.GB + max_cpus = 16 + max_time = 240.h } // Load base.config by default for all pipelines includeConfig 'conf/base.config' -// Container slug. Stable releases should specify release tag! -// Developmental code should specify :dev -process.container = 'nfcore/ampliseq:1.2.0' - +// Load modules.config for DSL2 module specific options +includeConfig 'conf/modules.config' +// Load ref_databases.config for all pipelines +includeConfig 'conf/ref_databases.config' + // Load nf-core custom profiles from different Institutions try { - includeConfig "${params.custom_config_base}/nfcore_custom.config" + includeConfig "${params.custom_config_base}/nfcore_custom.config" } catch (Exception e) { - System.err.println("WARNING: Could not load nf-core/config profiles: ${params.custom_config_base}/nfcore_custom.config") + System.err.println("WARNING: Could not load nf-core/config profiles: ${params.custom_config_base}/nfcore_custom.config") } // Load nf-core/ampliseq custom profiles from different Institutions try { - includeConfig "${params.custom_config_base}/pipeline/ampliseq.config" + includeConfig "${params.custom_config_base}/pipeline/ampliseq.config" } catch (Exception e) { - System.err.println("WARNING: Could not load nf-core/config/ampliseq profiles: ${params.custom_config_base}/pipeline/ampliseq.config") + System.err.println("WARNING: Could not load nf-core/config/ampliseq profiles: ${params.custom_config_base}/pipeline/ampliseq.config") } profiles { - conda { process.conda = "$projectDir/environment.yml" } - debug { process.beforeScript = 'echo $HOSTNAME' } - docker { - docker.enabled = true - // Avoid this error: - // WARNING: Your kernel does not support swap limit capabilities or the cgroup is not mounted. Memory limited without swap. - // Testing this in nf-core after discussion here https://github.com/nf-core/tools/pull/351 - // once this is established and works well, nextflow might implement this behavior as new default. - docker.runOptions = '-u \$(id -u):\$(id -g)' - } - singularity { - singularity.enabled = true - singularity.autoMounts = true - } - podman { - podman.enabled = true - } - test { includeConfig 'conf/test.config' } - test_multi { includeConfig 'conf/test_multi.config' } - test_manifest { includeConfig 'conf/test_manifest.config' } - test_doubleprimers { includeConfig 'conf/test_doubleprimers.config' } - test_pacbio_its { includeConfig 'conf/test_pacbio_its.config' } - test_full { includeConfig 'conf/test_full.config' } + conda { + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + debug { process.beforeScript = 'echo $HOSTNAME' } + conda { params.enable_conda = true } + docker { + docker.enabled = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + // Avoid this error: + // WARNING: Your kernel does not support swap limit capabilities or the cgroup is not mounted. Memory limited without swap. + // Testing this in nf-core after discussion here https://github.com/nf-core/tools/pull/351 + // once this is established and works well, nextflow might implement this behavior as new default. + docker.runOptions = '-u \$(id -u):\$(id -g)' + } + singularity { + docker.enabled = false + singularity.enabled = true + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + singularity.autoMounts = true + } + podman { + singularity.enabled = false + docker.enabled = false + podman.enabled = true + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + singularity.enabled = false + docker.enabled = false + podman.enabled = false + shifter.enabled = true + charliecloud.enabled = false + } + charliecloud { + singularity.enabled = false + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = true + } + test { includeConfig 'conf/test.config' } + test_multi { includeConfig 'conf/test_multi.config' } + test_doubleprimers { includeConfig 'conf/test_doubleprimers.config' } + test_pacbio_its { includeConfig 'conf/test_pacbio_its.config' } + test_iontorrent { includeConfig 'conf/test_iontorrent.config' } + test_full { includeConfig 'conf/test_full.config' } } // Export these variables to prevent local Python/R libraries from conflicting with those in the container env { - PYTHONNOUSERSITE = 1 - R_PROFILE_USER = "/.Rprofile" - R_ENVIRON_USER = "/.Renviron" + PYTHONNOUSERSITE = 1 + R_PROFILE_USER = "/.Rprofile" + R_ENVIRON_USER = "/.Renviron" } // Capture exit codes from upstream processes when piping process.shell = ['/bin/bash', '-euo', 'pipefail'] +def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') timeline { - enabled = true - file = "${params.tracedir}/execution_timeline.html" + enabled = true + file = "${params.tracedir}/execution_timeline_${trace_timestamp}.html" } report { - enabled = true - file = "${params.tracedir}/execution_report.html" + enabled = true + file = "${params.tracedir}/execution_report_${trace_timestamp}.html" } trace { - enabled = true - file = "${params.tracedir}/execution_trace.txt" + enabled = true + file = "${params.tracedir}/execution_trace_${trace_timestamp}.txt" } dag { - enabled = true - file = "${params.tracedir}/pipeline_dag.svg" + enabled = true + file = "${params.tracedir}/pipeline_dag_${trace_timestamp}.svg" } manifest { - name = 'nf-core/ampliseq' - author = 'Daniel Straub, Alexander Peltzer' - homePage = 'https://github.com/nf-core/ampliseq' - description = '16S rRNA amplicon sequencing analysis workflow using QIIME2' - mainScript = 'main.nf' - nextflowVersion = '>=20.04.0' - version = '1.2.0' + name = 'nf-core/ampliseq' + author = 'Daniel Straub, Alexander Peltzer' + homePage = 'https://github.com/nf-core/ampliseq' + description = 'Amplicon sequencing analysis workflow using DADA2 and QIIME2' + mainScript = 'main.nf' + nextflowVersion = '!>=21.04.0' + version = '2.0.0' } // Function to ensure that resource requirements don't go beyond // a maximum limit def check_max(obj, type) { - if (type == 'memory') { - try { - if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1) - return params.max_memory as nextflow.util.MemoryUnit - else - return obj - } catch (all) { - println " ### ERROR ### Max memory '${params.max_memory}' is not valid! Using default value: $obj" - return obj - } - } else if (type == 'time') { - try { - if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1) - return params.max_time as nextflow.util.Duration - else - return obj - } catch (all) { - println " ### ERROR ### Max time '${params.max_time}' is not valid! Using default value: $obj" - return obj + if (type == 'memory') { + try { + if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1) + return params.max_memory as nextflow.util.MemoryUnit + else + return obj + } catch (all) { + println " ### ERROR ### Max memory '${params.max_memory}' is not valid! Using default value: $obj" + return obj + } + } else if (type == 'time') { + try { + if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1) + return params.max_time as nextflow.util.Duration + else + return obj + } catch (all) { + println " ### ERROR ### Max time '${params.max_time}' is not valid! Using default value: $obj" + return obj + } + } else if (type == 'cpus') { + try { + return Math.min( obj, params.max_cpus as int ) + } catch (all) { + println " ### ERROR ### Max cpus '${params.max_cpus}' is not valid! Using default value: $obj" + return obj + } } - } else if (type == 'cpus') { - try { - return Math.min( obj, params.max_cpus as int ) - } catch (all) { - println " ### ERROR ### Max cpus '${params.max_cpus}' is not valid! Using default value: $obj" - return obj - } - } -} +} \ No newline at end of file diff --git a/nextflow_schema.json b/nextflow_schema.json index f353f0b8..5211757e 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -2,7 +2,7 @@ "$schema": "http://json-schema.org/draft-07/schema", "$id": "https://raw.githubusercontent.com/nf-core/ampliseq/master/nextflow_schema.json", "title": "nf-core/ampliseq pipeline parameters", - "description": "16S rRNA amplicon sequencing analysis workflow using QIIME2", + "description": "Amplicon sequencing analysis workflow using DADA2 and QIIME2", "type": "object", "definitions": { "main_arguments": { @@ -14,109 +14,175 @@ "input": { "type": "string", "fa_icon": "fas fa-dna", - "description": "Folder containing paired-end demultiplexed FastQ files", - "help_text": "Use this to specify the location of your input paired-end FastQ files. \n\nFor example:\n\n```bash\n--input 'path/to/data'\n```\n\nExample for input data organization from one sequencing run with two samples:\n\n```bash\ndata\n |-sample1_1_L001_R1_001.fastq.gz\n |-sample1_1_L001_R2_001.fastq.gz\n |-sample2_1_L001_R1_001.fastq.gz\n |-sample2_1_L001_R2_001.fastq.gz\n```\n\nPlease note the following requirements:\n\n1. The path must be enclosed in quotes\n2. The folder must contain gzip compressed paired-end demultiplexed fastq files. If the file names do not follow the default (`\"/*_R{1,2}_001.fastq.gz\"`), please check `--extension`.\n3. If your data is scattered, a directory with symlinks to your actual data might be a solution.\n4. All sequencing data should originate from one sequencing run, because processing relies on run-specific error models that are unreliable when data from several sequencing runs are mixed. Sequencing data originating from multiple sequencing runs requires additionally the parameter `--multipleSequencingRuns` and a specific folder structure." + "description": "Either a tab-seperated sample sheet, a fasta file, or a folder containing zipped FastQ files", + "help_text": "Points to the main pipeline input, one of the following:\n- folder containing compressed fastq files\n- sample sheet ending with `.tsv` that points towards compressed fastq files\n- fasta file ending with `.fasta`, `.fna` or `.fa` that will be taxonomically classified\n\nRelated parameters are:\n- `--pacbio` and `--iontorrent` if the sequencing data is PacBio data or IonTorrent data (default expected: paired-end Illumina data)\n- `--single_end` if the sequencing data is single-ended Illumina data (default expected: paired-end Illumina data)\n- `--multiple_sequencing_runs` (folder input only) if the sequencing data originates from multiple sequencing runs\n- `--extension` (folder input only) if the sequencing file names do not follow the default (`\"/*_R{1,2}_001.fastq.gz\"`)\n- `--dada_ref_taxonomy` and `--qiime_ref_taxonomy` to choose an appropriate reference taxonomy for the type of amplicon (16S/18S/ITS) (default: 16S rRNA sequence database)\n\n##### Folder containing zipped FastQ files\n\nFor example:\n\n```bash\n--input 'path/to/data'\n```\n\nExample for input data organization from one sequencing run with two samples, paired-end data:\n\n```bash\ndata\n |-sample1_1_L001_R1_001.fastq.gz\n |-sample1_1_L001_R2_001.fastq.gz\n |-sample2_1_L001_R1_001.fastq.gz\n |-sample2_1_L001_R2_001.fastq.gz\n```\n\nPlease note the following requirements:\n\n1. The path must be enclosed in quotes\n2. The folder must contain gzip compressed demultiplexed fastq files. If the file names do not follow the default (`\"/*_R{1,2}_001.fastq.gz\"`), please check `--extension`.\n3. Sample identifiers are extracted from file names, i.e. the string before the first underscore `_`, these must be unique\n4. If your data is scattered, produce a sample sheet\n5. All sequencing data should originate from one sequencing run, because processing relies on run-specific error models that are unreliable when data from several sequencing runs are mixed. Sequencing data originating from multiple sequencing runs requires additionally the parameter `--multiple_sequencing_runs` and a specific folder structure.\n\n##### Sample sheet\n\nThe sample sheet file is an alternative way to provide input reads, it must be a tab-separated file ending with `.tsv` that must have two to four columns with the following headers: \n- `sampleID` (required): Unique sample identifiers, any unique string (may not contain dots `.`)\n- `forwardReads` (required): Paths to (forward) reads zipped FastQ files\n- `reverseReads` (optional): Paths to reverse reads zipped FastQ files, required if the data is paired-end\n- `run` (optional): If the data was produced by multiple sequencing runs, any string\n\nFor example:\n\n```bash\n--input 'path/to/samplesheet.tsv'\n```\n\n##### Fasta file\n\nWhen pointing at a file ending with `.fasta`, `.fna` or `.fa`, the containing sequences will be taxonomically classified. All other pipeline steps will be skipped.\n\nThis can be used to taxonomically classify previously produced ASV/OTU sequences.\n\nFor example:\n\n```bash\n--input 'path/to/amplicon_sequences.fasta'\n```" }, "FW_primer": { "type": "string", "description": "Forward primer sequence", - "help_text": "In amplicon sequencing methods, PCR with specific primers produces the amplicon of interest. These primer sequences need to be trimmed from the reads before further processing and are also required for producing an appropriate classifier. \n\nFor example:\n\n```bash\n--FW_primer GTGYCAGCMGCCGCGGTAA --RV_primer GGACTACNVGGGTWTCTAAT\n```", + "help_text": "In amplicon sequencing methods, PCR with specific primers produces the amplicon of interest. These primer sequences need to be trimmed from the reads before further processing and are also required for producing an appropriate classifier. Do not use here any technical sequence such as adapter sequences but only the primer sequence that matches the biological amplicon.\n\nFor example:\n\n```bash\n--FW_primer \"GTGYCAGCMGCCGCGGTAA\" --RV_primer \"GGACTACNVGGGTWTCTAAT\"\n```", "fa_icon": "fas fa-arrow-circle-right" }, "RV_primer": { "type": "string", "description": "Reverse primer sequence", - "help_text": "In amplicon sequencing methods, PCR with specific primers produces the amplicon of interest. These primer sequences need to be trimmed from the reads before further processing and are also required for producing an appropriate classifier. \n\nFor example:\n\n```bash\n--FW_primer GTGYCAGCMGCCGCGGTAA --RV_primer GGACTACNVGGGTWTCTAAT\n```", + "help_text": "In amplicon sequencing methods, PCR with specific primers produces the amplicon of interest. These primer sequences need to be trimmed from the reads before further processing and are also required for producing an appropriate classifier. Do not use here any technical sequence such as adapter sequences but only the primer sequence that matches the biological amplicon.\n\nFor example:\n\n```bash\n--FW_primer GTGYCAGCMGCCGCGGTAA --RV_primer GGACTACNVGGGTWTCTAAT\n```", "fa_icon": "fas fa-arrow-alt-circle-left" }, "metadata": { "type": "string", "description": "Path to metadata sheet, when missing most downstream analysis are skipped (barplots, PCoA plots, ...).", - "help_text": "This is optional, but for performing downstream analysis such as barplots, diversity indices or differential abundance testing, a metadata file is essential. \n\nFor example:\n\n```bash\n--metadata \"path/to/metadata.tsv\"\n```\n\nPlease note the following requirements:\n\n1. The path must be enclosed in quotes\n2. The metadata file has to follow the QIIME2 specifications (https://docs.qiime2.org/2019.10/tutorials/metadata/)\n3. In case of multiple sequencing runs, specific naming of samples are required, see `--multipleSequencingRuns`\n\nThe first column in the metadata file is the identifier (ID) column and defines the sample or feature IDs associated with your study. Metadata files are not required to have additional metadata columns, so a file containing only an ID column is a valid QIIME 2 metadata file. Additional columns defining metadata associated with each sample or feature ID are optional.\n**NB**: without additional columns there might be no groupings for the downstream analyses.\n\nIdentifiers should be 36 characters long or less, and also contain only ASCII alphanumeric characters (i.e. in the range of [a-z], [A-Z], or [0-9]), the period (.) character, or the dash (-) character. By default all numeric columns, blanks or NA are removed, and only columns with multiple different values but not all unique are selected.\n\nThe columns which are to be assessed can be specified by `--metadata_category`. If `--metadata_category` isn't specified than all columns that fit the specification are automatically chosen.", - "fa_icon": "fas fa-file-csv" - }, - "multipleSequencingRuns": { - "type": "boolean", - "description": "If samples were sequenced in multiple sequencing runs", - "help_text": "Expects one subfolder per sequencing run in the folder specified by `--input` containing sequencing data of the specific run.\n\nTo prevent overlapping sample names from multiple sequencing runs, sample names obtained from the sequencing files will be renamed automatically by adding the folder name as prefix separated by a string specified by `--split`. Accordingly, the sample name column in the metadata file `--metadata` require values following `subfolder-samplename`.\n\nExample for input data organization:\n\n```bash\ndata\n |-run1\n | |-sample1_1_L001_R{1,2}_001.fastq.gz\n | |-sample2_1_L001_R{1,2}_001.fastq.gz\n |\n |-run2\n |-sample3_1_L001_R{1,2}_001.fastq.gz\n |-sample4_1_L001_R{1,2}_001.fastq.gz\n```\n\nIn this example the first column in the metadata file requires the values `run1-sample1` ... `run2-sample4` (instead of `sample1`, ..., `sample4`).\n\nExample command to analyze this data in one pipeline run:\n\n```bash\nnextflow run nf-core/ampliseq \\\n -profile singularity \\\n --input \"data\" \\\n --FW_primer GTGYCAGCMGCCGCGGTAA \\\n --RV_primer GGACTACNVGGGTWTCTAAT \\\n --metadata \"data/Metadata.tsv\" \\\n --multipleSequencingRuns\n```\n\n##### Visually choosing sequencing read truncation cutoffs\n\nWhile `--untilQ2import` with `--multipleSequencingRuns` is currently supported, `--Q2imported` is not. The pipeline can be first run with `--untilQ2import`, than `--trunclenf` and `--trunclenr` are visually chosen, and then the pipeline can be continued without `--untilQ2import` but with `--trunlenf`, `--trunlenr`, and `-resume`.\n\nFor example:\n\n(1) To produce quality plots and choose truncation values:\n\n```bash\nnextflow run nf-core/ampliseq \\\n -profile singularity \\\n --input \"data\" \\\n --FW_primer GTGYCAGCMGCCGCGGTAA \\\n --RV_primer GGACTACNVGGGTWTCTAAT \\\n --metadata \"data/Metadata.tsv\" \\\n --multipleSequencingRuns \\\n --untilQ2import\n```\n\n(2) To finish analysis:\n\n```bash\nnextflow run nf-core/ampliseq \\\n -profile singularity \\\n --input \"data\" \\\n --FW_primer GTGYCAGCMGCCGCGGTAA \\\n --RV_primer GGACTACNVGGGTWTCTAAT \\\n --metadata \"data/Metadata.tsv\" \\\n --multipleSequencingRuns \\\n --trunclenf 200 \\\n --trunclenr 180 \\\n -resume\n```", - "fa_icon": "fas fa-running" - }, - "manifest": { - "type": "string", - "description": "Path to tab-separated table with sample IDs and paths to sequencing files", - "help_text": "You can submit a manifest file as an alternative way to provide input reads. No submission of read files with `--input` is required this way.\n\nA manifest must be a tab-separated file that must have the following labels in this exact order: `sampleID`, `forwardReads`, `reverseReads`. In case of single-end reads, such as PacBio data, the labels should be: `sampleID`, `Reads`. The sample identifiers must be listed under `sampleID`. Paths to forward and reverse reads must be reported under `forwardReads` and `reverseReads`, respectively. Path to single-end must be reported under `Reads`.\n\nMultiple sequencing runs not supported by `--manifest` at this stage.", + "help_text": "This is optional, but for performing downstream analysis such as barplots, diversity indices or differential abundance testing, a metadata file is essential.\n\nRelated parameter:\n- `--metadata_category` (optional) to choose columns that are used for testing significance\n\nFor example:\n\n```bash\n--metadata \"path/to/metadata.tsv\"\n```\n\nPlease note the following requirements:\n\n1. The path must be enclosed in quotes\n2. The metadata file has to follow the QIIME2 specifications (https://docs.qiime2.org/2021.2/tutorials/metadata/)\n\nThe first column in the tab-separated metadata file is the sample identifier column (required header: `ID`) and defines the sample or feature IDs associated with your study. Metadata files are not required to have additional metadata columns, so a file containing only an ID column is a valid QIIME 2 metadata file. Additional columns defining metadata associated with each sample or feature ID are optional.\n**NB**: without additional columns there might be no groupings for the downstream analyses.\n\nSample identifiers should be 36 characters long or less, and also contain only ASCII alphanumeric characters (i.e. in the range of [a-z], [A-Z], or [0-9]), or the dash (-) character. For downstream analysis, by default all numeric columns, blanks or NA are removed, and only columns with multiple different values but not all unique are selected.\n\nThe columns which are to be assessed can be specified by `--metadata_category`. If `--metadata_category` isn't specified than all columns that fit the specification are automatically chosen.", "fa_icon": "fas fa-file-csv" } }, "required": [ + "input", "FW_primer", "RV_primer" ], "fa_icon": "fas fa-terminal" }, + "other_input_output_options": { + "title": "Other input/output options", + "type": "object", + "fa_icon": "fas fa-terminal", + "description": "Define where the pipeline should find input data and save output data.", + "properties": { + "pacbio": { + "type": "boolean", + "description": "If data is single-ended PacBio reads instead of Illumina" + }, + "iontorrent": { + "type": "boolean", + "description": "If data is single-ended IonTorrent reads instead of Illumina" + }, + "single_end": { + "type": "boolean", + "description": "If data is single-ended Illumina reads instead of paired-end" + }, + "cut_its": { + "type": "boolean", + "description": "If data is long read ITS sequences, that need to be cut to ITS region only for taxonomy assignment" + }, + "multiple_sequencing_runs": { + "type": "boolean", + "description": "If samples were sequenced in multiple sequencing runs", + "help_text": "Expects one sub-folder per sequencing run in the folder specified by `--input` containing sequencing data of the specific run.\nSample identifiers are taken from sequencing files, specifically the string before the first underscore will be the sample ID. Sample IDs across all sequencing runs (all sequencing files) have to be unique. If this is not the case, please use a sample sheet as input instead.\n\nExample for input data organization:\n\n```bash\ndata\n |-run1\n | |-sample1_1_L001_R1_001.fastq.gz\n | |-sample1_1_L001_R2_001.fastq.gz\n | |-sample2_1_L001_R1_001.fastq.gz\n | |-sample2_1_L001_R2_001.fastq.gz\n |\n |-run2\n |-sample3_1_L001_R1_001.fastq.gz\n |-sample3_1_L001_R2_001.fastq.gz\n |-sample4_1_L001_R1_001.fastq.gz\n |-sample4_1_L001_R2_001.fastq.gz\n```\n\nExample command to analyze this data in one pipeline run:\n\n```bash\nnextflow run nf-core/ampliseq \\\n -profile singularity \\\n --input \"data\" \\\n --FW_primer \"GTGYCAGCMGCCGCGGTAA\" \\\n --RV_primer \"GGACTACNVGGGTWTCTAAT\" \\\n --metadata \"data/Metadata.tsv\" \\\n --multiple_sequencing_runs\n```", + "fa_icon": "fas fa-running" + }, + "illumina_pe_its": { + "type": "boolean", + "description": "If analysing ITS amplicons or any other region with large length variability with Illumina paired end reads", + "help_text": "This will cause the pipeline to\n- not truncate input reads if not `--trunclenf` and `--trunclenr` are overwriting defaults\n- remove reverse complement primers from the end of reads in case the read length exceeds the amplicon length" + }, + "concatenate_reads": { + "type": "boolean", + "description": "Not recommended: When paired end reads are not sufficiently overlapping for merging.", + "help_text": "This parameters specifies that paired-end reads are not merged after denoising but concatenated (separated by 10 N's). This is of advantage when an amplicon was sequenced that is too long for merging (i.e. bad experimental design). This is an alternative to only analyzing the forward or reverse read in case of non-overlapping paired-end sequencing data.\n\n**This parameter is not recommended! Only if all other options fail.**" + }, + "sample_inference": { + "type": "string", + "default": "independent", + "help_text": "If samples are treated independent (lowest sensitivity and lowest resources), pooled (highest sensitivity and resources) or pseudo-pooled (balance between required resources and sensitivity).", + "description": "Mode of sample inference: \"independent\", \"pooled\" or \"pseudo\"", + "enum": [ + "independent", + "pooled", + "pseudo" + ] + }, + "metadata_category": { + "type": "string", + "description": "Comma separated list of metadata column headers for statistics.", + "help_text": "Here columns in the metadata sheet can be chosen with groupings that are used for diversity indices and differential abundance analysis. By default, all suitable columns in the metadata sheet will be used if this option is not specified. Suitable are columns which are categorical (not numerical) and have multiple different values which are not all unique. For example:\n\n```bash\n--metadata_category \"treatment1,treatment2\"\n```\n\nPlease note the following requirements:\n\n1. Comma separated list enclosed in quotes\n2. May not contain whitespace characters\n3. Each comma separated term has to match exactly one column name in the metadata sheet" + }, + "extension": { + "type": "string", + "default": "/*_R{1,2}_001.fastq.gz", + "description": "Naming of sequencing files", + "help_text": "Indicates the naming of sequencing files (default: `\"/*_R{1,2}_001.fastq.gz\"`).\n\nPlease note:\n\n1. The prepended slash (`/`) is required\n2. The star (`*`) is the required wildcard for sample names\n3. The curly brackets (`{}`) enclose the orientation for paired end reads, seperated by a comma (`,`).\n4. The pattern must be enclosed in quotes\n\nFor example for one sample (name: `1`) with forward (file: `1_a.fastq.gz`) and reverse (file: `1_b.fastq.gz`) reads in folder `data`:\n\n```bash\n--input \"data\" --extension \"/*_{a,b}.fastq.gz\"\n```" + }, + "outdir": { + "type": "string", + "description": "The output directory where the results will be saved.", + "default": "./results", + "fa_icon": "fas fa-folder-open" + }, + "email": { + "type": "string", + "description": "Email address for completion summary.", + "fa_icon": "fas fa-envelope", + "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", + "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" + } + } + }, "cutoffs": { "title": "Cutoffs", "type": "object", "description": "", "default": "", "properties": { - "maxEE": { - "type": "integer", - "default": 2, - "description": "DADA2 read filtering option", - "help_text": "After truncation, reads with higher than \u2018maxEE\u2019 \"expected errors\" will be discarded. In case of very long reads, you might want to increase this value. We recommend (to start with) a value corresponding to approximately 1 expected error per 100-200 bp (default: 2)", - "fa_icon": "fas fa-equals" - }, - "maxLen": { - "type": "integer", - "default": 2999, - "description": "DADA2 read filtering option [PacBio only]", - "fa_icon": "fas fa-less-than-equal", - "help_text": "Remove reads with length greater than maxLen after trimming and truncation." - }, - "minLen": { - "type": "integer", - "default": 50, - "description": "DADA2 read filtering option [PacBio only]", - "fa_icon": "fas fa-greater-than-equal", - "help_text": "Remove reads with length less than minLen after trimming and truncation." - }, "retain_untrimmed": { "type": "boolean", "description": "Cutadapt will retain untrimmed reads, choose only if input reads are not expected to contain primer sequences.", - "help_text": "When read sequences are trimmed, untrimmed read pairs are discarded routinely. Use this option to retain untrimmed read pairs. This is usually not recommended and is only of advantage for specific protocols that prevent sequencing PCR primers. \n\nFor example:\n\n```bash\n--retain_untrimmed\n```", + "help_text": "When read sequences are trimmed, untrimmed read pairs are discarded routinely. Use this option to retain untrimmed read pairs. This is usually not recommended and is only of advantage for specific protocols that prevent sequencing PCR primers. ", "fa_icon": "far fa-plus-square" }, "double_primer": { "type": "boolean", "description": "Cutadapt will be run twice to ensure removal of potential double primers", - "help_text": "Cutdapt will be run twice, first to remove reads without primers (default), then a second time to remove reads that erroneously contain a second set of primers, not to be used with `--retain_untrimmed`", + "help_text": "Cutdapt will be run twice, first to remove reads without primers (default), then a second time to remove reads that erroneously contain a second set of primers, not to be used with `--retain_untrimmed`.", "fa_icon": "fas fa-project-diagram" }, "trunclenf": { "type": "integer", "description": "DADA2 read truncation value for forward strand, set this to 0 for no truncation", - "help_text": "Read denoising by DADA2 creates an error profile specific to a sequencing run and uses this to correct sequencing errors. This method requires all reads to have the same length and as high quality as possible while maintaining at least 20 bp overlap for merging. One cutoff for the forward read `--trunclenf` and one for the reverse read `--trunclenr` truncate all longer reads at that position and drop all shorter reads.\nThese cutoffs are usually chosen visually using `--untilQ2import`, inspecting the quality plots in \"results/demux\", and resuming analysis with `--Q2imported`. If not set, these cutoffs will be determined automatically for the position before the mean quality score drops below `--trunc_qmin`.\n\nFor example:\n\n```bash\n--trunclenf 180 --trunclenr 120\n```\n\nPlease note:\n\n1. Overly aggressive truncation might lead to insufficient overlap for read merging\n2. Too little truncation might reduce denoised reads\n3. The code choosing these values automatically cannot take the points above into account, therefore setting `--trunclenf` and `--trunclenr` is recommended", + "help_text": "Read denoising by DADA2 creates an error profile specific to a sequencing run and uses this to correct sequencing errors. This method prefers when all reads to have the same length and as high quality as possible while maintaining at least 20 bp overlap for merging. One cutoff for the forward read `--trunclenf` and one for the reverse read `--trunclenr` truncate all longer reads at that position and drop all shorter reads.\nIf not set, these cutoffs will be determined automatically for the position before the mean quality score drops below `--trunc_qmin`.\n\nFor example:\n\n```bash\n--trunclenf 180 --trunclenr 120\n```\n\nPlease note:\n\n1. Overly aggressive truncation might lead to insufficient overlap for read merging\n2. Too little truncation might reduce denoised reads\n3. The code choosing these values automatically cannot take the points above into account, therefore checking read numbers is essential", "fa_icon": "fas fa-ban" }, "trunclenr": { "type": "integer", "description": "DADA2 read truncation value for reverse strand, set this to 0 for no truncation", - "help_text": "Read denoising by DADA2 creates an error profile specific to a sequencing run and uses this to correct sequencing errors. This method requires all reads to have the same length and as high quality as possible while maintaining at least 20 bp overlap for merging. One cutoff for the forward read `--trunclenf` and one for the reverse read `--trunclenr` truncate all longer reads at that position and drop all shorter reads.\nThese cutoffs are usually chosen visually using `--untilQ2import`, inspecting the quality plots in \"results/demux\", and resuming analysis with `--Q2imported`. If not set, these cutoffs will be determined automatically for the position before the mean quality score drops below `--trunc_qmin`.\n\nFor example:\n\n```bash\n--trunclenf 180 --trunclenr 120\n```\n\nPlease note:\n\n1. Overly aggressive truncation might lead to insufficient overlap for read merging\n2. Too little truncation might reduce denoised reads\n3. The code choosing these values automatically cannot take the points above into account, therefore setting `--trunclenf` and `--trunclenr` is recommended", + "help_text": "Read denoising by DADA2 creates an error profile specific to a sequencing run and uses this to correct sequencing errors. This method prefers when all reads to have the same length and as high quality as possible while maintaining at least 20 bp overlap for merging. One cutoff for the forward read `--trunclenf` and one for the reverse read `--trunclenr` truncate all longer reads at that position and drop all shorter reads.\nIf not set, these cutoffs will be determined automatically for the position before the mean quality score drops below `--trunc_qmin`.\n\nFor example:\n\n```bash\n--trunclenf 180 --trunclenr 120\n```\n\nPlease note:\n\n1. Overly aggressive truncation might lead to insufficient overlap for read merging\n2. Too little truncation might reduce denoised reads\n3. The code choosing these values automatically cannot take the points above into account, therefore checking read numbers is essential", "fa_icon": "fas fa-ban" }, "trunc_qmin": { "type": "integer", "default": 25, "description": "If --trunclenf and --trunclenr are not set, these values will be automatically determined using this median quality score", - "help_text": "Automatically determine `--trunclenf` and `--trunclenr` before the median quality score drops below `--trunc_qmin` (default: 25). The fraction of reads retained is defined by `--trunc_rmin`, which might override the quality cutoff.\n\nFor example:\n\n```bash\n--trunc_qmin 35\n```\n\nPlease note:\n\n1. The code choosing `--trunclenf` and `--trunclenr` using `--trunc_qmin` automatically cannot take amplicon length or overlap requirements for merging into account, therefore use with caution.\n2. The default value of 25 is recommended. However, high quality data with a large paired sequence overlap might justify a higher value (e.g. 35). Also, very low quality data might require a lower value.\n3. If the quality cutoff is too low to include a certain fraction of reads that is specified by `--trunc_rmin` (default: 0.75, meaning at least 75% percent of reads are retained), a lower cutoff according to `--trunc_rmin` superseeds the quality cutoff.", + "help_text": "Automatically determine `--trunclenf` and `--trunclenr` before the median quality score drops below `--trunc_qmin`. The fraction of reads retained is defined by `--trunc_rmin`, which might override the quality cutoff.\n\nFor example:\n\n```bash\n--trunc_qmin 35\n```\n\nPlease note:\n\n1. The code choosing `--trunclenf` and `--trunclenr` using `--trunc_qmin` automatically cannot take amplicon length or overlap requirements for merging into account, therefore use with caution.\n2. A minimum value of 25 is recommended. However, high quality data with a large paired sequence overlap might justify a higher value (e.g. 35). Also, very low quality data might require a lower value.\n3. If the quality cutoff is too low to include a certain fraction of reads that is specified by `--trunc_rmin` (e.g. 0.75 means at least 75% percent of reads are retained), a lower cutoff according to `--trunc_rmin` superseeds the quality cutoff.", "fa_icon": "fas fa-greater-than-equal" }, "trunc_rmin": { "type": "number", "default": 0.75, "description": "Assures that values chosen with --trunc_qmin will retain a fraction of reads.", - "help_text": "Value can range from 0 to 1. 0 means no reads need to be retained and 1 means all reads need to be retained. The minimum lengths of --trunc_qmin and --trunc_rmin are chosen as DADA2 cutoffs." + "help_text": "Value can range from 0 to 1. 0 means no reads need to be retained and 1 means all reads need to be retained. The minimum lengths of --trunc_qmin and --trunc_rmin are chosen as DADA2 cutoffs.", + "minimum": 0, + "maximum": 1 + }, + "max_ee": { + "type": "integer", + "default": 2, + "description": "DADA2 read filtering option", + "help_text": "After truncation, reads with higher than `max_ee` \"expected errors\" will be discarded. In case of very long reads, you might want to increase this value. We recommend (to start with) a value corresponding to approximately 1 expected error per 100-200 bp (default: 2)", + "fa_icon": "fas fa-equals" + }, + "max_len": { + "type": "string", + "description": "DADA2 read filtering optio", + "fa_icon": "fas fa-less-than-equal", + "help_text": "Remove reads with length greater than `max_len` after trimming and truncation. Must be a positive integer. Default is \"Inf\", therefore no removal of any reads.", + "default": "Inf" + }, + "min_len": { + "type": "integer", + "default": 50, + "description": "DADA2 read filtering option", + "fa_icon": "fas fa-greater-than-equal", + "help_text": "Remove reads with length less than `min_len` after trimming and truncation." } }, "fa_icon": "fas fa-filter" @@ -127,32 +193,50 @@ "description": "", "default": "", "properties": { - "reference_database": { + "dada_ref_taxonomy": { "type": "string", - "default": "https://www.arb-silva.de/fileadmin/silva_databases/qiime/Silva_132_release.zip", - "description": "Path to taxonomic reference database, currently accepts a qiime compatible file Silva_132_release.zip or a UNITE fasta file", - "help_text": "By default, the workflow downloads SILVA (https://www.arb-silva.de/) v132 (https://www.arb-silva.de/documentation/release-132/) and extracts reference sequences and taxonomy clustered at 99% similarity and trains a Naive Bayes classifier to assign taxonomy to features." + "help_text": "Choose any of the supported databases, and optionally also specify the version. Database and version are separated by an equal sign (`=`, e.g. `silva=138`) . This will download the desired database, format it to produce a file that is compatible with DADA2's assignTaxonomy and another file that is compatible with DADA2's addSpecies.\n\nThe following databases are supported:\n- GTDB - Genome Taxonomy Database - 16S rRNA\n- PR2 - Protist Reference Ribosomal Database - 18S rRNA\n- RDP - Ribosomal Database Project - 16S rRNA\n- SILVA ribosomal RNA gene database project - 16S rRNA\n- UNITE - eukaryotic nuclear ribosomal ITS region - ITS\n\nGenerally, using `gtdb`, `pr2`, `rdp`, `silva`, `unite-fungi`, or `unite-alleuk` will select the most recent supported version. For details on what values are valid, please either use an invalid value such as `x` (causing the pipeline to send an error message with a list of all valid values) or see `conf/ref_databases.config`.\n\nPlease note that commercial/non-academic entities [require licensing](https://www.arb-silva.de/silva-license-information) for SILVA v132 database (non-default) but not from v138 on (default).", + "description": "Name of supported database, and optionally also version number", + "default": "silva=138", + "enum": [ + "gtdb=05-RS95", + "gtdb", + "pr2=4.13.0", + "pr2", + "rdp=18", + "rdp", + "silva=132", + "silva=138", + "silva", + "unite-fungi=8.2", + "unite-fungi", + "unite-alleuk=8.2", + "unite-alleuk" + ] }, - "taxon_reference": { + "cut_dada_ref_taxonomy": { + "type": "boolean", + "help_text": "Expected amplified sequences are extracted from the DADA2 reference taxonomy using the primer sequences, that might improve classification. This is not applied to species classification (assignSpecies) but only for lower taxonomic levels (assignTaxonomy).", + "description": "If the expected amplified sequences are extracted from the DADA2 reference taxonomy database" + }, + "qiime_ref_taxonomy": { "type": "string", - "default": "silva", - "description": "Specify which database to use for taxonomic assignment. Either 'silva' or 'unite' (default: 'silva').", - "help_text": "By default, uses SILVA for taxonomic assignment, but can also use UNITE. If so, specify the UNITE fasta file with --reference_database." + "help_text": "Choose any of the supported databases, and optionally also specify the version. Database and version are separated by an equal sign (`=`, e.g. `silva=138`) . This will download the desired database and initiate taxonomic classification with QIIME2 and the chosen database.\n\nIf both, `--dada_ref_taxonomy` and `--qiime_ref_taxonomy` are used, DADA2 classification will be used for downstream analysis.\n\nThe following databases are supported:\n- SILVA ribosomal RNA gene database project - 16S rRNA\n- UNITE - eukaryotic nuclear ribosomal ITS region - ITS\n- Greengenes (only testing!)\n\nGenerally, using `silva`, `unite-fungi`, or `unite-alleuk` will select the most recent supported version. For testing purposes, the tiny database `greengenes85` (dereplicated at 85% sequence similarity) is available. For details on what values are valid, please either use an invalid value such as `x` (causing the pipeline to send an error message with all valid values) or see `conf/ref_databases.config`.", + "description": "Name of supported database, and optionally also version number", + "enum": [ + "silva=138", + "silva", + "unite-fungi=8.2", + "unite-fungi", + "unite-alleuk=8.2", + "unite-alleuk", + "greengenes85" + ] }, "classifier": { "type": "string", "description": "Path to QIIME2 trained classifier file (typically *-classifier.qza)", - "help_text": "If you have trained a compatible classifier before, from sources such as SILVA (https://www.arb-silva.de/), Greengenes (http://greengenes.secondgenome.com/downloads) or RDP (https://rdp.cme.msu.edu/). \n\nFor example:\n\n```bash\n--classifier \"FW_primer-RV_primer-classifier.qza\"\n```\n\nPlease note the following requirements:\n\n1. The path must be enclosed in quotes\n2. The classifier is a Naive Bayes classifier produced by \"qiime feature-classifier fit-classifier-naive-bayes\" (e.g. by this pipeline or from (https://docs.qiime2.org/2019.10/data-resources/))\n3. The primer pair for the amplicon PCR and the computing of the classifier are exactly the same (or fulllength, potentially lower performance)\n4. The classifier has to be trained by the same version of scikit-learn as this version of the pipeline uses (0.21.2)" - }, - "classifier_removeHash": { - "type": "boolean", - "description": "Remove all hash signs from taxonomy strings, resolves a rare ValueError during classification (process classifier)" - }, - "dereplication": { - "type": "integer", - "default": 99, - "description": "Dereplication of the database. Must bematching SILVA v132 and its subfolders. Database size is descreasing, but taxonomical assignments as well.", - "hidden": true + "help_text": "If you have trained a compatible classifier before, from sources such as SILVA (https://www.arb-silva.de/), Greengenes (http://greengenes.secondgenome.com/downloads) or RDP (https://rdp.cme.msu.edu/). \n\nFor example:\n\n```bash\n--classifier \"FW_primer-RV_primer-classifier.qza\"\n```\n\nPlease note the following requirements:\n\n1. The path must be enclosed in quotes\n2. The classifier is a Naive Bayes classifier produced by `qiime feature-classifier fit-classifier-naive-bayes` (e.g. by this pipeline)\n3. The primer pair for the amplicon PCR and the computing of the classifier are exactly the same (or full-length, potentially lower performance)\n4. The classifier has to be trained by the same version of scikit-learn as this version of the pipeline uses" } }, "fa_icon": "fas fa-database" @@ -171,101 +255,19 @@ }, "min_frequency": { "type": "integer", - "default": "1", + "default": 1, "description": "Abundance filtering", "help_text": "Remove entries from the feature table below an absolute abundance threshold (default: 1, meaning filter is disabled). Singletons are often regarded as artifacts, choosing a value of 2 removes sequences with less than 2 total counts from the feature table.\n\nFor example to remove singletons choose:\n\n```bash\n--min_frequency 2\n```" }, "min_samples": { "type": "integer", - "default": "1", + "default": 1, "description": "Prevalence filtering", "help_text": "Filtering low prevalent features from the feature table, e.g. keeping only features that are present in at least two samples can be achived by choosing a value of 2 (default: 1, meaning filter is disabled). Typically only used when having replicates for all samples.\n\nFor example to retain features that are present in at least two sample:\n\n```bash\n--min_samples 2\n```\n\nPlease note this is independent of abundance." } }, "fa_icon": "fas fa-filter" }, - "other_input_output_options": { - "title": "Other input/output options", - "type": "object", - "fa_icon": "fas fa-terminal", - "description": "Define where the pipeline should find input data and save output data.", - "properties": { - "readPaths": { - "type": "string", - "hidden": true, - "description": "Path to test sequencing read files" - }, - "metadata_category": { - "type": "string", - "description": "Comma separated list of metadata column headers for statistics.", - "help_text": "Here columns in the metadata sheet can be chosen with groupings that are used for diversity indices and differential abundance analysis. By default, all suitable columns in the metadata sheet will be used if this option is not specified. Suitable are columns which are categorical (not numerical) and have multiple different values which are not all unique. For example:\n\n```bash\n--metadata_category \"treatment1,treatment2\"\n```\n\nPlease note the following requirements:\n\n1. Comma separated list enclosed in quotes\n2. May not contain whitespace characters\n3. Each comma separated term has to match exactly one column name in the metadata sheet" - }, - "pacbio": { - "type": "boolean", - "description": "If PacBio data. Use this option together with --manifest" - }, - "phred64": { - "type": "boolean", - "description": "If the sequencing data has PHRED 64 encoded quality scores, otherwise PHRED 33 is assumed" - }, - "split": { - "type": "string", - "default": "-", - "description": "A string that will be used between the prepended run/folder name and the sample name. Only used with \"--multipleSequencingRuns\".", - "help_text": "A string that will be used between the prepended run/folder name and the sample name. Only used with `--multipleSequencingRuns` (default: `\"-\"`).\n\nFor example using the string `link`:\n\n```bash\n--split \"link\"\n```\n\nPlease note:\n\n1. Run/folder names may not contain the string specified by `--split`\n2. No underscore(s) allowed\n3. Must be enclosed in quotes\n4. The metadata sheet has to be adjusted, instead of using `run-sample` in the first column, in this example `runlinksample` is required" - }, - "extension": { - "type": "string", - "default": "/*_R{1,2}_001.fastq.gz", - "description": "Naming of sequencing files", - "help_text": "Indicates the naming of sequencing files (default: `\"/*_R{1,2}_001.fastq.gz\"`).\n\nPlease note:\n\n1. The prepended slash (`/`) is required\n2. The star (`*`) is the required wildcard for sample names\n3. The curly brackets (`{}`) enclose the orientation for paired end reads, seperated by a comma (`,`).\n4. The pattern must be enclosed in quotes\n\nFor example for one sample (name: `1`) with forward (file: `1_a.fastq.gz`) and reverse (file: `1_b.fastq.gz`) reads in folder `data`:\n\n```bash\n--input \"data\" --extension \"/*_{a,b}.fastq.gz\"\n```" - }, - "outdir": { - "type": "string", - "description": "The output directory where the results will be saved.", - "default": "./results", - "fa_icon": "fas fa-folder-open" - }, - "email": { - "type": "string", - "description": "Email address for completion summary.", - "fa_icon": "fas fa-envelope", - "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", - "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" - }, - "qiime_timezone": { - "type": "string", - "default": "Europe/Berlin", - "description": "Needs to be specified to resolve a timezone error", - "help_text": "If a timezone error occurs, this parameter needs to be specified (default: 'Europe/Berlin'). Find your appropriate timezone with e.g. tzselect.\nNote, this affects the timezone of the entire software environment." - }, - "keepIntermediates": { - "type": "boolean", - "description": "Keep additional intermediate files, such as trimmed reads or various QIIME2 archives" - } - } - }, - "run_only_partial_workflow": { - "title": "Run only partial workflow", - "type": "object", - "description": "", - "default": "", - "properties": { - "untilQ2import": { - "type": "boolean", - "description": "Skip all steps after importing into QIIME2, used for visually choosing DADA2 parameter `--trunclenf` and `--trunclenr`" - }, - "Q2imported": { - "type": "string", - "description": "Path to imported reads (e.g. \"demux.qza\")", - "help_text": "Analysis starting with a QIIME2 artefact with trimmed reads, typically produced before with `--untilQ2import`. This is only supported for data from a single sequencing run.\n\nFor data from multiple sequencing runs with `--multipleSequencingRuns` the pipeline can be first run with `--untilQ2import` and next run without `--untilQ2import` but with `-resume`." - }, - "onlyDenoising": { - "type": "boolean", - "description": "Skip all steps after denoising, produce only sequences and abundance tables on ASV level" - } - } - }, "skipping_specific_steps": { "title": "Skipping specific steps", "type": "object", @@ -276,22 +278,26 @@ "type": "boolean", "description": "Skip FastQC" }, - "skip_alpha_rarefaction": { + "skip_qiime": { "type": "boolean", - "description": "Skip alpha rarefaction" - }, - "skip_barplot": { - "type": "boolean", - "description": "Skip producing barplot" + "description": "Skip all steps that are executed by QIIME2, including QIIME2 software download, taxonomy assignment by QIIME2, barplots, relative abundance tables, diversity analysis, differential abundance testing." }, "skip_taxonomy": { "type": "boolean", "description": "Skip taxonomic classification" }, + "skip_barplot": { + "type": "boolean", + "description": "Skip producing barplot" + }, "skip_abundance_tables": { "type": "boolean", "description": "Skip producing any relative abundance tables" }, + "skip_alpha_rarefaction": { + "type": "boolean", + "description": "Skip alpha rarefaction" + }, "skip_diversity_indices": { "type": "boolean", "description": "Skip alpha and beta diversity analysis" @@ -335,12 +341,12 @@ "move" ] }, - "name": { - "type": "string", - "description": "Workflow name.", - "fa_icon": "fas fa-fingerprint", - "hidden": true, - "help_text": "A custom name for the pipeline run. Unlike the core nextflow `-name` option with one hyphen this parameter can be reused multiple times, for example if using `-resume`. Passed through to steps such as MultiQC and used for things like report filenames and titles." + "validate_params": { + "type": "boolean", + "description": "Boolean whether to validate parameters against the schema at runtime", + "default": true, + "fa_icon": "fas fa-check-square", + "hidden": true }, "email_on_fail": { "type": "string", @@ -385,18 +391,31 @@ "fa_icon": "fas fa-cogs", "hidden": true }, - "clusterOptions": { - "type": "string", - "hidden": true + "show_hidden_params": { + "type": "boolean", + "fa_icon": "far fa-eye-slash", + "description": "Show all params when using `--help`", + "hidden": true, + "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." }, - "awsqueue": { - "type": "string", - "hidden": true + "enable_conda": { + "type": "boolean", + "description": "Run this workflow with Conda. You can also use '-profile conda' instead of providing this parameter.", + "hidden": true, + "fa_icon": "fas fa-bacon" }, - "awsregion": { + "multiqc_title": { "type": "string", - "default": "eu-west-1", - "hidden": true + "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.", + "hidden": true, + "fa_icon": "fas fa-file-signature" + }, + "singularity_pull_docker_container": { + "type": "boolean", + "description": "Instead of directly downloading Singularity images for use with Singularity, force the workflow to pull and convert Docker containers instead.", + "hidden": true, + "fa_icon": "fas fa-toolbox", + "help_text": "This may be useful for example if you are unable to directly pull Singularity containers to run the pipeline due to http/https proxy issues." } } }, @@ -429,6 +448,12 @@ "hidden": true, "fa_icon": "fas fa-users-cog" }, + "config_profile_name": { + "type": "string", + "description": "Institutional config name.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, "config_profile_description": { "type": "string", "description": "Institutional config description.", @@ -469,6 +494,7 @@ "description": "Maximum amount of memory that can be requested for any single job.", "default": "128.GB", "fa_icon": "fas fa-memory", + "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", "hidden": true, "help_text": "Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`" }, @@ -477,6 +503,7 @@ "description": "Maximum amount of time that can be requested for any single job.", "default": "240.h", "fa_icon": "far fa-clock", + "pattern": "^(\\d+\\.?\\s*(s|m|h|day)\\s*)+$", "hidden": true, "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" } @@ -487,6 +514,9 @@ { "$ref": "#/definitions/main_arguments" }, + { + "$ref": "#/definitions/other_input_output_options" + }, { "$ref": "#/definitions/cutoffs" }, @@ -496,12 +526,6 @@ { "$ref": "#/definitions/filtering" }, - { - "$ref": "#/definitions/other_input_output_options" - }, - { - "$ref": "#/definitions/run_only_partial_workflow" - }, { "$ref": "#/definitions/skipping_specific_steps" }, diff --git a/subworkflows/local/cutadapt_workflow.nf b/subworkflows/local/cutadapt_workflow.nf new file mode 100644 index 00000000..d68f2a6b --- /dev/null +++ b/subworkflows/local/cutadapt_workflow.nf @@ -0,0 +1,61 @@ +/* + * Check input samplesheet or folder and get read channels + */ + +params.standard_options = [:] +params.readthrough_options = [:] +params.doubleprimer_options = [:] +params.summary_options = [:] +params.summary_merge_options = [:] + +include { CUTADAPT } from '../../modules/nf-core/software/cutadapt/main' addParams( options: params.standard_options ) +include { CUTADAPT as CUTADAPT_READTHROUGH } from '../../modules/nf-core/software/cutadapt/main' addParams( options: params.readthrough_options ) +include { CUTADAPT as CUTADAPT_DOUBLEPRIMER } from '../../modules/nf-core/software/cutadapt/main' addParams( options: params.doubleprimer_options ) +include { CUTADAPT_SUMMARY } from '../../modules/local/cutadapt_summary' addParams( options: params.summary_options ) +include { CUTADAPT_SUMMARY as CUTADAPT_SUMMARY_DOUBLEPRIMER } from '../../modules/local/cutadapt_summary' addParams( options: params.summary_options ) +include { CUTADAPT_SUMMARY_MERGE } from '../../modules/local/cutadapt_summary_merge' addParams( options: params.summary_merge_options ) + +workflow CUTADAPT_WORKFLOW { + take: + ch_file + illumina_pe_its + double_primer + main: + CUTADAPT ( ch_file ).reads.set { ch_trimmed_reads } + CUTADAPT.out.log + .map { + info, log -> + def meta = [:] + meta.single_end = info.single_end + [ meta, log ] } + .groupTuple(by: 0 ) + .set { ch_cutadapt_logs } + CUTADAPT_SUMMARY ( "cutadapt_standard", ch_cutadapt_logs ) + + if (illumina_pe_its) { + CUTADAPT_READTHROUGH ( ch_trimmed_reads ).reads.set { ch_trimmed_reads } + } + + if (double_primer) { + CUTADAPT_DOUBLEPRIMER ( ch_trimmed_reads ).reads.set { ch_trimmed_reads } + CUTADAPT_DOUBLEPRIMER.out.log + .map { + info, log -> + def meta = [:] + meta.single_end = info.single_end + [ meta, log ] } + .groupTuple(by: 0 ) + .set { ch_cutadapt_doubleprimer_logs } + CUTADAPT_SUMMARY_DOUBLEPRIMER ( "cutadapt_doubleprimer", ch_cutadapt_doubleprimer_logs ) + ch_summaries = CUTADAPT_SUMMARY.out.tsv.combine( CUTADAPT_SUMMARY_DOUBLEPRIMER.out.tsv ) + CUTADAPT_SUMMARY_MERGE ( "merge", ch_summaries ) + } else { + CUTADAPT_SUMMARY_MERGE ( "copy", CUTADAPT_SUMMARY.out.tsv ) + } + + emit: + reads = ch_trimmed_reads + logs = CUTADAPT.out.log + summary = CUTADAPT_SUMMARY_MERGE.out.tsv + version = CUTADAPT.out.version +} diff --git a/subworkflows/local/parse_input.nf b/subworkflows/local/parse_input.nf new file mode 100644 index 00000000..b3e333d8 --- /dev/null +++ b/subworkflows/local/parse_input.nf @@ -0,0 +1,104 @@ +/* + * Check input samplesheet or folder and get read channels + */ + +params.options = [:] + +include { parse_samplesheet } from '../../modules/local/parse_samplesheet' addParams( options: params.options ) + +workflow PARSE_INPUT { + take: + input // file.tsv or folder + single_end + multiple_sequencing_runs + extension + + main: + if ( input.toString().toLowerCase().endsWith(".fasta") || input.toString().toLowerCase().endsWith(".fna") || input.toString().toLowerCase().endsWith(".fa") ) { + // Fasta input directely for classification + ch_fasta = Channel.fromPath(input, checkIfExists: true) + ch_reads = Channel.empty() + } else { + ch_fasta = Channel.empty() + + if ( input.toString().toLowerCase().endsWith("tsv") ) { + // Sample sheet input + + tsvFile = file(input).getName() + // extracts read files from TSV and distribute into channels + Channel + .fromPath(input) + .ifEmpty {exit 1, log.info "Cannot find path file ${tsvFile}"} + .splitCsv(header:true, sep:'\t') + .map { parse_samplesheet(it, single_end) } + .set { ch_reads } + } else { + // Folder input + + //Check folders in folder when multiple_sequencing_runs + folders = multiple_sequencing_runs ? "/*" : "" + if ( single_end ) { + //Get files - single end + Channel + .fromPath( input + folders + extension ) + .ifEmpty { exit 1, "Cannot find any reads matching: \"${input}${extension}\"\nPlease revise the input folder (\"--input\"): \"${input}\"\nand the input file pattern (\"--extension\"): \"${extension}\"\nIf you have multiple sequencing runs, please add \"--multiple_sequencing_runs\".\nNB: Path needs to be enclosed in quotes!" } + .map { read -> + def meta = [:] + meta.id = read.baseName.toString().indexOf("_") != -1 ? read.baseName.toString().take(read.baseName.toString().indexOf("_")) : read.baseName + meta.single_end = single_end.toBoolean() + meta.run = multiple_sequencing_runs ? read.take(read.findLastIndexOf{"/"})[-1] : "1" + [ meta, read ] } + .set { ch_reads } + } else { + //Get files - paired end + Channel + .fromFilePairs( input + folders + extension, size: 2 ) + .ifEmpty { exit 1, "Cannot find any reads matching: \"${input}${extension}\"\nPlease revise the input folder (\"--input\"): \"${input}\"\nand the input file pattern (\"--extension\"): \"${extension}\"\nIf you have multiple sequencing runs, please add \"--multiple_sequencing_runs\".\nNB: Path needs to be enclosed in quotes!" } + .map { name, reads -> + def meta = [:] + meta.id = name.toString().indexOf("_") != -1 ? name.toString().take(name.toString().indexOf("_")) : name + meta.single_end = single_end.toBoolean() + meta.run = multiple_sequencing_runs ? reads[0].take(reads[0].findLastIndexOf{"/"})[-1] : "1" + [ meta, reads ] } + .set { ch_reads } + } + if (multiple_sequencing_runs) { + //Get folder information + ch_reads + .flatMap { meta, reads -> [ meta.run ] } + .unique() + .set { ch_folders } + //Report folders with sequencing files + ch_folders + .collect() + .subscribe { + String folders = it.toString().replace("[", "").replace("]","") + log.info "\nFound the folder(s) \"$folders\" containing sequencing read files matching \"${extension}\" in \"${input}\".\n" } + //Stop if folder count is 1 and multiple_sequencing_runs + ch_folders + .count() + .subscribe { if ( it == 1 ) exit 1, "Found only one folder with read data but \"--multiple_sequencing_runs\" was specified. Please review data input." } + } + } + + //Check whether all sampleID = meta.id are unique + ch_reads + .map { meta, reads -> [ meta.id ] } + .toList() + .subscribe { + if( it.size() != it.unique().size() ) { + ids = it.take(10); + exit 1, "Please review data input, sample IDs are not unique! First IDs are $ids" + } + } + + //Check that no dots "." are in sampleID + ch_reads + .map { meta, reads -> [ meta.id ] } + .subscribe { if ( "$it".contains(".") ) exit 1, "Please review data input, sampleIDs may not contain dots, but \"$it\" does." } + } + + emit: + reads = ch_reads + fasta = ch_fasta +} \ No newline at end of file diff --git a/subworkflows/local/qiime2_ancom.nf b/subworkflows/local/qiime2_ancom.nf new file mode 100644 index 00000000..13443cf9 --- /dev/null +++ b/subworkflows/local/qiime2_ancom.nf @@ -0,0 +1,35 @@ +/* + * Diversity indices with QIIME2 + */ + +params.filterasv_options = [:] +params.ancom_tax_options = [:] +params.ancom_asv_options = [:] + +include { QIIME2_FILTERASV } from '../../modules/local/qiime2_filterasv' addParams( options: params.filterasv_options ) +include { QIIME2_ANCOM_TAX } from '../../modules/local/qiime2_ancom_tax' addParams( options: params.ancom_tax_options ) +include { QIIME2_ANCOM_ASV } from '../../modules/local/qiime2_ancom_asv' addParams( options: params.ancom_asv_options ) + +workflow QIIME2_ANCOM { + take: + ch_metadata + ch_asv + ch_metacolumn_all + ch_tax + + main: + //Filter ASV table to get rid of samples that have no metadata values + QIIME2_FILTERASV ( ch_metadata, ch_asv, ch_metacolumn_all ) + + //ANCOM on various taxonomic levels + ch_taxlevel = Channel.from( 2, 3, 4, 5, 6 ) + ch_metadata + .combine( QIIME2_FILTERASV.out.qza.flatten() ) + .combine( ch_tax ) + .combine( ch_taxlevel ) + .set{ ch_for_ancom_tax } + QIIME2_ANCOM_TAX ( ch_for_ancom_tax ) + QIIME2_ANCOM_TAX.out.ancom.subscribe { if ( it.baseName[0].toString().startsWith("WARNING") ) log.warn it.baseName[0].toString().replace("WARNING ","QIIME2_ANCOM_TAX: ") } + + QIIME2_ANCOM_ASV ( ch_metadata.combine( QIIME2_FILTERASV.out.qza.flatten() ) ) +} diff --git a/subworkflows/local/qiime2_diversity.nf b/subworkflows/local/qiime2_diversity.nf new file mode 100644 index 00000000..a1a4a419 --- /dev/null +++ b/subworkflows/local/qiime2_diversity.nf @@ -0,0 +1,66 @@ +/* + * Diversity indices with QIIME2 + */ + +params.tree_options = [:] +params.alphararefaction_options = [:] +params.diversity_core_options = [:] +params.diversity_alpha_options = [:] +params.diversity_beta_options = [:] +params.diversity_betaord_options = [:] + +include { QIIME2_TREE } from '../../modules/local/qiime2_tree' addParams( options: params.tree_options ) +include { QIIME2_ALPHARAREFACTION } from '../../modules/local/qiime2_alphararefaction' addParams( options: params.alphararefaction_options ) +include { QIIME2_DIVERSITY_CORE } from '../../modules/local/qiime2_diversity_core' addParams( options: params.diversity_core_options ) +include { QIIME2_DIVERSITY_ALPHA } from '../../modules/local/qiime2_diversity_alpha' addParams( options: params.diversity_alpha_options ) +include { QIIME2_DIVERSITY_BETA } from '../../modules/local/qiime2_diversity_beta' addParams( options: params.diversity_beta_options ) +include { QIIME2_DIVERSITY_BETAORD } from '../../modules/local/qiime2_diversity_betaord' addParams( options: params.diversity_betaord_options ) + +workflow QIIME2_DIVERSITY { + take: + ch_metadata + ch_asv + ch_seq + ch_stats //QIIME2_FILTERTAXA.out.tsv + ch_metacolumn_pairwise //METADATA_PAIRWISE.out + ch_metacolumn_all //METADATA_ALL.out + skip_alpha_rarefaction + skip_diversity_indices + + main: + //Phylogenetic tree for beta & alpha diversities + QIIME2_TREE ( ch_seq ) + + //Alpha-rarefaction + if (!skip_alpha_rarefaction) { + QIIME2_ALPHARAREFACTION ( ch_metadata, ch_asv, QIIME2_TREE.out.qza, ch_stats ) + } + + //Calculate diversity indices + if (!skip_diversity_indices) { + + QIIME2_DIVERSITY_CORE ( ch_metadata, ch_asv, QIIME2_TREE.out.qza, ch_stats ) + //Print warning if rarefaction depth is <10000 + QIIME2_DIVERSITY_CORE.out.depth.subscribe { if ( it.baseName.toString().startsWith("WARNING") ) log.warn it.baseName.toString().replace("WARNING ","QIIME2_DIVERSITY_CORE: ") } + + //alpha_diversity ( ch_metadata, DIVERSITY_CORE.out.qza, ch_metacolumn_all ) + ch_metadata + .combine( QIIME2_DIVERSITY_CORE.out.vector.flatten() ) + .combine( ch_metacolumn_all ) + .set{ ch_to_diversity_alpha } + QIIME2_DIVERSITY_ALPHA ( ch_to_diversity_alpha ) + + //beta_diversity ( ch_metadata, DIVERSITY_CORE.out.qza, ch_metacolumn_pairwise ) + ch_metadata + .combine( QIIME2_DIVERSITY_CORE.out.distance.flatten() ) + .combine( ch_metacolumn_pairwise ) + .set{ ch_to_diversity_beta } + QIIME2_DIVERSITY_BETA ( ch_to_diversity_beta ) + + //beta_diversity_ordination ( ch_metadata, DIVERSITY_CORE.out.qza ) + ch_metadata + .combine( QIIME2_DIVERSITY_CORE.out.pcoa.flatten() ) + .set{ ch_to_diversity_betaord } + QIIME2_DIVERSITY_BETAORD ( ch_to_diversity_betaord ) + } +} \ No newline at end of file diff --git a/subworkflows/local/qiime2_export.nf b/subworkflows/local/qiime2_export.nf new file mode 100644 index 00000000..5aaa7cf0 --- /dev/null +++ b/subworkflows/local/qiime2_export.nf @@ -0,0 +1,34 @@ +/* + * Export filtered tables from QIIME2 + */ + +params.absolute_options = [:] +params.relasv_options = [:] +params.reltax_options = [:] +params.combine_table_options = [:] + +include { QIIME2_EXPORT_ABSOLUTE } from '../../modules/local/qiime2_export_absolute' addParams( options: params.absolute_options ) +include { QIIME2_EXPORT_RELASV } from '../../modules/local/qiime2_export_relasv' addParams( options: params.relasv_options ) +include { QIIME2_EXPORT_RELTAX } from '../../modules/local/qiime2_export_reltax' addParams( options: params.reltax_options ) +include { COMBINE_TABLE } from '../../modules/local/combine_table' addParams( options: params.combine_table_options ) + +workflow QIIME2_EXPORT { + take: + ch_asv + ch_seq + ch_tax + ch_tax_tsv + + main: + //export_filtered_dada_output (optional) + QIIME2_EXPORT_ABSOLUTE ( ch_asv, ch_seq, ch_tax ) + + //RelativeAbundanceASV (optional) + QIIME2_EXPORT_RELASV ( ch_asv ) + + //RelativeAbundanceReducedTaxa (optional) + QIIME2_EXPORT_RELTAX ( ch_asv, ch_tax ) + + //combine_table.r (optional), seems similar to DADA2_table.tsv but with additionally taxonomy merged + COMBINE_TABLE ( QIIME2_EXPORT_RELASV.out.tsv, QIIME2_EXPORT_ABSOLUTE.out.fasta, ch_tax_tsv ) +} \ No newline at end of file diff --git a/subworkflows/local/qiime2_preptax.nf b/subworkflows/local/qiime2_preptax.nf new file mode 100644 index 00000000..05bb2f99 --- /dev/null +++ b/subworkflows/local/qiime2_preptax.nf @@ -0,0 +1,35 @@ +/* + * Training of a classifier with QIIME2 + */ + +params.options = [:] + +include { FORMAT_TAXONOMY_QIIME } from '../../modules/local/format_taxonomy_qiime' +include { QIIME2_EXTRACT } from '../../modules/local/qiime2_extract' addParams( options: params.options ) +include { QIIME2_TRAIN } from '../../modules/local/qiime2_train' addParams( options: params.options ) + +workflow QIIME2_PREPTAX { + take: + ch_dada_ref_taxonomy //channel, list of files + FW_primer //val + RV_primer //val + + main: + FORMAT_TAXONOMY_QIIME ( ch_dada_ref_taxonomy ) + + ch_ref_database = FORMAT_TAXONOMY_QIIME.out.fasta.combine(FORMAT_TAXONOMY_QIIME.out.tax) + ch_ref_database + .map { + db -> + def meta = [:] + meta.FW_primer = FW_primer + meta.RV_primer = RV_primer + [ meta, db ] } + .set { ch_ref_database } + QIIME2_EXTRACT ( ch_ref_database ) + QIIME2_TRAIN ( QIIME2_EXTRACT.out.qza ) + + emit: + classifier = QIIME2_TRAIN.out.qza + version = QIIME2_TRAIN.out.version +} \ No newline at end of file diff --git a/subworkflows/local/qiime2_taxonomy.nf b/subworkflows/local/qiime2_taxonomy.nf new file mode 100644 index 00000000..886e59b4 --- /dev/null +++ b/subworkflows/local/qiime2_taxonomy.nf @@ -0,0 +1,23 @@ +/* + * Taxonomic classification with QIIME2 + */ + +params.options = [:] + +include { QIIME2_INSEQ } from '../../modules/local/qiime2_inseq' addParams( options: params.options ) +include { QIIME2_CLASSIFY } from '../../modules/local/qiime2_classify' addParams( options: params.options ) + +workflow QIIME2_TAXONOMY { + take: + ch_fasta + ch_classifier + + main: + QIIME2_INSEQ ( ch_fasta ) + QIIME2_CLASSIFY ( ch_classifier, QIIME2_INSEQ.out.qza ) + + emit: + qza = QIIME2_CLASSIFY.out.qza + tsv = QIIME2_CLASSIFY.out.tsv + version = QIIME2_INSEQ.out.version +} \ No newline at end of file diff --git a/workflows/ampliseq.nf b/workflows/ampliseq.nf new file mode 100644 index 00000000..ae0e9330 --- /dev/null +++ b/workflows/ampliseq.nf @@ -0,0 +1,566 @@ +//////////////////////////////////////////////////// +/* -- LOCAL PARAMETER VALUES -- */ +//////////////////////////////////////////////////// + +params.summary_params = [:] + +//////////////////////////////////////////////////// +/* -- VALIDATE INPUTS -- */ +//////////////////////////////////////////////////// + +/* + * Import input files + */ +if (params.metadata) { + ch_metadata = Channel.fromPath("${params.metadata}", checkIfExists: true) +} else { ch_metadata = Channel.empty() } + +if (params.classifier) { + ch_qiime_classifier = Channel.fromPath("${params.classifier}", checkIfExists: true) +} else { ch_qiime_classifier = Channel.empty() } + +if (params.dada_ref_taxonomy && !params.skip_taxonomy) { + // Check if ref_taxonomy exists in the config file + if (params.dada_ref_databases && params.dada_ref_taxonomy && !params.dada_ref_databases.containsKey(params.dada_ref_taxonomy)) { + exit 1, "The provided DADA2 reference taxonomy '${params.dada_ref_taxonomy}' is not available in the 'conf/ref_databases.config' file. Currently the available reference taxonomies are ${params.dada_ref_databases.keySet().join(', ')}" + } + ch_dada_ref_taxonomy = Channel.fromList(params.dada_ref_databases[params.dada_ref_taxonomy]["file"]).map { file(it) } +} else { ch_dada_ref_taxonomy = Channel.empty() } + +if (params.qiime_ref_taxonomy && !params.skip_taxonomy && !params.classifier) { + // Check if ref_taxonomy exists in the config file + if (params.qiime_ref_databases && params.qiime_ref_taxonomy && !params.qiime_ref_databases.containsKey(params.qiime_ref_taxonomy)) { + exit 1, "The provided QIIME2 reference taxonomy '${params.qiime_ref_taxonomy}' is not available in the 'conf/ref_databases.config' file. Currently the available reference taxonomies are ${params.qiime_ref_databases.keySet().join(', ')}" + } + ch_qiime_ref_taxonomy = Channel.fromList(params.qiime_ref_databases[params.qiime_ref_taxonomy]["file"]).map { file(it) } +} else { ch_qiime_ref_taxonomy = Channel.empty() } + +/* + * Set variables + */ + +single_end = params.single_end +if ( params.pacbio || params.iontorrent ) { + single_end = true +} + +trunclenf = params.trunclenf ? params.trunclenf : 0 +trunclenr = params.trunclenr ? params.trunclenr : 0 +if ( !single_end && !params.illumina_pe_its && (params.trunclenf == false || params.trunclenr == false) ) { + find_truncation_values = true + log.warn "No DADA2 cutoffs were specified (--trunclenf & --trunclenr), therefore reads will be truncated where median quality drops below ${params.trunc_qmin} (defined by --trunc_qmin) but at least a fraction of ${params.trunc_rmin} (defined by --trunc_rmin) of the reads will be retained.\nThe chosen cutoffs do not account for required overlap for merging, therefore DADA2 might have poor merging efficiency or even fail.\n" +} else { find_truncation_values = false } + +//only run QIIME2 when taxonomy is actually calculated and all required data is available +if ( !params.enable_conda && !params.skip_taxonomy && !params.skip_qiime ) { + run_qiime2 = true +} else { run_qiime2 = false } + +/* + * Sanity check input values + */ + +if (params.enable_conda) { log.warn "Conda is enabled (--enable_conda), any steps involving QIIME2 are not available. Use a container engine instead of conda to enable all software." } + +if (!params.FW_primer) { exit 1, "Option --FW_primer missing" } +if (!params.RV_primer) { exit 1, "Option --RV_primer missing" } +if (!params.input) { exit 1, "Option --input missing" } + +if (!["pooled", "independent", "pseudo"].contains(params.sample_inference)) { + exit 1, "Please set --sample_inference to one of the following:\n\t-\"independent\" (lowest sensitivity and lowest resources),\n\t-\"pseudo\" (balance between required resources and sensitivity),\n\t-\"pooled\" (highest sensitivity and resources)." +} + +if (params.double_primer && params.retain_untrimmed) { + exit 1, "Incompatible parameters --double_primer and --retain_untrimmed cannot be set at the same time." +} + +//////////////////////////////////////////////////// +/* -- CONFIG FILES -- */ +//////////////////////////////////////////////////// + +ch_multiqc_config = file("$projectDir/assets/multiqc_config.yaml", checkIfExists: true) +ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config) : Channel.empty() + +//////////////////////////////////////////////////// +/* -- IMPORT LOCAL MODULES/SUBWORKFLOWS -- */ +//////////////////////////////////////////////////// + +// Don't overwrite global params.modules, create a copy instead and use that within the main script. +def modules = params.modules.clone() + +def dada2_filtntrim_options = modules['dada2_filtntrim'] +dada2_filtntrim_options.args += single_end ? ", maxEE = $params.max_ee" : ", maxEE = c($params.max_ee, $params.max_ee)" +if (params.pacbio) { + //PacBio data + dada2_filtntrim_options.args +=", trimLeft = 0, minLen = $params.min_len, maxLen = $params.max_len, rm.phix = FALSE" +} else if (params.iontorrent) { + //Ion-torrent data + dada2_filtntrim_options.args += ", trimLeft = 15, minLen = $params.min_len, maxLen = $params.max_len, rm.phix = TRUE" +} else if (params.illumina_pe_its) { + //Illumina ITS data or other sequences with high length variability + dada2_filtntrim_options.args += ", trimLeft = 0, minLen = $params.min_len, maxLen = $params.max_len, rm.phix = TRUE" +} else { + //Illumina 16S data + dada2_filtntrim_options.args += ", trimLeft = 0, minLen = $params.min_len, maxLen = $params.max_len, rm.phix = TRUE" +} + +def dada2_quality_options = modules['dada2_quality'] + +def trunclen_options = [:] +trunclen_options.args ="$params.trunc_qmin $params.trunc_rmin" + +def dada2_err_options = modules['dada2_err'] +dada2_err_options.args += params.pacbio ? ", errorEstimationFunction = PacBioErrfun" : ", errorEstimationFunction = loessErrfun" + +def dada2_denoising_options = modules['dada2_denoising'] +if (params.iontorrent) { + //Ion-torrent data + dada2_denoising_options.args += ", BAND_SIZE = 32, HOMOPOLYMER_GAP_PENALTY = -1" +} else { + dada2_denoising_options.args += ", BAND_SIZE = 16, HOMOPOLYMER_GAP_PENALTY = NULL" +} +dada2_denoising_options.args += params.sample_inference == "pseudo" ? ", pool = \"pseudo\"" : params.sample_inference == "pooled" ? ", pool = TRUE" : ", pool = FALSE" +dada2_denoising_options.args2 += params.concatenate_reads ? ", justConcatenate = TRUE" : ", justConcatenate = FALSE" + +def dada2_rmchimera_options = modules['dada2_rmchimera'] + +def multiqc_options = modules['multiqc'] +multiqc_options.args += params.multiqc_title ? " --title \"$params.multiqc_title\"" : '' + +def dada2_taxonomy_options = modules['dada2_taxonomy'] +dada2_taxonomy_options.args += params.pacbio ? ", tryRC = TRUE" : "" +dada2_taxonomy_options.args += params.iontorrent ? ", tryRC = TRUE" : "" + +def dada2_addspecies_options = modules['dada2_addspecies'] +dada2_addspecies_options.args += params.pacbio ? ", tryRC = TRUE" : "" +dada2_addspecies_options.args += params.iontorrent ? ", tryRC = TRUE" : "" + +include { RENAME_RAW_DATA_FILES } from '../modules/local/rename_raw_data_files' +include { DADA2_FILTNTRIM } from '../modules/local/dada2_filtntrim' addParams( options: dada2_filtntrim_options ) +include { DADA2_QUALITY } from '../modules/local/dada2_quality' addParams( options: dada2_quality_options ) +include { TRUNCLEN } from '../modules/local/trunclen' addParams( options: trunclen_options ) +include { DADA2_ERR } from '../modules/local/dada2_err' addParams( options: dada2_err_options ) +include { DADA2_DEREPLICATE } from '../modules/local/dada2_dereplicate' addParams( options: modules['dada2_dereplicate'] ) +include { DADA2_DENOISING } from '../modules/local/dada2_denoising' addParams( options: dada2_denoising_options ) +include { DADA2_RMCHIMERA } from '../modules/local/dada2_rmchimera' addParams( options: dada2_rmchimera_options ) +include { DADA2_STATS } from '../modules/local/dada2_stats' addParams( options: modules['dada2_stats'] ) +include { DADA2_MERGE } from '../modules/local/dada2_merge' addParams( options: modules['dada2_merge'] ) +include { FORMAT_TAXONOMY } from '../modules/local/format_taxonomy' +include { ITSX_CUTASV } from '../modules/local/itsx_cutasv' addParams( options: modules['itsx_cutasv'] ) +include { MERGE_STATS } from '../modules/local/merge_stats' addParams( options: modules['merge_stats'] ) +include { DADA2_TAXONOMY } from '../modules/local/dada2_taxonomy' addParams( options: dada2_taxonomy_options ) +include { DADA2_ADDSPECIES } from '../modules/local/dada2_addspecies' addParams( options: dada2_addspecies_options ) +include { FORMAT_TAXRESULTS } from '../modules/local/format_taxresults' +include { QIIME2_INSEQ } from '../modules/local/qiime2_inseq' addParams( options: modules['qiime2_inseq'] ) +include { QIIME2_FILTERTAXA } from '../modules/local/qiime2_filtertaxa' addParams( options: modules['qiime2_filtertaxa'] ) +include { QIIME2_INASV } from '../modules/local/qiime2_inasv' addParams( options: modules['qiime2_inasv'] ) +include { FILTER_STATS } from '../modules/local/filter_stats' addParams( options: modules['filter_stats'] ) +include { MERGE_STATS as MERGE_STATS_FILTERTAXA } from '../modules/local/merge_stats' addParams( options: modules['merge_stats'] ) +include { QIIME2_BARPLOT } from '../modules/local/qiime2_barplot' addParams( options: modules['qiime2_barplot'] ) +include { METADATA_ALL } from '../modules/local/metadata_all' +include { METADATA_PAIRWISE } from '../modules/local/metadata_pairwise' +include { QIIME2_INTAX } from '../modules/local/qiime2_intax' addParams( options: modules['qiime2_intax'] ) +include { MULTIQC } from '../modules/local/multiqc' addParams( options: multiqc_options ) +include { GET_SOFTWARE_VERSIONS } from '../modules/local/get_software_versions' addParams( options: [publish_files : ['csv':'']] ) + +/* + * SUBWORKFLOW: Consisting of a mix of local and nf-core/modules + */ + +if (params.pacbio) { + //PacBio data + cutadapt_options_args = " --rc -g ${params.FW_primer}...${params.RV_primer}" +} else if (params.iontorrent) { + //IonTorrent data + cutadapt_options_args = " --rc -g ${params.FW_primer}...${params.RV_primer}" +} else if (params.single_end) { + //Illumina SE + cutadapt_options_args = " -g ${params.FW_primer}" +} else { + //Illumina PE + cutadapt_options_args = " -g ${params.FW_primer} -G ${params.RV_primer}" +} + +def cutadapt_options = modules['cutadapt'] +cutadapt_options.args += cutadapt_options_args +cutadapt_options.args += params.retain_untrimmed ? '' : " --discard-untrimmed" + +//prepare reverse complement primers to remove those in read-throughs +// Get the complement of a DNA sequence +// Complement table taken from http://arep.med.harvard.edu/labgc/adnan/projects/Utilities/revcomp.html +def make_complement(String seq) { + def complements = [ A:'T', T:'A', U:'A', G:'C', C:'G', Y:'R', R:'Y', S:'S', W:'W', K:'M', M:'K', B:'V', D:'H', H:'D', V:'B', N:'N' ] + comp = seq.toUpperCase().collect { base -> complements[ base ] ?: 'X' }.join() + return comp +} +FW_primer_RevComp = make_complement ( "${params.FW_primer}".reverse() ) +RV_primer_RevComp = make_complement ( "${params.RV_primer}".reverse() ) +def cutadapt_readthrough_options = modules['cutadapt_readthrough'] +cutadapt_readthrough_options.args += " -a ${RV_primer_RevComp} -A ${FW_primer_RevComp}" + +def cutadapt_doubleprimer_options = modules['cutadapt_doubleprimer'] +cutadapt_doubleprimer_options.args += cutadapt_options_args + +include { PARSE_INPUT } from '../subworkflows/local/parse_input' +include { QIIME2_PREPTAX } from '../subworkflows/local/qiime2_preptax' addParams( options: modules['qiime2_preptax'] ) +include { QIIME2_TAXONOMY } from '../subworkflows/local/qiime2_taxonomy' addParams( options: modules['qiime2_taxonomy'] ) +include { CUTADAPT_WORKFLOW } from '../subworkflows/local/cutadapt_workflow' addParams( standard_options: cutadapt_options, readthrough_options: cutadapt_readthrough_options,doubleprimer_options: cutadapt_doubleprimer_options,summary_options: modules['cutadapt_summary'],summary_merge_options: modules['cutadapt_summary_merge'] ) +include { QIIME2_EXPORT } from '../subworkflows/local/qiime2_export' addParams( absolute_options: modules['qiime2_export_absolute'], relasv_options: modules['qiime2_export_relasv'],reltax_options: modules['qiime2_export_reltax'],combine_table_options: modules['combine_table'] ) +include { QIIME2_DIVERSITY } from '../subworkflows/local/qiime2_diversity' addParams( tree_options: modules['qiime2_tree'], alphararefaction_options: modules['qiime2_alphararefaction'], diversity_core_options: modules['qiime2_diversity_core'], diversity_alpha_options: modules['qiime2_diversity_alpha'], diversity_beta_options: modules['qiime2_diversity_beta'], diversity_betaord_options: modules['qiime2_diversity_betaord'] ) +include { QIIME2_ANCOM } from '../subworkflows/local/qiime2_ancom' addParams( filterasv_options: modules['qiime2_filterasv'], ancom_tax_options: modules['qiime2_ancom_tax'], ancom_asv_options: modules['qiime2_ancom_asv'] ) + + //////////////////////////////////////////////////// +/* -- IMPORT NF-CORE MODULES/SUBWORKFLOWS -- */ +//////////////////////////////////////////////////// + +/* + * MODULE: Installed directly from nf-core/modules + */ +def fastqc_options = modules['fastqc'] +def cutadapt_taxonomy_options = modules['cutadapt_taxonomy'] +cutadapt_taxonomy_options.args += " -g ${params.FW_primer}...${RV_primer_RevComp}" + +include { CUTADAPT as CUTADAPT_TAXONOMY } from '../modules/nf-core/software/cutadapt/main' addParams( options: cutadapt_taxonomy_options ) +include { FASTQC } from '../modules/nf-core/software/fastqc/main' addParams( options: fastqc_options ) + +/* + * SUBWORKFLOW: Consisting entirely of nf-core/modules + */ + + //////////////////////////////////////////////////// +/* -- RUN MAIN WORKFLOW -- */ +//////////////////////////////////////////////////// + +// Info required for completion email and summary +def multiqc_report = [] + +workflow AMPLISEQ { + ch_software_versions = Channel.empty() + + /* + * Create a channel for input read files + */ + PARSE_INPUT ( params.input, single_end, params.multiple_sequencing_runs, params.extension ) + ch_reads = PARSE_INPUT.out.reads + ch_fasta = PARSE_INPUT.out.fasta + + /* + * MODULE: Rename files + */ + RENAME_RAW_DATA_FILES ( ch_reads ) + + /* + * MODULE: FastQC + */ + if (!params.skip_fastqc) { + FASTQC ( RENAME_RAW_DATA_FILES.out ).html.set { fastqc_html } + ch_software_versions = ch_software_versions.mix(FASTQC.out.version.first().ifEmpty(null)) + } + + /* + * MODULE: Cutadapt + */ + CUTADAPT_WORKFLOW ( + RENAME_RAW_DATA_FILES.out, + params.illumina_pe_its, + params.double_primer + ).reads.set { ch_trimmed_reads } + ch_software_versions = ch_software_versions.mix(CUTADAPT_WORKFLOW.out.version.first().ifEmpty(null)) + + /* + * SUBWORKFLOW / MODULES : ASV generation with DADA2 + */ + //plot aggregated quality profile for forward and reverse reads separately + if (single_end) { + ch_trimmed_reads + .map { meta, reads -> [ reads ] } + .collect() + .map { reads -> [ "single_end", reads ] } + .set { ch_all_trimmed_reads } + } else { + ch_trimmed_reads + .map { meta, reads -> [ reads[0] ] } + .collect() + .map { reads -> [ "FW", reads ] } + .set { ch_all_trimmed_fw } + ch_trimmed_reads + .map { meta, reads -> [ reads[1] ] } + .collect() + .map { reads -> [ "RV", reads ] } + .set { ch_all_trimmed_rv } + ch_all_trimmed_fw + .mix ( ch_all_trimmed_rv ) + .set { ch_all_trimmed_reads } + } + DADA2_QUALITY ( ch_all_trimmed_reads ) + + //find truncation values in case they are not supplied + if ( find_truncation_values ) { + TRUNCLEN ( DADA2_QUALITY.out.tsv ) + TRUNCLEN.out + .toSortedList() + .set { ch_trunc } + //add one more warning or reminder that trunclenf and trunclenr were chosen automatically + ch_trunc.subscribe { + if ( "${it[0][1]}".toInteger() + "${it[1][1]}".toInteger() <= 10 ) { log.warn "`--trunclenf` was set to ${it[0][1]} and `--trunclenr` to ${it[1][1]}, this is too low! Please either change `--trunc_qmin` (and `--trunc_rmin`), or set `--trunclenf` and `--trunclenr`." } + else if ( "${it[0][1]}".toInteger() <= 10 ) { log.warn "`--trunclenf` was set to ${it[0][1]}, this is too low! Please either change `--trunc_qmin` (and `--trunc_rmin`), or set `--trunclenf` and `--trunclenr`." } + else if ( "${it[1][1]}".toInteger() <= 10 ) { log.warn "`--trunclenr` was set to ${it[1][1]}, this is too low! Please either change `--trunc_qmin` (and `--trunc_rmin`), or set `--trunclenf` and `--trunclenr`." } + else log.warn "Probably everything is fine, but this is a reminder that `--trunclenf` was set automatically to ${it[0][1]} and `--trunclenr` to ${it[1][1]}. If this doesnt seem reasonable, then please change `--trunc_qmin` (and `--trunc_rmin`), or set `--trunclenf` and `--trunclenr` directly." + } + } else { + Channel.from( [['FW', trunclenf], ['RV', trunclenr]] ) + .toSortedList() + .set { ch_trunc } + } + ch_trimmed_reads.combine(ch_trunc).set { ch_trimmed_reads } + + //filter reads + DADA2_FILTNTRIM ( ch_trimmed_reads ) + ch_software_versions = ch_software_versions.mix(DADA2_FILTNTRIM.out.version.first().ifEmpty(null)) + + //group by sequencing run + DADA2_FILTNTRIM.out.reads + .map { + info, reads -> + def meta = [:] + meta.run = info.run + meta.single_end = info.single_end + [ meta, reads, info.id ] } + .groupTuple(by: 0 ) + .map { + info, reads, ids -> + def meta = [:] + meta.run = info.run + meta.single_end = info.single_end + meta.id = ids.flatten().sort() + [ meta, reads.flatten().sort() ] } + .set { ch_filt_reads } + + DADA2_ERR ( ch_filt_reads ) + + DADA2_DEREPLICATE ( ch_filt_reads ) + + //group by meta + DADA2_DEREPLICATE.out.dereplicated + .join( DADA2_ERR.out.errormodel ) + .set { ch_derep_errormodel } + DADA2_DENOISING ( ch_derep_errormodel ) + + DADA2_RMCHIMERA ( DADA2_DENOISING.out.seqtab ) + + //group by sequencing run & group by meta + DADA2_FILTNTRIM.out.log + .map { + info, reads -> + def meta = [:] + meta.run = info.run + meta.single_end = info.single_end + [ meta, reads, info.id ] } + .groupTuple(by: 0 ) + .map { + info, reads, ids -> + def meta = [:] + meta.run = info.run + meta.single_end = info.single_end + meta.id = ids.flatten().sort() + [ meta, reads.flatten().sort() ] } + .join( DADA2_DENOISING.out.denoised ) + .join( DADA2_DENOISING.out.mergers ) + .join( DADA2_RMCHIMERA.out.rds ) + .set { ch_track_numbers } + DADA2_STATS ( ch_track_numbers ) + + //merge if several runs, otherwise just publish + DADA2_MERGE ( + DADA2_STATS.out.stats.map { meta, stats -> stats }.collect(), + DADA2_RMCHIMERA.out.rds.map { meta, rds -> rds }.collect() ) + + //merge cutadapt_summary and dada_stats files + MERGE_STATS (CUTADAPT_WORKFLOW.out.summary, DADA2_MERGE.out.dada2stats) + + /* + * SUBWORKFLOW / MODULES : Taxonomic classification with DADA2 and/or QIIME2 + */ + //Alternative entry point for fasta that is being classified - the if clause needs to be the opposite (i.e. with !) of that in subworkflow/local/parse.nf + if ( !(params.input.toString().toLowerCase().endsWith(".fasta") || params.input.toString().toLowerCase().endsWith(".fna") || params.input.toString().toLowerCase().endsWith(".fa") )) { + ch_fasta = DADA2_MERGE.out.fasta + } + + //DADA2 + if (!params.skip_taxonomy) { + FORMAT_TAXONOMY ( ch_dada_ref_taxonomy.collect() ) + ch_assigntax = FORMAT_TAXONOMY.out.assigntax + ch_addspecies = FORMAT_TAXONOMY.out.addspecies + //Cut taxonomy to expected amplicon + if (params.cut_dada_ref_taxonomy) { + ch_assigntax + .map { + db -> + def meta = [:] + meta.single_end = true + meta.id = "assignTaxonomy" + [ meta, db ] } + .set { ch_assigntax } + CUTADAPT_TAXONOMY ( ch_assigntax ).reads + .map { meta, db -> db } + .set { ch_assigntax } + } + if (!params.cut_its) { + DADA2_TAXONOMY ( ch_fasta, ch_assigntax, 'ASV_tax.tsv' ) + DADA2_ADDSPECIES ( DADA2_TAXONOMY.out.rds, ch_addspecies, 'ASV_tax_species.tsv' ) + ch_dada2_tax = DADA2_ADDSPECIES.out.tsv + //Cut out ITS region if long ITS reads + } else { + ITSX_CUTASV ( ch_fasta ) + ch_software_versions = ch_software_versions.mix(ITSX_CUTASV.out.version.ifEmpty(null)) + ch_cut_fasta = ITSX_CUTASV.out.fasta + DADA2_TAXONOMY ( ch_cut_fasta, ch_assigntax, 'ASV_ITS_tax.tsv' ) + DADA2_ADDSPECIES ( DADA2_TAXONOMY.out.rds, ch_addspecies, 'ASV_ITS_tax_species.tsv' ) + FORMAT_TAXRESULTS ( DADA2_TAXONOMY.out.tsv, DADA2_ADDSPECIES.out.tsv, ch_fasta ) + ch_dada2_tax = FORMAT_TAXRESULTS.out.tsv + } + } + + //QIIME2 + if ( run_qiime2 ) { + if (params.qiime_ref_taxonomy && !params.classifier) { + QIIME2_PREPTAX ( + ch_qiime_ref_taxonomy.collect(), + params.FW_primer, + params.RV_primer + ) + ch_qiime_classifier = QIIME2_PREPTAX.out.classifier + } + QIIME2_TAXONOMY ( + ch_fasta, + ch_qiime_classifier + ) + ch_software_versions = ch_software_versions.mix( QIIME2_TAXONOMY.out.version.ifEmpty(null) ) //usually a .first() is here, dont know why this leads here to a warning + } + /* + * SUBWORKFLOW / MODULES : Downstream analysis with QIIME2 + */ + if ( run_qiime2 ) { + //Import ASV abundance table and sequences into QIIME2 + QIIME2_INASV ( DADA2_MERGE.out.asv ) + QIIME2_INSEQ ( ch_fasta ) + + //Import taxonomic classification into QIIME2, if available + if ( params.skip_taxonomy ) { + log.info "Skip taxonomy classification" + ch_tax = Channel.empty() + } else if ( params.dada_ref_taxonomy ) { + log.info "Use DADA2 taxonomy classification" + ch_tax = QIIME2_INTAX ( ch_dada2_tax ).qza + } else if ( params.qiime_ref_taxonomy || params.classifier ) { + log.info "Use QIIME2 taxonomy classification" + ch_tax = QIIME2_TAXONOMY.out.qza + } else { + log.info "Use no taxonomy classification" + ch_tax = Channel.empty() + } + + //Filtering by taxonomy & prevalence & counts + if (params.exclude_taxa != "none" || params.min_frequency != 1 || params.min_samples != 1) { + QIIME2_FILTERTAXA ( + QIIME2_INASV.out.qza, + QIIME2_INSEQ.out.qza, + ch_tax, + params.min_frequency, + params.min_samples, + params.exclude_taxa + ) + FILTER_STATS ( DADA2_MERGE.out.asv, QIIME2_FILTERTAXA.out.tsv ) + MERGE_STATS_FILTERTAXA (MERGE_STATS.out.tsv, FILTER_STATS.out.tsv) + ch_asv = QIIME2_FILTERTAXA.out.asv + ch_seq = QIIME2_FILTERTAXA.out.seq + } else { + ch_asv = QIIME2_INASV.out.qza + ch_seq = QIIME2_INSEQ.out.qza + } + //Export various ASV tables + if (!params.skip_abundance_tables) { + QIIME2_EXPORT ( ch_asv, ch_seq, ch_tax, QIIME2_TAXONOMY.out.tsv ) + } + + if (!params.skip_barplot) { + QIIME2_BARPLOT ( ch_metadata, ch_asv, ch_tax ) + } + + //Select metadata categories for diversity analysis & ancom + if (!params.skip_ancom || !params.skip_diversity_indices) { + METADATA_ALL ( ch_metadata, params.metadata_category ).set { ch_metacolumn_all } + //return empty channel if no appropriate column was found + ch_metacolumn_all.branch { passed: it != "" }.set { result } + ch_metacolumn_all = result.passed + + METADATA_PAIRWISE ( ch_metadata ).set { ch_metacolumn_pairwise } + } else { + ch_metacolumn_all = Channel.empty() + ch_metacolumn_pairwise = Channel.empty() + } + + //Diversity indices + if ( params.metadata && (!params.skip_alpha_rarefaction || !params.skip_diversity_indices) ) { + QIIME2_DIVERSITY ( + ch_metadata, + ch_asv, + ch_seq, + QIIME2_FILTERTAXA.out.tsv, + ch_metacolumn_pairwise, + ch_metacolumn_all, + params.skip_alpha_rarefaction, + params.skip_diversity_indices + ) + } + + //Perform ANCOM tests + if ( !params.skip_ancom && params.metadata ) { + QIIME2_ANCOM ( + ch_metadata, + ch_asv, + ch_metacolumn_all, + ch_tax + ) + } + } + + /* + * MODULE: Pipeline reporting + */ + GET_SOFTWARE_VERSIONS ( + ch_software_versions.map { it }.collect() + ) + + /* + * MultiQC + */ + if (!params.skip_multiqc) { + workflow_summary = MultiqcSchema.params_summary_multiqc(workflow, params.summary_params) + ch_workflow_summary = Channel.value(workflow_summary) + + MULTIQC ( + ch_multiqc_config, + ch_multiqc_custom_config.collect().ifEmpty([]), + GET_SOFTWARE_VERSIONS.out.yaml.collect(), + ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml'), + FASTQC.out.zip.collect{it[1]}.ifEmpty([]), + CUTADAPT_WORKFLOW.out.logs.collect{it[1]}.ifEmpty([]) + ) + multiqc_report = MULTIQC.out.report.toList() + } + +} + +//////////////////////////////////////////////////// +/* -- COMPLETION EMAIL -- */ +//////////////////////////////////////////////////// + +workflow.onComplete { + Completion.email(workflow, params, params.summary_params, projectDir, log, multiqc_report) + Completion.summary(workflow, params, log) +} + +//////////////////////////////////////////////////// +/* -- THE END -- */ +////////////////////////////////////////////////////