diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml deleted file mode 100644 index 1718b42dd..000000000 --- a/.github/workflows/awsfulltest.yml +++ /dev/null @@ -1,36 +0,0 @@ -name: nf-core AWS full size tests -# This workflow is triggered on published releases. -# It can be additionally triggered manually with GitHub actions workflow dispatch button. -# It runs the -profile 'test_full' on AWS batch - -on: - release: - types: [published] - workflow_dispatch: -jobs: - run-tower: - name: Run AWS full tests - if: github.repository == 'nf-core/rnaseq' - runs-on: ubuntu-latest - # Do a full-scale run with each of the three aligners - strategy: - matrix: - aligner: ["star_salmon", "star_rsem", "hisat2"] - steps: - - name: Launch workflow via tower - uses: nf-core/tower-action@v3 - with: - workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} - access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} - compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} - workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/rnaseq/work-${{ github.sha }} - parameters: | - { - "outdir" : "s3://${{ secrets.AWS_S3_BUCKET }}/rnaseq/results-${{ github.sha }}/aligner_${{ matrix.aligner }}", - "aligner": "${{ matrix.aligner }}" - } - profiles: test_full,aws_tower - - uses: actions/upload-artifact@v3 - with: - name: Tower debug log file - path: tower_action_*.log diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml deleted file mode 100644 index d2b1b704e..000000000 --- a/.github/workflows/awstest.yml +++ /dev/null @@ -1,29 +0,0 @@ -name: nf-core AWS test -# This workflow can be triggered manually with the GitHub actions workflow dispatch button. -# It runs the -profile 'test' on AWS batch - -on: - workflow_dispatch: -jobs: - run-tower: - name: Run AWS tests - if: github.repository == 'nf-core/rnaseq' - runs-on: ubuntu-latest - steps: - # Launch workflow using Tower CLI tool action - - name: Launch workflow via tower - uses: nf-core/tower-action@v3 - with: - workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} - access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} - compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} - workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/rnaseq/work-${{ github.sha }} - parameters: | - { - "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/rnaseq/results-test-${{ github.sha }}" - } - profiles: test,aws_tower - - uses: actions/upload-artifact@v3 - with: - name: Tower debug log file - path: tower_action_*.log diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index afb8b2119..fc63a1509 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -35,9 +35,33 @@ jobs: with: version: "${{ matrix.NXF_VER }}" + - name: Cache test data + id: cache-testdata + uses: actions/cache@v3 + with: + path: test-datasets/ + key: rnaseq3_10-test-data + + - name: Check out test data + if: steps.cache-testdata.outputs.cache-hit != 'true' + uses: actions/checkout@v3 + with: + repository: nf-core/test-datasets + ref: rnaseq3 + path: test-datasets/ + + - name: Replace remote paths in samplesheets + run: | + for f in ${{ github.workspace }}/test-datasets/samplesheet/v3.10/*.csv; do + sed -i "s=https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/=${{ github.workspace }}/test-datasets/=g" $f + echo "========== $f ============" + cat $f + echo "========================================" + done; + - name: Run pipeline with test data run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results --test_data_base ${{ github.workspace }}/test-datasets/ star_salmon: name: Test STAR Salmon with workflow parameters @@ -58,10 +82,35 @@ jobs: - "--bam_csi_index" - "--save_align_intermeds --save_reference" - "--featurecounts_group_type false" + - "--trimmer fastp" steps: - name: Check out pipeline code uses: actions/checkout@v2 + - name: Cache test data + id: cache-testdata + uses: actions/cache@v3 + with: + path: test-datasets/ + key: rnaseq3_10-test-data + + - name: Check out test data + if: steps.cache-testdata.outputs.cache-hit != 'true' + uses: actions/checkout@v3 + with: + repository: nf-core/test-datasets + ref: rnaseq3 + path: test-datasets/ + + - name: Replace remote paths in samplesheets + run: | + for f in ${{ github.workspace }}/test-datasets/samplesheet/v3.10/*.csv; do + sed -i "s=https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/=${{ github.workspace }}/test-datasets/=g" $f + echo "========== $f ============" + cat $f + echo "========================================" + done; + - name: Install Nextflow run: | wget -qO- get.nextflow.io | bash @@ -69,7 +118,7 @@ jobs: - name: Run pipeline with STAR and various parameters run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --aligner star_salmon ${{ matrix.parameters }} --outdir ./results + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --aligner star_salmon ${{ matrix.parameters }} --outdir ./results --test_data_base ${{ github.workspace }}/test-datasets/ star_rsem: name: Test STAR RSEM with workflow parameters @@ -84,6 +133,30 @@ jobs: - name: Check out pipeline code uses: actions/checkout@v2 + - name: Cache test data + id: cache-testdata + uses: actions/cache@v3 + with: + path: test-datasets/ + key: rnaseq3_10-test-data + + - name: Check out test data + if: steps.cache-testdata.outputs.cache-hit != 'true' + uses: actions/checkout@v3 + with: + repository: nf-core/test-datasets + ref: rnaseq3 + path: test-datasets/ + + - name: Replace remote paths in samplesheets + run: | + for f in ${{ github.workspace }}/test-datasets/samplesheet/v3.10/*.csv; do + sed -i "s=https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/=${{ github.workspace }}/test-datasets/=g" $f + echo "========== $f ============" + cat $f + echo "========================================" + done; + - name: Install Nextflow run: | wget -qO- get.nextflow.io | bash @@ -91,7 +164,7 @@ jobs: - name: Run pipeline with RSEM STAR and various parameters run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --aligner star_rsem ${{ matrix.parameters }} --outdir ./results + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --aligner star_rsem ${{ matrix.parameters }} --outdir ./results --test_data_base ${{ github.workspace }}/test-datasets/ hisat2: name: Test HISAT2 with workflow parameters @@ -106,6 +179,30 @@ jobs: - name: Check out pipeline code uses: actions/checkout@v2 + - name: Cache test data + id: cache-testdata + uses: actions/cache@v3 + with: + path: test-datasets/ + key: rnaseq3_10-test-data + + - name: Check out test data + if: steps.cache-testdata.outputs.cache-hit != 'true' + uses: actions/checkout@v3 + with: + repository: nf-core/test-datasets + ref: rnaseq3 + path: test-datasets/ + + - name: Replace remote paths in samplesheets + run: | + for f in ${{ github.workspace }}/test-datasets/samplesheet/v3.10/*.csv; do + sed -i "s=https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/=${{ github.workspace }}/test-datasets/=g" $f + echo "========== $f ============" + cat $f + echo "========================================" + done; + - name: Install Nextflow run: | wget -qO- get.nextflow.io | bash @@ -113,7 +210,7 @@ jobs: - name: Run pipeline with HISAT2 and various parameters run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --aligner hisat2 ${{ matrix.parameters }} --outdir ./results + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --aligner hisat2 ${{ matrix.parameters }} --outdir ./results --test_data_base ${{ github.workspace }}/test-datasets/ salmon: name: Test Salmon with workflow parameters @@ -128,6 +225,30 @@ jobs: - name: Check out pipeline code uses: actions/checkout@v2 + - name: Cache test data + id: cache-testdata + uses: actions/cache@v3 + with: + path: test-datasets/ + key: rnaseq3_10-test-data + + - name: Check out test data + if: steps.cache-testdata.outputs.cache-hit != 'true' + uses: actions/checkout@v3 + with: + repository: nf-core/test-datasets + ref: rnaseq3 + path: test-datasets/ + + - name: Replace remote paths in samplesheets + run: | + for f in ${{ github.workspace }}/test-datasets/samplesheet/v3.10/*.csv; do + sed -i "s=https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/=${{ github.workspace }}/test-datasets/=g" $f + echo "========== $f ============" + cat $f + echo "========================================" + done; + - name: Install Nextflow run: | wget -qO- get.nextflow.io | bash @@ -135,4 +256,4 @@ jobs: - name: Run pipeline with Salmon and various parameters run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --pseudo_aligner salmon ${{ matrix.parameters }} --outdir ./results + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --pseudo_aligner salmon ${{ matrix.parameters }} --outdir ./results --test_data_base ${{ github.workspace }}/test-datasets/ diff --git a/.github/workflows/cloud_tests_full.yml b/.github/workflows/cloud_tests_full.yml new file mode 100644 index 000000000..92d784131 --- /dev/null +++ b/.github/workflows/cloud_tests_full.yml @@ -0,0 +1,94 @@ +name: full-sized tests on cloud providers +run-name: Submitting workflow to all cloud providers using full sized data +on: + release: + types: [published] + workflow_dispatch: + inputs: + platform: + description: "Platform to run test" + required: true + default: "all" + type: choice + options: + - all + - aws + - azure + - gcp +jobs: + run-full-tests-on-aws: + if: ${{ github.event.inputs.platform == 'all' || github.event.inputs.platform == 'aws' }} + runs-on: ubuntu-latest + strategy: + matrix: + aligner: ["star_salmon", "star_rsem"] + steps: + - uses: seqeralabs/action-tower-launch@v1 + with: + workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} + compute_env: ${{ secrets.TOWER_CE_AWS_CPU }} + workdir: "${{ secrets.TOWER_BUCKET_AWS }}/work/rnaseq/work-${{ github.sha }}" + run_name: "aws_rnaseq_full_${{ matrix.aligner }}" + profiles: test_full_aws + parameters: | + { + "hook_url": "${{ secrets.MEGATESTS_ALERTS_SLACK_HOOK_URL }}", + "outdir": "${{ secrets.TOWER_BUCKET_AWS }}/rnaseq/results-${{ github.sha }}/aligner_${{ matrix.aligner }}" + } + wait: false + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: tower_action_*.log + run-full-tests-on-gcp: + if: ${{ github.event.inputs.platform == 'all' || github.event.inputs.platform == 'gcp' }} + runs-on: ubuntu-latest + strategy: + matrix: + aligner: ["star_salmon", "star_rsem"] + steps: + - uses: seqeralabs/action-tower-launch@v1 + with: + workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} + compute_env: ${{ secrets.TOWER_CE_GCP_CPU }} + workdir: "${{ secrets.TOWER_BUCKET_GCP }}/work/rnaseq/work-${{ github.sha }}" + run_name: "gcp_rnaseq_full_${{ matrix.aligner }}" + profiles: test_full_gcp + parameters: | + { + "hook_url": "${{ secrets.MEGATESTS_ALERTS_SLACK_HOOK_URL }}", + "outdir": "${{ secrets.TOWER_BUCKET_GCP }}/rnaseq/results-${{ github.sha }}/aligner_${{ matrix.aligner }}" + } + wait: false + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: tower_action_*.log + run-full-tests-on-azure: + if: ${{ github.event.inputs.platform == 'all' || github.event.inputs.platform == 'azure' }} + runs-on: ubuntu-latest + strategy: + matrix: + aligner: ["star_salmon", "star_rsem"] + steps: + - uses: seqeralabs/action-tower-launch@v1 + with: + workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} + compute_env: ${{ secrets.TOWER_CE_AZURE_CPU }} + workdir: "${{ secrets.TOWER_BUCKET_AZURE }}/work/rnaseq/work-${{ github.sha }}" + run_name: "azure_rnaseq_full_${{ matrix.aligner }}" + profiles: test_full_azure + parameters: | + { + "hook_url": "${{ secrets.MEGATESTS_ALERTS_SLACK_HOOK_URL }}", + "outdir": "${{ secrets.TOWER_BUCKET_AZURE }}/rnaseq/results-${{ github.sha }}/aligner_${{ matrix.aligner }}", + "igenomes_base": "${{ secrets.TOWER_IGENOMES_BASE_AZURE }}" + } + wait: false + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: tower_action_*.log diff --git a/.github/workflows/cloud_tests_small.yml b/.github/workflows/cloud_tests_small.yml new file mode 100644 index 000000000..318601912 --- /dev/null +++ b/.github/workflows/cloud_tests_small.yml @@ -0,0 +1,79 @@ +name: small-sized tests on cloud providers +run-name: Submitting workflow to all cloud providers using small sized data +on: + workflow_dispatch: + inputs: + platform: + description: "Platform to run test" + required: true + default: "all" + type: choice + options: + - all + - aws + - azure + - gcp +jobs: + run-small-tests-on-aws: + if: ${{ github.event.inputs.platform == 'all' || github.event.inputs.platform == 'aws' }} + runs-on: ubuntu-latest + steps: + - uses: seqeralabs/action-tower-launch@v1 + with: + workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} + compute_env: ${{ secrets.TOWER_CE_AWS_CPU }} + workdir: "${{ secrets.TOWER_BUCKET_AWS }}/work/rnaseq/work-${{ github.sha }}" + run_name: "aws_rnaseq_small" + profiles: test + parameters: | + { + "outdir": "${{ secrets.TOWER_BUCKET_AWS }}/rnaseq/results-test-${{ github.sha }}" + } + wait: false + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: tower_action_*.log + run-small-tests-on-gcp: + if: ${{ github.event.inputs.platform == 'all' || github.event.inputs.platform == 'gcp' }} + runs-on: ubuntu-latest + steps: + - uses: seqeralabs/action-tower-launch@v1 + with: + workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} + compute_env: ${{ secrets.TOWER_CE_GCP_CPU }} + workdir: "${{ secrets.TOWER_BUCKET_GCP }}/work/rnaseq/work-${{ github.sha }}" + run_name: "gcp_rnaseq_small" + profiles: test + parameters: | + { + "outdir": "${{ secrets.TOWER_BUCKET_GCP }}/rnaseq/results-test-${{ github.sha }}" + } + wait: false + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: tower_action_*.log + run-small-tests-on-azure: + if: ${{ github.event.inputs.platform == 'all' || github.event.inputs.platform == 'azure' }} + runs-on: ubuntu-latest + steps: + - uses: seqeralabs/action-tower-launch@v1 + with: + workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} + compute_env: ${{ secrets.TOWER_CE_AZURE_CPU }} + workdir: "${{ secrets.TOWER_BUCKET_AZURE }}/work/rnaseq/work-${{ github.sha }}" + run_name: "azure_rnaseq_small" + profiles: test + parameters: | + { + "outdir": "${{ secrets.TOWER_BUCKET_AZURE }}/rnaseq/results-test-${{ github.sha }}" + } + wait: false + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: tower_action_*.log diff --git a/.nf-core.yml b/.nf-core.yml index 40bcac74d..e6c9d79f0 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -3,4 +3,5 @@ lint: files_unchanged: - assets/email_template.html - assets/email_template.txt + - lib/NfcoreSchema.groovy - lib/NfcoreTemplate.groovy diff --git a/CHANGELOG.md b/CHANGELOG.md index 282a5e715..ecda50c86 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,76 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [[3.11.0](https://github.com/nf-core/rnaseq/releases/tag/3.11.0)] - 2023-03-30 + +### Credits + +Special thanks to the following for their code contributions to the release: + +- [J Lorent](https://github.com/jlorent) +- [Luca Beltrame](https://github.com/lbeltrame) +- [Matthias Zepper](https://github.com/MatthiasZepper) +- [Maxime Garcia](https://github.com/maxulysse) +- [Ryan Yordanoff](https://github.com/ryanyord) +- [Thomas Sandmann](https://github.com/tomsing1) + +Thank you to everyone else that has contributed by reporting bugs, enhancements or in any other way, shape or form. + +### Enhancements & fixes + +- Add infrastructure and CI for multi-cloud full-sized tests run via Nextflow Tower (see [#981](https://github.com/nf-core/rnaseq/pull/981)) +- Added fastp support. + - Users can now select between `--trimmer trimgalore` (default) and `--trimmer fastp`. + - Trim Galore! specific pipeline parameters have been deprecated: `--clip_r1`, `--clip_r2`, `--three_prime_clip_r1`, `--three_prime_clip_r2` and `--trim_nextseq` + - Any additional options can now be specified via the `--extra_trimgalore_args` and `--extra_fastp_args` parameters, respectively. +- [[#663](https://github.com/nf-core/rnaseq/issues/663)] - Alternative trimming step for polyA/T removal +- [[#781](https://github.com/nf-core/rnaseq/issues/781)] - Add Warning for poly(A) libraries +- [[#878](https://github.com/nf-core/rnaseq/issues/878)] - Allow tabs in fasta header when creating decoys for salmon index +- [[#931](https://github.com/nf-core/rnaseq/issues/931)] - Save transcriptome BAM files when using `--save_umi_intermeds` / `--save_align_intermeds` +- [[#934](https://github.com/nf-core/rnaseq/pull/934)] - Union of `ext.args` and `params.extra_star_align_args` prevents parameter clashes in the STAR module +- [[#940](https://github.com/nf-core/rnaseq/issues/940)] - Bugfix in `salmon_summarizedexperiment.r` to ensure `rbind` doesn't fail when `rowdata` has no `tx` column. +- [[#944](https://github.com/nf-core/rnaseq/issues/944)] - Read clipping using clip_r1, clip_r2, three_prime_clip_r1, three_prime_clip_r2 disabled in 3.10 +- [[#956](https://github.com/nf-core/rnaseq/pull/956)] - Implement 'auto' as default strandedness argument in `fastq_dir_to_samplesheet.py` script +- [[#960](https://github.com/nf-core/rnaseq/issues/960)] - Failure with awsbatch when running processes that are using `executor: local` +- [[#961](https://github.com/nf-core/rnaseq/issues/961)] - Add warnings to STDOUT for all skipped and failed strandedness check samples +- [[#975](https://github.com/nf-core/rnaseq/issues/975)] - `SALMON_INDEX` runs when using `--aligner star_rsem` even if samples have explicit strandedness +- Remove HISAT2 from automated AWS full-sized tests + +### Parameters + +| Old parameter | New parameter | +| ----------------------- | ------------------------- | +| | `--trimmer` | +| | `--extra_trimgalore_args` | +| `--clip_r1` | | +| `--clip_r2` | | +| `--three_prime_clip_r1` | | +| `--three_prime_clip_r2` | | +| `--tracedir` | | +| `--trim_nextseq` | | + +> **NB:** Parameter has been **updated** if both old and new parameter information is present. +> **NB:** Parameter has been **added** if just the new parameter information is present. +> **NB:** Parameter has been **removed** if new parameter information isn't present. + +### Software dependencies + +Note, since the pipeline is now using Nextflow DSL2, each process will be run with its own [Biocontainer](https://biocontainers.pro/#/registry). This means that on occasion it is entirely possible for the pipeline to be using different versions of the same tool. However, the overall software dependency changes compared to the last release have been listed below for reference. + +| Dependency | Old version | New version | +| ----------- | ----------- | ----------- | +| `fastp` | | 0.23.2 | +| `multiqc` | 1.13 | 1.14 | +| `picard` | 2.27.4 | 3.0.0 | +| `salmon` | 1.9.0 | 1.10.1 | +| `umi_tools` | 1.1.2 | 1.1.4 | + +> **NB:** Dependency has been **updated** if both old and new version information is present. +> +> **NB:** Dependency has been **added** if just the new version information is present. +> +> **NB:** Dependency has been **removed** if new version information isn't present. + ## [[3.10.1](https://github.com/nf-core/rnaseq/releases/tag/3.10.1)] - 2023-01-05 ### Enhancements & fixes diff --git a/CITATIONS.md b/CITATIONS.md index 52f50cf87..5e9d106fd 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -16,6 +16,10 @@ > Quinlan AR, Hall IM. BEDTools: a flexible suite of utilities for comparing genomic features. Bioinformatics. 2010 Mar 15;26(6):841-2. doi: 10.1093/bioinformatics/btq033. Epub 2010 Jan 28. PubMed PMID: 20110278; PubMed Central PMCID: PMC2832824. +- [fastp](https://www.ncbi.nlm.nih.gov/pubmed/30423086/) + + > Chen S, Zhou Y, Chen Y, Gu J. fastp: an ultra-fast all-in-one FASTQ preprocessor. Bioinformatics. 2018 Sep 1;34(17):i884-i890. doi: 10.1093/bioinformatics/bty560. PubMed PMID: 30423086; PubMed Central PMCID: PMC6129281. + - [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) - [featureCounts](https://pubmed.ncbi.nlm.nih.gov/24227677/) diff --git a/README.md b/README.md index 8b42c6fec..7d64b451b 100644 --- a/README.md +++ b/README.md @@ -12,25 +12,10 @@ ## Introduction -**nf-core/rnaseq** is a bioinformatics pipeline that can be used to analyse RNA sequencing data obtained from organisms with a reference genome and annotation. - -On release, automated continuous integration tests run the pipeline on a [full-sized dataset](https://github.com/nf-core/test-datasets/tree/rnaseq#full-test-dataset-origin) obtained from the ENCODE Project Consortium on the AWS cloud infrastructure. This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world datasets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources. The results obtained from running the full-sized tests individually for each `--aligner` option can be viewed on the [nf-core website](https://nf-co.re/rnaseq/results) e.g. the results for running the pipeline with `--aligner star_salmon` will be in a folder called `aligner_star_salmon` and so on. - -The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community! - -## Online videos - -A short talk about the history, current status and functionality on offer in this pipeline was given by Harshil Patel ([@drpatelh](https://github.com/drpatelh)) on [8th February 2022](https://nf-co.re/events/2022/bytesize-32-nf-core-rnaseq) as part of the nf-core/bytesize series. - -You can find numerous talks on the [nf-core events page](https://nf-co.re/events) from various topics including writing pipelines/modules in Nextflow DSL2, using nf-core tooling, running nf-core pipelines as well as more generic content like contributing to Github. Please check them out! - -## Pipeline summary +**nf-core/rnaseq** is a bioinformatics pipeline that can be used to analyse RNA sequencing data obtained from organisms with a reference genome and annotation. It takes a samplesheet and FASTQ files as input, performs quality control (QC), trimming and (pseudo-)alignment, and produces a gene expression matrix and extensive QC report. ![nf-core/rnaseq metro map](docs/images/nf-core-rnaseq_metro_map_grey.png) -> **Note** -> The SRA download functionality has been removed from the pipeline (`>=3.2`) and ported to an independent workflow called [nf-core/fetchngs](https://nf-co.re/fetchngs). You can provide `--nf_core_pipeline rnaseq` when running nf-core/fetchngs to download and auto-create a samplesheet containing publicly available samples that can be accepted directly as input by this pipeline. - 1. Merge re-sequenced FastQ files ([`cat`](http://www.linfo.org/cat.html)) 2. Sub-sample FastQ files and auto-infer strandedness ([`fq`](https://github.com/stjude-rust-labs/fq), [`Salmon`](https://combine-lab.github.io/salmon/)) 3. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) @@ -56,44 +41,55 @@ You can find numerous talks on the [nf-core events page](https://nf-co.re/events 15. Pseudo-alignment and quantification ([`Salmon`](https://combine-lab.github.io/salmon/); _optional_) 16. Present QC for raw read, alignment, gene biotype, sample similarity, and strand-specificity checks ([`MultiQC`](http://multiqc.info/), [`R`](https://www.r-project.org/)) +> **Note** +> The SRA download functionality has been removed from the pipeline (`>=3.2`) and ported to an independent workflow called [nf-core/fetchngs](https://nf-co.re/fetchngs). You can provide `--nf_core_pipeline rnaseq` when running nf-core/fetchngs to download and auto-create a samplesheet containing publicly available samples that can be accepted directly as input by this pipeline. + > **Warning** > Quantification isn't performed if using `--aligner hisat2` due to the lack of an appropriate option to calculate accurate expression estimates from HISAT2 derived genomic alignments. However, you can use this route if you have a preference for the alignment, QC and other types of downstream analysis compatible with the output of HISAT2. -## Quick Start +## Usage -1. Install [`Nextflow`](https://www.nextflow.io/docs/latest/getstarted.html#installation) (`>=22.10.1`) +> **Note** +> If you are new to nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. -2. Install any of [`Docker`](https://docs.docker.com/engine/installation/), [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/) (you can follow [this tutorial](https://singularity-tutorial.github.io/01-installation/)), [`Podman`](https://podman.io/), [`Shifter`](https://nersc.gitlab.io/development/shifter/how-to-use/) or [`Charliecloud`](https://hpc.github.io/charliecloud/) for full pipeline reproducibility _(you can use [`Conda`](https://conda.io/miniconda.html) both to install Nextflow itself and also to manage software within pipelines. Please only use it within pipelines as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles))_. Note: This pipeline does not currently support running with Conda on macOS if the `--remove_ribo_rna` parameter is used because the latest version of the SortMeRNA package is not available for this platform. +First, you need to prepare a samplesheet with your input data that looks as follows: -3. Download the pipeline and test it on a minimal dataset with a single command: +**samplesheet.csv**: - ```bash - nextflow run nf-core/rnaseq -profile test,YOURPROFILE --outdir - ``` +```csv +sample,fastq_1,fastq_2,strandedness +CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz,auto +CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz,auto +CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz,auto +``` - Note that some form of configuration will be needed so that Nextflow knows how to fetch the required software. This is usually done in the form of a config profile (`YOURPROFILE` in the example command above). You can chain multiple config profiles in a comma-separated string. +Each row represents a fastq file (single-end) or a pair of fastq files (paired end). Rows with the same sample identifier are considered technical replicates and merged automatically. The strandedness refers to the library preparation and will be automatically inferred if set to `auto`. - > - The pipeline comes with config profiles called `docker`, `singularity`, `podman`, `shifter`, `charliecloud` and `conda` which instruct the pipeline to use the named tool for software management. For example, `-profile test,docker`. - > - Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile ` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment. - > - If you are using `singularity`, please use the [`nf-core download`](https://nf-co.re/tools/#downloading-pipelines-for-offline-use) command to download images first, before running the pipeline. Setting the [`NXF_SINGULARITY_CACHEDIR` or `singularity.cacheDir`](https://www.nextflow.io/docs/latest/singularity.html?#singularity-docker-hub) Nextflow options enables you to store and re-use the images from a central location for future pipeline runs. - > - If you are using `conda`, it is highly recommended to use the [`NXF_CONDA_CACHEDIR` or `conda.cacheDir`](https://www.nextflow.io/docs/latest/conda.html) settings to store the environments in a central location for future pipeline runs. +> **Warning** +> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration **except for parameters**; see [docs](https://nf-co.re/usage/configuration#custom-configuration-files). -4. Start running your own analysis! +Now, you can run the pipeline using: - ```bash - nextflow run nf-core/rnaseq --input samplesheet.csv --outdir --genome GRCh37 -profile - ``` +```bash +nextflow run nf-core/rnaseq \ + --input samplesheet.csv \ + --outdir \ + --genome GRCh37 \ + -profile +``` - - An executable Python script called [`fastq_dir_to_samplesheet.py`](https://github.com/nf-core/rnaseq/blob/master/bin/fastq_dir_to_samplesheet.py) has been provided if you would like to auto-create an input samplesheet based on a directory containing FastQ files **before** you run the pipeline (requires Python 3 installed locally) e.g. +For more details, please refer to the [usage documentation](https://nf-co.re/rnaseq/usage) and the [parameter documentation](https://nf-co.re/rnaseq/parameters). - ```bash - wget -L https://raw.githubusercontent.com/nf-core/rnaseq/master/bin/fastq_dir_to_samplesheet.py - ./fastq_dir_to_samplesheet.py samplesheet.csv --strandedness reverse - ``` +## Pipeline output -## Documentation +The output of the pipeline applied to a [full-sized example dataset](https://github.com/nf-core/test-datasets/tree/rnaseq#full-test-dataset-origin) can be found [here](https://nf-co.re/rnaseq/results). +For more details, please refer to the [output documentation](https://nf-co.re/rnaseq/output). -The nf-core/rnaseq pipeline comes with documentation about the pipeline [usage](https://nf-co.re/rnaseq/usage), [parameters](https://nf-co.re/rnaseq/parameters) and [output](https://nf-co.re/rnaseq/output). +## Online videos + +A short talk about the history, current status and functionality on offer in this pipeline was given by Harshil Patel ([@drpatelh](https://github.com/drpatelh)) on [8th February 2022](https://nf-co.re/events/2022/bytesize-32-nf-core-rnaseq) as part of the nf-core/bytesize series. + +You can find numerous talks on the [nf-core events page](https://nf-co.re/events) from various topics including writing pipelines/modules in Nextflow DSL2, using nf-core tooling, running nf-core pipelines as well as more generic content like contributing to Github. Please check them out! ## Credits @@ -104,18 +100,20 @@ The pipeline was re-written in Nextflow DSL2 and is primarily maintained by Hars The pipeline workflow diagram was designed by Sarah Guinchard ([@G-Sarah](https://github.com/G-Sarah)) and James Fellows Yates ([@jfy133](https://github.com/jfy133)). Many thanks to other who have helped out along the way too, including (but not limited to): -[@MatthiasZepper](https://github.com/MatthiasZepper), -[@Emiller88](https://github.com/Emiller88), -[@maxulysse](https://github.com/maxulysse), -[@robsyme](https://github.com/robsyme), -[@Galithil](https://github.com/Galithil), -[@pditommaso](https://github.com/pditommaso), -[@orzechoj](https://github.com/orzechoj), -[@apeltzer](https://github.com/apeltzer), -[@colindaven](https://github.com/colindaven), -[@lpantano](https://github.com/lpantano), -[@olgabot](https://github.com/olgabot), -[@jburos](https://github.com/jburos). + +- [Alex Peltzer](https://github.com/apeltzer) +- [Colin Davenport](https://github.com/colindaven) +- [Denis Moreno](https://github.com/Galithil) +- [Edumnd Miller](https://github.com/Emiller88) +- [Gregor Sturm](https://github.com/grst) +- [Jacki Buros Novik](https://github.com/jburos) +- [Lorena Pantano](https://github.com/lpantano) +- [Matthias Zepper](https://github.com/MatthiasZepper) +- [Maxime Garcia](https://github.com/maxulysse) +- [Olga Botvinnik](https://github.com/olgabot) +- [@orzechoj](https://github.com/orzechoj) +- [Paolo Di Tommaso](https://github.com/pditommaso) +- [Rob Syme](https://github.com/robsyme) ## Contributions and Support diff --git a/assets/email_template.html b/assets/email_template.html index c73cfb931..0ef39b222 100644 --- a/assets/email_template.html +++ b/assets/email_template.html @@ -34,7 +34,7 @@

nf-core/rnaseq execution completed uns

The full error message was:

${errorReport}
- """ } else if(fail_percent_mapped.size() > 0) { out << """ + """ } else if(skip_sample_count > 0) { out << """
nf-core/rnaseq execution completed uns >

nf-core/rnaseq execution completed with warnings!

- The pipeline finished successfully, but the following samples were skipped due to very low alignment (< - ${min_mapped_reads}%): + The pipeline finished successfully, but samples were skipped. Please check warnings at the top of the MultiQC report.

-
    -
  • ${fail_percent_mapped.join('
  • ')}
  • -

diff --git a/assets/email_template.txt b/assets/email_template.txt index fd1ec9894..5440f887a 100644 --- a/assets/email_template.txt +++ b/assets/email_template.txt @@ -17,14 +17,12 @@ The full error message was: ${errorReport} """ -} else if (fail_percent_mapped.size() > 0) { +} else if (skip_sample_count > 0) { out << """################################################## ## nf-core/rnaseq execution completed with warnings ## ################################################## -The pipeline finished successfully, but the following samples were skipped, -due to very low alignment (less than ${min_mapped_reads}%): - - - ${fail_percent_mapped.join("\n - ")} +The pipeline finished successfully, but samples were skipped. +Please check warnings at the top of the MultiQC report. """ } else { out << "## nf-core/rnaseq execution completed successfully! ##" diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 09b6b262b..736156783 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -17,6 +17,7 @@ run_modules: - custom_content - fastqc - cutadapt + - fastp - sortmerna - star - hisat2 @@ -47,13 +48,14 @@ module_order: name: "FastQC (raw)" info: "This section of the report shows FastQC results before adapter trimming." path_filters: - - "./fastqc/*.zip" + - "./fastqc/raw/*.zip" - cutadapt + - fastp - fastqc: name: "FastQC (trimmed)" info: "This section of the report shows FastQC results after adapter trimming." path_filters: - - "./trimgalore/fastqc/*.zip" + - "./fastqc/trim/*.zip" # Don't show % Dups in the General Stats table (we have this from Picard) table_columns_visible: @@ -74,6 +76,9 @@ sp: cutadapt: fn: "*trimming_report.txt" + fastp: + fn: "*.fastp.json" + sortmerna: fn: "*.sortmerna.log" diff --git a/assets/slackreport.json b/assets/slackreport.json index 043d02f27..fc84fa5e5 100644 --- a/assets/slackreport.json +++ b/assets/slackreport.json @@ -3,7 +3,7 @@ { "fallback": "Plain-text summary of the attachment.", "color": "<% if (success) { %>good<% } else { %>danger<%} %>", - "author_name": "sanger-tol/readmapping v${version} - ${runName}", + "author_name": "nf-core/rnaseq ${version} - ${runName}", "author_icon": "https://www.nextflow.io/docs/latest/_static/favicon.ico", "text": "<% if (success) { %>Pipeline completed successfully!<% } else { %>Pipeline completed with errors<% } %>", "fields": [ diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 69d67446f..05971a5d9 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -48,7 +48,6 @@ def check_samplesheet(file_in, file_out): sample_mapping_dict = {} with open(file_in, "r", encoding="utf-8-sig") as fin: - ## Check header MIN_COLS = 3 HEADER = ["sample", "fastq_1", "fastq_2", "strandedness"] @@ -142,7 +141,6 @@ def check_samplesheet(file_in, file_out): ",".join(["sample", "single_end", "fastq_1", "fastq_2", "strandedness"] + header[len(HEADER) :]) + "\n" ) for sample in sorted(sample_mapping_dict.keys()): - ## Check that multiple runs of the same sample are of the same datatype i.e. single-end / paired-end if not all(x[0] == sample_mapping_dict[sample][0][0] for x in sample_mapping_dict[sample]): print_error( diff --git a/bin/fastq_dir_to_samplesheet.py b/bin/fastq_dir_to_samplesheet.py index 6c7931f4f..1eb3657c6 100755 --- a/bin/fastq_dir_to_samplesheet.py +++ b/bin/fastq_dir_to_samplesheet.py @@ -18,8 +18,8 @@ def parse_args(args=None): "--strandedness", type=str, dest="STRANDEDNESS", - default="unstranded", - help="Value for 'strandedness' in samplesheet. Must be one of 'unstranded', 'forward', 'reverse'.", + default="auto", + help="Value for 'strandedness' in samplesheet. Must be one of 'unstranded', 'forward', 'reverse', 'auto'.", ) parser.add_argument( "-r1", @@ -80,7 +80,7 @@ def parse_args(args=None): def fastq_dir_to_samplesheet( fastq_dir, samplesheet_file, - strandedness="unstranded", + strandedness="auto", read1_extension="_R1_001.fastq.gz", read2_extension="_R2_001.fastq.gz", single_end=False, @@ -154,8 +154,8 @@ def get_fastqs(extension, recursive=False): def main(args=None): args = parse_args(args) - strandedness = "unstranded" - if args.STRANDEDNESS in ["unstranded", "forward", "reverse"]: + strandedness = "auto" + if args.STRANDEDNESS in ["unstranded", "forward", "reverse", "auto"]: strandedness = args.STRANDEDNESS fastq_dir_to_samplesheet( diff --git a/bin/filter_gtf_for_genes_in_genome.py b/bin/filter_gtf_for_genes_in_genome.py index ef4c87cd4..9f876eaa0 100755 --- a/bin/filter_gtf_for_genes_in_genome.py +++ b/bin/filter_gtf_for_genes_in_genome.py @@ -46,7 +46,6 @@ def extract_genes_in_genome(fasta, gtf_in, gtf_out): n_lines_in_genome = 0 with open(gtf_out, "w") as f: with open(gtf_in) as g: - for line in g.readlines(): n_total_lines += 1 seq_name_gtf = line.split("\t")[0] diff --git a/bin/mqc_features_stat.py b/bin/mqc_features_stat.py index c0cb59dc9..689a3f215 100755 --- a/bin/mqc_features_stat.py +++ b/bin/mqc_features_stat.py @@ -24,7 +24,6 @@ def mqc_feature_stat(bfile, features, outfile, sname=None): - # If sample name not given use file name if not sname: sname = os.path.splitext(os.path.basename(bfile))[0] diff --git a/bin/prepare-for-rsem.py b/bin/prepare-for-rsem.py index 95ef24686..f874af792 100755 --- a/bin/prepare-for-rsem.py +++ b/bin/prepare-for-rsem.py @@ -1,9 +1,9 @@ #!/usr/bin/env python3 """ -============================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Credits -============================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ This script is a clone of the "prepare-for-rsem.py" script written by Ian Sudbury, Tom Smith and other contributors to the UMI-tools package: @@ -19,9 +19,9 @@ Commit: https://github.com/CGATOxford/UMI-tools/blob/bf8608d6a172c5ca0dcf33c126b4e23429177a72/umi_tools/prepare-for-rsem.py -============================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ prepare_for_rsem - make the output from dedup or group compatible with RSEM -=============================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The SAM format specification states that the mnext and mpos fields should point to the primary alignment of a read's mate. However, not all aligners adhere to this standard. In addition, the RSEM software requires that the mate of a read1 @@ -68,7 +68,6 @@ def chunk_bam(bamfile): output_buffer = list() for read in bamfile: - if last_query_name is not None and last_query_name != read.query_name: yield (output_buffer) output_buffer = list() @@ -84,7 +83,6 @@ def copy_tags(tags, read1, read2): to read2, if the tag is set""" for tag in tags: - try: read1_tag = read1.get_tag(tag, with_value_type=True) read2.set_tag(tag, value=read1_tag[0], value_type=read1_tag[1]) @@ -122,7 +120,6 @@ def pick_mate(read, template_dict, mate_key): def main(argv=None): - if argv is None: argv = sys.argv @@ -174,7 +171,6 @@ def main(argv=None): options.tags = options.tags.split(",") for template in chunk_bam(inbam): - assert len(set(r.query_name for r in template)) == 1 current_template = {True: defaultdict(list), False: defaultdict(list)} @@ -185,7 +181,6 @@ def main(argv=None): output = set() for read in template: - mate = None # if this read is a non_primary alignment, we first want to check if it has a mate @@ -231,7 +226,6 @@ def main(argv=None): # each pair twice - once when we scan read1 and once when we scan read2. Thus we need # to make sure we don't output something already output. if read.is_read1: - mate = copy_tags(options.tags, read, mate) output_key = str(read) + str(mate) @@ -242,7 +236,6 @@ def main(argv=None): skipped_stats["pairs_output"] += 1 elif read.is_read2: - read = copy_tags(options.tags, mate, read) output_key = str(mate) + str(read) diff --git a/bin/salmon_summarizedexperiment.r b/bin/salmon_summarizedexperiment.r index 32243ed7f..5ebdd317b 100755 --- a/bin/salmon_summarizedexperiment.r +++ b/bin/salmon_summarizedexperiment.r @@ -4,56 +4,65 @@ library(SummarizedExperiment) ## Create SummarizedExperiment (se) object from Salmon counts -args = commandArgs(trailingOnly=TRUE) +args <- commandArgs(trailingOnly = TRUE) if (length(args) < 2) { - stop("Usage: salmon_se.r ", call.=FALSE) + stop("Usage: salmon_se.r ", call. = FALSE) } -coldata = args[1] -counts_fn = args[2] -tpm_fn = args[3] +coldata <- args[1] +counts_fn <- args[2] +tpm_fn <- args[3] -tx2gene = "salmon_tx2gene.tsv" -info = file.info(tx2gene) +tx2gene <- "salmon_tx2gene.tsv" +info <- file.info(tx2gene) if (info$size == 0) { - tx2gene = NULL + tx2gene <- NULL } else { - rowdata = read.csv(tx2gene, sep="\t", header = FALSE) - colnames(rowdata) = c("tx", "gene_id", "gene_name") - tx2gene = rowdata[,1:2] + rowdata <- read.csv(tx2gene, sep = "\t", header = FALSE) + colnames(rowdata) <- c("tx", "gene_id", "gene_name") + tx2gene <- rowdata[, 1:2] } -counts = read.csv(counts_fn, row.names=1, sep="\t") -counts = counts[,2:ncol(counts),drop=FALSE] # remove gene_name column -tpm = read.csv(tpm_fn, row.names=1, sep="\t") -tpm = tpm[,2:ncol(tpm),drop=FALSE] # remove gene_name column +counts <- read.csv(counts_fn, row.names = 1, sep = "\t") +counts <- counts[, 2:ncol(counts), drop = FALSE] # remove gene_name column +tpm <- read.csv(tpm_fn, row.names = 1, sep = "\t") +tpm <- tpm[, 2:ncol(tpm), drop = FALSE] # remove gene_name column if (length(intersect(rownames(counts), rowdata[["tx"]])) > length(intersect(rownames(counts), rowdata[["gene_id"]]))) { - by_what = "tx" + by_what <- "tx" } else { - by_what = "gene_id" - rowdata = unique(rowdata[,2:3]) + by_what <- "gene_id" + rowdata <- unique(rowdata[, 2:3]) } if (file.exists(coldata)) { - coldata = read.csv(coldata, sep="\t") - coldata = coldata[match(colnames(counts), coldata[,1]),] - coldata = cbind(files = fns, coldata) + coldata <- read.csv(coldata, sep = "\t") + coldata <- coldata[match(colnames(counts), coldata[, 1]), ] + coldata <- cbind(files = fns, coldata) } else { message("ColData not avaliable ", coldata) - coldata = data.frame(files = colnames(counts), names = colnames(counts)) + coldata <- data.frame(files = colnames(counts), names = colnames(counts)) } -rownames(coldata) = coldata[["names"]] -extra = setdiff(rownames(counts), as.character(rowdata[[by_what]])) +rownames(coldata) <- coldata[["names"]] +extra <- setdiff(rownames(counts), as.character(rowdata[[by_what]])) if (length(extra) > 0) { - rowdata = rbind(rowdata, data.frame(tx=extra, gene_id=extra, gene_name=extra)) + rowdata <- rbind( + rowdata, + data.frame( + tx = extra, + gene_id = extra, + gene_name = extra + )[, colnames(rowdata)] + ) } -rowdata = rowdata[match(rownames(counts), as.character(rowdata[[by_what]])),] -rownames(rowdata) = rowdata[[by_what]] -se = SummarizedExperiment(assays = list(counts = counts, abundance = tpm), - colData = DataFrame(coldata), - rowData = rowdata) +rowdata <- rowdata[match(rownames(counts), as.character(rowdata[[by_what]])), ] +rownames(rowdata) <- rowdata[[by_what]] +se <- SummarizedExperiment( + assays = list(counts = counts, abundance = tpm), + colData = DataFrame(coldata), + rowData = rowdata +) saveRDS(se, file = paste0(tools::file_path_sans_ext(counts_fn), ".rds")) diff --git a/conf/modules.config b/conf/modules.config index 3d1a2c916..0513ef0c2 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -198,59 +198,100 @@ process { // if (!(params.skip_fastqc || params.skip_qc)) { - process { - withName: '.*:FASTQ_FASTQC_UMITOOLS_TRIMGALORE:FASTQC' { - ext.args = '--quiet' + if (params.trimmer == 'trimgalore') { + process { + withName: '.*:FASTQ_FASTQC_UMITOOLS_TRIMGALORE:FASTQC' { + ext.args = '--quiet' + } } } -} -if (!params.skip_trimming) { - process { - withName: '.*:FASTQ_FASTQC_UMITOOLS_TRIMGALORE:TRIMGALORE' { - ext.args = { - [ - "--fastqc_args '-t ${task.cpus}' ", - params.trim_nextseq > 0 ? "--nextseq ${params.trim_nextseq}" : '' - ].join(' ').trim() + if (params.trimmer == 'fastp') { + process { + withName: '.*:FASTQ_FASTQC_UMITOOLS_FASTP:FASTQC_RAW' { + ext.args = '--quiet' } - publishDir = [ - [ - path: { "${params.outdir}/trimgalore/fastqc" }, - mode: params.publish_dir_mode, - pattern: "*.{html,zip}" - ], - [ - path: { "${params.outdir}/trimgalore" }, - mode: params.publish_dir_mode, - pattern: "*.fq.gz", - enabled: params.save_trimmed - ], - [ - path: { "${params.outdir}/trimgalore" }, + + withName: '.*:FASTQ_FASTQC_UMITOOLS_FASTP:FASTQC_TRIM' { + ext.args = '--quiet' + publishDir = [ + path: { "${params.outdir}/${params.trimmer}/fastqc" }, mode: params.publish_dir_mode, - pattern: "*.txt" + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] - ] + } } + } +} - withName: 'MULTIQC_TSV_FAIL_TRIMMED' { - publishDir = [ - path: { "${params.outdir}/multiqc" }, - enabled: false - ] +if (!params.skip_trimming) { + if (params.trimmer == 'trimgalore') { + process { + withName: '.*:FASTQ_FASTQC_UMITOOLS_TRIMGALORE:TRIMGALORE' { + ext.args = { + [ + "--fastqc_args '-t ${task.cpus}'", + params.extra_trimgalore_args ? params.extra_trimgalore_args.split("\\s(?=--)") : '' + ].flatten().unique(false).join(' ').trim() + } + publishDir = [ + [ + path: { "${params.outdir}/${params.trimmer}/fastqc" }, + mode: params.publish_dir_mode, + pattern: "*.{html,zip}" + ], + [ + path: { "${params.outdir}/${params.trimmer}" }, + mode: params.publish_dir_mode, + pattern: "*.fq.gz", + enabled: params.save_trimmed + ], + [ + path: { "${params.outdir}/${params.trimmer}" }, + mode: params.publish_dir_mode, + pattern: "*.txt" + ] + ] + } + } + } + + if (params.trimmer == 'fastp') { + process { + withName: '.*:FASTQ_FASTQC_UMITOOLS_FASTP:FASTP' { + ext.args = params.extra_fastp_args ?: '' + publishDir = [ + [ + path: { "${params.outdir}/${params.trimmer}" }, + mode: params.publish_dir_mode, + pattern: "*.{json,html}" + ], + [ + path: { "${params.outdir}/${params.trimmer}/log" }, + mode: params.publish_dir_mode, + pattern: "*.log" + ], + [ + path: { "${params.outdir}/${params.trimmer}" }, + mode: params.publish_dir_mode, + pattern: "*.fastq.gz", + enabled: params.save_trimmed + ] + ] + } } } } if (params.with_umi && !params.skip_umi_extract) { process { - withName: '.*:FASTQ_FASTQC_UMITOOLS_TRIMGALORE:UMITOOLS_EXTRACT' { + withName: 'UMITOOLS_EXTRACT' { ext.args = [ params.umitools_extract_method ? "--extract-method=${params.umitools_extract_method}" : '', params.umitools_bc_pattern ? "--bc-pattern='${params.umitools_bc_pattern}'" : '', - params.umitools_bc_pattern2 ? "--bc-pattern2='${params.umitools_bc_pattern2}'" : '' - ].join(' ').trim() + params.umitools_bc_pattern2 ? "--bc-pattern2='${params.umitools_bc_pattern2}'" : '', + params.umitools_umi_separator ? "--umi-separator='${params.umitools_umi_separator}'" : '' + ].join(' ').trim() publishDir = [ [ path: { "${params.outdir}/umitools" }, @@ -513,17 +554,6 @@ if (!params.skip_alignment) { } } } - - if (params.aligner.contains('star')) { - process { - withName: 'MULTIQC_TSV_FAIL_MAPPED' { - publishDir = [ - path: { "${params.outdir}/multiqc" }, - enabled: false - ] - } - } - } } // @@ -545,8 +575,8 @@ if (!params.skip_alignment && params.aligner == 'star_salmon') { '--quantTranscriptomeBan Singleend', '--outSAMstrandField intronMotif', params.save_unaligned ? '--outReadsUnmapped Fastx' : '', - params.extra_star_align_args ?: '' - ].join(' ').trim() + params.extra_star_align_args ? params.extra_star_align_args.split("\\s(?=--)") : '' + ].flatten().unique(false).join(' ').trim() publishDir = [ [ path: { "${params.outdir}/${params.aligner}/log" }, @@ -609,16 +639,32 @@ if (!params.skip_alignment && params.aligner == 'star_salmon') { ext.prefix = { "${meta.id}.umi_dedup.transcriptome" } publishDir = [ path: { "${params.outdir}/${params.aligner}" }, - enabled: false + mode: params.publish_dir_mode, + pattern: '*.bam', + enabled: ( + params.save_align_intermeds || + params.save_umi_intermeds + ) ] } withName: 'NFCORE_RNASEQ:RNASEQ:UMITOOLS_PREPAREFORSALMON' { ext.prefix = { "${meta.id}.umi_dedup.transcriptome.filtered" } publishDir = [ - path: { "${params.outdir}/${params.aligner}/umitools/log" }, - mode: params.publish_dir_mode, - pattern: '*.log' + [ + path: { "${params.outdir}/${params.aligner}/umitools/log" }, + mode: params.publish_dir_mode, + pattern: '*.log' + ], + [ + path: { "${params.outdir}/${params.aligner}" }, + mode: params.publish_dir_mode, + pattern: '*.bam', + enabled: ( + params.save_align_intermeds || + params.save_umi_intermeds + ) + ] ] } @@ -626,22 +672,37 @@ if (!params.skip_alignment && params.aligner == 'star_salmon') { ext.prefix = { "${meta.id}.transcriptome.sorted" } publishDir = [ path: { "${params.outdir}/${params.aligner}" }, - enabled: false + mode: params.publish_dir_mode, + pattern: '*.bam', + enabled: ( + params.save_align_intermeds || + params.save_umi_intermeds + ) ] } withName: 'NFCORE_RNASEQ:RNASEQ:BAM_SORT_STATS_SAMTOOLS:SAMTOOLS_INDEX' { publishDir = [ path: { "${params.outdir}/${params.aligner}" }, - enabled: false + mode: params.publish_dir_mode, + pattern: '*.bai', + enabled: ( + params.save_align_intermeds || + params.save_umi_intermeds + ) ] } withName: 'NFCORE_RNASEQ:RNASEQ:BAM_SORT_STATS_SAMTOOLS:BAM_STATS_SAMTOOLS:.*' { ext.prefix = { "${meta.id}.transcriptome.sorted.bam" } publishDir = [ - path: { "${params.outdir}/${params.aligner}" }, - enabled: false + path: { "${params.outdir}/${params.aligner}/samtools_stats" }, + mode: params.publish_dir_mode, + pattern: '*.{stats,flagstat,idxstats}', + enabled: ( + params.save_align_intermeds || + params.save_umi_intermeds + ) ] } @@ -649,20 +710,36 @@ if (!params.skip_alignment && params.aligner == 'star_salmon') { ext.args = { [ meta.single_end ? '' : '--unpaired-reads=discard --chimeric-pairs=discard', params.umitools_grouping_method ? "--method='${params.umitools_grouping_method}'" : '', - params.umitools_umi_separator ? "--umi-separator='${params.umitools_umi_separator}'" : '', + params.umitools_umi_separator ? "--umi-separator='${params.umitools_umi_separator}'" : '' ].join(' ').trim() } ext.prefix = { "${meta.id}.umi_dedup.transcriptome.sorted" } publishDir = [ - path: { "${params.outdir}/${params.aligner}/umitools" }, - mode: params.publish_dir_mode, - pattern: '*.tsv' + [ + path: { "${params.outdir}/${params.aligner}/umitools" }, + mode: params.publish_dir_mode, + pattern: '*.tsv' + ], + [ + path: { "${params.outdir}/${params.aligner}" }, + mode: params.publish_dir_mode, + pattern: '*.bam', + enabled: ( + params.save_align_intermeds || + params.save_umi_intermeds + ) + ] ] } withName: '.*:BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_TRANSCRIPTOME:SAMTOOLS_INDEX' { publishDir = [ path: { "${params.outdir}/${params.aligner}" }, - enabled: false + mode: params.publish_dir_mode, + pattern: '*.bai', + enabled: ( + params.save_align_intermeds || + params.save_umi_intermeds + ) ] } @@ -1038,17 +1115,6 @@ if (!params.skip_alignment && !params.skip_qc) { } } } - - if (!params.skip_rseqc && rseqc_modules) { - process { - withName: 'MULTIQC_TSV_STRAND_CHECK' { - publishDir = [ - path: { "${params.outdir}/multiqc" }, - enabled: false - ] - } - } - } } if (!params.skip_multiqc) { diff --git a/conf/test.config b/conf/test.config index cb7fc562d..57811683c 100644 --- a/conf/test.config +++ b/conf/test.config @@ -20,19 +20,19 @@ params { max_time = '6.h' // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/samplesheet/v3.10/samplesheet_test.csv' + input = "${params.test_data_base}/samplesheet/v3.10/samplesheet_test.csv" // Genome references - fasta = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/genome.fasta' - gtf = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/genes.gtf.gz' - gff = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/genes.gff.gz' - transcript_fasta = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/transcriptome.fasta' - additional_fasta = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/gfp.fa.gz' - - bbsplit_fasta_list = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/bbsplit_fasta_list.txt' - hisat2_index = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/hisat2.tar.gz' - salmon_index = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/salmon.tar.gz' - rsem_index = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/rsem.tar.gz' + fasta = "${params.test_data_base}/reference/genome.fasta" + gtf = "${params.test_data_base}/reference/genes.gtf.gz" + gff = "${params.test_data_base}/reference/genes.gff.gz" + transcript_fasta = "${params.test_data_base}/reference/transcriptome.fasta" + additional_fasta = "${params.test_data_base}/reference/gfp.fa.gz" + + bbsplit_fasta_list = "${params.test_data_base}/reference/bbsplit_fasta_list.txt" + hisat2_index = "${params.test_data_base}/reference/hisat2.tar.gz" + salmon_index = "${params.test_data_base}/reference/salmon.tar.gz" + rsem_index = "${params.test_data_base}/reference/rsem.tar.gz" // Other parameters skip_bbsplit = false diff --git a/docs/images/mqc_fastp_plot.png b/docs/images/mqc_fastp_plot.png new file mode 100755 index 000000000..798539ca3 Binary files /dev/null and b/docs/images/mqc_fastp_plot.png differ diff --git a/docs/images/nf-core-rnaseq_metro_map_grey.png b/docs/images/nf-core-rnaseq_metro_map_grey.png index 4df8c4822..0a3645f81 100644 Binary files a/docs/images/nf-core-rnaseq_metro_map_grey.png and b/docs/images/nf-core-rnaseq_metro_map_grey.png differ diff --git a/docs/images/nf-core-rnaseq_metro_map_grey.svg b/docs/images/nf-core-rnaseq_metro_map_grey.svg index 073b02098..2e2a07374 100644 --- a/docs/images/nf-core-rnaseq_metro_map_grey.svg +++ b/docs/images/nf-core-rnaseq_metro_map_grey.svg @@ -2,9 +2,9 @@ image/svg+xmlMultiQCFASTQ1catfastqsubsamplefastq(fq)METHODSTAGE2. Genome alignment & quantification3. Pseudo-alignment & quantification5. Final QC1. Pre-processing4. Post-processingUMI-toolsextractTrimGalore!FastQCSortMeRNAFastQCinferstrandedness(Salmon) + r="3.4699144" />UMI-toolsextractFastQCFastP diff --git a/docs/output.md b/docs/output.md index 80b41edf0..aee68700f 100644 --- a/docs/output.md +++ b/docs/output.md @@ -19,6 +19,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [FastQC](#fastqc) - Raw read QC - [UMI-tools extract](#umi-tools-extract) - UMI barcode extraction - [TrimGalore](#trimgalore) - Adapter and quality trimming + - [fastp](#fastp) - Adapter and quality trimming - [BBSplit](#bbsplit) - Removal of genome contaminants - [SortMeRNA](#sortmerna) - Removal of ribosomal RNA - [Alignment and quantification](#alignment-and-quantification) @@ -110,12 +111,33 @@ To facilitate processing of input data which has the UMI barcode already embedde -[Trim Galore!](https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/) is a wrapper tool around Cutadapt and FastQC to peform quality and adapter trimming on FastQ files. By default, Trim Galore! will automatically detect and trim the appropriate adapter sequence. +[Trim Galore!](https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/) is a wrapper tool around Cutadapt and FastQC to peform quality and adapter trimming on FastQ files. Trim Galore! will automatically detect and trim the appropriate adapter sequence. It is the default trimming tool used by this pipeline, however you can use fastp instead by specifying the `--trimmer fastp` parameter. You can specify additional options for Trim Galore! via the `--extra_trimgalore_args` parameters. > **NB:** TrimGalore! will only run using multiple cores if you are able to use more than > 5 and > 6 CPUs for single- and paired-end data, respectively. The total cores available to TrimGalore! will also be capped at 4 (7 and 8 CPUs in total for single- and paired-end data, respectively) because there is no longer a run-time benefit. See [release notes](https://github.com/FelixKrueger/TrimGalore/blob/master/Changelog.md#version-060-release-on-1-mar-2019) and [discussion whilst adding this logic to the nf-core/atacseq pipeline](https://github.com/nf-core/atacseq/pull/65). ![MultiQC - cutadapt trimmed sequence length plot](images/mqc_cutadapt_trimmed.png) +### fastp + +
+Output files + +- `fastp/` + - `*.fastq.gz`: If `--save_trimmed` is specified, FastQ files **after** adapter trimming will be placed in this directory. + - `*.fastp.html`: Trimming report in html format. + - `*.fastp.json`: Trimming report in json format. +- `fastp/log/` + - `*.fastp.log`: Trimming log file. +- `fastp/fastqc/` + - `*_fastqc.html`: FastQC report containing quality metrics for read 1 (_and read2 if paired-end_) **after** adapter trimming. + - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. + +
+ +[fastp](https://github.com/OpenGene/fastp) is a tool designed to provide fast, all-in-one preprocessing for FastQ files. It has been developed in C++ with multithreading support to achieve higher performance. fastp can be used in this pipeline for standard adapter trimming and quality filtering by setting the `--trimmer fastp` parameter. You can specify additional options for fastp via the `--extra_fastp_args` parameter. + +![MultiQC - fastp filtered reads plot](images/mqc_fastp_plot.png) + ### BBSplit
diff --git a/docs/usage.md b/docs/usage.md index 49d26a8e4..a008251dd 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -4,6 +4,10 @@ > _Documentation of pipeline parameters is generated automatically from the pipeline schema and can no longer be found in markdown files._ +## Pipeline parameters + +Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration except for parameters; see [docs](https://nf-co.re/usage/configuration#custom-configuration-files). + ## Samplesheet input You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 4 columns, and a header row as shown in the examples below. @@ -51,6 +55,12 @@ An [example samplesheet](../assets/samplesheet.csv) has been provided with the p > **NB:** The `group` and `replicate` columns were replaced with a single `sample` column as of v3.1 of the pipeline. The `sample` column is essentially a concatenation of the `group` and `replicate` columns, however it now also offers more flexibility in instances where replicate information is not required e.g. when sequencing clinical samples. If all values of `sample` have the same number of underscores, fields defined by these underscore-separated names may be used in the PCA plots produced by the pipeline, to regain the ability to represent different groupings. +## Adapter trimming options + +[Trim Galore!](https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/) is a wrapper tool around Cutadapt and FastQC to peform quality and adapter trimming on FastQ files. Trim Galore! will automatically detect and trim the appropriate adapter sequence. It is the default trimming tool used by this pipeline, however you can use fastp instead by specifying the `--trimmer fastp` parameter. [fastp](https://github.com/OpenGene/fastp) is a tool designed to provide fast, all-in-one preprocessing for FastQ files. It has been developed in C++ with multithreading support to achieve higher performance. You can specify additional options for Trim Galore! and fastp via the `--extra_trimgalore_args` and `--extra_fastp_args` parameters, respectively. + +> **NB:** TrimGalore! will only run using multiple cores if you are able to use more than > 5 and > 6 CPUs for single- and paired-end data, respectively. The total cores available to TrimGalore! will also be capped at 4 (7 and 8 CPUs in total for single- and paired-end data, respectively) because there is no longer a run-time benefit. See [release notes](https://github.com/FelixKrueger/TrimGalore/blob/master/Changelog.md#version-060-release-on-1-mar-2019) and [discussion whilst adding this logic to the nf-core/atacseq pipeline](https://github.com/nf-core/atacseq/pull/65). + ## Alignment options By default, the pipeline uses [STAR](https://github.com/alexdobin/STAR) (i.e. `--aligner star_salmon`) to map the raw FastQ reads to the reference genome, project the alignments onto the transcriptome and to perform the downstream BAM-level quantification with [Salmon](https://salmon.readthedocs.io/en/latest/salmon.html). STAR is fast but requires a lot of memory to run, typically around 38GB for the Human GRCh37 reference genome. Since the [RSEM](https://github.com/deweylab/RSEM) (i.e. `--aligner star_rsem`) workflow in the pipeline also uses STAR you should use the [HISAT2](https://ccb.jhu.edu/software/hisat2/index.shtml) aligner (i.e. `--aligner hisat2`) if you have memory limitations. @@ -79,10 +89,45 @@ The `--umitools_grouping_method` parameter affects [how similar, but non-identic #### Examples: -| UMI type | Source | Pipeline parameters | -| ------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------- | -| In read name | [Illumina BCL convert >3.7.5](https://emea.support.illumina.com/content/dam/illumina-support/documents/documentation/software_documentation/bcl_convert/bcl-convert-v3-7-5-software-guide-1000000163594-00.pdf) | `--with_umi --skip_umi_extract --umitools_umi_separator ":"` | -| In sequence | [Takara Bio SMARTer® Stranded Total RNA-Seq Kit v3](https://www.takarabio.com/documents/User%20Manual/SMARTer%20Stranded%20Total%20RNA/SMARTer%20Stranded%20Total%20RNA-Seq%20Kit%20v3%20-%20Pico%20Input%20Mammalian%20User%20Manual-a_114949.pdf) | `--with_umi --umitools_extract_method "regex" --umitools_bc_pattern2 "^(?P.{8})(?P.{6}).*"` | +| UMI type | Source | Pipeline parameters | +| ------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | +| In read name | [Illumina BCL convert >3.7.5](https://emea.support.illumina.com/content/dam/illumina-support/documents/documentation/software_documentation/bcl_convert/bcl-convert-v3-7-5-software-guide-1000000163594-00.pdf) | `--with_umi --skip_umi_extract --umitools_umi_separator ":"` | +| In sequence | [Lexogen QuantSeq® 3’ mRNA-Seq V2 FWD](https://www.lexogen.com/quantseq-3mrna-sequencing) + [UMI Second Strand Synthesis Module](https://faqs.lexogen.com/faq/how-can-i-add-umis-to-my-quantseq-libraries) | `--with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P.{6})(?P.{4}).*"` | +| In sequence | [Lexogen CORALL® Total RNA-Seq V1](https://www.lexogen.com/corall-total-rna-seq/)
> _mind [Appendix H](https://www.lexogen.com/wp-content/uploads/2020/04/095UG190V0130_CORALL-Total-RNA-Seq_2020-03-31.pdf) regarding optional trimming_ | `--with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P.{12}).*"`
Optional: `--clip_r2 9 --three_prime_clip_r2 12` | +| In sequence | [Takara Bio SMARTer® Stranded Total RNA-Seq Kit v3](https://www.takarabio.com/documents/User%20Manual/SMARTer%20Stranded%20Total%20RNA/SMARTer%20Stranded%20Total%20RNA-Seq%20Kit%20v3%20-%20Pico%20Input%20Mammalian%20User%20Manual-a_114949.pdf) | `--with_umi --umitools_extract_method "regex" --umitools_bc_pattern2 "^(?P.{8})(?P.{6}).*"` | + +> _No warranty for the accuracy or completeness of the parameters is implied_ + +### 3′ digital gene expression assays + +Some bulk RNA-seq library preparation protocols capture only a 3' tag from each transcript, e.g. [3'Pool-seq](https://pubmed.ncbi.nlm.nih.gov/31959126/), [DRUG-seq](https://pubs.acs.org/doi/10.1021/acschembio.1c00920), [BRB-seq](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1671-x) or Lexogen's commercial [QuantSeq 3' mRNA-seq FWD](https://www.lexogen.com/quantseq-3mrna-sequencing/) protocol. The following parameters have been validated for `QuantSeq 3' mRNA-seq FWD` data, and provide useful starting points for other 3' RNA-seq protocols: + +#### Custom STAR parameters + +Lexogen provides an example analysis workflow [on their website](https://www.lexogen.com/quantseq-data-analysis/), which includes the _ENCODE standard options_ for the [STAR aligner](<[https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf](https://github.com/alexdobin/STAR)>). In addition, Lexogen also decreases the tolerance for mismatches and clips poly(A) tails. To apply these settings, add the following parameters when running the pipeline: + +``` +--extra_star_align_args "--alignIntronMax 1000000 --alignIntronMin 20 --alignMatesGapMax 1000000 --alignSJoverhangMin 8 --outFilterMismatchNmax 999 --outFilterMultimapNmax 20 --outFilterType BySJout --outFilterMismatchNoverLmax 0.1 --clip3pAdapterSeq AAAAAAAA" +``` + +#### Custom Salmon arguments + +[Salmon's default quantitation algorithm](https://www.nature.com/articles/nmeth.4197) takes into account transcript length. +Because 3' tag protocols do not capture full transcripts, this feature needs to be deactivated by specifying: + +``` +--extra_salmon_quant_args "--noLengthCorrection" +``` + +#### QuantSeq analysis with UMIs + +If unique molecular identifiers were used to prepare the library, add the following arguments as well, to extract the UMIs and deduplicated alignments: + +``` +--with_umi +--umitools_extract_method regex +--umitools_bc_pattern "^(?P.{6})(?P.{4}).*" +``` ## Reference genome files diff --git a/lib/NfcoreSchema.groovy b/lib/NfcoreSchema.groovy index 33cd4f6e8..4d2968143 100755 --- a/lib/NfcoreSchema.groovy +++ b/lib/NfcoreSchema.groovy @@ -2,6 +2,7 @@ // This file holds several functions used to perform JSON parameter validation, help and summary rendering for the nf-core pipeline template. // +import nextflow.Nextflow import org.everit.json.schema.Schema import org.everit.json.schema.loader.SchemaLoader import org.everit.json.schema.ValidationException @@ -177,7 +178,7 @@ class NfcoreSchema { } if (has_error) { - System.exit(1) + Nextflow.error('Exiting!') } } diff --git a/lib/NfcoreTemplate.groovy b/lib/NfcoreTemplate.groovy index bec83dfd2..bce1492b9 100755 --- a/lib/NfcoreTemplate.groovy +++ b/lib/NfcoreTemplate.groovy @@ -32,6 +32,21 @@ class NfcoreTemplate { } } + // + // Warn if using custom configs to provide pipeline parameters + // + public static void warnParamsProvidedInConfig(workflow, log) { + if (workflow.configFiles.size() > 1) { + log.warn "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + + " Multiple config files detected!\n" + + " Please provide pipeline parameters via the CLI or Nextflow '-params-file' option.\n" + + " Custom config files including those provided by the '-c' Nextflow option can be\n" + + " used to provide any configuration except for parameters.\n\n" + + " Docs: https://nf-co.re/usage/configuration#custom-configuration-files\n" + + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + } + } + // // Generate version string // @@ -54,12 +69,16 @@ class NfcoreTemplate { // // Construct and send completion email // - public static void email(workflow, params, summary_params, projectDir, log, multiqc_report=[], fail_percent_mapped=[:]) { + public static void email(workflow, params, summary_params, projectDir, log, multiqc_report=[], pass_mapped_reads=[:], pass_trimmed_reads=[:], pass_strand_check=[:]) { // Set up the e-mail variables + def fail_mapped_count = pass_mapped_reads.count { key, value -> value == false } + def fail_trimmed_count = pass_trimmed_reads.count { key, value -> value == false } + def fail_strand_count = pass_strand_check.count { key, value -> value == false } + def subject = "[$workflow.manifest.name] Successful: $workflow.runName" - if (fail_percent_mapped.size() > 0) { - subject = "[$workflow.manifest.name] Partially successful (${fail_percent_mapped.size()} skipped): $workflow.runName" + if (fail_mapped_count + fail_trimmed_count + fail_strand_count > 0) { + subject = "[$workflow.manifest.name] Partially successful - samples skipped: $workflow.runName" } if (!workflow.success) { subject = "[$workflow.manifest.name] FAILED: $workflow.runName" @@ -94,8 +113,7 @@ class NfcoreTemplate { email_fields['commandLine'] = workflow.commandLine email_fields['projectDir'] = workflow.projectDir email_fields['summary'] = summary << misc_fields - email_fields['fail_percent_mapped'] = fail_percent_mapped.keySet() - email_fields['min_mapped_reads'] = params.min_mapped_reads + email_fields['skip_sample_count'] = fail_mapped_count + fail_trimmed_count + fail_strand_count // On success try attach the multiqc report def mqc_report = null @@ -230,36 +248,32 @@ class NfcoreTemplate { // // Print pipeline summary on completion // - public static void summary(workflow, params, log, fail_percent_mapped=[:], pass_percent_mapped=[:]) { + public static void summary(workflow, params, log, pass_mapped_reads=[:], pass_trimmed_reads=[:], pass_strand_check=[:]) { Map colors = logColours(params.monochrome_logs) - def total_aln_count = pass_percent_mapped.size() + fail_percent_mapped.size() - if (pass_percent_mapped.size() > 0) { - def idx = 0 - def samp_aln = '' - for (samp in pass_percent_mapped) { - samp_aln += " ${samp.value}%: ${samp.key}\n" - idx += 1 - if (idx > 5) { - samp_aln += " ..see pipeline reports for full list\n" - break; - } + def fail_mapped_count = pass_mapped_reads.count { key, value -> value == false } + def fail_trimmed_count = pass_trimmed_reads.count { key, value -> value == false } + def fail_strand_count = pass_strand_check.count { key, value -> value == false } + if (workflow.success) { + def color = colors.green + def status = [] + if (workflow.stats.ignoredCount != 0) { + color = colors.yellow + status += ['with errored process(es)'] } - log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} ${pass_percent_mapped.size()}/$total_aln_count samples passed STAR ${params.min_mapped_reads}% mapped threshold:\n${samp_aln}${colors.reset}-" - } - if (fail_percent_mapped.size() > 0) { - def samp_aln = '' - for (samp in fail_percent_mapped) { - samp_aln += " ${samp.value}%: ${samp.key}\n" + if (fail_mapped_count > 0 || fail_trimmed_count > 0 || fail_strand_count > 0) { + color = colors.yellow + status += ['with skipped sampl(es)'] } - log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} ${fail_percent_mapped.size()}/$total_aln_count samples skipped since they failed STAR ${params.min_mapped_reads}% mapped threshold:\n${samp_aln}${colors.reset}-" - } - - if (workflow.success) { - if (workflow.stats.ignoredCount == 0) { - log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Pipeline completed successfully${colors.reset}-" - } else { - log.info "-${colors.purple}[$workflow.manifest.name]${colors.yellow} Pipeline completed successfully, but with errored process(es) ${colors.reset}-" + log.info "-${colors.purple}[$workflow.manifest.name]${color} Pipeline completed successfully ${status.join(', ')}${colors.reset}-" + if (fail_trimmed_count > 0) { + log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} Please check MultiQC report: ${fail_trimmed_count}/${pass_trimmed_reads.size()} samples skipped since they failed ${params.min_trimmed_reads} trimmed read threshold.${colors.reset}-" + } + if (fail_mapped_count > 0) { + log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} Please check MultiQC report: ${fail_mapped_count}/${pass_mapped_reads.size()} samples skipped since they failed STAR ${params.min_mapped_reads}% mapped threshold.${colors.reset}-" + } + if (fail_strand_count > 0) { + log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} Please check MultiQC report: ${fail_strand_count}/${pass_strand_check.size()} samples failed strandedness check.${colors.reset}-" } } else { log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} Pipeline completed with errors${colors.reset}-" diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index d665b356c..eaf9bce23 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -2,6 +2,8 @@ // This file holds several functions specific to the main.nf workflow in the nf-core/rnaseq pipeline // +import nextflow.Nextflow + class WorkflowMain { // @@ -62,6 +64,9 @@ class WorkflowMain { // Print parameter summary log to screen log.info paramsSummaryLog(workflow, params, log) + // Warn about using custom configs to provide pipeline parameters + NfcoreTemplate.warnParamsProvidedInConfig(workflow, log) + // Validate workflow parameters via the JSON schema if (params.validate_params) { NfcoreSchema.validateParameters(workflow, params, log) @@ -80,8 +85,7 @@ class WorkflowMain { // Check input has been provided if (!params.input) { - log.error "Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'" - System.exit(1) + Nextflow.error("Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'") } } // diff --git a/lib/WorkflowRnaseq.groovy b/lib/WorkflowRnaseq.groovy index 09e324a2a..ff577917b 100755 --- a/lib/WorkflowRnaseq.groovy +++ b/lib/WorkflowRnaseq.groovy @@ -2,6 +2,7 @@ // This file holds several functions specific to the workflow/rnaseq.nf in the nf-core/rnaseq pipeline // +import nextflow.Nextflow import groovy.json.JsonSlurper import groovy.text.SimpleTemplateEngine @@ -15,13 +16,11 @@ class WorkflowRnaseq { if (!params.fasta) { - log.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file." - System.exit(1) + Nextflow.error("Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file.") } if (!params.gtf && !params.gff) { - log.error "No GTF or GFF3 annotation specified! The pipeline requires at least one of these files." - System.exit(1) + Nextflow.error("No GTF or GFF3 annotation specified! The pipeline requires at least one of these files.") } if (params.gtf) { @@ -41,45 +40,43 @@ class WorkflowRnaseq { } if (!params.skip_bbsplit && !params.bbsplit_index && !params.bbsplit_fasta_list) { - log.error "Please provide either --bbsplit_fasta_list / --bbsplit_index to run BBSplit." - System.exit(1) + Nextflow.error("Please provide either --bbsplit_fasta_list / --bbsplit_index to run BBSplit.") } if (params.remove_ribo_rna && !params.ribo_database_manifest) { - log.error "Please provide --ribo_database_manifest to remove ribosomal RNA with SortMeRNA." - System.exit(1) + Nextflow.error("Please provide --ribo_database_manifest to remove ribosomal RNA with SortMeRNA.") } if (params.with_umi && !params.skip_umi_extract) { if (!params.umitools_bc_pattern && !params.umitools_bc_pattern2) { - log.error "UMI-tools requires a barcode pattern to extract barcodes from the reads." - System.exit(1) + Nextflow.error("UMI-tools requires a barcode pattern to extract barcodes from the reads.") } } + if (!params.skip_trimming) { + if (!valid_params['trimmers'].contains(params.trimmer)) { + Nextflow.error("Invalid option: '${params.trimmer}'. Valid options for '--trimmer': ${valid_params['trimmers'].join(', ')}.") + } + } if (!params.skip_alignment) { if (!valid_params['aligners'].contains(params.aligner)) { - log.error "Invalid option: '${params.aligner}'. Valid options for '--aligner': ${valid_params['aligners'].join(', ')}." - System.exit(1) + Nextflow.error("Invalid option: '${params.aligner}'. Valid options for '--aligner': ${valid_params['aligners'].join(', ')}.") } } else { if (!params.pseudo_aligner) { - log.error "--skip_alignment specified without --pseudo_aligner...please specify e.g. --pseudo_aligner ${valid_params['pseudoaligners'][0]}." - System.exit(1) + Nextflow.error("--skip_alignment specified without --pseudo_aligner...please specify e.g. --pseudo_aligner ${valid_params['pseudoaligners'][0]}.") } skipAlignmentWarn(log) } if (params.pseudo_aligner) { if (!valid_params['pseudoaligners'].contains(params.pseudo_aligner)) { - log.error "Invalid option: '${params.pseudo_aligner}'. Valid options for '--pseudo_aligner': ${valid_params['pseudoaligners'].join(', ')}." - System.exit(1) + Nextflow.error("Invalid option: '${params.pseudo_aligner}'. Valid options for '--pseudo_aligner': ${valid_params['pseudoaligners'].join(', ')}.") } else { if (!(params.salmon_index || params.transcript_fasta || (params.fasta && (params.gtf || params.gff)))) { - log.error "To use `--pseudo_aligner 'salmon'`, you must provide either --salmon_index or --transcript_fasta or both --fasta and --gtf / --gff." - System.exit(1) + Nextflow.error("To use `--pseudo_aligner 'salmon'`, you must provide either --salmon_index or --transcript_fasta or both --fasta and --gtf / --gff.") } } } @@ -114,8 +111,7 @@ class WorkflowRnaseq { // Check which RSeQC modules we are running def rseqc_modules = params.rseqc_modules ? params.rseqc_modules.split(',').collect{ it.trim().toLowerCase() } : [] if ((valid_params['rseqc_modules'] + rseqc_modules).unique().size() != valid_params['rseqc_modules'].size()) { - log.error "Invalid option: ${params.rseqc_modules}. Valid options for '--rseqc_modules': ${valid_params['rseqc_modules'].join(', ')}" - System.exit(1) + Nextflow.error("Invalid option: ${params.rseqc_modules}. Valid options for '--rseqc_modules': ${valid_params['rseqc_modules'].join(', ')}") } } @@ -133,12 +129,12 @@ class WorkflowRnaseq { if (hits) { return true } else { - log.warn "=============================================================================\n" + + log.warn "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + " Biotype attribute '${biotype}' not found in the last column of the GTF file!\n\n" + " Biotype QC will be skipped to circumvent the issue below:\n" + " https://github.com/nf-core/rnaseq/issues/460\n\n" + " Amend '--featurecounts_group_type' to change this behaviour.\n" + - "===================================================================================" + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" return false } } @@ -153,14 +149,14 @@ class WorkflowRnaseq { def chrom = lspl[0] def size = lspl[1] if (size.toInteger() > max_size) { - log.error "=============================================================================\n" + + def error_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + " Contig longer than ${max_size}bp found in reference genome!\n\n" + " ${chrom}: ${size}\n\n" + " Provide the '--bam_csi_index' parameter to use a CSI instead of BAI index.\n\n" + " Please see:\n" + " https://github.com/nf-core/rnaseq/issues/744\n" + - "=============================================================================" - System.exit(1) + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + Nextflow.error(error_string) } } } @@ -273,6 +269,18 @@ class WorkflowRnaseq { return yaml_file_text } + // + // Create MultiQC tsv custom content from a list of values + // + public static String multiqcTsvFromList(tsv_data, header) { + def tsv_string = "" + if (tsv_data.size() > 0) { + tsv_string += "${header.join('\t')}\n" + tsv_string += tsv_data.join('\n') + } + return tsv_string + } + public static String methodsDescriptionText(run_workflow, mqc_methods_yaml) { // Convert to a named map so can be used as with familar NXF ${workflow} variable syntax in the MultiQC YML file def meta = [:] @@ -288,17 +296,19 @@ class WorkflowRnaseq { def description_html = engine.createTemplate(methods_text).make(meta) return description_html - }// + } + + // // Exit pipeline if incorrect --genome key provided // private static void genomeExistsError(params, log) { if (params.genomes && params.genome && !params.genomes.containsKey(params.genome)) { - log.error "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + + def error_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + " Genome '${params.genome}' not found in any config files provided to the pipeline.\n" + " Currently, the available genome keys are:\n" + " ${params.genomes.keySet().join(", ")}\n" + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - System.exit(1) + Nextflow.error(error_string) } } @@ -306,92 +316,92 @@ class WorkflowRnaseq { // Print a warning if using GRCh38 assembly from igenomes.config // private static void ncbiGenomeWarn(log) { - log.warn "=============================================================================\n" + + log.warn "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + " When using '--genome GRCh38' the assembly is from the NCBI and NOT Ensembl.\n" + " Biotype QC will be skipped to circumvent the issue below:\n" + " https://github.com/nf-core/rnaseq/issues/460\n\n" + " If you would like to use the soft-masked Ensembl assembly instead please see:\n" + " https://github.com/nf-core/rnaseq/issues/159#issuecomment-501184312\n" + - "===================================================================================" + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" } // // Print a warning if using a UCSC assembly from igenomes.config // private static void ucscGenomeWarn(log) { - log.warn "=============================================================================\n" + + log.warn "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + " When using UCSC assemblies the 'gene_biotype' field is absent from the GTF file.\n" + " Biotype QC will be skipped to circumvent the issue below:\n" + " https://github.com/nf-core/rnaseq/issues/460\n\n" + " If you would like to use the soft-masked Ensembl assembly instead please see:\n" + " https://github.com/nf-core/rnaseq/issues/159#issuecomment-501184312\n" + - "===================================================================================" + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" } // // Print a warning if both GTF and GFF have been provided // private static void gtfGffWarn(log) { - log.warn "=============================================================================\n" + + log.warn "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + " Both '--gtf' and '--gff' parameters have been provided.\n" + " Using GTF file as priority.\n" + - "===================================================================================" + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" } // // Print a warning if using '--transcript_fasta' // private static void transcriptsFastaWarn(log) { - log.warn "=============================================================================\n" + + log.warn "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + " '--transcript_fasta' parameter has been provided.\n" + " Make sure transcript names in this file match those in the GFF/GTF file.\n\n" + " Please see:\n" + " https://github.com/nf-core/rnaseq/issues/753\n" + - "===================================================================================" + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" } // // Print a warning if --skip_alignment has been provided // private static void skipAlignmentWarn(log) { - log.warn "=============================================================================\n" + + log.warn "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + " '--skip_alignment' parameter has been provided.\n" + " Skipping alignment, genome-based quantification and all downstream QC processes.\n" + - "===================================================================================" + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" } // // Print a warning if using '--aligner star_rsem' and '--with_umi' // private static void rsemUmiError(log) { - log.error "=============================================================================\n" + + def error_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + " When using '--aligner star_rsem', STAR is run by RSEM itself and so it is\n" + " not possible to remove UMIs before the quantification.\n\n" + " If you would like to remove UMI barcodes using the '--with_umi' option\n" + " please use either '--aligner star_salmon' or '--aligner hisat2'.\n" + - "=============================================================================" - System.exit(1) + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + Nextflow.error(error_string) } // // Print a warning if using '--aligner star_rsem' and providing both '--rsem_index' and '--star_index' // private static void rsemStarIndexWarn(log) { - log.warn "=============================================================================\n" + + log.warn "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + " When using '--aligner star_rsem', both the STAR and RSEM indices should\n" + " be present in the path specified by '--rsem_index'.\n\n" + " This warning has been generated because you have provided both\n" + " '--rsem_index' and '--star_index'. The pipeline will ignore the latter.\n\n" + " Please see:\n" + " https://github.com/nf-core/rnaseq/issues/568\n" + - "===================================================================================" + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" } // // Print a warning if using '--additional_fasta' and '--_index' // private static void additionaFastaIndexWarn(index, log) { - log.warn "=============================================================================\n" + + log.warn "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + " When using '--additional_fasta ' the aligner index will not\n" + " be re-built with the transgenes incorporated by default since you have \n" + " already provided an index via '--${index}_index '.\n\n" + @@ -401,6 +411,6 @@ class WorkflowRnaseq { " Ignore this warning if you know that the index already contains transgenes.\n\n" + " Please see:\n" + " https://github.com/nf-core/rnaseq/issues/556\n" + - "===================================================================================" + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" } } diff --git a/modules.json b/modules.json index 0403e1f6a..018c1ecae 100644 --- a/modules.json +++ b/modules.json @@ -7,7 +7,7 @@ "nf-core": { "bbmap/bbsplit": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "75027bf77472b1f4fd2cdd7e46f83119dfb0f2c6", "installed_by": ["modules"] }, "cat/fastq": { @@ -17,24 +17,28 @@ }, "custom/dumpsoftwareversions": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "b6d4d476aee074311c89d82a69c1921bd70c8180", "installed_by": ["modules"] }, "custom/getchromsizes": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"], - "patch": "modules/nf-core/custom/getchromsizes/custom-getchromsizes.diff" + "installed_by": ["modules"] + }, + "fastp": { + "branch": "master", + "git_sha": "20a508676f40d0fd3f911ac595af91ec845704c4", + "installed_by": ["modules", "fastq_fastqc_umitools_fastp"] }, "fastqc": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["fastq_fastqc_umitools_trimgalore"] + "installed_by": ["fastq_fastqc_umitools_trimgalore", "fastq_fastqc_umitools_fastp"] }, "fq/subsample": { "branch": "master", - "git_sha": "ad462aa294faf9a8c42688a08daf81a580594f70", - "installed_by": ["modules", "fastq_subsample_fq_salmon"] + "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", + "installed_by": ["fastq_subsample_fq_salmon", "modules"] }, "gffread": { "branch": "master", @@ -43,7 +47,7 @@ }, "gunzip": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", "installed_by": ["modules"] }, "hisat2/align": { @@ -63,27 +67,27 @@ }, "picard/markduplicates": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "75027bf77472b1f4fd2cdd7e46f83119dfb0f2c6", "installed_by": ["bam_markduplicates_picard"] }, "preseq/lcextrap": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", "installed_by": ["modules"] }, "qualimap/rnaseq": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "75027bf77472b1f4fd2cdd7e46f83119dfb0f2c6", "installed_by": ["modules"] }, "rsem/calculateexpression": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", "installed_by": ["modules"] }, "rsem/preparereference": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", "installed_by": ["modules"] }, "rseqc/bamstat": { @@ -103,7 +107,7 @@ }, "rseqc/junctionannotation": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "2f398a6342db61e7ab0f4cb4d395eed74b65db7c", "installed_by": ["bam_rseqc"] }, "rseqc/junctionsaturation": { @@ -123,46 +127,46 @@ }, "rseqc/tin": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", "installed_by": ["bam_rseqc"] }, "salmon/index": { "branch": "master", - "git_sha": "94b06f1683ddf893cf06525f6e7f0573ad8fbf83", + "git_sha": "5d2c0dd6a8e2790e7ff511f7f4d761f4ed627a91", "installed_by": ["fastq_subsample_fq_salmon"] }, "salmon/quant": { "branch": "master", - "git_sha": "94b06f1683ddf893cf06525f6e7f0573ad8fbf83", + "git_sha": "5d2c0dd6a8e2790e7ff511f7f4d761f4ed627a91", "installed_by": ["modules", "fastq_subsample_fq_salmon"] }, "samtools/flagstat": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", "installed_by": ["bam_stats_samtools"] }, "samtools/idxstats": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", "installed_by": ["bam_stats_samtools"] }, "samtools/index": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", "installed_by": [ + "bam_dedup_stats_samtools_umitools", "bam_markduplicates_picard", - "bam_sort_stats_samtools", - "bam_dedup_stats_samtools_umitools" + "bam_sort_stats_samtools" ] }, "samtools/sort": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", "installed_by": ["bam_sort_stats_samtools"] }, "samtools/stats": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", "installed_by": ["bam_stats_samtools"] }, "sortmerna": { @@ -192,32 +196,32 @@ }, "trimgalore": { "branch": "master", - "git_sha": "72ffbd7128015a1d4b65b95ff8d37be8fee2f981", + "git_sha": "64a3dbc0a30a94cdaed7869d8e34fbb85e886614", "installed_by": ["fastq_fastqc_umitools_trimgalore"] }, "ucsc/bedclip": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", "installed_by": ["bedgraph_bedclip_bedgraphtobigwig"] }, "ucsc/bedgraphtobigwig": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", "installed_by": ["bedgraph_bedclip_bedgraphtobigwig"] }, "umitools/dedup": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "6d9c7e43404e20a97d2f6f88548456afe78282e6", "installed_by": ["bam_dedup_stats_samtools_umitools"] }, "umitools/extract": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["fastq_fastqc_umitools_trimgalore"] + "git_sha": "6d9c7e43404e20a97d2f6f88548456afe78282e6", + "installed_by": ["fastq_fastqc_umitools_trimgalore", "fastq_fastqc_umitools_fastp"] }, "untar": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "cc1f997fab6d8fde5dc0e6e2a310814df5b53ce7", "installed_by": ["modules"] } } @@ -226,17 +230,17 @@ "nf-core": { "bam_dedup_stats_samtools_umitools": { "branch": "master", - "git_sha": "41891ec2c3704911cd68b9317f26545b95a1c48d", + "git_sha": "901fab507683647b43e7032f3ae9e4c234eb68eb", "installed_by": ["subworkflows"] }, "bam_markduplicates_picard": { "branch": "master", - "git_sha": "6daac2bc63f4847e0c7cc661f4f5b043ac13faaf", + "git_sha": "6f1697c121719dedde9e0537b6ed6a9cb8c13583", "installed_by": ["subworkflows"] }, "bam_rseqc": { "branch": "master", - "git_sha": "36a77f7c6decf2d1fb9f639ae982bc148d6828aa", + "git_sha": "b8f7bdc6f0e37f2946c84c5ac676c6030ebbf8c4", "installed_by": ["subworkflows"] }, "bam_sort_stats_samtools": { @@ -246,11 +250,11 @@ }, "bam_stats_samtools": { "branch": "master", - "git_sha": "92eb5091ae5368a60cda58b3a0ced8b36d715b0f", + "git_sha": "b4b7f89e7fd6d2293f0c176213f710e0bcdaf19e", "installed_by": [ - "bam_sort_stats_samtools", "bam_dedup_stats_samtools_umitools", - "bam_markduplicates_picard" + "bam_markduplicates_picard", + "bam_sort_stats_samtools" ] }, "bedgraph_bedclip_bedgraphtobigwig": { @@ -263,9 +267,14 @@ "git_sha": "9057e75e8ac959373a72a9402130fdea2e2d1398", "installed_by": ["subworkflows"] }, + "fastq_fastqc_umitools_fastp": { + "branch": "master", + "git_sha": "4c8c5b5c084a99d9bf4f987040ebe88ec12b705d", + "installed_by": ["subworkflows"] + }, "fastq_fastqc_umitools_trimgalore": { "branch": "master", - "git_sha": "72ffbd7128015a1d4b65b95ff8d37be8fee2f981", + "git_sha": "100caf3506850ffcc04953f724a97f422940c377", "installed_by": ["subworkflows"] }, "fastq_subsample_fq_salmon": { diff --git a/modules/local/multiqc.nf b/modules/local/multiqc.nf index 4da19f3fc..802cebea9 100644 --- a/modules/local/multiqc.nf +++ b/modules/local/multiqc.nf @@ -1,10 +1,10 @@ process MULTIQC { label 'process_medium' - conda "bioconda::multiqc=1.13" + conda "bioconda::multiqc=1.14" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.13--pyhdfd78af_0' : - 'quay.io/biocontainers/multiqc:1.13--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.14--pyhdfd78af_0' : + 'quay.io/biocontainers/multiqc:1.14--pyhdfd78af_0' }" input: path multiqc_config @@ -16,9 +16,9 @@ process MULTIQC { path fail_trimming_summary path fail_mapping_summary path fail_strand_check - path ('fastqc/*') - path ('trimgalore/fastqc/*') - path ('trimgalore/*') + path ('fastqc/raw/*') + path ('fastqc/trim/*') + path ('trim_log/*') path ('sortmerna/*') path ('star/*') path ('hisat2/*') diff --git a/modules/local/multiqc_tsv_from_list.nf b/modules/local/multiqc_tsv_from_list.nf deleted file mode 100644 index a53c8f73e..000000000 --- a/modules/local/multiqc_tsv_from_list.nf +++ /dev/null @@ -1,28 +0,0 @@ -process MULTIQC_TSV_FROM_LIST { - - executor 'local' - memory 100.MB - - input: - val tsv_data // [ ['foo', 1], ['bar', 1] ] - val header // [ 'name', 'number' ] - val out_prefix - - output: - path "*.tsv" - - when: - task.ext.when == null || task.ext.when - - exec: - // Generate file contents - def contents = "" - if (tsv_data.size() > 0) { - contents += "${header.join('\t')}\n" - contents += tsv_data.join('\n') - } - - // Write to file - def mqc_file = task.workDir.resolve("${out_prefix}_mqc.tsv") - mqc_file.text = contents -} diff --git a/modules/local/umitools_prepareforrsem.nf b/modules/local/umitools_prepareforrsem.nf index 516e33266..416df567e 100644 --- a/modules/local/umitools_prepareforrsem.nf +++ b/modules/local/umitools_prepareforrsem.nf @@ -2,10 +2,10 @@ process UMITOOLS_PREPAREFORRSEM { tag "$meta.id" label 'process_medium' - conda "bioconda::umi_tools=1.1.2" + conda "bioconda::umi_tools=1.1.4" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/umi_tools:1.1.2--py38h4a8c8d9_0' : - 'quay.io/biocontainers/umi_tools:1.1.2--py38h4a8c8d9_0' }" + 'https://depot.galaxyproject.org/singularity/umi_tools:1.1.4--py38hbff2b2d_1' : + 'quay.io/biocontainers/umi_tools:1.1.4--py38hbff2b2d_1' }" input: tuple val(meta), path(bam) diff --git a/modules/nf-core/bbmap/bbsplit/main.nf b/modules/nf-core/bbmap/bbsplit/main.nf index 90f5104a1..2d9196871 100644 --- a/modules/nf-core/bbmap/bbsplit/main.nf +++ b/modules/nf-core/bbmap/bbsplit/main.nf @@ -27,11 +27,11 @@ process BBMAP_BBSPLIT { def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - def avail_mem = 3 + def avail_mem = 3072 if (!task.memory) { log.info '[BBSplit] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' } else { - avail_mem = task.memory.giga + avail_mem = (task.memory.mega*0.8).intValue() } def other_refs = [] @@ -42,7 +42,7 @@ process BBMAP_BBSPLIT { if (primary_ref && other_ref_names && other_ref_paths) { """ bbsplit.sh \\ - -Xmx${avail_mem}g \\ + -Xmx${avail_mem}M \\ ref_primary=$primary_ref \\ ${other_refs.join(' ')} \\ path=bbsplit \\ @@ -70,7 +70,7 @@ process BBMAP_BBSPLIT { def fastq_out = meta.single_end ? "basename=${prefix}_%.fastq.gz" : "basename=${prefix}_%_#.fastq.gz" """ bbsplit.sh \\ - -Xmx${avail_mem}g \\ + -Xmx${avail_mem}M \\ $index_files \\ threads=$task.cpus \\ $fastq_in \\ diff --git a/modules/nf-core/bbmap/bbsplit/meta.yml b/modules/nf-core/bbmap/bbsplit/meta.yml index 2b9ab8cbb..4cdc31a87 100644 --- a/modules/nf-core/bbmap/bbsplit/meta.yml +++ b/modules/nf-core/bbmap/bbsplit/meta.yml @@ -11,8 +11,7 @@ tools: description: BBMap is a short read aligner, as well as various other bioinformatic tools. homepage: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/ documentation: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/ - tool_dev_url: None - doi: "" + licence: ["UC-LBL license (see package)"] input: diff --git a/modules/nf-core/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf index 3df21765b..800a60991 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/main.nf +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -2,10 +2,10 @@ process CUSTOM_DUMPSOFTWAREVERSIONS { label 'process_single' // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container - conda "bioconda::multiqc=1.13" + conda "bioconda::multiqc=1.14" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.13--pyhdfd78af_0' : - 'quay.io/biocontainers/multiqc:1.13--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.14--pyhdfd78af_0' : + 'quay.io/biocontainers/multiqc:1.14--pyhdfd78af_0' }" input: path versions diff --git a/modules/nf-core/custom/getchromsizes/custom-getchromsizes.diff b/modules/nf-core/custom/getchromsizes/custom-getchromsizes.diff deleted file mode 100644 index cd8e585c8..000000000 --- a/modules/nf-core/custom/getchromsizes/custom-getchromsizes.diff +++ /dev/null @@ -1,24 +0,0 @@ -Changes in module 'nf-core/custom/getchromsizes' ---- modules/nf-core/custom/getchromsizes/main.nf -+++ modules/nf-core/custom/getchromsizes/main.nf -@@ -8,13 +8,13 @@ - 'quay.io/biocontainers/samtools:1.16.1--h6899075_1' }" - - input: -- tuple val(meta), path(fasta) -+ path fasta - - output: -- tuple val(meta), path ("*.sizes"), emit: sizes -- tuple val(meta), path ("*.fai") , emit: fai -- tuple val(meta), path ("*.gzi") , emit: gzi, optional: true -- path "versions.yml" , emit: versions -+ path "*.sizes" , emit: sizes -+ path "*.fai" , emit: fai -+ path "*.gzi" , emit: gzi, optional: true -+ path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - -************************************************************ diff --git a/modules/nf-core/custom/getchromsizes/main.nf b/modules/nf-core/custom/getchromsizes/main.nf index 486bfbd78..580f87fea 100644 --- a/modules/nf-core/custom/getchromsizes/main.nf +++ b/modules/nf-core/custom/getchromsizes/main.nf @@ -8,13 +8,13 @@ process CUSTOM_GETCHROMSIZES { 'quay.io/biocontainers/samtools:1.16.1--h6899075_1' }" input: - path fasta + tuple val(meta), path(fasta) output: - path "*.sizes" , emit: sizes - path "*.fai" , emit: fai - path "*.gzi" , emit: gzi, optional: true - path "versions.yml", emit: versions + tuple val(meta), path ("*.sizes"), emit: sizes + tuple val(meta), path ("*.fai") , emit: fai + tuple val(meta), path ("*.gzi") , emit: gzi, optional: true + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when diff --git a/modules/nf-core/fastp/main.nf b/modules/nf-core/fastp/main.nf new file mode 100644 index 000000000..5eeb9b09e --- /dev/null +++ b/modules/nf-core/fastp/main.nf @@ -0,0 +1,102 @@ +process FASTP { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::fastp=0.23.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/fastp:0.23.2--h79da9fb_0' : + 'quay.io/biocontainers/fastp:0.23.2--h79da9fb_0' }" + + input: + tuple val(meta), path(reads) + path adapter_fasta + val save_trimmed_fail + val save_merged + + output: + tuple val(meta), path('*.fastp.fastq.gz') , optional:true, emit: reads + tuple val(meta), path('*.json') , emit: json + tuple val(meta), path('*.html') , emit: html + tuple val(meta), path('*.log') , emit: log + path "versions.yml" , emit: versions + tuple val(meta), path('*.fail.fastq.gz') , optional:true, emit: reads_fail + tuple val(meta), path('*.merged.fastq.gz'), optional:true, emit: reads_merged + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def adapter_list = adapter_fasta ? "--adapter_fasta ${adapter_fasta}" : "" + def fail_fastq = save_trimmed_fail && meta.single_end ? "--failed_out ${prefix}.fail.fastq.gz" : save_trimmed_fail && !meta.single_end ? "--unpaired1 ${prefix}_1.fail.fastq.gz --unpaired2 ${prefix}_2.fail.fastq.gz" : '' + // Added soft-links to original fastqs for consistent naming in MultiQC + // Use single ended for interleaved. Add --interleaved_in in config. + if ( task.ext.args?.contains('--interleaved_in') ) { + """ + [ ! -f ${prefix}.fastq.gz ] && ln -sf $reads ${prefix}.fastq.gz + + fastp \\ + --stdout \\ + --in1 ${prefix}.fastq.gz \\ + --thread $task.cpus \\ + --json ${prefix}.fastp.json \\ + --html ${prefix}.fastp.html \\ + $adapter_list \\ + $fail_fastq \\ + $args \\ + 2> ${prefix}.fastp.log \\ + | gzip -c > ${prefix}.fastp.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g") + END_VERSIONS + """ + } else if (meta.single_end) { + """ + [ ! -f ${prefix}.fastq.gz ] && ln -sf $reads ${prefix}.fastq.gz + + fastp \\ + --in1 ${prefix}.fastq.gz \\ + --out1 ${prefix}.fastp.fastq.gz \\ + --thread $task.cpus \\ + --json ${prefix}.fastp.json \\ + --html ${prefix}.fastp.html \\ + $adapter_list \\ + $fail_fastq \\ + $args \\ + 2> ${prefix}.fastp.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g") + END_VERSIONS + """ + } else { + def merge_fastq = save_merged ? "-m --merged_out ${prefix}.merged.fastq.gz" : '' + """ + [ ! -f ${prefix}_1.fastq.gz ] && ln -sf ${reads[0]} ${prefix}_1.fastq.gz + [ ! -f ${prefix}_2.fastq.gz ] && ln -sf ${reads[1]} ${prefix}_2.fastq.gz + fastp \\ + --in1 ${prefix}_1.fastq.gz \\ + --in2 ${prefix}_2.fastq.gz \\ + --out1 ${prefix}_1.fastp.fastq.gz \\ + --out2 ${prefix}_2.fastp.fastq.gz \\ + --json ${prefix}.fastp.json \\ + --html ${prefix}.fastp.html \\ + $adapter_list \\ + $fail_fastq \\ + $merge_fastq \\ + --thread $task.cpus \\ + --detect_adapter_for_pe \\ + $args \\ + 2> ${prefix}.fastp.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g") + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/fastp/meta.yml b/modules/nf-core/fastp/meta.yml new file mode 100644 index 000000000..197ea7ca6 --- /dev/null +++ b/modules/nf-core/fastp/meta.yml @@ -0,0 +1,73 @@ +name: fastp +description: Perform adapter/quality trimming on sequencing reads +keywords: + - trimming + - quality control + - fastq +tools: + - fastp: + description: | + A tool designed to provide fast all-in-one preprocessing for FastQ files. This tool is developed in C++ with multithreading supported to afford high performance. + documentation: https://github.com/OpenGene/fastp + doi: 10.1093/bioinformatics/bty560 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information. Use 'single_end: true' to specify single ended or interleaved FASTQs. Use 'single_end: false' for paired-end reads. + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. If you wish to run interleaved paired-end data, supply as single-end data + but with `--interleaved_in` in your `modules.conf`'s `ext.args` for the module. + - adapter_fasta: + type: file + description: File in FASTA format containing possible adapters to remove. + pattern: "*.{fasta,fna,fas,fa}" + - save_trimmed_fail: + type: boolean + description: Specify true to save files that failed to pass trimming thresholds ending in `*.fail.fastq.gz` + - save_merged: + type: boolean + description: Specify true to save all merged reads to the a file ending in `*.merged.fastq.gz` + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: The trimmed/modified/unmerged fastq reads + pattern: "*fastp.fastq.gz" + - json: + type: file + description: Results in JSON format + pattern: "*.json" + - html: + type: file + description: Results in HTML format + pattern: "*.html" + - log: + type: file + description: fastq log file + pattern: "*.log" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - reads_fail: + type: file + description: Reads the failed the preprocessing + pattern: "*fail.fastq.gz" + - reads_merged: + type: file + description: Reads that were successfully merged + pattern: "*.{merged.fastq.gz}" +authors: + - "@drpatelh" + - "@kevinmenden" diff --git a/modules/nf-core/fq/subsample/meta.yml b/modules/nf-core/fq/subsample/meta.yml index 0134c8a73..9d93f8df0 100644 --- a/modules/nf-core/fq/subsample/meta.yml +++ b/modules/nf-core/fq/subsample/meta.yml @@ -9,7 +9,7 @@ tools: homepage: "https://github.com/stjude-rust-labs/fq" documentation: "https://github.com/stjude-rust-labs/fq" tool_dev_url: "https://github.com/stjude-rust-labs/fq" - doi: "" + licence: "['MIT']" input: diff --git a/modules/nf-core/gunzip/meta.yml b/modules/nf-core/gunzip/meta.yml index 4d2ebc84e..2e0e4054d 100644 --- a/modules/nf-core/gunzip/meta.yml +++ b/modules/nf-core/gunzip/meta.yml @@ -5,29 +5,29 @@ keywords: - compression tools: - gunzip: - description: | - gzip is a file format and a software application used for file compression and decompression. - documentation: https://www.gnu.org/software/gzip/manual/gzip.html - licence: ["GPL-3.0-or-later"] + description: | + gzip is a file format and a software application used for file compression and decompression. + documentation: https://www.gnu.org/software/gzip/manual/gzip.html + licence: ["GPL-3.0-or-later"] input: - meta: - type: map - description: | - Optional groovy Map containing meta information - e.g. [ id:'test', single_end:false ] + type: map + description: | + Optional groovy Map containing meta information + e.g. [ id:'test', single_end:false ] - archive: - type: file - description: File to be compressed/uncompressed - pattern: "*.*" + type: file + description: File to be compressed/uncompressed + pattern: "*.*" output: - gunzip: - type: file - description: Compressed/uncompressed file - pattern: "*.*" + type: file + description: Compressed/uncompressed file + pattern: "*.*" - versions: - type: file - description: File containing software versions - pattern: "versions.yml" + type: file + description: File containing software versions + pattern: "versions.yml" authors: - "@joseespinosa" - "@drpatelh" diff --git a/modules/nf-core/picard/markduplicates/main.nf b/modules/nf-core/picard/markduplicates/main.nf index 148003928..1fe6ee2d2 100644 --- a/modules/nf-core/picard/markduplicates/main.nf +++ b/modules/nf-core/picard/markduplicates/main.nf @@ -2,10 +2,10 @@ process PICARD_MARKDUPLICATES { tag "$meta.id" label 'process_medium' - conda "bioconda::picard=2.27.4" + conda "bioconda::picard=3.0.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/picard:2.27.4--hdfd78af_0' : - 'quay.io/biocontainers/picard:2.27.4--hdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/picard:3.0.0--hdfd78af_1' : + 'quay.io/biocontainers/picard:3.0.0--hdfd78af_1' }" input: tuple val(meta), path(bam) @@ -24,15 +24,15 @@ process PICARD_MARKDUPLICATES { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - def avail_mem = 3 + def avail_mem = 3072 if (!task.memory) { log.info '[Picard MarkDuplicates] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' } else { - avail_mem = task.memory.giga + avail_mem = (task.memory.mega*0.8).intValue() } """ picard \\ - -Xmx${avail_mem}g \\ + -Xmx${avail_mem}M \\ MarkDuplicates \\ $args \\ --INPUT $bam \\ diff --git a/modules/nf-core/preseq/lcextrap/meta.yml b/modules/nf-core/preseq/lcextrap/meta.yml index f1be05a2f..1391961c8 100755 --- a/modules/nf-core/preseq/lcextrap/meta.yml +++ b/modules/nf-core/preseq/lcextrap/meta.yml @@ -10,7 +10,7 @@ tools: homepage: http://smithlabresearch.org/software/preseq/ documentation: http://smithlabresearch.org/wp-content/uploads/manual.pdf tool_dev_url: https://github.com/smithlabcode/preseq - doi: "" + licence: ["GPL"] input: diff --git a/modules/nf-core/qualimap/rnaseq/main.nf b/modules/nf-core/qualimap/rnaseq/main.nf index ad15ebc2a..c3cd5c06e 100644 --- a/modules/nf-core/qualimap/rnaseq/main.nf +++ b/modules/nf-core/qualimap/rnaseq/main.nf @@ -22,7 +22,7 @@ process QUALIMAP_RNASEQ { def args = task.ext.args ?: '' prefix = task.ext.prefix ?: "${meta.id}" def paired_end = meta.single_end ? '' : '-pe' - def memory = task.memory.toGiga() + "G" + def memory = (task.memory.mega*0.8).intValue() + 'M' def strandedness = 'non-strand-specific' if (meta.strandedness == 'forward') { diff --git a/modules/nf-core/rsem/calculateexpression/meta.yml b/modules/nf-core/rsem/calculateexpression/meta.yml index 8b89c7d11..680ede2e2 100644 --- a/modules/nf-core/rsem/calculateexpression/meta.yml +++ b/modules/nf-core/rsem/calculateexpression/meta.yml @@ -10,7 +10,7 @@ tools: RSEM: accurate transcript quantification from RNA-Seq data with or without a reference genome homepage: https://github.com/deweylab/RSEM documentation: https://github.com/deweylab/RSEM - doi: https://doi.org/10.1186/1471-2105-12-323 + doi: 10.1186/1471-2105-12-323 licence: ["GPL-3.0-or-later"] input: - meta: diff --git a/modules/nf-core/rsem/preparereference/meta.yml b/modules/nf-core/rsem/preparereference/meta.yml index fbe57b203..b1d013b9c 100644 --- a/modules/nf-core/rsem/preparereference/meta.yml +++ b/modules/nf-core/rsem/preparereference/meta.yml @@ -9,7 +9,7 @@ tools: RSEM: accurate transcript quantification from RNA-Seq data with or without a reference genome homepage: https://github.com/deweylab/RSEM documentation: https://github.com/deweylab/RSEM - doi: https://doi.org/10.1186/1471-2105-12-323 + doi: 10.1186/1471-2105-12-323 licence: ["GPL-3.0-or-later"] input: - fasta: diff --git a/modules/nf-core/rseqc/junctionannotation/meta.yml b/modules/nf-core/rseqc/junctionannotation/meta.yml index a17b84e91..48c43260e 100644 --- a/modules/nf-core/rseqc/junctionannotation/meta.yml +++ b/modules/nf-core/rseqc/junctionannotation/meta.yml @@ -53,6 +53,9 @@ output: description: Rscript to reproduce the plots pattern: "*.r" - log: + type: file + description: Log file of execution + pattern: "*.junction_annotation.log" - versions: type: file description: File containing software versions diff --git a/modules/nf-core/rseqc/tin/meta.yml b/modules/nf-core/rseqc/tin/meta.yml index 6333ae14c..381edfde0 100644 --- a/modules/nf-core/rseqc/tin/meta.yml +++ b/modules/nf-core/rseqc/tin/meta.yml @@ -24,9 +24,9 @@ input: description: Input BAM file pattern: "*.{bam}" - bai: - type: file - description: Index for input BAM file - pattern: "*.{bai}" + type: file + description: Index for input BAM file + pattern: "*.{bai}" - bed: type: file description: BED file containing the reference gene model diff --git a/modules/nf-core/salmon/index/main.nf b/modules/nf-core/salmon/index/main.nf index 1a67bddc5..668b62287 100644 --- a/modules/nf-core/salmon/index/main.nf +++ b/modules/nf-core/salmon/index/main.nf @@ -2,28 +2,28 @@ process SALMON_INDEX { tag "$transcript_fasta" label "process_medium" - conda "bioconda::salmon=1.9.0" + conda "bioconda::salmon=1.10.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/salmon:1.9.0--h7e5ed60_1' : - 'quay.io/biocontainers/salmon:1.9.0--h7e5ed60_1' }" + 'https://depot.galaxyproject.org/singularity/salmon:1.10.1--h7e5ed60_0' : + 'quay.io/biocontainers/salmon:1.10.1--h7e5ed60_0' }" input: path genome_fasta path transcript_fasta output: - path "salmon" , emit: index - path "versions.yml" , emit: versions + path "salmon" , emit: index + path "versions.yml", emit: versions when: task.ext.when == null || task.ext.when script: def args = task.ext.args ?: '' - def get_decoy_ids = "grep '^>' $genome_fasta | cut -d ' ' -f 1 > decoys.txt" + def get_decoy_ids = "grep '^>' $genome_fasta | cut -d ' ' -f 1 | cut -d \$'\\t' -f 1 > decoys.txt" def gentrome = "gentrome.fa" if (genome_fasta.endsWith('.gz')) { - get_decoy_ids = "grep '^>' <(gunzip -c $genome_fasta) | cut -d ' ' -f 1 > decoys.txt" + get_decoy_ids = "grep '^>' <(gunzip -c $genome_fasta) | cut -d ' ' -f 1 | cut -d \$'\\t' -f 1 > decoys.txt" gentrome = "gentrome.fa.gz" } """ @@ -38,6 +38,7 @@ process SALMON_INDEX { -d decoys.txt \\ $args \\ -i salmon + cat <<-END_VERSIONS > versions.yml "${task.process}": salmon: \$(echo \$(salmon --version) | sed -e "s/salmon //g") diff --git a/modules/nf-core/salmon/quant/main.nf b/modules/nf-core/salmon/quant/main.nf index d70fbf584..978faf6c5 100644 --- a/modules/nf-core/salmon/quant/main.nf +++ b/modules/nf-core/salmon/quant/main.nf @@ -2,10 +2,10 @@ process SALMON_QUANT { tag "$meta.id" label "process_medium" - conda "bioconda::salmon=1.9.0" + conda "bioconda::salmon=1.10.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/salmon:1.9.0--h7e5ed60_1' : - 'quay.io/biocontainers/salmon:1.9.0--h7e5ed60_1' }" + 'https://depot.galaxyproject.org/singularity/salmon:1.10.1--h7e5ed60_0' : + 'quay.io/biocontainers/salmon:1.10.1--h7e5ed60_0' }" input: tuple val(meta), path(reads) diff --git a/modules/nf-core/salmon/quant/meta.yml b/modules/nf-core/salmon/quant/meta.yml index 877c719df..ea01e0df8 100644 --- a/modules/nf-core/salmon/quant/meta.yml +++ b/modules/nf-core/salmon/quant/meta.yml @@ -34,12 +34,12 @@ input: type: file description: Fasta file of the reference transcriptome - alignment_mode: - type: boolean - description: whether to run salmon in alignment mode + type: boolean + description: whether to run salmon in alignment mode - lib_type: - type: string - description: | - Override library type inferred based on strandedness defined in meta object + type: string + description: | + Override library type inferred based on strandedness defined in meta object output: - results: diff --git a/modules/nf-core/samtools/flagstat/meta.yml b/modules/nf-core/samtools/flagstat/meta.yml index 952690639..954225dfc 100644 --- a/modules/nf-core/samtools/flagstat/meta.yml +++ b/modules/nf-core/samtools/flagstat/meta.yml @@ -14,7 +14,7 @@ tools: short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. These files are generated as output by short read aligners like BWA. homepage: http://www.htslib.org/ - documentation: hhttp://www.htslib.org/doc/samtools.html + documentation: http://www.htslib.org/doc/samtools.html doi: 10.1093/bioinformatics/btp352 licence: ["MIT"] input: diff --git a/modules/nf-core/samtools/idxstats/meta.yml b/modules/nf-core/samtools/idxstats/meta.yml index 3710ab882..dda87e1ee 100644 --- a/modules/nf-core/samtools/idxstats/meta.yml +++ b/modules/nf-core/samtools/idxstats/meta.yml @@ -15,7 +15,7 @@ tools: short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. These files are generated as output by short read aligners like BWA. homepage: http://www.htslib.org/ - documentation: hhttp://www.htslib.org/doc/samtools.html + documentation: http://www.htslib.org/doc/samtools.html doi: 10.1093/bioinformatics/btp352 licence: ["MIT"] input: diff --git a/modules/nf-core/samtools/index/meta.yml b/modules/nf-core/samtools/index/meta.yml index e5cadbc24..8bd2fa6fb 100644 --- a/modules/nf-core/samtools/index/meta.yml +++ b/modules/nf-core/samtools/index/meta.yml @@ -12,7 +12,7 @@ tools: short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. These files are generated as output by short read aligners like BWA. homepage: http://www.htslib.org/ - documentation: hhttp://www.htslib.org/doc/samtools.html + documentation: http://www.htslib.org/doc/samtools.html doi: 10.1093/bioinformatics/btp352 licence: ["MIT"] input: diff --git a/modules/nf-core/samtools/sort/meta.yml b/modules/nf-core/samtools/sort/meta.yml index 092897512..073284316 100644 --- a/modules/nf-core/samtools/sort/meta.yml +++ b/modules/nf-core/samtools/sort/meta.yml @@ -12,7 +12,7 @@ tools: short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. These files are generated as output by short read aligners like BWA. homepage: http://www.htslib.org/ - documentation: hhttp://www.htslib.org/doc/samtools.html + documentation: http://www.htslib.org/doc/samtools.html doi: 10.1093/bioinformatics/btp352 licence: ["MIT"] input: diff --git a/modules/nf-core/samtools/stats/meta.yml b/modules/nf-core/samtools/stats/meta.yml index cac50b1c0..1d68a5d83 100644 --- a/modules/nf-core/samtools/stats/meta.yml +++ b/modules/nf-core/samtools/stats/meta.yml @@ -13,7 +13,7 @@ tools: short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. These files are generated as output by short read aligners like BWA. homepage: http://www.htslib.org/ - documentation: hhttp://www.htslib.org/doc/samtools.html + documentation: http://www.htslib.org/doc/samtools.html doi: 10.1093/bioinformatics/btp352 licence: ["MIT"] input: @@ -23,13 +23,13 @@ input: Groovy Map containing sample information e.g. [ id:'test', single_end:false ] - input: - type: file - description: BAM/CRAM file from alignment - pattern: "*.{bam,cram}" + type: file + description: BAM/CRAM file from alignment + pattern: "*.{bam,cram}" - input_index: - type: file - description: BAI/CRAI file from alignment - pattern: "*.{bai,crai}" + type: file + description: BAI/CRAI file from alignment + pattern: "*.{bai,crai}" - fasta: type: optional file description: Reference file the CRAM was created with diff --git a/modules/nf-core/trimgalore/main.nf b/modules/nf-core/trimgalore/main.nf index 37e88f586..dc2957846 100644 --- a/modules/nf-core/trimgalore/main.nf +++ b/modules/nf-core/trimgalore/main.nf @@ -37,10 +37,12 @@ process TRIMGALORE { // Added soft-links to original fastqs for consistent naming in MultiQC def prefix = task.ext.prefix ?: "${meta.id}" if (meta.single_end) { + def args_list = args.split("\\s(?=--)").toList() + args_list.removeAll { it.toLowerCase().contains('_r2 ') } """ [ ! -f ${prefix}.fastq.gz ] && ln -s $reads ${prefix}.fastq.gz trim_galore \\ - $args \\ + ${args_list.join(' ')} \\ --cores $cores \\ --gzip \\ ${prefix}.fastq.gz diff --git a/modules/nf-core/ucsc/bedclip/meta.yml b/modules/nf-core/ucsc/bedclip/meta.yml index e6dd8cebc..ca278552a 100755 --- a/modules/nf-core/ucsc/bedclip/meta.yml +++ b/modules/nf-core/ucsc/bedclip/meta.yml @@ -5,10 +5,7 @@ keywords: tools: - ucsc: description: Remove lines from bed file that refer to off-chromosome locations. - homepage: None - documentation: None - tool_dev_url: None - doi: "" + homepage: http://hgdownload.cse.ucsc.edu/admin/exe/ licence: ["varies; see http://genome.ucsc.edu/license"] input: diff --git a/modules/nf-core/ucsc/bedgraphtobigwig/meta.yml b/modules/nf-core/ucsc/bedgraphtobigwig/meta.yml index 1be1a3b7f..ba8915bed 100755 --- a/modules/nf-core/ucsc/bedgraphtobigwig/meta.yml +++ b/modules/nf-core/ucsc/bedgraphtobigwig/meta.yml @@ -6,10 +6,8 @@ keywords: tools: - ucsc: description: Convert a bedGraph file to bigWig format. - homepage: None - documentation: None - tool_dev_url: None - doi: "" + homepage: http://hgdownload.cse.ucsc.edu/admin/exe/ + documentation: https://genome.ucsc.edu/goldenPath/help/bigWig.html licence: ["varies; see http://genome.ucsc.edu/license"] input: diff --git a/modules/nf-core/umitools/dedup/main.nf b/modules/nf-core/umitools/dedup/main.nf index 642001389..68fc9b9e7 100644 --- a/modules/nf-core/umitools/dedup/main.nf +++ b/modules/nf-core/umitools/dedup/main.nf @@ -1,11 +1,11 @@ process UMITOOLS_DEDUP { tag "$meta.id" - label "process_medium" + label "process_single" - conda "bioconda::umi_tools=1.1.2" + conda "bioconda::umi_tools=1.1.4" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/umi_tools:1.1.2--py38h4a8c8d9_0' : - 'quay.io/biocontainers/umi_tools:1.1.2--py38h4a8c8d9_0' }" + 'https://depot.galaxyproject.org/singularity/umi_tools:1.1.4--py38hbff2b2d_1' : + 'quay.io/biocontainers/umi_tools:1.1.4--py38hbff2b2d_1' }" input: tuple val(meta), path(bam), path(bai) diff --git a/modules/nf-core/umitools/dedup/meta.yml b/modules/nf-core/umitools/dedup/meta.yml index 56888e5a6..0719a9552 100644 --- a/modules/nf-core/umitools/dedup/meta.yml +++ b/modules/nf-core/umitools/dedup/meta.yml @@ -5,57 +5,57 @@ keywords: - deduplication tools: - umi_tools: - description: > - UMI-tools contains tools for dealing with Unique Molecular Identifiers (UMIs)/Random Molecular Tags (RMTs) - and single cell RNA-Seq cell barcodes - documentation: https://umi-tools.readthedocs.io/en/latest/ - license: ["MIT"] + description: > + UMI-tools contains tools for dealing with Unique Molecular Identifiers (UMIs)/Random Molecular Tags (RMTs) + and single cell RNA-Seq cell barcodes + documentation: https://umi-tools.readthedocs.io/en/latest/ + license: ["MIT"] input: - meta: - type: map - description: | - Groovy Map containing sample information + type: map + description: | + Groovy Map containing sample information e.g. [ id:'test', single_end:false ] - bam: - type: file - description: | - BAM file containing reads to be deduplicated via UMIs. - pattern: "*.{bam}" + type: file + description: | + BAM file containing reads to be deduplicated via UMIs. + pattern: "*.{bam}" - bai: - type: file - description: | - BAM index files corresponding to the input BAM file. - pattern: "*.{bai}" + type: file + description: | + BAM index files corresponding to the input BAM file. + pattern: "*.{bai}" - get_output_stats: - type: boolean - description: | - Whether or not to generate output stats. + type: boolean + description: | + Whether or not to generate output stats. output: - meta: - type: map - description: | - Groovy Map containing sample information + type: map + description: | + Groovy Map containing sample information e.g. [ id:'test', single_end:false ] - bam: - type: file - description: BAM file with deduplicated UMIs. - pattern: "*.{bam}" + type: file + description: BAM file with deduplicated UMIs. + pattern: "*.{bam}" - tsv_edit_distance: - type: file - description: Reports the (binned) average edit distance between the UMIs at each position. - pattern: "*edit_distance.tsv" + type: file + description: Reports the (binned) average edit distance between the UMIs at each position. + pattern: "*edit_distance.tsv" - tsv_per_umi: - type: file - description: UMI-level summary statistics. - pattern: "*per_umi.tsv" + type: file + description: UMI-level summary statistics. + pattern: "*per_umi.tsv" - tsv_umi_per_position: - type: file - description: Tabulates the counts for unique combinations of UMI and position. - pattern: "*per_position.tsv" + type: file + description: Tabulates the counts for unique combinations of UMI and position. + pattern: "*per_position.tsv" - versions: - type: file - description: File containing software versions - pattern: "versions.yml" + type: file + description: File containing software versions + pattern: "versions.yml" authors: - "@drpatelh" diff --git a/modules/nf-core/umitools/extract/main.nf b/modules/nf-core/umitools/extract/main.nf index 731f164c2..ba2826e1f 100644 --- a/modules/nf-core/umitools/extract/main.nf +++ b/modules/nf-core/umitools/extract/main.nf @@ -1,11 +1,11 @@ process UMITOOLS_EXTRACT { tag "$meta.id" - label "process_low" + label "process_single" - conda "bioconda::umi_tools=1.1.2" + conda "bioconda::umi_tools=1.1.4" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/umi_tools:1.1.2--py38h4a8c8d9_0' : - 'quay.io/biocontainers/umi_tools:1.1.2--py38h4a8c8d9_0' }" + 'https://depot.galaxyproject.org/singularity/umi_tools:1.1.4--py38hbff2b2d_1' : + 'quay.io/biocontainers/umi_tools:1.1.4--py38hbff2b2d_1' }" input: tuple val(meta), path(reads) diff --git a/modules/nf-core/umitools/extract/meta.yml b/modules/nf-core/umitools/extract/meta.yml index 7fc23f722..db64a0f88 100644 --- a/modules/nf-core/umitools/extract/meta.yml +++ b/modules/nf-core/umitools/extract/meta.yml @@ -5,42 +5,42 @@ keywords: - extract tools: - umi_tools: - description: > - UMI-tools contains tools for dealing with Unique Molecular Identifiers (UMIs)/Random Molecular Tags (RMTs) - and single cell RNA-Seq cell barcodes - documentation: https://umi-tools.readthedocs.io/en/latest/ - license: ["MIT"] + description: > + UMI-tools contains tools for dealing with Unique Molecular Identifiers (UMIs)/Random Molecular Tags (RMTs) + and single cell RNA-Seq cell barcodes + documentation: https://umi-tools.readthedocs.io/en/latest/ + license: ["MIT"] input: - meta: - type: map - description: | - Groovy Map containing sample information + type: map + description: | + Groovy Map containing sample information e.g. [ id:'test', single_end:false ] - reads: - type: list - description: | - List of input FASTQ files whose UMIs will be extracted. + type: list + description: | + List of input FASTQ files whose UMIs will be extracted. output: - meta: - type: map - description: | - Groovy Map containing sample information + type: map + description: | + Groovy Map containing sample information e.g. [ id:'test', single_end:false ] - reads: - type: file - description: > - Extracted FASTQ files. | - For single-end reads, pattern is \${prefix}.umi_extract.fastq.gz. | + type: file + description: > + Extracted FASTQ files. | + For single-end reads, pattern is \${prefix}.umi_extract.fastq.gz. | For paired-end reads, pattern is \${prefix}.umi_extract_{1,2}.fastq.gz. - pattern: "*.{fastq.gz}" + pattern: "*.{fastq.gz}" - log: - type: file - description: Logfile for umi_tools - pattern: "*.{log}" + type: file + description: Logfile for umi_tools + pattern: "*.{log}" - versions: - type: file - description: File containing software versions - pattern: "versions.yml" + type: file + description: File containing software versions + pattern: "versions.yml" authors: - "@drpatelh" diff --git a/modules/nf-core/untar/main.nf b/modules/nf-core/untar/main.nf index 16bccc905..3384847aa 100644 --- a/modules/nf-core/untar/main.nf +++ b/modules/nf-core/untar/main.nf @@ -2,7 +2,7 @@ process UNTAR { tag "$archive" label 'process_single' - conda "conda-forge::sed=4.7" + conda "conda-forge::sed=4.7 bioconda::grep=3.4 conda-forge::tar=1.34" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : 'ubuntu:20.04' }" @@ -11,8 +11,8 @@ process UNTAR { tuple val(meta), path(archive) output: - tuple val(meta), path("$untar"), emit: untar - path "versions.yml" , emit: versions + tuple val(meta), path("$prefix"), emit: untar + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -20,31 +20,29 @@ process UNTAR { script: def args = task.ext.args ?: '' def args2 = task.ext.args2 ?: '' - untar = archive.toString() - '.tar.gz' + prefix = task.ext.prefix ?: ( meta.id ? "${meta.id}" : archive.baseName.toString().replaceFirst(/\.tar$/, "")) """ - mkdir output + mkdir $prefix ## Ensures --strip-components only applied when top level of tar contents is a directory - ## If just files or multiple directories, place all in output - if [[ \$(tar -tzf ${archive} | grep -o -P "^.*?\\/" | uniq | wc -l) -eq 1 ]]; then + ## If just files or multiple directories, place all in prefix + if [[ \$(tar -taf ${archive} | grep -o -P "^.*?\\/" | uniq | wc -l) -eq 1 ]]; then tar \\ - -C output --strip-components 1 \\ - -xzvf \\ + -C $prefix --strip-components 1 \\ + -xavf \\ $args \\ $archive \\ $args2 else tar \\ - -C output \\ - -xzvf \\ + -C $prefix \\ + -xavf \\ $args \\ $archive \\ $args2 fi - mv output ${untar} - cat <<-END_VERSIONS > versions.yml "${task.process}": untar: \$(echo \$(tar --version 2>&1) | sed 's/^.*(GNU tar) //; s/ Copyright.*\$//') @@ -52,9 +50,10 @@ process UNTAR { """ stub: - untar = archive.toString() - '.tar.gz' + prefix = task.ext.prefix ?: ( meta.id ? "${meta.id}" : archive.toString().replaceFirst(/\.[^\.]+(.gz)?$/, "")) """ - touch $untar + mkdir $prefix + touch ${prefix}/file.txt cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/nextflow.config b/nextflow.config index f94f87baf..c5ca21943 100644 --- a/nextflow.config +++ b/nextflow.config @@ -37,12 +37,10 @@ params { save_umi_intermeds = false // Trimming + trimmer = 'trimgalore' min_trimmed_reads = 10000 - clip_r1 = null - clip_r2 = null - three_prime_clip_r1 = null - three_prime_clip_r2 = null - trim_nextseq = null + extra_trimgalore_args = null + extra_fastp_args = null save_trimmed = false skip_trimming = false @@ -98,7 +96,6 @@ params { // Boilerplate options outdir = null - tracedir = "${params.outdir}/pipeline_info" publish_dir_mode = 'copy' email = null email_on_fail = null @@ -118,6 +115,7 @@ params { config_profile_contact = null config_profile_url = null config_profile_name = null + test_data_base = 'https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq3' // Max resource options // Defaults only, expecting to be overwritten @@ -207,8 +205,17 @@ profiles { executor.cpus = 16 executor.memory = 60.GB } - test { includeConfig 'conf/test.config' } - test_full { includeConfig 'conf/test_full.config' } + test { includeConfig 'conf/test.config' } + test_full { includeConfig 'conf/test_full.config' } + test_full_aws { includeConfig 'conf/test_full.config' } + test_full_gcp { + includeConfig 'conf/test_full.config' + params.input = 'https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/samplesheet/v3.10/samplesheet_full_gcp.csv' + } + test_full_azure { + includeConfig 'conf/test_full.config' + params.input = 'https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/samplesheet/v3.10/samplesheet_full_azure.csv' + } } // Load igenomes.config if required @@ -232,19 +239,19 @@ env { def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') timeline { enabled = true - file = "${params.tracedir}/execution_timeline_${trace_timestamp}.html" + file = "${params.outdir}/pipeline_info/execution_timeline_${trace_timestamp}.html" } report { enabled = true - file = "${params.tracedir}/execution_report_${trace_timestamp}.html" + file = "${params.outdir}/pipeline_info/execution_report_${trace_timestamp}.html" } trace { enabled = true - file = "${params.tracedir}/execution_trace_${trace_timestamp}.txt" + file = "${params.outdir}/pipeline_info/execution_trace_${trace_timestamp}.txt" } dag { enabled = true - file = "${params.tracedir}/pipeline_dag_${trace_timestamp}.html" + file = "${params.outdir}/pipeline_info/pipeline_dag_${trace_timestamp}.html" } manifest { @@ -254,7 +261,7 @@ manifest { description = """RNA sequencing analysis pipeline for gene/isoform quantification and extensive quality control.""" mainScript = 'main.nf' nextflowVersion = '!>=22.10.1' - version = '3.10.1' + version = '3.11.0' doi = 'https://doi.org/10.5281/zenodo.1400710' } diff --git a/nextflow_schema.json b/nextflow_schema.json index 90a223cef..9f6d7529d 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -326,31 +326,22 @@ "fa_icon": "fas fa-cut", "description": "Options to adjust read trimming criteria.", "properties": { - "clip_r1": { - "type": "integer", - "description": "Instructs Trim Galore to remove bp from the 5' end of read 1 (or single-end reads).", - "fa_icon": "fas fa-cut" - }, - "clip_r2": { - "type": "integer", - "description": "Instructs Trim Galore to remove bp from the 5' end of read 2 (paired-end reads only).", - "fa_icon": "fas fa-cut" - }, - "three_prime_clip_r1": { - "type": "integer", - "description": "Instructs Trim Galore to remove bp from the 3' end of read 1 AFTER adapter/quality trimming has been performed.", - "fa_icon": "fas fa-cut" + "trimmer": { + "type": "string", + "default": "trimgalore", + "description": "Specifies the trimming tool to use - available options are 'trimgalore' and 'fastp'.", + "fa_icon": "fas fa-cut", + "enum": ["trimgalore", "fastp"] }, - "three_prime_clip_r2": { - "type": "integer", - "description": "Instructs Trim Galore to remove bp from the 3' end of read 2 AFTER adapter/quality trimming has been performed.", - "fa_icon": "fas fa-cut" + "extra_trimgalore_args": { + "type": "string", + "description": "Extra arguments to pass to Trim Galore! command in addition to defaults defined by the pipeline.", + "fa_icon": "fas fa-plus" }, - "trim_nextseq": { - "type": "integer", - "description": "Instructs Trim Galore to apply the --nextseq=X option, to trim based on quality after removing poly-G tails.", - "help_text": "This enables the option Cutadapt `--nextseq-trim=3'CUTOFF` option via Trim Galore, which will set a quality cutoff (that is normally given with -q instead), but qualities of G bases are ignored. This trimming is in common for the NextSeq- and NovaSeq-platforms, where basecalls without any signal are called as high-quality G bases.", - "fa_icon": "fas fa-cut" + "extra_fastp_args": { + "type": "string", + "description": "Extra arguments to pass to fastp command in addition to defaults defined by the pipeline.", + "fa_icon": "fas fa-plus" }, "min_trimmed_reads": { "type": "integer", @@ -580,6 +571,13 @@ "description": "Institutional config URL link.", "hidden": true, "fa_icon": "fas fa-users-cog" + }, + "test_data_base": { + "type": "string", + "default": "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq3", + "description": "Base path / URL for data used in the test profiles", + "help_text": "Warning: The `-profile test` samplesheet file itself contains remote paths. Setting this parameter does not alter the contents of that file.", + "hidden": true } } }, @@ -697,13 +695,6 @@ "description": "Custom MultiQC yaml file containing HTML including a methods description.", "fa_icon": "fas fa-cog" }, - "tracedir": { - "type": "string", - "description": "Directory to keep pipeline Nextflow logs and reports.", - "default": "${params.outdir}/pipeline_info", - "fa_icon": "fas fa-cogs", - "hidden": true - }, "validate_params": { "type": "boolean", "description": "Boolean whether to validate parameters against the schema at runtime", diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf index 1b99df670..52e09c774 100644 --- a/subworkflows/local/prepare_genome.nf +++ b/subworkflows/local/prepare_genome.nf @@ -33,9 +33,23 @@ include { STAR_GENOMEGENERATE_IGENOMES } from '../../modules/local/star_ workflow PREPARE_GENOME { take: - prepare_tool_indices // list : tools to prepare indices for - biotype // string : if additional fasta file is provided biotype value to use when appending entries to GTF file - is_aws_igenome // boolean: whether the genome files are from AWS iGenomes + fasta // file: /path/to/genome.fasta + gtf // file: /path/to/genome.gtf + gff // file: /path/to/genome.gff + additional_fasta // file: /path/to/additional.fasta + transcript_fasta // file: /path/to/transcript.fasta + gene_bed // file: /path/to/gene.bed + splicesites // file: /path/to/splicesites.txt + bbsplit_fasta_list // file: /path/to/bbsplit_fasta_list.txt + star_index // directory: /path/to/star/index/ + rsem_index // directory: /path/to/rsem/index/ + salmon_index // directory: /path/to/salmon/index/ + hisat2_index // directory: /path/to/hisat2/index/ + bbsplit_index // directory: /path/to/rsem/index/ + gencode // boolean: whether the genome is from GENCODE + is_aws_igenome // boolean: whether the genome files are from AWS iGenomes + biotype // string: if additional fasta file is provided biotype value to use when appending entries to GTF file + prepare_tool_indices // list: tools to prepare indices for main: @@ -44,29 +58,29 @@ workflow PREPARE_GENOME { // // Uncompress genome fasta file if required // - if (params.fasta.endsWith('.gz')) { - ch_fasta = GUNZIP_FASTA ( [ [:], params.fasta ] ).gunzip.map { it[1] } + if (fasta.endsWith('.gz')) { + ch_fasta = GUNZIP_FASTA ( [ [:], fasta ] ).gunzip.map { it[1] } ch_versions = ch_versions.mix(GUNZIP_FASTA.out.versions) } else { - ch_fasta = file(params.fasta) + ch_fasta = Channel.value(file(fasta)) } // // Uncompress GTF annotation file or create from GFF3 if required // - if (params.gtf) { - if (params.gtf.endsWith('.gz')) { - ch_gtf = GUNZIP_GTF ( [ [:], params.gtf ] ).gunzip.map { it[1] } + if (gtf) { + if (gtf.endsWith('.gz')) { + ch_gtf = GUNZIP_GTF ( [ [:], gtf ] ).gunzip.map { it[1] } ch_versions = ch_versions.mix(GUNZIP_GTF.out.versions) } else { - ch_gtf = file(params.gtf) + ch_gtf = Channel.value(file(gtf)) } - } else if (params.gff) { - if (params.gff.endsWith('.gz')) { - ch_gff = GUNZIP_GFF ( [ [:], params.gff ] ).gunzip.map { it[1] } + } else if (gff) { + if (gff.endsWith('.gz')) { + ch_gff = GUNZIP_GFF ( [ [:], gff ] ).gunzip.map { it[1] } ch_versions = ch_versions.mix(GUNZIP_GFF.out.versions) } else { - ch_gff = file(params.gff) + ch_gff = Channel.value(file(gff)) } ch_gtf = GFFREAD ( ch_gff ).gtf ch_versions = ch_versions.mix(GFFREAD.out.versions) @@ -75,12 +89,12 @@ workflow PREPARE_GENOME { // // Uncompress additional fasta file and concatenate with reference fasta and gtf files // - if (params.additional_fasta) { - if (params.additional_fasta.endsWith('.gz')) { - ch_add_fasta = GUNZIP_ADDITIONAL_FASTA ( [ [:], params.additional_fasta ] ).gunzip.map { it[1] } + if (additional_fasta) { + if (additional_fasta.endsWith('.gz')) { + ch_add_fasta = GUNZIP_ADDITIONAL_FASTA ( [ [:], additional_fasta ] ).gunzip.map { it[1] } ch_versions = ch_versions.mix(GUNZIP_ADDITIONAL_FASTA.out.versions) } else { - ch_add_fasta = file(params.additional_fasta) + ch_add_fasta = Channel.value(file(additional_fasta)) } CAT_ADDITIONAL_FASTA ( ch_fasta, ch_gtf, ch_add_fasta, biotype ) ch_fasta = CAT_ADDITIONAL_FASTA.out.fasta @@ -91,12 +105,12 @@ workflow PREPARE_GENOME { // // Uncompress gene BED annotation file or create from GTF if required // - if (params.gene_bed) { - if (params.gene_bed.endsWith('.gz')) { - ch_gene_bed = GUNZIP_GENE_BED ( [ [:], params.gene_bed ] ).gunzip.map { it[1] } + if (gene_bed) { + if (gene_bed.endsWith('.gz')) { + ch_gene_bed = GUNZIP_GENE_BED ( [ [:], gene_bed ] ).gunzip.map { it[1] } ch_versions = ch_versions.mix(GUNZIP_GENE_BED.out.versions) } else { - ch_gene_bed = file(params.gene_bed) + ch_gene_bed = Channel.value(file(gene_bed)) } } else { ch_gene_bed = GTF2BED ( ch_gtf ).bed @@ -106,14 +120,14 @@ workflow PREPARE_GENOME { // // Uncompress transcript fasta file / create if required // - if (params.transcript_fasta) { - if (params.transcript_fasta.endsWith('.gz')) { - ch_transcript_fasta = GUNZIP_TRANSCRIPT_FASTA ( [ [:], params.transcript_fasta ] ).gunzip.map { it[1] } + if (transcript_fasta) { + if (transcript_fasta.endsWith('.gz')) { + ch_transcript_fasta = GUNZIP_TRANSCRIPT_FASTA ( [ [:], transcript_fasta ] ).gunzip.map { it[1] } ch_versions = ch_versions.mix(GUNZIP_TRANSCRIPT_FASTA.out.versions) } else { - ch_transcript_fasta = file(params.transcript_fasta) + ch_transcript_fasta = Channel.value(file(transcript_fasta)) } - if (params.gencode) { + if (gencode) { PREPROCESS_TRANSCRIPTS_FASTA_GENCODE ( ch_transcript_fasta ) ch_transcript_fasta = PREPROCESS_TRANSCRIPTS_FASTA_GENCODE.out.fasta ch_versions = ch_versions.mix(PREPROCESS_TRANSCRIPTS_FASTA_GENCODE.out.versions) @@ -128,9 +142,9 @@ workflow PREPARE_GENOME { // // Create chromosome sizes file // - CUSTOM_GETCHROMSIZES ( ch_fasta ) - ch_chrom_sizes = CUSTOM_GETCHROMSIZES.out.sizes - ch_fai = CUSTOM_GETCHROMSIZES.out.fai + CUSTOM_GETCHROMSIZES ( ch_fasta.map { [ [:], it ] } ) + ch_fai = CUSTOM_GETCHROMSIZES.out.fai.map { it[1] } + ch_chrom_sizes = CUSTOM_GETCHROMSIZES.out.sizes.map { it[1] } ch_versions = ch_versions.mix(CUSTOM_GETCHROMSIZES.out.versions) // @@ -138,16 +152,16 @@ workflow PREPARE_GENOME { // ch_bbsplit_index = Channel.empty() if ('bbsplit' in prepare_tool_indices) { - if (params.bbsplit_index) { - if (params.bbsplit_index.endsWith('.tar.gz')) { - ch_bbsplit_index = UNTAR_BBSPLIT_INDEX ( [ [:], params.bbsplit_index ] ).untar.map { it[1] } + if (bbsplit_index) { + if (bbsplit_index.endsWith('.tar.gz')) { + ch_bbsplit_index = UNTAR_BBSPLIT_INDEX ( [ [:], bbsplit_index ] ).untar.map { it[1] } ch_versions = ch_versions.mix(UNTAR_BBSPLIT_INDEX.out.versions) } else { - ch_bbsplit_index = file(params.bbsplit_index) + ch_bbsplit_index = Channel.value(file(bbsplit_index)) } } else { Channel - .from(file(params.bbsplit_fasta_list)) + .from(file(bbsplit_fasta_list)) .splitCsv() // Read in 2 column csv file: short_name,path_to_fasta .flatMap { id, fasta -> [ [ 'id', id ], [ 'fasta', file(fasta, checkIfExists: true) ] ] } // Flatten entries to be able to groupTuple by a common key .groupTuple() @@ -165,12 +179,12 @@ workflow PREPARE_GENOME { // ch_star_index = Channel.empty() if ('star_salmon' in prepare_tool_indices) { - if (params.star_index) { - if (params.star_index.endsWith('.tar.gz')) { - ch_star_index = UNTAR_STAR_INDEX ( [ [:], params.star_index ] ).untar.map { it[1] } + if (star_index) { + if (star_index.endsWith('.tar.gz')) { + ch_star_index = UNTAR_STAR_INDEX ( [ [:], star_index ] ).untar.map { it[1] } ch_versions = ch_versions.mix(UNTAR_STAR_INDEX.out.versions) } else { - ch_star_index = file(params.star_index) + ch_star_index = Channel.value(file(star_index)) } } else { if (is_aws_igenome) { @@ -188,12 +202,12 @@ workflow PREPARE_GENOME { // ch_rsem_index = Channel.empty() if ('star_rsem' in prepare_tool_indices) { - if (params.rsem_index) { - if (params.rsem_index.endsWith('.tar.gz')) { - ch_rsem_index = UNTAR_RSEM_INDEX ( [ [:], params.rsem_index ] ).untar.map { it[1] } + if (rsem_index) { + if (rsem_index.endsWith('.tar.gz')) { + ch_rsem_index = UNTAR_RSEM_INDEX ( [ [:], rsem_index ] ).untar.map { it[1] } ch_versions = ch_versions.mix(UNTAR_RSEM_INDEX.out.versions) } else { - ch_rsem_index = file(params.rsem_index) + ch_rsem_index = Channel.value(file(rsem_index)) } } else { ch_rsem_index = RSEM_PREPAREREFERENCE_GENOME ( ch_fasta, ch_gtf ).index @@ -207,18 +221,18 @@ workflow PREPARE_GENOME { ch_splicesites = Channel.empty() ch_hisat2_index = Channel.empty() if ('hisat2' in prepare_tool_indices) { - if (!params.splicesites) { + if (!splicesites) { ch_splicesites = HISAT2_EXTRACTSPLICESITES ( ch_gtf ).txt ch_versions = ch_versions.mix(HISAT2_EXTRACTSPLICESITES.out.versions) } else { - ch_splicesites = file(params.splicesites) + ch_splicesites = Channel.value(file(splicesites)) } - if (params.hisat2_index) { - if (params.hisat2_index.endsWith('.tar.gz')) { - ch_hisat2_index = UNTAR_HISAT2_INDEX ( [ [:], params.hisat2_index ] ).untar.map { it[1] } + if (hisat2_index) { + if (hisat2_index.endsWith('.tar.gz')) { + ch_hisat2_index = UNTAR_HISAT2_INDEX ( [ [:], hisat2_index ] ).untar.map { it[1] } ch_versions = ch_versions.mix(UNTAR_HISAT2_INDEX.out.versions) } else { - ch_hisat2_index = file(params.hisat2_index) + ch_hisat2_index = Channel.value(file(hisat2_index)) } } else { ch_hisat2_index = HISAT2_BUILD ( ch_fasta, ch_gtf, ch_splicesites ).index @@ -230,12 +244,12 @@ workflow PREPARE_GENOME { // Uncompress Salmon index or generate from scratch if required // ch_salmon_index = Channel.empty() - if (params.salmon_index) { - if (params.salmon_index.endsWith('.tar.gz')) { - ch_salmon_index = UNTAR_SALMON_INDEX ( [ [:], params.salmon_index ] ).untar.map { it[1] } + if (salmon_index) { + if (salmon_index.endsWith('.tar.gz')) { + ch_salmon_index = UNTAR_SALMON_INDEX ( [ [:], salmon_index ] ).untar.map { it[1] } ch_versions = ch_versions.mix(UNTAR_SALMON_INDEX.out.versions) } else { - ch_salmon_index = file(params.salmon_index) + ch_salmon_index = Channel.value(file(salmon_index)) } } else { if ('salmon' in prepare_tool_indices) { @@ -245,18 +259,18 @@ workflow PREPARE_GENOME { } emit: - fasta = ch_fasta // path: genome.fasta - gtf = ch_gtf // path: genome.gtf - fai = ch_fai // path: genome.fai - gene_bed = ch_gene_bed // path: gene.bed - transcript_fasta = ch_transcript_fasta // path: transcript.fasta - chrom_sizes = ch_chrom_sizes // path: genome.sizes - splicesites = ch_splicesites // path: genome.splicesites.txt - bbsplit_index = ch_bbsplit_index // path: bbsplit/index/ - star_index = ch_star_index // path: star/index/ - rsem_index = ch_rsem_index // path: rsem/index/ - hisat2_index = ch_hisat2_index // path: hisat2/index/ - salmon_index = ch_salmon_index // path: salmon/index/ + fasta = ch_fasta // channel: path(genome.fasta) + gtf = ch_gtf // channel: path(genome.gtf) + fai = ch_fai // channel: path(genome.fai) + gene_bed = ch_gene_bed // channel: path(gene.bed) + transcript_fasta = ch_transcript_fasta // channel: path(transcript.fasta) + chrom_sizes = ch_chrom_sizes // channel: path(genome.sizes) + splicesites = ch_splicesites // channel: path(genome.splicesites.txt) + bbsplit_index = ch_bbsplit_index // channel: path(bbsplit/index/) + star_index = ch_star_index // channel: path(star/index/) + rsem_index = ch_rsem_index // channel: path(rsem/index/) + hisat2_index = ch_hisat2_index // channel: path(hisat2/index/) + salmon_index = ch_salmon_index // channel: path(salmon/index/) versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ] } diff --git a/subworkflows/nf-core/bam_dedup_stats_samtools_umitools/main.nf b/subworkflows/nf-core/bam_dedup_stats_samtools_umitools/main.nf index 9d4294f1c..6b9e2e7a0 100644 --- a/subworkflows/nf-core/bam_dedup_stats_samtools_umitools/main.nf +++ b/subworkflows/nf-core/bam_dedup_stats_samtools_umitools/main.nf @@ -8,8 +8,8 @@ include { BAM_STATS_SAMTOOLS } from '../bam_stats_samtools/main' workflow BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS { take: - bam_bai // channel: [ val(meta), [ bam ], [ bai/csi ] ] - get_dedup_stats // boolean: true/false + ch_bam_bai // channel: [ val(meta), path(bam), path(bai/csi) ] + val_get_dedup_stats // boolean: true/false main: @@ -18,7 +18,7 @@ workflow BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS { // // UMI-tools dedup // - UMITOOLS_DEDUP ( bam_bai, get_dedup_stats ) + UMITOOLS_DEDUP ( ch_bam_bai, val_get_dedup_stats ) ch_versions = ch_versions.mix(UMITOOLS_DEDUP.out.versions.first()) // @@ -27,7 +27,7 @@ workflow BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS { SAMTOOLS_INDEX ( UMITOOLS_DEDUP.out.bam ) ch_versions = ch_versions.mix(SAMTOOLS_INDEX.out.versions.first()) - UMITOOLS_DEDUP.out.bam + ch_bam_bai_dedup = UMITOOLS_DEDUP.out.bam .join(SAMTOOLS_INDEX.out.bai, by: [0], remainder: true) .join(SAMTOOLS_INDEX.out.csi, by: [0], remainder: true) .map { @@ -38,19 +38,18 @@ workflow BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS { [ meta, bam, csi ] } } - .set { ch_bam_bai } - BAM_STATS_SAMTOOLS ( ch_bam_bai, [] ) + BAM_STATS_SAMTOOLS ( ch_bam_bai_dedup, [] ) ch_versions = ch_versions.mix(BAM_STATS_SAMTOOLS.out.versions) emit: - bam = UMITOOLS_DEDUP.out.bam // channel: [ val(meta), [ bam ] ] + bam = UMITOOLS_DEDUP.out.bam // channel: [ val(meta), path(bam) ] - bai = SAMTOOLS_INDEX.out.bai // channel: [ val(meta), [ bai ] ] - csi = SAMTOOLS_INDEX.out.csi // channel: [ val(meta), [ csi ] ] - stats = BAM_STATS_SAMTOOLS.out.stats // channel: [ val(meta), [ stats ] ] - flagstat = BAM_STATS_SAMTOOLS.out.flagstat // channel: [ val(meta), [ flagstat ] ] - idxstats = BAM_STATS_SAMTOOLS.out.idxstats // channel: [ val(meta), [ idxstats ] ] + bai = SAMTOOLS_INDEX.out.bai // channel: [ val(meta), path(bai) ] + csi = SAMTOOLS_INDEX.out.csi // channel: [ val(meta), path(csi) ] + stats = BAM_STATS_SAMTOOLS.out.stats // channel: [ val(meta), path(stats) ] + flagstat = BAM_STATS_SAMTOOLS.out.flagstat // channel: [ val(meta), path(flagstat) ] + idxstats = BAM_STATS_SAMTOOLS.out.idxstats // channel: [ val(meta), path(idxstats) ] - versions = ch_versions // channel: [ versions.yml ] + versions = ch_versions // channel: [ path(versions.yml) ] } diff --git a/subworkflows/nf-core/bam_dedup_stats_samtools_umitools/meta.yml b/subworkflows/nf-core/bam_dedup_stats_samtools_umitools/meta.yml index a3b29479d..c5ecdb01a 100644 --- a/subworkflows/nf-core/bam_dedup_stats_samtools_umitools/meta.yml +++ b/subworkflows/nf-core/bam_dedup_stats_samtools_umitools/meta.yml @@ -14,43 +14,44 @@ modules: - samtools/idxstats - samtools/flagstat input: - - bam_bai: - type: file - description: BAM/CRAM/SAM file - pattern: "*.{bam,cram,sam}" - - get_dedup_stats: + - ch_bam_bai: + description: | + input BAM file + Structure: [ val(meta), path(bam), path(bai) ] + - val_get_dedup_stats: type: boolean description: | Generate output stats when running "umi_tools dedup" output: - bam: - type: file - description: Umi deduplicated BAM/CRAM/SAM file - pattern: "*.{bam,cram,sam}" + description: | + Umi deduplicated BAM/SAM file + Structure: [ val(meta), path(bam) ] - bai: - type: file - description: Umi deduplicated BAM/CRAM/SAM samtools index - pattern: "*.{bai,crai,sai}" + description: | + Umi deduplicated BAM/SAM samtools index + Structure: [ val(meta), path(bai) ] - csi: - type: file - description: CSI samtools index - pattern: "*.csi" + description: | + CSI samtools index + Structure: [ val(meta), path(csi) ] - stats: - type: file - description: File containing samtools stats output - pattern: "*.{stats}" + description: | + File containing samtools stats output + Structure: [ val(meta), path(stats) ] - flagstat: - type: file - description: File containing samtools flagstat output - pattern: "*.{flagstat}" + description: | + File containing samtools flagstat output + Structure: [ val(meta), path(flagstat) ] - idxstats: - type: file - description: File containing samtools idxstats output - pattern: "*.{idxstats}" + description: | + File containing samtools idxstats output + Structure: [ val(meta), path(idxstats) ] - versions: - type: file - description: File containing software versions - pattern: "versions.yml" + description: | + Files containing software versions + Structure: [ path(versions.yml) ] + authors: - "@drpatelh" - "@KamilMaliszArdigen" diff --git a/subworkflows/nf-core/bam_markduplicates_picard/main.nf b/subworkflows/nf-core/bam_markduplicates_picard/main.nf index 9cb24cdc3..6e3df3320 100644 --- a/subworkflows/nf-core/bam_markduplicates_picard/main.nf +++ b/subworkflows/nf-core/bam_markduplicates_picard/main.nf @@ -9,9 +9,9 @@ include { BAM_STATS_SAMTOOLS } from '../bam_stats_samtools/main' workflow BAM_MARKDUPLICATES_PICARD { take: - ch_bam // channel: [ val(meta), [ bam ] ] - ch_fasta // channel: [ fasta ] - ch_fai // channel: [ fai ] + ch_bam // channel: [ val(meta), path(bam) ] + ch_fasta // channel: [ path(fasta) ] + ch_fai // channel: [ path(fai) ] main: @@ -23,7 +23,7 @@ workflow BAM_MARKDUPLICATES_PICARD { SAMTOOLS_INDEX ( PICARD_MARKDUPLICATES.out.bam ) ch_versions = ch_versions.mix(SAMTOOLS_INDEX.out.versions.first()) - PICARD_MARKDUPLICATES.out.bam + ch_bam_bai = PICARD_MARKDUPLICATES.out.bam .join(SAMTOOLS_INDEX.out.bai, by: [0], remainder: true) .join(SAMTOOLS_INDEX.out.csi, by: [0], remainder: true) .map { @@ -34,20 +34,19 @@ workflow BAM_MARKDUPLICATES_PICARD { [ meta, bam, csi ] } } - .set { ch_bam_bai } BAM_STATS_SAMTOOLS ( ch_bam_bai, ch_fasta ) ch_versions = ch_versions.mix(BAM_STATS_SAMTOOLS.out.versions) emit: - bam = PICARD_MARKDUPLICATES.out.bam // channel: [ val(meta), [ bam ] ] - metrics = PICARD_MARKDUPLICATES.out.metrics // channel: [ val(meta), [ bam ] ] - bai = SAMTOOLS_INDEX.out.bai // channel: [ val(meta), [ bai ] ] - csi = SAMTOOLS_INDEX.out.csi // channel: [ val(meta), [ csi ] ] - - stats = BAM_STATS_SAMTOOLS.out.stats // channel: [ val(meta), [ stats ] ] - flagstat = BAM_STATS_SAMTOOLS.out.flagstat // channel: [ val(meta), [ flagstat ] ] - idxstats = BAM_STATS_SAMTOOLS.out.idxstats // channel: [ val(meta), [ idxstats ] ] + bam = PICARD_MARKDUPLICATES.out.bam // channel: [ val(meta), path(bam) ] + metrics = PICARD_MARKDUPLICATES.out.metrics // channel: [ val(meta), path(bam) ] + bai = SAMTOOLS_INDEX.out.bai // channel: [ val(meta), path(bai) ] + csi = SAMTOOLS_INDEX.out.csi // channel: [ val(meta), path(csi) ] + + stats = BAM_STATS_SAMTOOLS.out.stats // channel: [ val(meta), path(stats) ] + flagstat = BAM_STATS_SAMTOOLS.out.flagstat // channel: [ val(meta), path(flagstat) ] + idxstats = BAM_STATS_SAMTOOLS.out.idxstats // channel: [ val(meta), path(idxstats) ] versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/nf-core/bam_markduplicates_picard/meta.yml b/subworkflows/nf-core/bam_markduplicates_picard/meta.yml index fdd9f8d1c..822c61328 100644 --- a/subworkflows/nf-core/bam_markduplicates_picard/meta.yml +++ b/subworkflows/nf-core/bam_markduplicates_picard/meta.yml @@ -14,48 +14,47 @@ modules: - samtools/flagstat input: - - meta: - type: map + - ch_bam: description: | - Groovy Map containing sample information - e.g. [ id:'test' ] - - bam: - type: file - description: BAM/CRAM/SAM file - pattern: "*.{bam,cram,sam}" - - fasta: - type: file - description: Reference genome fasta file - pattern: "*.{fasta,fa}" - + BAM/CRAM/SAM file + Structure: [ val(meta), path(bam) ] + - ch_fasta: + description: | + Reference genome fasta file + Structure: [ path(fasta) ] + - ch_fasta: + description: | + Index of the reference genome fasta file + Structure: [ path(fai) ] output: - - meta: - type: map + - bam: description: | - Groovy Map containing sample information - e.g. [ id:'test' ] + processed BAM/CRAM/SAM file + Structure: [ val(meta), path(bam) ] - bai: - type: file - description: BAM/CRAM/SAM samtools index - pattern: "*.{bai,crai,sai}" + description: | + BAM/CRAM/SAM samtools index + Structure: [ val(meta), path(bai) ] - csi: - type: file - description: CSI samtools index - pattern: "*.csi" + description: | + CSI samtools index + Structure: [ val(meta), path(csi) ] - stats: - type: file - description: File containing samtools stats output + description: | + File containing samtools stats output + Structure: [ val(meta), path(stats) ] - flagstat: - type: file - description: File containing samtools flagstat output + description: | + File containing samtools flagstat output + Structure: [ val(meta), path(flagstat) ] - idxstats: - type: file - description: File containing samtools idxstats output - pattern: "*.{idxstats}" + description: | + File containing samtools idxstats output + Structure: [ val(meta), path(idxstats) ] - versions: - type: file - description: File containing software versions - pattern: "versions.yml" + description: | + Files containing software versions + Structure: [ path(versions.yml) ] authors: - "@dmarron" - "@drpatelh" diff --git a/subworkflows/nf-core/bam_rseqc/meta.yml b/subworkflows/nf-core/bam_rseqc/meta.yml index 1e1fa18dc..cc074c21f 100644 --- a/subworkflows/nf-core/bam_rseqc/meta.yml +++ b/subworkflows/nf-core/bam_rseqc/meta.yml @@ -31,9 +31,9 @@ input: description: BAM file to calculate statistics pattern: "*.{bam}" - bai: - type: file - description: Index for input BAM file - pattern: "*.{bai}" + type: file + description: Index for input BAM file + pattern: "*.{bai}" - bed: type: file description: BED file for the reference gene model diff --git a/subworkflows/nf-core/bam_stats_samtools/main.nf b/subworkflows/nf-core/bam_stats_samtools/main.nf index cfcc48dd6..c9d7c8b75 100644 --- a/subworkflows/nf-core/bam_stats_samtools/main.nf +++ b/subworkflows/nf-core/bam_stats_samtools/main.nf @@ -8,25 +8,25 @@ include { SAMTOOLS_FLAGSTAT } from '../../../modules/nf-core/samtools/flagstat/m workflow BAM_STATS_SAMTOOLS { take: - bam_bai // channel: [ val(meta), [ bam/cram ], [bai/csi] ] - fasta // channel: [ fasta ] + ch_bam_bai // channel: [ val(meta), path(bam), path(bai) ] + ch_fasta // channel: [ path(fasta) ] main: ch_versions = Channel.empty() - SAMTOOLS_STATS ( bam_bai, fasta ) + SAMTOOLS_STATS ( ch_bam_bai, ch_fasta ) ch_versions = ch_versions.mix(SAMTOOLS_STATS.out.versions) - SAMTOOLS_FLAGSTAT ( bam_bai ) + SAMTOOLS_FLAGSTAT ( ch_bam_bai ) ch_versions = ch_versions.mix(SAMTOOLS_FLAGSTAT.out.versions) - SAMTOOLS_IDXSTATS ( bam_bai ) + SAMTOOLS_IDXSTATS ( ch_bam_bai ) ch_versions = ch_versions.mix(SAMTOOLS_IDXSTATS.out.versions) emit: - stats = SAMTOOLS_STATS.out.stats // channel: [ val(meta), [ stats ] ] - flagstat = SAMTOOLS_FLAGSTAT.out.flagstat // channel: [ val(meta), [ flagstat ] ] - idxstats = SAMTOOLS_IDXSTATS.out.idxstats // channel: [ val(meta), [ idxstats ] ] + stats = SAMTOOLS_STATS.out.stats // channel: [ val(meta), path(stats) ] + flagstat = SAMTOOLS_FLAGSTAT.out.flagstat // channel: [ val(meta), path(flagstat) ] + idxstats = SAMTOOLS_IDXSTATS.out.idxstats // channel: [ val(meta), path(idxstats) ] - versions = ch_versions // channel: [ versions.yml ] + versions = ch_versions // channel: [ path(versions.yml) ] } diff --git a/subworkflows/nf-core/bam_stats_samtools/meta.yml b/subworkflows/nf-core/bam_stats_samtools/meta.yml index 5252b0e42..b6072686e 100644 --- a/subworkflows/nf-core/bam_stats_samtools/meta.yml +++ b/subworkflows/nf-core/bam_stats_samtools/meta.yml @@ -11,44 +11,30 @@ modules: - samtools/idxstats - samtools/flagstat input: - - meta: - type: map + - ch_bam_bai: description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - bam: - type: file - description: BAM/CRAM/SAM file - pattern: "*.{bam,cram,sam}" - - bai: - type: file - description: Index for BAM/CRAM/SAM file - pattern: "*.{bai,crai,sai}" - - fasta: - type: file - description: Reference genome fasta file - pattern: "*.{fasta,fa}" -output: - - meta: - type: map + The input channel containing the BAM/CRAM and it's index + Structure: [ val(meta), path(bam), path(bai) ] + - ch_fasta: description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] + Reference genome fasta file + Structure: [ path(fasta) ] +output: - stats: - type: file - description: File containing samtools stats output - pattern: "*.{stats}" + description: | + File containing samtools stats output + Structure: [ val(meta), path(stats) ] - flagstat: - type: file - description: File containing samtools flagstat output - pattern: "*.{flagstat}" + description: | + File containing samtools flagstat output + Structure: [ val(meta), path(flagstat) ] - idxstats: - type: file - description: File containing samtools idxstats output - pattern: "*.{idxstats}" + description: | + File containing samtools idxstats output + Structure: [ val(meta), path(idxstats)] - versions: - type: file - description: File containing software versions - pattern: "versions.yml" + description: | + Files containing software versions + Structure: [ path(versions.yml) ] authors: - "@drpatelh" diff --git a/subworkflows/nf-core/fastq_fastqc_umitools_fastp/main.nf b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/main.nf new file mode 100644 index 000000000..42638943d --- /dev/null +++ b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/main.nf @@ -0,0 +1,140 @@ +// +// Read QC, UMI extraction and trimming +// + +include { FASTQC as FASTQC_RAW } from '../../../modules/nf-core/fastqc/main' +include { FASTQC as FASTQC_TRIM } from '../../../modules/nf-core/fastqc/main' +include { UMITOOLS_EXTRACT } from '../../../modules/nf-core/umitools/extract/main' +include { FASTP } from '../../../modules/nf-core/fastp/main' + +// +// Function that parses fastp json output file to get total number of reads after trimming +// +import groovy.json.JsonSlurper + +def getFastpReadsAfterFiltering(json_file) { + def Map json = (Map) new JsonSlurper().parseText(json_file.text).get('summary') + return json['after_filtering']['total_reads'].toInteger() +} + +workflow FASTQ_FASTQC_UMITOOLS_FASTP { + take: + reads // channel: [ val(meta), [ reads ] ] + skip_fastqc // boolean: true/false + with_umi // boolean: true/false + skip_umi_extract // boolean: true/false + umi_discard_read // integer: 0, 1 or 2 + skip_trimming // boolean: true/false + adapter_fasta // file: adapter.fasta + save_trimmed_fail // boolean: true/false + save_merged // boolean: true/false + min_trimmed_reads // integer: > 0 + + main: + ch_versions = Channel.empty() + fastqc_raw_html = Channel.empty() + fastqc_raw_zip = Channel.empty() + if (!skip_fastqc) { + FASTQC_RAW ( + reads + ) + fastqc_raw_html = FASTQC_RAW.out.html + fastqc_raw_zip = FASTQC_RAW.out.zip + ch_versions = ch_versions.mix(FASTQC_RAW.out.versions.first()) + } + + umi_reads = reads + umi_log = Channel.empty() + if (with_umi && !skip_umi_extract) { + UMITOOLS_EXTRACT ( + reads + ) + umi_reads = UMITOOLS_EXTRACT.out.reads + umi_log = UMITOOLS_EXTRACT.out.log + ch_versions = ch_versions.mix(UMITOOLS_EXTRACT.out.versions.first()) + + // Discard R1 / R2 if required + if (umi_discard_read in [1,2]) { + UMITOOLS_EXTRACT + .out + .reads + .map { + meta, reads -> + meta.single_end ? [ meta, reads ] : [ meta + [single_end: true], reads[umi_discard_read % 2] ] + } + .set { umi_reads } + } + } + + trim_reads = umi_reads + trim_json = Channel.empty() + trim_html = Channel.empty() + trim_log = Channel.empty() + trim_reads_fail = Channel.empty() + trim_reads_merged = Channel.empty() + fastqc_trim_html = Channel.empty() + fastqc_trim_zip = Channel.empty() + trim_read_count = Channel.empty() + if (!skip_trimming) { + FASTP ( + reads, + adapter_fasta, + save_trimmed_fail, + save_merged + ) + trim_json = FASTP.out.json + trim_html = FASTP.out.html + trim_log = FASTP.out.log + trim_reads_fail = FASTP.out.reads_fail + trim_reads_merged = FASTP.out.reads_merged + ch_versions = ch_versions.mix(FASTP.out.versions.first()) + + // + // Filter FastQ files based on minimum trimmed read count after adapter trimming + // + FASTP + .out + .reads + .join(trim_json) + .map { meta, reads, json -> [ meta, reads, getFastpReadsAfterFiltering(json) ] } + .set { ch_num_trimmed_reads } + + ch_num_trimmed_reads + .filter { meta, reads, num_reads -> num_reads >= min_trimmed_reads.toInteger() } + .map { meta, reads, num_reads -> [ meta, reads ] } + .set { trim_reads } + + ch_num_trimmed_reads + .map { meta, reads, num_reads -> [ meta, num_reads ] } + .set { trim_read_count } + + if (!skip_fastqc) { + FASTQC_TRIM ( + trim_reads + ) + fastqc_trim_html = FASTQC_TRIM.out.html + fastqc_trim_zip = FASTQC_TRIM.out.zip + ch_versions = ch_versions.mix(FASTQC_TRIM.out.versions.first()) + } + } + + emit: + reads = trim_reads // channel: [ val(meta), [ reads ] ] + + fastqc_raw_html // channel: [ val(meta), [ html ] ] + fastqc_raw_zip // channel: [ val(meta), [ zip ] ] + + umi_log // channel: [ val(meta), [ log ] ] + + trim_json // channel: [ val(meta), [ json ] ] + trim_html // channel: [ val(meta), [ html ] ] + trim_log // channel: [ val(meta), [ log ] ] + trim_reads_fail // channel: [ val(meta), [ fastq.gz ] ] + trim_reads_merged // channel: [ val(meta), [ fastq.gz ] ] + trim_read_count // channel: [ val(meta), val(count) ] + + fastqc_trim_html // channel: [ val(meta), [ html ] ] + fastqc_trim_zip // channel: [ val(meta), [ zip ] ] + + versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ] +} diff --git a/subworkflows/nf-core/fastq_fastqc_umitools_fastp/meta.yml b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/meta.yml new file mode 100644 index 000000000..d80ebc0a1 --- /dev/null +++ b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/meta.yml @@ -0,0 +1,122 @@ +name: "fastq_fastqc_umitools_fastp" +description: Read QC, UMI extraction and trimming +keywords: + - fastq + - fastqc + - qc + - UMI + - trimming + - fastp +modules: + - fastqc + - umitools/extract + - fastp +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - skip_fastqc: + type: boolean + description: | + Skip fastqc process + - with_umi: + type: boolean + description: | + With or without umi detection + - skip_umi_extract: + type: boolean + description: | + With or without umi extrection + - umi_discard_read: + type: integer + description: | + Discard R1 / R2 if required + - skip_trimming: + type: boolean + description: | + Allows to skip trimgalore execution + - adapter_fasta: + type: file + description: | + Fasta file of adapter sequences + - save_trimmed_fail: + type: boolean + description: | + Save trimmed fastqs of failed samples + - save_merged: + type: boolean + description: | + Save merged fastqs + - min_trimmed_reads: + type: integer + description: | + Inputs with fewer than this reads will be filtered out of the "reads" output channel +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - reads: + type: file + description: > + Extracted FASTQ files. | + For single-end reads, pattern is \${prefix}.umi_extract.fastq.gz. | + For paired-end reads, pattern is \${prefix}.umi_extract_{1,2}.fastq.gz. + pattern: "*.{fastq.gz}" + - fastqc_html: + type: file + description: FastQC report + pattern: "*_{fastqc.html}" + - fastqc_zip: + type: file + description: FastQC report archive + pattern: "*_{fastqc.zip}" + - log: + type: file + description: Logfile for umi_tools + pattern: "*.{log}" + - trim_json: + type: file + description: FastP Trimming report + pattern: "*.{fastp.json}" + - trim_html: + type: file + description: FastP Trimming report + pattern: "*.{fastp.html}" + - log: + type: file + description: Logfile FastP + pattern: "*.{fastp.log}" + - trim_reads_fail: + type: file + description: Trimmed fastq files failing QC + pattern: "*.{fastq.gz}" + - trim_reads_merged: + type: file + description: Trimmed and merged fastq files + pattern: "*.{fastq.gz}" + - trim_read_count: + type: integer + description: Number of reads after trimming + - fastqc_trim_html: + type: file + description: FastQC report + pattern: "*_{fastqc.html}" + - fastqc_trim_zip: + type: file + description: FastQC report archive + pattern: "*_{fastqc.zip}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@robsyme" diff --git a/subworkflows/nf-core/fastq_fastqc_umitools_trimgalore/main.nf b/subworkflows/nf-core/fastq_fastqc_umitools_trimgalore/main.nf index 36295a167..f7f48d15b 100644 --- a/subworkflows/nf-core/fastq_fastqc_umitools_trimgalore/main.nf +++ b/subworkflows/nf-core/fastq_fastqc_umitools_trimgalore/main.nf @@ -23,20 +23,22 @@ def getTrimGaloreReadsAfterFiltering(log_file) { workflow FASTQ_FASTQC_UMITOOLS_TRIMGALORE { take: - reads // channel: [ val(meta), [ reads ] ] - skip_fastqc // boolean: true/false - with_umi // boolean: true/false - skip_umi_extract // boolean: true/false - skip_trimming // boolean: true/false - umi_discard_read // integer: 0, 1 or 2 + reads // channel: [ val(meta), [ reads ] ] + skip_fastqc // boolean: true/false + with_umi // boolean: true/false + skip_umi_extract // boolean: true/false + skip_trimming // boolean: true/false + umi_discard_read // integer: 0, 1 or 2 + min_trimmed_reads // integer: > 0 main: - ch_versions = Channel.empty() + fastqc_html = Channel.empty() fastqc_zip = Channel.empty() if (!skip_fastqc) { - FASTQC ( reads ).html.set { fastqc_html } + FASTQC (reads) + fastqc_html = FASTQC.out.html fastqc_zip = FASTQC.out.zip ch_versions = ch_versions.mix(FASTQC.out.versions.first()) } @@ -44,8 +46,8 @@ workflow FASTQ_FASTQC_UMITOOLS_TRIMGALORE { umi_reads = reads umi_log = Channel.empty() if (with_umi && !skip_umi_extract) { - - UMITOOLS_EXTRACT ( reads ).reads.set { umi_reads } + UMITOOLS_EXTRACT (reads) + umi_reads = UMITOOLS_EXTRACT.out.reads umi_log = UMITOOLS_EXTRACT.out.log ch_versions = ch_versions.mix(UMITOOLS_EXTRACT.out.versions.first()) @@ -54,24 +56,22 @@ workflow FASTQ_FASTQC_UMITOOLS_TRIMGALORE { UMITOOLS_EXTRACT .out .reads - .map { meta, reads -> - if (!meta.single_end) { - meta['single_end'] = true - reads = reads[umi_discard_read % 2] - } - return [ meta, reads ] + .map { + meta, reads -> + meta.single_end ? [ meta, reads ] : [ meta + ['single_end': true], reads[umi_discard_read % 2] ] } .set { umi_reads } } } - trim_reads = umi_reads - trim_unpaired = Channel.empty() - trim_html = Channel.empty() - trim_zip = Channel.empty() - trim_log = Channel.empty() + trim_reads = umi_reads + trim_unpaired = Channel.empty() + trim_html = Channel.empty() + trim_zip = Channel.empty() + trim_log = Channel.empty() + trim_read_count = Channel.empty() if (!skip_trimming) { - TRIMGALORE ( umi_reads ).reads.set { trim_reads } + TRIMGALORE (umi_reads) trim_unpaired = TRIMGALORE.out.unpaired trim_html = TRIMGALORE.out.html trim_zip = TRIMGALORE.out.zip @@ -79,24 +79,31 @@ workflow FASTQ_FASTQC_UMITOOLS_TRIMGALORE { ch_versions = ch_versions.mix(TRIMGALORE.out.versions.first()) // - // Filter empty FastQ files after adapter trimming + // Filter FastQ files based on minimum trimmed read count after adapter trimming // - trim_reads + TRIMGALORE + .out + .reads .join(trim_log, remainder: true) .map { meta, reads, trim_log -> if (trim_log) { - if (!meta.single_end) { - trim_log = trim_log[-1] - } - if (getTrimGaloreReadsAfterFiltering(trim_log) > 0) { - [ meta, reads ] - } + num_reads = getTrimGaloreReadsAfterFiltering(meta.single_end ? trim_log : trim_log[-1]) + [ meta, reads, num_reads ] } else { - [ meta, reads ] + [ meta, reads, min_trimmed_reads.toFloat() + 1 ] } } + .set { ch_num_trimmed_reads } + + ch_num_trimmed_reads + .filter { meta, reads, num_reads -> num_reads >= min_trimmed_reads.toFloat() } + .map { meta, reads, num_reads -> [ meta, reads ] } .set { trim_reads } + + ch_num_trimmed_reads + .map { meta, reads, num_reads -> [ meta, num_reads ] } + .set { trim_read_count } } emit: @@ -111,6 +118,7 @@ workflow FASTQ_FASTQC_UMITOOLS_TRIMGALORE { trim_html // channel: [ val(meta), [ html ] ] trim_zip // channel: [ val(meta), [ zip ] ] trim_log // channel: [ val(meta), [ txt ] ] + trim_read_count // channel: [ val(meta), val(count) ] versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ] } diff --git a/subworkflows/nf-core/fastq_fastqc_umitools_trimgalore/meta.yml b/subworkflows/nf-core/fastq_fastqc_umitools_trimgalore/meta.yml index 02a02a6a1..b05004d45 100644 --- a/subworkflows/nf-core/fastq_fastqc_umitools_trimgalore/meta.yml +++ b/subworkflows/nf-core/fastq_fastqc_umitools_trimgalore/meta.yml @@ -42,6 +42,10 @@ input: type: integer description: | Discard R1 / R2 if required + - min_trimmed_reads: + type: integer + description: | + Inputs with fewer than this reads will be filtered out of the "reads" output channel output: - reads: @@ -80,6 +84,9 @@ output: type: file description: Trim Galore! trimming report pattern: "*_{report.txt}" + - trim_read_count: + type: integer + description: Number of reads remaining after trimming for all input samples - versions: type: file description: File containing software versions diff --git a/workflows/rnaseq.nf b/workflows/rnaseq.nf index c22c8688c..87630bead 100755 --- a/workflows/rnaseq.nf +++ b/workflows/rnaseq.nf @@ -6,6 +6,7 @@ def valid_params = [ aligners : ['star_salmon', 'star_rsem', 'hisat2'], + trimmers : ['trimgalore', 'fastp'], pseudoaligners : ['salmon'], rseqc_modules : ['bam_stat', 'inner_distance', 'infer_experiment', 'junction_annotation', 'junction_saturation', 'read_distribution', 'read_duplication', 'tin'] ] @@ -100,9 +101,6 @@ include { DUPRADAR } from '../modules/local/dupradar' include { MULTIQC } from '../modules/local/multiqc' include { MULTIQC_CUSTOM_BIOTYPE } from '../modules/local/multiqc_custom_biotype' include { UMITOOLS_PREPAREFORRSEM as UMITOOLS_PREPAREFORSALMON } from '../modules/local/umitools_prepareforrsem.nf' -include { MULTIQC_TSV_FROM_LIST as MULTIQC_TSV_FAIL_MAPPED } from '../modules/local/multiqc_tsv_from_list' -include { MULTIQC_TSV_FROM_LIST as MULTIQC_TSV_FAIL_TRIMMED } from '../modules/local/multiqc_tsv_from_list' -include { MULTIQC_TSV_FROM_LIST as MULTIQC_TSV_STRAND_CHECK } from '../modules/local/multiqc_tsv_from_list' // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules @@ -138,6 +136,7 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoft // include { FASTQ_SUBSAMPLE_FQ_SALMON } from '../subworkflows/nf-core/fastq_subsample_fq_salmon/main' include { FASTQ_FASTQC_UMITOOLS_TRIMGALORE } from '../subworkflows/nf-core/fastq_fastqc_umitools_trimgalore/main' +include { FASTQ_FASTQC_UMITOOLS_FASTP } from '../subworkflows/nf-core/fastq_fastqc_umitools_fastp/main' include { FASTQ_ALIGN_HISAT2 } from '../subworkflows/nf-core/fastq_align_hisat2/main' include { BAM_SORT_STATS_SAMTOOLS } from '../subworkflows/nf-core/bam_sort_stats_samtools/main' include { BAM_MARKDUPLICATES_PICARD } from '../subworkflows/nf-core/bam_markduplicates_picard/main' @@ -154,9 +153,10 @@ include { BEDGRAPH_BEDCLIP_BEDGRAPHTOBIGWIG as BEDGRAPH_BEDCLIP_BEDGRAPHTOBIGWIG */ // Info required for completion email and summary -def multiqc_report = [] -def pass_percent_mapped = [:] -def fail_percent_mapped = [:] +def multiqc_report = [] +def pass_mapped_reads = [:] +def pass_trimmed_reads = [:] +def pass_strand_check = [:] workflow RNASEQ { @@ -167,10 +167,23 @@ workflow RNASEQ { // def biotype = params.gencode ? "gene_type" : params.featurecounts_group_type PREPARE_GENOME ( - prepareToolIndices, + params.fasta, + params.gtf, + params.gff, + params.additional_fasta, + params.transcript_fasta, + params.gene_bed, + params.splicesites, + params.bbsplit_fasta_list, + params.star_index, + params.rsem_index, + params.salmon_index, + params.hisat2_index, + params.bbsplit_index, + params.gencode, + is_aws_igenome, biotype, - is_aws_igenome - + prepareToolIndices ) ch_versions = ch_versions.mix(PREPARE_GENOME.out.versions) @@ -230,9 +243,15 @@ workflow RNASEQ { // // SUBWORKFLOW: Sub-sample FastQ files and pseudo-align with Salmon to auto-infer strandedness // + // Return empty channel if ch_strand_fastq.auto_strand is empty so salmon index isn't created + PREPARE_GENOME.out.fasta + .combine(ch_strand_fastq.auto_strand) + .map { it.first() } + .set { ch_genome_fasta } + FASTQ_SUBSAMPLE_FQ_SALMON ( ch_strand_fastq.auto_strand, - PREPARE_GENOME.out.fasta, + ch_genome_fasta, PREPARE_GENOME.out.transcript_fasta, PREPARE_GENOME.out.gtf, PREPARE_GENOME.out.salmon_index, @@ -251,60 +270,74 @@ workflow RNASEQ { .set { ch_strand_inferred_fastq } // - // SUBWORKFLOW: Read QC, extract UMI and trim adapters - // - FASTQ_FASTQC_UMITOOLS_TRIMGALORE ( - ch_strand_inferred_fastq, - params.skip_fastqc || params.skip_qc, - params.with_umi, - params.skip_umi_extract, - params.skip_trimming, - params.umi_discard_read - ) - ch_versions = ch_versions.mix(FASTQ_FASTQC_UMITOOLS_TRIMGALORE.out.versions) + // SUBWORKFLOW: Read QC, extract UMI and trim adapters with TrimGalore! + // + ch_filtered_reads = Channel.empty() + ch_fastqc_raw_multiqc = Channel.empty() + ch_fastqc_trim_multiqc = Channel.empty() + ch_trim_log_multiqc = Channel.empty() + ch_trim_read_count = Channel.empty() + if (params.trimmer == 'trimgalore') { + FASTQ_FASTQC_UMITOOLS_TRIMGALORE ( + ch_strand_inferred_fastq, + params.skip_fastqc || params.skip_qc, + params.with_umi, + params.skip_umi_extract, + params.skip_trimming, + params.umi_discard_read, + params.min_trimmed_reads + ) + ch_filtered_reads = FASTQ_FASTQC_UMITOOLS_TRIMGALORE.out.reads + ch_fastqc_raw_multiqc = FASTQ_FASTQC_UMITOOLS_TRIMGALORE.out.fastqc_zip + ch_fastqc_trim_multiqc = FASTQ_FASTQC_UMITOOLS_TRIMGALORE.out.trim_zip + ch_trim_log_multiqc = FASTQ_FASTQC_UMITOOLS_TRIMGALORE.out.trim_log + ch_trim_read_count = FASTQ_FASTQC_UMITOOLS_TRIMGALORE.out.trim_read_count + ch_versions = ch_versions.mix(FASTQ_FASTQC_UMITOOLS_TRIMGALORE.out.versions) + } // - // Filter channels to get samples that passed minimum trimmed read count + // SUBWORKFLOW: Read QC, extract UMI and trim adapters with fastp // - ch_fail_trimming_multiqc = Channel.empty() - ch_filtered_reads = FASTQ_FASTQC_UMITOOLS_TRIMGALORE.out.reads - if (!params.skip_trimming) { - ch_filtered_reads - .join(FASTQ_FASTQC_UMITOOLS_TRIMGALORE.out.trim_log, remainder: true) - .map { - meta, reads, trim_log -> - if (trim_log) { - if (!meta.single_end) { - trim_log = trim_log[-1] - } - num_reads = WorkflowRnaseq.getTrimGaloreReadsAfterFiltering(trim_log) - [ meta, reads, num_reads ] - } else { - [ meta, reads, params.min_trimmed_reads + 1 ] - } - } - .set { ch_num_trimmed_reads } - - ch_num_trimmed_reads - .map { meta, reads, num_reads -> if (num_reads > params.min_trimmed_reads) [ meta, reads ] } - .set { ch_filtered_reads } + if (params.trimmer == 'fastp') { + FASTQ_FASTQC_UMITOOLS_FASTP ( + ch_strand_inferred_fastq, + params.skip_fastqc || params.skip_qc, + params.with_umi, + params.skip_umi_extract, + params.umi_discard_read, + params.skip_trimming, + [], + params.save_trimmed, + params.save_trimmed, + params.min_trimmed_reads + ) + ch_filtered_reads = FASTQ_FASTQC_UMITOOLS_FASTP.out.reads + ch_fastqc_raw_multiqc = FASTQ_FASTQC_UMITOOLS_FASTP.out.fastqc_raw_zip + ch_fastqc_trim_multiqc = FASTQ_FASTQC_UMITOOLS_FASTP.out.fastqc_trim_zip + ch_trim_log_multiqc = FASTQ_FASTQC_UMITOOLS_FASTP.out.trim_json + ch_trim_read_count = FASTQ_FASTQC_UMITOOLS_FASTP.out.trim_read_count + ch_versions = ch_versions.mix(FASTQ_FASTQC_UMITOOLS_FASTP.out.versions) + } - ch_num_trimmed_reads - .map { - meta, reads, num_reads -> - if (num_reads <= params.min_trimmed_reads) { + // + // Get list of samples that failed trimming threshold for MultiQC report + // + ch_trim_read_count + .map { + meta, num_reads -> + pass_trimmed_reads[meta.id] = true + if (num_reads <= params.min_trimmed_reads.toFloat()) { + pass_trimmed_reads[meta.id] = false return [ "$meta.id\t$num_reads" ] } - } - .set { ch_num_trimmed_reads } - - MULTIQC_TSV_FAIL_TRIMMED ( - ch_num_trimmed_reads.collect(), - ["Sample", "Reads after trimming"], - 'fail_trimmed_samples' - ) + } + .collect() + .map { + tsv_data -> + def header = ["Sample", "Reads after trimming"] + WorkflowRnaseq.multiqcTsvFromList(tsv_data, header) + } .set { ch_fail_trimming_multiqc } - } // // MODULE: Remove genome contaminant reads @@ -522,19 +555,19 @@ workflow RNASEQ { // SUBWORKFLOW: Remove duplicate reads from BAM file based on UMIs // if (params.with_umi) { - BAM_DEDUP_STATS_SAMTOOLS_UMI_UMITOOLS_GENOME ( + BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_GENOME ( ch_genome_bam.join(ch_genome_bam_index, by: [0]), params.umitools_dedup_stats ) - ch_genome_bam = BAM_DEDUP_STATS_SAMTOOLS_UMI_UMITOOLS_GENOME.out.bam - ch_genome_bam_index = BAM_DEDUP_STATS_SAMTOOLS_UMI_UMITOOLS_GENOME.out.bai - ch_samtools_stats = BAM_DEDUP_STATS_SAMTOOLS_UMI_UMITOOLS_GENOME.out.stats - ch_samtools_flagstat = BAM_DEDUP_STATS_SAMTOOLS_UMI_UMITOOLS_GENOME.out.flagstat - ch_samtools_idxstats = BAM_DEDUP_STATS_SAMTOOLS_UMI_UMITOOLS_GENOME.out.idxstats + ch_genome_bam = BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_GENOME.out.bam + ch_genome_bam_index = BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_GENOME.out.bai + ch_samtools_stats = BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_GENOME.out.stats + ch_samtools_flagstat = BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_GENOME.out.flagstat + ch_samtools_idxstats = BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_GENOME.out.idxstats if (params.bam_csi_index) { - ch_genome_bam_index = BAM_DEDUP_STATS_SAMTOOLS_UMI_UMITOOLS_GENOME.out.csi + ch_genome_bam_index = BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_GENOME.out.csi } - ch_versions = ch_versions.mix(BAM_DEDUP_STATS_SAMTOOLS_UMI_UMITOOLS_GENOME.out.versions) + ch_versions = ch_versions.mix(BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_GENOME.out.versions) } } @@ -560,24 +593,23 @@ workflow RNASEQ { ch_percent_mapped .branch { meta, mapped, pass -> pass: pass - pass_percent_mapped[meta.id] = mapped + pass_mapped_reads[meta.id] = true return [ "$meta.id\t$mapped" ] fail: !pass - fail_percent_mapped[meta.id] = mapped + pass_mapped_reads[meta.id] = false return [ "$meta.id\t$mapped" ] } .set { ch_pass_fail_mapped } - def header = [ - "Sample", - "STAR uniquely mapped reads (%)" - ] - MULTIQC_TSV_FAIL_MAPPED ( - ch_pass_fail_mapped.fail.collect(), - header, - 'fail_mapped_samples' - ) - .set { ch_fail_mapping_multiqc } + ch_pass_fail_mapped + .fail + .collect() + .map { + tsv_data -> + def header = ["Sample", "STAR uniquely mapped reads (%)"] + WorkflowRnaseq.multiqcTsvFromList(tsv_data, header) + } + .set { ch_fail_mapping_multiqc } } // @@ -733,27 +765,29 @@ workflow RNASEQ { ch_versions = ch_versions.mix(BAM_RSEQC.out.versions) ch_inferexperiment_multiqc - .map { meta, strand_log -> [ meta ] + WorkflowRnaseq.getInferexperimentStrandedness(strand_log, 30) } - .filter { it[0].strandedness != it[1] } - .map { meta, strandedness, sense, antisense, undetermined -> - [ "$meta.id\t$meta.strandedness\t$strandedness\t$sense\t$antisense\t$undetermined" ] + .map { + meta, strand_log -> + def inferred_strand = WorkflowRnaseq.getInferexperimentStrandedness(strand_log, 30) + pass_strand_check[meta.id] = true + if (meta.strandedness != inferred_strand[0]) { + pass_strand_check[meta.id] = false + return [ "$meta.id\t$meta.strandedness\t${inferred_strand.join('\t')}" ] + } } - .set { ch_fail_strand } - - def header = [ - "Sample", - "Provided strandedness", - "Inferred strandedness", - "Sense (%)", - "Antisense (%)", - "Undetermined (%)" - ] - MULTIQC_TSV_STRAND_CHECK ( - ch_fail_strand.collect(), - header, - 'fail_strand_check' - ) - .set { ch_fail_strand_multiqc } + .collect() + .map { + tsv_data -> + def header = [ + "Sample", + "Provided strandedness", + "Inferred strandedness", + "Sense (%)", + "Antisense (%)", + "Undetermined (%)" + ] + WorkflowRnaseq.multiqcTsvFromList(tsv_data, header) + } + .set { ch_fail_strand_multiqc } } } @@ -811,12 +845,12 @@ workflow RNASEQ { ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml'), ch_methods_description.collectFile(name: 'methods_description_mqc.yaml'), ch_multiqc_logo.collect().ifEmpty([]), - ch_fail_trimming_multiqc.ifEmpty([]), - ch_fail_mapping_multiqc.ifEmpty([]), - ch_fail_strand_multiqc.ifEmpty([]), - FASTQ_FASTQC_UMITOOLS_TRIMGALORE.out.fastqc_zip.collect{it[1]}.ifEmpty([]), - FASTQ_FASTQC_UMITOOLS_TRIMGALORE.out.trim_zip.collect{it[1]}.ifEmpty([]), - FASTQ_FASTQC_UMITOOLS_TRIMGALORE.out.trim_log.collect{it[1]}.ifEmpty([]), + ch_fail_trimming_multiqc.collectFile(name: 'fail_trimmed_samples_mqc.tsv').ifEmpty([]), + ch_fail_mapping_multiqc.collectFile(name: 'fail_mapped_samples_mqc.tsv').ifEmpty([]), + ch_fail_strand_multiqc.collectFile(name: 'fail_strand_check_mqc.tsv').ifEmpty([]), + ch_fastqc_raw_multiqc.collect{it[1]}.ifEmpty([]), + ch_fastqc_trim_multiqc.collect{it[1]}.ifEmpty([]), + ch_trim_log_multiqc.collect{it[1]}.ifEmpty([]), ch_sortmerna_multiqc.collect{it[1]}.ifEmpty([]), ch_star_multiqc.collect{it[1]}.ifEmpty([]), ch_hisat2_multiqc.collect{it[1]}.ifEmpty([]), @@ -855,14 +889,14 @@ workflow RNASEQ { workflow.onComplete { if (params.email || params.email_on_fail) { - NfcoreTemplate.email(workflow, params, summary_params, projectDir, log, multiqc_report, fail_percent_mapped) + NfcoreTemplate.email(workflow, params, summary_params, projectDir, log, multiqc_report, pass_mapped_reads, pass_trimmed_reads, pass_strand_check) } if (params.hook_url) { NfcoreTemplate.IM_notification(workflow, params, summary_params, projectDir, log) } - NfcoreTemplate.summary(workflow, params, log, fail_percent_mapped, pass_percent_mapped) + NfcoreTemplate.summary(workflow, params, log, pass_mapped_reads, pass_trimmed_reads, pass_strand_check) } /*