nf-core · jfy133 · Jun 3, 2024 · May 29, 2024 · May 29, 2024 · May 29, 2024
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -17,7 +17,7 @@ concurrency:
 
 jobs:
   test:
-    name: Run pipeline with test data (AMP and ARG workflows)
+    name: Run pipeline with test data (AMP and ARG)
     # Only run on push if this is the nf-core dev branch (merged PRs)
     if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/funcscan') }}"
     runs-on: ubuntu-latest
@@ -27,9 +27,9 @@ jobs:
           - "23.04.0"
           - "latest-everything"
         parameters:
-          - "--annotation_tool prodigal"
-          - "--annotation_tool prokka"
-          - "--annotation_tool bakta --annotation_bakta_db_downloadtype light --arg_skip_deeparg --arg_skip_amrfinderplus" # Skip deeparg and amrfinderplus due to otherwise running out of space on GitHub Actions
+          - "-profile docker,test_preannotated --annotation_tool prodigal"
+          - "-profile docker,test --annotation_tool prokka"
+          - "-profile docker,test --annotation_tool bakta --annotation_bakta_db_downloadtype light --arg_skip_deeparg --arg_skip_amrfinderplus" # Skip deeparg and amrfinderplus due to otherwise running out of space on GitHub Actions
 
     steps:
       - name: Check out pipeline code
@@ -43,12 +43,12 @@ jobs:
       - name: Disk space cleanup
         uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
 
-      - name: Run pipeline with test data (AMP and ARG workflows)
+      - name: Run pipeline with test data (AMP/ARG workflows)
         run: |
-          nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results ${{ matrix.parameters }}
+          nextflow run ${GITHUB_WORKSPACE} ${{ matrix.parameters }} --outdir ./results
 
   test_bgc:
-    name: Run pipeline with test data (BGC workflow)
+    name: Run pipeline with test data (BGC)
     # Only run on push if this is the nf-core dev branch (merged PRs)
     if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/funcscan') }}"
     runs-on: ubuntu-latest
@@ -58,9 +58,9 @@ jobs:
           - "23.04.0"
           - "latest-everything"
         parameters:
-          - "--annotation_tool prodigal"
-          - "--annotation_tool prokka"
-          - "--annotation_tool bakta --annotation_bakta_db_downloadtype light"
+          - "-profile docker,test_preannotated_bgc --annotation_tool prodigal"
+          - "-profile docker,test_bgc --annotation_tool prokka"
+          - "-profile docker,test_bgc --annotation_tool bakta --annotation_bakta_db_downloadtype light"
 
     steps:
       - name: Check out pipeline code
@@ -76,10 +76,10 @@ jobs:
 
       - name: Run pipeline with test data (BGC workflow)
         run: |
-          nextflow run ${GITHUB_WORKSPACE} -profile test_bgc,docker --outdir ./results ${{ matrix.parameters }} --bgc_skip_deepbgc
+          nextflow run ${GITHUB_WORKSPACE} ${{ matrix.parameters }} --outdir ./results --bgc_skip_deepbgc
 
   test_taxonomy:
-    name: Run pipeline with test data (AMP, ARG and BGC taxonomy workflows)
+    name: Run pipeline with test data (AMP, ARG and BGC with taxonomy)
     # Only run on push if this is the nf-core dev branch (merged PRs)
     if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/funcscan') }}"
     runs-on: ubuntu-latest
@@ -89,9 +89,9 @@ jobs:
           - "23.04.0"
           - "latest-everything"
         parameters:
-          - "--annotation_tool prodigal"
-          - "--annotation_tool prokka"
-          - "--annotation_tool bakta --annotation_bakta_db_downloadtype light"
+          - "-profile docker,test_taxonomy --annotation_tool prodigal"
+          - "-profile docker,test_taxonomy --annotation_tool prokka"
+          - "-profile docker,test_taxonomy --annotation_tool bakta --annotation_bakta_db_downloadtype light"
 
     steps:
       - name: Check out pipeline code
@@ -107,4 +107,4 @@ jobs:
 
       - name: Run pipeline with test data (AMP, ARG and BGC taxonomy workflows)
         run: |
-          nextflow run ${GITHUB_WORKSPACE} -profile test_taxonomy,docker --outdir ./results ${{ matrix.parameters }}
+          nextflow run ${GITHUB_WORKSPACE} ${{ matrix.parameters }} --outdir ./results
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#343](https://github.com/nf-core/funcscan/pull/343) Added contig taxonomic classification using [MMseqs2](https://github.com/soedinglab/MMseqs2/). (by @darcy220606)
 - [#358](https://github.com/nf-core/funcscan/pull/358) Improved RGI databases handling, users can supply their own CARD now. (by @jasmezz)
 - [#375](https://github.com/nf-core/funcscan/pull/375) Merged pipeline template of nf-core/tools version 2.14.1 (by @jfy133)
+- [#340](https://github.com/nf-core/funcscan/pull/340) Added support for supplying pre-annotated sequences to the pipeline. (by @jfy133, @jasmezz)
 
 ### `Fixed`
 

diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
@@ -10,6 +10,14 @@ report_section_order:
   "nf-core-funcscan-summary":
     order: -1002
 
+run_modules:
+  - prokka
+  - custom_content
+
+table_columns_visible:
+  Prokka:
+    organism: False
+
 export_plots: true
 
 disable_version_detection: true

diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv
@@ -1,3 +1,4 @@
-sample,fasta
-sample_1,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_1.fasta.gz
-sample_2,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_2.fasta.gz
+sample,fasta,protein,gbk
+sample_1,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_1.fasta.gz,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_prokka_1.faa,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_prokka_1.gbk
+sample_2,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_2.fasta.gz,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_prokka_2.faa.gz,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_prokka_2.gbk.gz
+sample_3,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs.fasta
diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -18,9 +18,27 @@
                 "type": "string",
                 "format": "file-path",
                 "exists": true,
-                "pattern": "^\\S+\\.(fasta|fas|fa|fna)(\\.gz)?$",
-                "errorMessage": "Fasta file for reads must be provided, cannot contain spaces and must have extension '.fasta', '.fas', '.fa' or '.fna' (any of these can be optionally compressed as '.gz')",
+                "pattern": "^\\S+\\.(fasta|fas|fna|fa)(\\.gz)?$",
+                "errorMessage": "Fasta file for reads must be provided, cannot contain spaces and must have extension '.fa.gz', '.fna.gz' or '.fasta.gz'",
                 "unique": true
+            },
+            "protein": {
+                "type": "string",
+                "format": "file-path",
+                "exists": true,
+                "pattern": "^\\S+\\.(faa)(\\.gz)?$",
+                "errorMessage": "Input file for peptide annotations has incorrect file format. File must end in .fasta, .faa",
+                "unique": true,
+                "dependentRequired": ["gbk"]
+            },
+            "gbk": {
+                "type": "string",
+                "format": "file-path",
+                "exists": true,
+                "pattern": "^\\S+\\.(gbk|gbff)(\\.gz)?$",
+                "errorMessage": "Input file for feature annotations has incorrect file format. File must end in .gbk or .gbff",
+                "unique": true,
+                "dependentRequired": ["protein"]
             }
         },
         "required": ["sample", "fasta"]

diff --git a/conf/base.config b/conf/base.config
@@ -79,11 +79,6 @@ process {
         time   = { check_max( 8.h  * task.attempt, 'time'    ) }
     }
 
-    withName: PRODIGAL_GFF {
-        memory = { check_max( 2.GB * task.attempt, 'memory'  ) }
-        cpus   = 1
-    }
-
     withName: PRODIGAL_GBK {
         memory = { check_max( 2.GB * task.attempt, 'memory'  ) }
         cpus   = 1

diff --git a/conf/modules.config b/conf/modules.config
@@ -33,32 +33,6 @@ process {
         ]
     }
 
-    withName: SEQKIT_SEQ_LONG {
-        ext.prefix = { "${meta.id}_long" }
-        publishDir = [
-            path: { "${params.outdir}/qc/seqkit/" },
-            mode: params.publish_dir_mode,
-            enabled: params.contig_qc_savesplitfastas,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-        ext.args = [
-            "--min-len ${params.contig_qc_lengththreshold}"
-        ].join(' ').trim()
-    }
-
-    withName: SEQKIT_SEQ_SHORT {
-        ext.prefix = { "${meta.id}_short" }
-        publishDir = [
-            path: { "${params.outdir}/qc/seqkit/" },
-            mode: params.publish_dir_mode,
-            enabled: params.contig_qc_savesplitfastas,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-        ext.args = [
-            "--max-len ${params.contig_qc_lengththreshold - 1}"
-        ].join(' ').trim()
-    }
-
     withName: MMSEQS_DATABASES {
         publishDir = [
             path: { "${params.outdir}/databases/mmseqs/" },
@@ -110,6 +84,7 @@ process {
     }
 
     withName: PROKKA {
+        ext.prefix =  { "${meta.id}_prokka" } // to prevent pigz symlink problems of input files if already uncompressed during post-annotation gzipping
         publishDir = [
             path: { "${params.outdir}/annotation/prokka/" },
             mode: params.publish_dir_mode,
@@ -128,7 +103,7 @@ process {
             params.annotation_prokka_rawproduct ? '--rawproduct' : '',
             params.annotation_prokka_rnammer ? '--rnammer' : '',
             params.annotation_prokka_compliant ? '--compliant' : '',
-            params.annotation_prokka_addgenes ? '--addgenes' : ''
+            params.annotation_prokka_addgenes ? '--addgenes' : '',
         ].join(' ').trim()
     }
 
@@ -145,6 +120,7 @@ process {
     }
 
     withName: BAKTA_BAKTA {
+        ext.prefix =  { "${meta.id}_bakta" } // to prevent pigz symlink problems of input files if already uncompressed during post-annotation gzipping
         publishDir = [
             path: { "${params.outdir}/annotation/bakta/${meta.id}" },
             mode: params.publish_dir_mode,
@@ -174,28 +150,12 @@ process {
         ].join(' ').trim()
     }
 
-    withName: PRODIGAL_GFF {
-        publishDir = [
-            path: { "${params.outdir}/annotation/prodigal/${meta.id}" },
-            mode: params.publish_dir_mode,
-            enabled: params.save_annotations,
-            pattern: "*.{faa,fna,gff}.gz",
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-        ext.args = [
-            params.annotation_prodigal_singlemode ? "-p single" : "-p meta",
-            params.annotation_prodigal_closed ? "-c" : "",
-            params.annotation_prodigal_forcenonsd ? "-n" : "",
-            "-g ${params.annotation_prodigal_transtable}"
-        ].join(' ').trim()
-    }
-
-        withName: PRODIGAL_GBK {
+    withName: PRODIGAL {
         publishDir = [
             path: { "${params.outdir}/annotation/prodigal/${meta.id}" },
             mode: params.publish_dir_mode,
             enabled: params.save_annotations,
-            pattern: "*.gbk.gz",
+            pattern: "*.{faa,fna,gbk,faa.gz,faa.gz,fna.gz,gbk.gz}",
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
         ext.args = [
@@ -207,11 +167,12 @@ process {
     }
 
     withName: PYRODIGAL {
+        ext.prefix =  { "${meta.id}_pyrodigal" } // to prevent pigz symlink problems of input files if already uncompressed during post-annotation gzipping
         publishDir = [
             path: { "${params.outdir}/annotation/pyrodigal/${meta.id}" },
             mode: params.publish_dir_mode,
             enabled: params.save_annotations,
-            pattern: "*.{faa,fna,gff,score}.gz",
+            pattern: "*.{faa,fna,gbk,score}.gz",
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
         ext.args = [
@@ -287,6 +248,7 @@ process {
     }
 
     withName: FARGENE {
+        tag = {"${meta.id}|${hmm_model}"}
         publishDir = [
             [
                 path: { "${params.outdir}/arg/fargene/${meta.id}" },

diff --git a/conf/test.config b/conf/test.config
@@ -23,7 +23,7 @@ params {
     input                = params.pipelines_testdata_base_path + 'funcscan/samplesheet_reduced.csv'
     amp_hmmsearch_models = params.pipelines_testdata_base_path + 'funcscan/hmms/mybacteriocin.hmm'
 
-    annotation_tool      = 'prodigal'
+    annotation_tool      = 'pyrodigal'
 
     run_arg_screening    = true
     arg_fargene_hmmmodel = 'class_a,class_b_1_2'

diff --git a/conf/test_bgc.config b/conf/test_bgc.config
@@ -23,7 +23,7 @@ params {
     input                = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/samplesheet_reduced.csv'
     bgc_hmmsearch_models = 'https://raw.githubusercontent.com/antismash/antismash/fd61de057e082fbf071732ac64b8b2e8883de32f/antismash/detection/hmm_detection/data/ToyB.hmm'
 
-    annotation_tool      = 'prodigal'
+    annotation_tool      = 'pyrodigal'
 
     run_arg_screening = false
     run_amp_screening = false

diff --git a/conf/test_nothing.config b/conf/test_nothing.config
@@ -7,7 +7,7 @@
     Although in this case we turn everything off
 
     Use as follows:
-        nextflow run nf-core/funcscan -profile test,<docker/singularity> --outdir <OUTDIR>
+        nextflow run nf-core/funcscan -profile test_nothing,<docker/singularity> --outdir <OUTDIR>
 
 ----------------------------------------------------------------------------------------
 */
@@ -24,10 +24,30 @@ params {
     // Input data
     input                = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/samplesheet_reduced.csv'
     amp_hmmsearch_models = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/hmms/mybacteriocin.hmm'
+    bgc_hmmsearch_models = 'https://raw.githubusercontent.com/antismash/antismash/fd61de057e082fbf071732ac64b8b2e8883de32f/antismash/detection/hmm_detection/data/ToyB.hmm'
 
-    annotation_tool         = 'prodigal'
+    annotation_tool         = 'pyrodigal'
 
     run_arg_screening       = false
     run_amp_screening       = false
     run_bgc_screening       = false
+
+    arg_fargene_hmmmodel    = 'class_a,class_b_1_2'
+
+    amp_skip_amplify        = true
+    amp_skip_macrel         = true
+    amp_skip_ampir          = true
+    amp_skip_hmmsearch      = true
+
+    arg_skip_deeparg        = true
+    arg_skip_fargene        = true
+    arg_skip_rgi            = true
+    arg_skip_amrfinderplus  = true
+    arg_skip_deeparg        = true
+    arg_skip_abricate       = true
+
+    bgc_skip_antismash      = true
+    bgc_skip_deepbgc        = true
+    bgc_skip_gecco          = true
+    bgc_skip_hmmsearch      = true
 }
diff --git a/conf/test_preannotated.config b/conf/test_preannotated.config
@@ -0,0 +1,32 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run nf-core/funcscan -profile test,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+    config_profile_name        = 'Test profile - preannotated input'
+    config_profile_description = 'Minimal test dataset to check pipeline function'
+
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus   = 2
+    max_memory = '6.GB'
+    max_time   = '6.h'
+
+    // Input data
+    input                = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/samplesheet_preannotated.csv'
+    amp_hmmsearch_models = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/hmms/mybacteriocin.hmm'
+
+    annotation_tool         = 'pyrodigal'
+
+    run_arg_screening       = true
+    arg_fargene_hmmmodel    = 'class_a,class_b_1_2'
+
+    run_amp_screening       = true
+}
diff --git a/conf/test_preannotated_bgc.config b/conf/test_preannotated_bgc.config
@@ -0,0 +1,31 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run nf-core/funcscan -profile test_bgc,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+    config_profile_name        = 'BGC test profile - preannotated input BGC'
+    config_profile_description = 'Minimal test dataset to check BGC workflow function'
+
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus   = 2
+    max_memory = '6.GB'
+    max_time   = '6.h'
+
+    // Input data
+    input                = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/samplesheet_preannotated.csv'
+    bgc_hmmsearch_models = 'https://raw.githubusercontent.com/antismash/antismash/fd61de057e082fbf071732ac64b8b2e8883de32f/antismash/detection/hmm_detection/data/ToyB.hmm'
+
+    annotation_tool      = 'pyrodigal'
+
+    run_arg_screening = false
+    run_amp_screening = false
+    run_bgc_screening = true
+}
diff --git a/conf/test_taxonomy.config b/conf/test_taxonomy.config
@@ -25,7 +25,7 @@ params {
     amp_hmmsearch_models    = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/hmms/mybacteriocin.hmm'
 
     run_taxa_classification = true
-    annotation_tool         = 'prodigal'
+    annotation_tool         = 'pyrodigal'
 
     run_arg_screening       = true
     arg_skip_deeparg        = true