From dfa5c49318755f0cb36a4c12f293b1e0e5f36e91 Mon Sep 17 00:00:00 2001
From: "James A. Fellows Yates" <jfy133@gmail.com>
Date: Mon, 16 Dec 2024 10:31:11 +0100
Subject: [PATCH] Add support for prokka compliance mode

---
 CHANGELOG.md                                  |  1 +
 conf/modules.config                           | 23 +++++++++++--------
 conf/test_full.config                         |  4 ++++
 nextflow.config                               |  2 ++
 nextflow_schema.json                          | 19 +++++++++++----
 .../local/utils_nfcore_mag_pipeline/main.nf   |  7 +++++-
 6 files changed, 40 insertions(+), 16 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6949773e..30c9ddf1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#707](https://github.com/nf-core/mag/pull/707) - Make Bin QC a subworkflow (added by @dialvarezs)
 - [#707](https://github.com/nf-core/mag/pull/707) - Added CheckM2 as an alternative bin completeness and QC tool (added by @dialvarezs)
 - [#708](https://github.com/nf-core/mag/pull/708) - Added `--exclude_unbins_from_postbinning` parameter to exclude unbinned contigs from post-binning processes, speeding up Prokka in some cases (added by @dialvarezs)
+- [#732](https://github.com/nf-core/mag/pull/732) - Added support for Prokka's compliance mode with `--prokka_with_compliance --prokka_compliance_centre <xyz>` (reported by @audy and @Thomieh73, added by @jfy133)
 
 ### `Changed`
 
diff --git a/conf/modules.config b/conf/modules.config
index bcad2756..25a2b725 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -184,7 +184,7 @@ process {
             "--keep_percent ${params.longreads_keep_percent}",
             "--trim",
             "--length_weight ${params.longreads_length_weight}",
-            params.longreads_min_quality ? "--min_mean_q ${params.longreads_min_quality}" : '',
+            params.longreads_min_quality ? "--min_mean_q ${params.longreads_min_quality}" : ''
         ].join(' ').trim()
         publishDir = [
             path: { "${params.outdir}/QC_longreads/Filtlong" },
@@ -196,9 +196,9 @@ process {
     }
 
     withName: NANOQ {
-        ext.args = [
+        ext.args   = [
             "--min-len ${params.longreads_min_length}",
-            params.longreads_min_quality ? "--min-qual ${params.longreads_min_quality}": '',
+            params.longreads_min_quality ? "--min-qual ${params.longreads_min_quality}" : '',
             "-vv"
         ].join(' ').trim()
         publishDir = [
@@ -221,11 +221,13 @@ process {
         publishDir = [
             [
                 path: { "${params.outdir}/QC_longreads/NanoLyse" },
-                mode: params.publish_dir_mode, pattern: "*.log"
+                mode: params.publish_dir_mode,
+                pattern: "*.log"
             ],
             [
                 path: { "${params.outdir}/QC_longreads/NanoLyse" },
-                mode: params.publish_dir_mode, pattern: "*_nanolyse.fastq.gz",
+                mode: params.publish_dir_mode,
+                pattern: "*_nanolyse.fastq.gz",
                 enabled: params.save_lambdaremoved_reads
             ]
         ]
@@ -234,8 +236,8 @@ process {
 
     withName: CHOPPER {
         ext.args2  = [
-            params.longreads_min_quality ? "--quality ${params.longreads_min_quality}": '',
-            params.longreads_min_length ? "--minlength ${params.longreads_min_length}": ''
+            params.longreads_min_quality ? "--quality ${params.longreads_min_quality}" : '',
+            params.longreads_min_length ? "--minlength ${params.longreads_min_length}" : ''
         ].join(' ').trim()
         publishDir = [
             [
@@ -250,7 +252,7 @@ process {
                 enabled: params.save_lambdaremoved_reads || params.save_filtered_longreads
             ]
         ]
-        ext.prefix =  { "${meta.id}_run${meta.run}_chopper" }
+        ext.prefix = { "${meta.id}_run${meta.run}_chopper" }
     }
 
     withName: NANOPLOT_RAW {
@@ -434,7 +436,8 @@ process {
     withName: CHECKM2_DATABASEDOWNLOAD {
         publishDir = [
             path: { "${params.outdir}/GenomeBinning/QC/CheckM2/checkm2_downloads" },
-            mode: params.publish_dir_mode, overwrite: false,
+            mode: params.publish_dir_mode,
+            overwrite: false,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
             enabled: params.save_checkm2_data
         ]
@@ -509,7 +512,7 @@ process {
     }
 
     withName: PROKKA {
-        ext.args   = "--metagenome"
+        ext.args   = { params.prokka_with_compliance ? "--metagenome --compliant --centre ${params.prokka_compliance_centre}" : "--metagenome" }
         publishDir = [path: { "${params.outdir}/Annotation/Prokka/${meta.assembler}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
     }
 
diff --git a/conf/test_full.config b/conf/test_full.config
index ed5923d0..f1b92f48 100644
--- a/conf/test_full.config
+++ b/conf/test_full.config
@@ -42,4 +42,8 @@ params {
 
     // Skip CONCOCT due to timeout issues
     skip_concoct               = true
+
+    // Set Prokka compliance mode to allow metaSPAdes bins to be annotated
+    prokka_with_compliance     = true
+    prokka_compliance_centres  = "nfcore"
 }
diff --git a/nextflow.config b/nextflow.config
index cdb2d9f0..7215cd7a 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -53,6 +53,8 @@ params {
     min_length_unbinned_contigs          = 1000000
     max_unbinned_contigs                 = 100
     skip_prokka                          = false
+    prokka_with_compliance               = false
+    prokka_compliance_centre             = null
 
     // assembly options
     coassemble_group                     = false
diff --git a/nextflow_schema.json b/nextflow_schema.json
index dedb286b..200d0c46 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -503,7 +503,7 @@
                 },
                 "gtdbtk_min_completeness": {
                     "type": "number",
-                    "default": 50.0,
+                    "default": 50,
                     "description": "Min. bin completeness (in %) required to apply GTDB-tk classification.",
                     "help_text": "Completeness assessed with BUSCO analysis (100% - %Missing). Must be greater than 0 (min. 0.01) to avoid GTDB-tk errors. If too low, GTDB-tk classification results can be impaired due to not enough marker genes!",
                     "minimum": 0.01,
@@ -511,7 +511,7 @@
                 },
                 "gtdbtk_max_contamination": {
                     "type": "number",
-                    "default": 10.0,
+                    "default": 10,
                     "description": "Max. bin contamination (in %) allowed to apply GTDB-tk classification.",
                     "help_text": "Contamination approximated based on BUSCO analysis (%Complete and duplicated). If too high, GTDB-tk classification results can be impaired due to contamination!",
                     "minimum": 0,
@@ -519,7 +519,7 @@
                 },
                 "gtdbtk_min_perc_aa": {
                     "type": "number",
-                    "default": 10.0,
+                    "default": 10,
                     "description": "Min. fraction of AA (in %) in the MSA for bins to be kept.",
                     "minimum": 0,
                     "maximum": 100
@@ -597,6 +597,16 @@
                     "type": "boolean",
                     "description": "Skip Prodigal gene prediction"
                 },
+                "prokka_with_compliance": {
+                    "type": "boolean",
+                    "help_text": "Sometimes Prokka will complain that your contig names are too long and fail.\n\nThis particularly happens with metaSPAdes assemblies.\n\nYou can turn on this flag which will tell Prokka to truncate the contig names for you.\nHowever this also requires you to specify a sequencing centre name (specified with `--prokka_compliance_centre`).\n\n:::warning\nTruncating contig names may make it harder to associated contig annotations with their original contigs!\n:::\n",
+                    "description": "Turn on Prokka complicance mode for truncating contig names for NCBI/ENA compatibility."
+                },
+                "prokka_compliance_centre": {
+                    "type": "string",
+                    "help_text": "Specify the sequencing centre name for making NCBI Genbank/ENA compatible annotation files (required when specifying `--prokka_with_compliance`).",
+                    "description": "Specify sequencing centre name required for Prokka's compliance mode."
+                },
                 "skip_prokka": {
                     "type": "boolean",
                     "description": "Skip Prokka genome annotation."
@@ -718,8 +728,7 @@
                 "exclude_unbins_from_postbinning": {
                     "type": "boolean",
                     "description": "Exclude unbinned contigs in the post-binning steps (bin QC, taxonomic classification, and annotation steps).",
-                    "help": "If you're not interested in assemby results that are not considered 'genome level', excluding unbinned contigs can greatly speed up downstream steps such as Prokka, that can be quite slow and spin up many tasks.",
-                    "default": false
+                    "help": "If you're not interested in assemby results that are not considered 'genome level', excluding unbinned contigs can greatly speed up downstream steps such as Prokka, that can be quite slow and spin up many tasks."
                 }
             }
         },
diff --git a/subworkflows/local/utils_nfcore_mag_pipeline/main.nf b/subworkflows/local/utils_nfcore_mag_pipeline/main.nf
index 3e3aa9e9..a3de55e9 100644
--- a/subworkflows/local/utils_nfcore_mag_pipeline/main.nf
+++ b/subworkflows/local/utils_nfcore_mag_pipeline/main.nf
@@ -309,13 +309,18 @@ def validateInputParameters(hybrid) {
         error('[nf-core/mag] ERROR: Invalid parameter combination: parameter --save_cat_db specified, but not --cat_db_generate! Note also that the parameter --save_cat_db does not work in combination with --cat_db.')
     }
 
-    // Chech MetaEuk db paramaters
+    // Check MetaEuk db paramaters
     if (params.metaeuk_mmseqs_db && params.metaeuk_db) {
         error('[nf-core/mag] ERROR: Invalid parameter combination: both --metaeuk_mmseqs_db and --metaeuk_db are specified! Please specify either --metaeuk_mmseqs_db or --metaeuk_db.')
     }
     if (params.save_mmseqs_db && !params.metaeuk_mmseqs_db) {
         error('[nf-core/mag] ERROR: Invalid parameter combination: --save_mmseqs_db supplied but no database has been requested for download with --metaeuk_mmseqs_db!')
     }
+
+    // Check Prokka parameters
+    if (params.prokka_with_compliance && !params.prokka_compliance_centre) {
+        error('[nf-core/mag] ERROR: Invalid parameter combination: running PROKKA with compliance mode requires a centre name specified with `--prokka_compliance_centre <XYZ>`!')
+    }
 }
 
 //