From 004f56db2819441c05e52d8bcbc0275862b2e6ed Mon Sep 17 00:00:00 2001
From: maxulysse <max.u.garcia@gmail.com>
Date: Mon, 4 Mar 2024 18:55:31 +0100
Subject: [PATCH 1/5] update sortmerna functionalities

---
 main.nf                                       |  8 ++++-
 modules/nf-core/sortmerna/nextflow.config     |  2 +-
 nextflow_schema.json                          |  8 +++++
 subworkflows/local/prepare_genome/main.nf     | 35 +++++++++++++++++++
 .../local/prepare_genome/nextflow.config      | 13 +++++++
 workflows/rnaseq/main.nf                      | 23 ++++++++++--
 6 files changed, 85 insertions(+), 4 deletions(-)

diff --git a/main.nf b/main.nf
index 59001a4f1..1b98f2341 100755
--- a/main.nf
+++ b/main.nf
@@ -37,6 +37,7 @@ params.gtf              = getGenomeAttribute('gtf')
 params.gff              = getGenomeAttribute('gff')
 params.gene_bed         = getGenomeAttribute('bed12')
 params.bbsplit_index    = getGenomeAttribute('bbsplit')
+params.sortmerna_index  = getGenomeAttribute('sortmerna')
 params.star_index       = getGenomeAttribute('star')
 params.hisat2_index     = getGenomeAttribute('hisat2')
 params.rsem_index       = getGenomeAttribute('rsem')
@@ -70,18 +71,21 @@ workflow NFCORE_RNASEQ {
         params.gene_bed,
         params.splicesites,
         params.bbsplit_fasta_list,
+        params.ribo_database_manifest,
         params.star_index,
         params.rsem_index,
         params.salmon_index,
         params.kallisto_index,
         params.hisat2_index,
         params.bbsplit_index,
+        params.sortmerna_index,
         params.gencode,
         params.featurecounts_group_type,
         params.aligner,
         params.pseudo_aligner,
         params.skip_gtf_filter,
         params.skip_bbsplit,
+        !params.remove_ribo_rna,
         params.skip_alignment,
         params.skip_pseudo_alignment
     )
@@ -114,7 +118,9 @@ workflow NFCORE_RNASEQ {
         PREPARE_GENOME.out.salmon_index,
         PREPARE_GENOME.out.kallisto_index,
         PREPARE_GENOME.out.bbsplit_index,
-        PREPARE_GENOME.out.splicesites
+        PREPARE_GENOME.out.sortmerna_index,
+        PREPARE_GENOME.out.splicesites,
+        !params.remove_ribo_rna && params.remove_ribo_rna
     )
     ch_versions = ch_versions.mix(RNASEQ.out.versions)
 
diff --git a/modules/nf-core/sortmerna/nextflow.config b/modules/nf-core/sortmerna/nextflow.config
index 8771660ce..8322435dc 100644
--- a/modules/nf-core/sortmerna/nextflow.config
+++ b/modules/nf-core/sortmerna/nextflow.config
@@ -1,7 +1,7 @@
 if (params.remove_ribo_rna) {
     process {
         withName: 'SORTMERNA' {
-            ext.args   = '--num_alignments 1 -v'
+            ext.args = '--num_alignments 1 -v --index 0'
             publishDir = [
                 [
                     path: { "${params.outdir}/sortmerna" },
diff --git a/nextflow_schema.json b/nextflow_schema.json
index e5195cade..5a93d7060 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -267,6 +267,14 @@
                     "description": "Path to directory or tar.gz archive for pre-built BBSplit index.",
                     "help_text": "The BBSplit index will have to be built at least once with this pipeline (see `--save_reference` to save index). It can then be provided via `--bbsplit_index` for future runs."
                 },
+                "sortmerna_index": {
+                    "type": "string",
+                    "format": "path",
+                    "exists": true,
+                    "fa_icon": "fas fa-bezier-curve",
+                    "description": "Path to directory or tar.gz archive for pre-built sortmerna index.",
+                    "help_text": "The sortmerna index will have to be built at least once with this pipeline (see `--save_reference` to save index). It can then be provided via `--sortmerna_index` for future runs."
+                },
                 "remove_ribo_rna": {
                     "type": "boolean",
                     "fa_icon": "fas fa-trash-alt",
diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf
index 58a9b293c..f8fb8f2d1 100644
--- a/subworkflows/local/prepare_genome/main.nf
+++ b/subworkflows/local/prepare_genome/main.nf
@@ -10,6 +10,7 @@ include { GUNZIP as GUNZIP_TRANSCRIPT_FASTA } from '../../../modules/nf-core/gun
 include { GUNZIP as GUNZIP_ADDITIONAL_FASTA } from '../../../modules/nf-core/gunzip'
 
 include { UNTAR as UNTAR_BBSPLIT_INDEX      } from '../../../modules/nf-core/untar'
+include { UNTAR as UNTAR_SORTMERNA_INDEX    } from '../../../modules/nf-core/untar'
 include { UNTAR as UNTAR_STAR_INDEX         } from '../../../modules/nf-core/untar'
 include { UNTAR as UNTAR_RSEM_INDEX         } from '../../../modules/nf-core/untar'
 include { UNTAR as UNTAR_HISAT2_INDEX       } from '../../../modules/nf-core/untar'
@@ -20,6 +21,7 @@ include { CUSTOM_CATADDITIONALFASTA         } from '../../../modules/nf-core/cus
 include { CUSTOM_GETCHROMSIZES              } from '../../../modules/nf-core/custom/getchromsizes'
 include { GFFREAD                           } from '../../../modules/nf-core/gffread'
 include { BBMAP_BBSPLIT                     } from '../../../modules/nf-core/bbmap/bbsplit'
+include { SORTMERNA as SORTMERNA_INDEX      } from '../../../modules/nf-core/sortmerna'
 include { STAR_GENOMEGENERATE               } from '../../../modules/nf-core/star/genomegenerate'
 include { HISAT2_EXTRACTSPLICESITES         } from '../../../modules/nf-core/hisat2/extractsplicesites'
 include { HISAT2_BUILD                      } from '../../../modules/nf-core/hisat2/build'
@@ -43,18 +45,21 @@ workflow PREPARE_GENOME {
     gene_bed                 //      file: /path/to/gene.bed
     splicesites              //      file: /path/to/splicesites.txt
     bbsplit_fasta_list       //      file: /path/to/bbsplit_fasta_list.txt
+    sortmerna_fasta_list     //      file: /path/to/sortmerna_fasta_list.txt
     star_index               // directory: /path/to/star/index/
     rsem_index               // directory: /path/to/rsem/index/
     salmon_index             // directory: /path/to/salmon/index/
     kallisto_index           // directory: /path/to/kallisto/index/
     hisat2_index             // directory: /path/to/hisat2/index/
     bbsplit_index            // directory: /path/to/rsem/index/
+    sortmerna_index          // directory: /path/to/sortmerna/index/
     gencode                  //   boolean: whether the genome is from GENCODE
     featurecounts_group_type //    string: The attribute type used to group feature types in the GTF file when generating the biotype plot with featureCounts
     aligner                  //    string: Specifies the alignment algorithm to use - available options are 'star_salmon', 'star_rsem' and 'hisat2'
     pseudo_aligner           //    string: Specifies the pseudo aligner to use - available options are 'salmon'. Runs in addition to '--aligner'
     skip_gtf_filter          //   boolean: Skip filtering of GTF for valid scaffolds and/ or transcript IDs
     skip_bbsplit             //   boolean: Skip BBSplit for removal of non-reference genome reads
+    skip_sortmerna           //   boolean: Skip sortmerna for removal of non-reference genome reads
     skip_alignment           //   boolean: Skip all of the alignment-based processes within the pipeline
     skip_pseudo_alignment    //   boolean: Skip all of the pseudoalignment-based processes within the pipeline
 
@@ -188,6 +193,7 @@ workflow PREPARE_GENOME {
     //
     def prepare_tool_indices = []
     if (!skip_bbsplit) { prepare_tool_indices << 'bbsplit' }
+    if (!skip_sortmerna) { prepare_tool_indices << 'sortmerna' }
     if (!skip_alignment) { prepare_tool_indices << aligner }
     if (!skip_pseudo_alignment && pseudo_aligner) { prepare_tool_indices << pseudo_aligner }
 
@@ -218,6 +224,34 @@ workflow PREPARE_GENOME {
         }
     }
 
+    //
+    // Uncompress sortmerna index or generate from scratch if required
+    //
+    ch_sortmerna_index = Channel.empty()
+    if ('sortmerna' in prepare_tool_indices) {
+        if (sortmerna_index) {
+            if (sortmerna_index.endsWith('.tar.gz')) {
+                ch_sortmerna_index = UNTAR_SORTMERNA_INDEX ( [ [:], sortmerna_index ] ).untar.map { it[1] }
+                ch_versions      = ch_versions.mix(UNTAR_SORTMERNA_INDEX.out.versions)
+            } else {
+                ch_sortmerna_index = Channel.value(file(sortmerna_index))
+            }
+        } else {
+            ch_sortmerna_fastas = Channel.from(file(sortmerna_fasta_list).readLines())
+                .map { row -> file(row, checkIfExists: true) }
+                .collect()
+                .map{ ['rrna_refs', it] }
+
+            SORTMERNA_INDEX (
+                Channel.of([[],[]]),
+                ch_sortmerna_fastas,
+                Channel.of([[],[]])
+            )
+            ch_sortmerna_index = SORTMERNA_INDEX.out.index.first()
+            ch_versions = ch_versions.mix(SORTMERNA_INDEX.out.versions)
+        }
+    }
+
     //
     // Uncompress STAR index or generate from scratch if required
     //
@@ -336,6 +370,7 @@ workflow PREPARE_GENOME {
     chrom_sizes      = ch_chrom_sizes            // channel: path(genome.sizes)
     splicesites      = ch_splicesites            // channel: path(genome.splicesites.txt)
     bbsplit_index    = ch_bbsplit_index          // channel: path(bbsplit/index/)
+    sortmerna_index  = ch_sortmerna_index        // channel: path(sortmerna/index/)
     star_index       = ch_star_index             // channel: path(star/index/)
     rsem_index       = ch_rsem_index             // channel: path(rsem/index/)
     hisat2_index     = ch_hisat2_index           // channel: path(hisat2/index/)
diff --git a/subworkflows/local/prepare_genome/nextflow.config b/subworkflows/local/prepare_genome/nextflow.config
index e02648197..cb78cb9e3 100644
--- a/subworkflows/local/prepare_genome/nextflow.config
+++ b/subworkflows/local/prepare_genome/nextflow.config
@@ -112,3 +112,16 @@ if (!params.skip_bbsplit && params.bbsplit_fasta_list) {
         }
     }
 }
+
+if (params.remove_ribo_rna && params.ribo_database_manifest) {
+    process {
+        withName: 'SORTMERNA_INDEX' {
+            ext.args   = '--index 1'
+            publishDir = [
+                path: { params.save_reference ? "${params.outdir}/genome/sortmerna" : params.outdir },
+                mode: params.publish_dir_mode,
+                saveAs: { filename -> filename.equals('versions.yml') ? null : params.save_reference ? filename : null }
+            ]
+        }
+    }
+}
diff --git a/workflows/rnaseq/main.nf b/workflows/rnaseq/main.nf
index 5c7c2b964..0a8f2f9e9 100755
--- a/workflows/rnaseq/main.nf
+++ b/workflows/rnaseq/main.nf
@@ -44,6 +44,7 @@ include { SAMTOOLS_SORT                                        } from '../../mod
 include { PRESEQ_LCEXTRAP                                      } from '../../modules/nf-core/preseq/lcextrap'
 include { QUALIMAP_RNASEQ                                      } from '../../modules/nf-core/qualimap/rnaseq'
 include { SORTMERNA                                            } from '../../modules/nf-core/sortmerna'
+include { SORTMERNA as SORTMERNA_INDEX                         } from '../../../modules/nf-core/sortmerna/main'
 include { STRINGTIE_STRINGTIE                                  } from '../../modules/nf-core/stringtie/stringtie'
 include { SUBREAD_FEATURECOUNTS                                } from '../../modules/nf-core/subread/featurecounts'
 include { MULTIQC                                              } from '../../modules/nf-core/multiqc'
@@ -97,7 +98,9 @@ workflow RNASEQ {
     ch_salmon_index     // channel: path(salmon/index/)
     ch_kallisto_index   // channel: [ meta, path(kallisto/index/) ]
     ch_bbsplit_index    // channel: path(bbsplit/index/)
+    ch_sortmerna_index  // channel: path(sortmerna/index/)
     ch_splicesites      // channel: path(genome.splicesites.txt)
+    make_sortmerna_index // boolean: Whether to create a sortmerna index before running sortmerna
 
     main:
 
@@ -225,13 +228,29 @@ workflow RNASEQ {
     //
     // MODULE: Remove ribosomal RNA reads
     //
+    // Check rRNA databases for sortmerna
     if (params.remove_ribo_rna) {
         ch_ribo_db = file(params.ribo_database_manifest)
-        ch_sortmerna_fastas = Channel.from(ch_ribo_db.readLines()).map { row -> file(row, checkIfExists: true) }.collect()
+        if (ch_ribo_db.isEmpty()) {exit 1, "File provided with --ribo_database_manifest is empty: ${ch_ribo_db.getName()}!"}
+
+        ch_sortmerna_fastas = Channel.from(ch_ribo_db.readLines())
+            .map { row -> file(row, checkIfExists: true) }
+            .collect()
+            .map{ ['rrna_refs', it] }
+
+        if (make_sortmerna_index) {
+            SORTMERNA_INDEX (
+                [[],[]],
+                ch_sortmerna_fastas,
+                [[],[]]
+            )
+            ch_sortmerna_index = SORTMERNA_INDEX.out.index.first()
+        }
 
         SORTMERNA (
             ch_filtered_reads,
-            ch_sortmerna_fastas
+            ch_sortmerna_fastas,
+            ch_sortmerna_index
         )
         .reads
         .set { ch_filtered_reads }

From 1e043da1144b90952c94ef4eda7af35e016fe11f Mon Sep 17 00:00:00 2001
From: Maxime U Garcia <maxime.garcia@seqera.io>
Date: Mon, 4 Mar 2024 19:06:37 +0100
Subject: [PATCH 2/5] Update modules/nf-core/sortmerna/nextflow.config

---
 modules/nf-core/sortmerna/nextflow.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/nf-core/sortmerna/nextflow.config b/modules/nf-core/sortmerna/nextflow.config
index 8322435dc..953da5a39 100644
--- a/modules/nf-core/sortmerna/nextflow.config
+++ b/modules/nf-core/sortmerna/nextflow.config
@@ -1,7 +1,7 @@
 if (params.remove_ribo_rna) {
     process {
         withName: 'SORTMERNA' {
-            ext.args = '--num_alignments 1 -v --index 0'
+            ext.args   = '--num_alignments 1 -v --index 0'
             publishDir = [
                 [
                     path: { "${params.outdir}/sortmerna" },

From d02c6f662ff12dc59228ae10d83c9a54c5a16ab2 Mon Sep 17 00:00:00 2001
From: maxulysse <max.u.garcia@gmail.com>
Date: Mon, 4 Mar 2024 20:11:32 +0100
Subject: [PATCH 3/5] update CHANGELOG

---
 CHANGELOG.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 581304379..c24a063e6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -21,9 +21,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [PR #1220](https://github.com/nf-core/rnaseq/pull/1220) - Initialise nf-test and add pipeline level test
 - [PR #1226](https://github.com/nf-core/rnaseq/pull/1226) - Reuse bbsplit index and don't keep overwriting ([#1225](https://github.com/nf-core/rnaseq/issues/1225))
 - [PR #1229](https://github.com/nf-core/rnaseq/pull/1229) - Template update for nf-core/tools v2.13.1
+- [PR #1231](https://github.com/nf-core/rnaseq/pull/1231) - Add sortmerna index possibilities
 
 ### Parameters
 
+| Old parameter | New parameter       |
+| ------------- | ------------------- |
+|               | `--sortmerna_index` |
+
 ### Software dependencies
 
 | Dependency  | Old version | New version |

From d0941f139eec26e355fff13ceb18503b0ded2dde Mon Sep 17 00:00:00 2001
From: maxulysse <max.u.garcia@gmail.com>
Date: Tue, 5 Mar 2024 10:12:49 +0100
Subject: [PATCH 4/5] fix path

---
 workflows/rnaseq/main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflows/rnaseq/main.nf b/workflows/rnaseq/main.nf
index 0a8f2f9e9..8549f802e 100755
--- a/workflows/rnaseq/main.nf
+++ b/workflows/rnaseq/main.nf
@@ -44,7 +44,7 @@ include { SAMTOOLS_SORT                                        } from '../../mod
 include { PRESEQ_LCEXTRAP                                      } from '../../modules/nf-core/preseq/lcextrap'
 include { QUALIMAP_RNASEQ                                      } from '../../modules/nf-core/qualimap/rnaseq'
 include { SORTMERNA                                            } from '../../modules/nf-core/sortmerna'
-include { SORTMERNA as SORTMERNA_INDEX                         } from '../../../modules/nf-core/sortmerna/main'
+include { SORTMERNA as SORTMERNA_INDEX                         } from '../../modules/nf-core/sortmerna'
 include { STRINGTIE_STRINGTIE                                  } from '../../modules/nf-core/stringtie/stringtie'
 include { SUBREAD_FEATURECOUNTS                                } from '../../modules/nf-core/subread/featurecounts'
 include { MULTIQC                                              } from '../../modules/nf-core/multiqc'

From 04b3581dcdc50326833fc65ae8539783bf820180 Mon Sep 17 00:00:00 2001
From: Maxime U Garcia <maxime.garcia@seqera.io>
Date: Tue, 5 Mar 2024 10:17:51 +0100
Subject: [PATCH 5/5] Update subworkflows/local/prepare_genome/main.nf

Co-authored-by: Jonathan Manning <pininforthefjords@gmail.com>
---
 subworkflows/local/prepare_genome/main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf
index f8fb8f2d1..ba2ee14dc 100644
--- a/subworkflows/local/prepare_genome/main.nf
+++ b/subworkflows/local/prepare_genome/main.nf
@@ -59,7 +59,7 @@ workflow PREPARE_GENOME {
     pseudo_aligner           //    string: Specifies the pseudo aligner to use - available options are 'salmon'. Runs in addition to '--aligner'
     skip_gtf_filter          //   boolean: Skip filtering of GTF for valid scaffolds and/ or transcript IDs
     skip_bbsplit             //   boolean: Skip BBSplit for removal of non-reference genome reads
-    skip_sortmerna           //   boolean: Skip sortmerna for removal of non-reference genome reads
+    skip_sortmerna           //   boolean: Skip sortmerna for removal of reads mapping to sequences in sortmerna_fasta_list
     skip_alignment           //   boolean: Skip all of the alignment-based processes within the pipeline
     skip_pseudo_alignment    //   boolean: Skip all of the pseudoalignment-based processes within the pipeline