Update interproscan (#5688)

* Add input path for interproscan database * Remove --formats because it's optional * Update version string * Remove disable precalc as this is optional * Remove applications as this is option * Swap output flag for file-base * Copy interproscan properties so data dir from work directory is used * simplify bash test * Only use interproscan db when not testing * Update test * Escape backslashes * Use custom delimiter in sed * Update test snapshot * update meta yml * Update meta.yml * Add default CPU and memory constraints * Try reducing for test * Protect space * update test * Update snapshot * Remove env. Just causing errors * Try set max heap size * Try again to reduce memory of interproscan * Try reducing in interproscan.sh * Revert changes to attempts at setting memory * Add myself as author and maintainer as it's a big change * Add interproscan to docker_self_hosted exclude list * Revert test exclusion * Reduce to stub test only * Add note to why tests are commented out --------- Co-authored-by: Simon Pearce <24893913+SPPearce@users.noreply.github.com>
nf-core · Jun 18, 2024 · f82c181 · f82c181
1 parent 5f65b53
commit f82c181
Show file tree

Hide file tree

Showing 4 changed files with 277 additions and 82 deletions.
diff --git a/modules/nf-core/interproscan/main.nf b/modules/nf-core/interproscan/main.nf
@@ -1,5 +1,6 @@
 process INTERPROSCAN {
     tag "$meta.id"
+    label 'process_medium'
     label 'process_long'
 
     conda "${moduleDir}/environment.yml"
@@ -9,7 +10,7 @@ process INTERPROSCAN {
 
     input:
     tuple val(meta), path(fasta)
-    val(out_ext)
+    path(interproscan_database, stageAs: 'data')
 
     output:
     tuple val(meta), path('*.tsv') , optional: true, emit: tsv
@@ -26,63 +27,40 @@ process INTERPROSCAN {
     def prefix = task.ext.prefix ?: "${meta.id}"
     def is_compressed = fasta.name.endsWith(".gz")
     def fasta_name = fasta.name.replace(".gz", "")
-
-    def appl = "-appl TIGRFAM,FunFam,SFLD,PANTHER,Gene3D,Hamap,ProSiteProfiles,Coils,SMART,CDD,PRINTS,PIRSR,ProSitePatterns,AntiFam,Pfam,MobiDBLite"
-    if ( args.contains("-appl") ) {
-        appl = ""
-    }
-    switch ( out_ext ) {
-        case "tsv": break
-        case "xml": break
-        case "gff3": break
-        case "json": break
-        default:
-            out_ext = 'tsv';
-            log.warn("Unknown output file format provided (${out_ext}): selecting tsv as fallback");
-            break
-    }
-
-    //  -dp (disable precalculation) is on so no online dependency
     """
-    if [ "${is_compressed}" == "true" ]; then
+    if [ -d 'data' ]; then
+        # Find interproscan.properties to link data/ from work directory
+        INTERPROSCAN_DIR="\$( dirname "\$( dirname "\$( which interproscan.sh )" )" )"
+        INTERPROSCAN_PROPERTIES="\$( find "\$INTERPROSCAN_DIR" -name "interproscan.properties" )"
+        cp "\$INTERPROSCAN_PROPERTIES" .
+        sed -i "/^bin\\.directory=/ s|.*|bin.directory=\$INTERPROSCAN_DIR/bin|" interproscan.properties
+        export INTERPROSCAN_CONF=interproscan.properties
+    fi # else use sample DB included with conda ( testing only! )
+
+    if ${is_compressed} ; then
         gzip -c -d ${fasta} > ${fasta_name}
     fi
 
     interproscan.sh \\
-        -cpu ${task.cpus} \\
-        -i ${fasta_name} \\
-        -f ${out_ext} \\
-        -dp \\
-        ${appl} \\
+        --cpu ${task.cpus} \\
+        --input ${fasta_name} \\
         ${args} \\
-        -o ${prefix}.${out_ext}
+        --output-file-base ${prefix}
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
-        interproscan: \$(echo \$(interproscan.sh --version 2>&1) | head -n 1 | sed 's/^.*InterProScan version//' | sed 's/\\s*InterProScan.*//')
+        interproscan: \$( interproscan.sh --version | sed '1!d; s/.*version //' )
     END_VERSIONS
     """
 
     stub:
     def prefix = task.ext.prefix ?: "${meta.id}"
-
-    switch ( out_ext ) {
-        case "tsv": break
-        case "xml": break
-        case "gff3": break
-        case "json": break
-        default:
-            out_ext = 'tsv';
-            log.warn("Unknown output file format provided (${out_ext}): selecting tsv as fallback");
-            break
-    }
-
     """
-    touch ${prefix}.${out_ext}
+    touch ${prefix}.{tsv,xml,json,gff3}
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
-        interproscan: \$(echo \$(interproscan.sh --version 2>&1) | head -n 1 | sed 's/^.*InterProScan version//' | sed 's/\\s*InterProScan.*//')
+        interproscan: \$( interproscan.sh --version | sed '1!d; s/.*version //' )
     END_VERSIONS
     """
 }
diff --git a/modules/nf-core/interproscan/meta.yml b/modules/nf-core/interproscan/meta.yml
@@ -3,6 +3,8 @@ description: Produces protein annotations and predictions from an amino acids FA
 keywords:
   - annotation
   - fasta
+  - protein
+  - dna
   - interproscan
 tools:
   - "interproscan":
@@ -11,7 +13,7 @@ tools:
       documentation: "https://interproscan-docs.readthedocs.io"
       tool_dev_url: "https://github.com/ebi-pf-team/interproscan"
       doi: "10.1093/bioinformatics/btu031"
-      licence: "['GPL v3']"
+      licence: ["GPL v3"]
 input:
   - meta:
       type: map
@@ -20,12 +22,11 @@ input:
         e.g. [ id:'test', single_end:false ]
   - fasta:
       type: file
-      description: Input fasta file containing the amino acid query sequences
+      description: Input fasta file containing the amino acid or dna query sequences
       pattern: "*.{fa,fasta,fa.gz,fasta.gz}"
-  - out_ext:
-      type: string
-      description: Specify the type of output file to be generated
-      pattern: "tsv|xml|gff3|json"
+  - interproscan_database:
+      type: directory
+      description: Path to the interproscan database (untarred http://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/${version_major}-${version_minor}/interproscan-${version_major}-${version_minor}-64-bit.tar.gz)
 output:
   - tsv:
       type: file
@@ -49,6 +50,8 @@ output:
       pattern: "versions.yml"
 authors:
   - "@toniher"
+  - "@mahesh-panchal"
 maintainers:
   - "@toniher"
   - "@vagkaratzas"
+  - "@mahesh-panchal"
diff --git a/modules/nf-core/interproscan/tests/main.nf.test b/modules/nf-core/interproscan/tests/main.nf.test
@@ -8,58 +8,93 @@ nextflow_process {
     tag "modules_nfcore"
     tag "interproscan"
 
-    test("Annotates set of input proteins in an output tsv file") {
+    // Note: Regular tests have been commented out because Interproscan has a harded coded a requirement of 10G memory,
+    // and so will therefore not run on the nf-core test runners without being killed.
 
-        when {
-            params {
-                outdir = "$outputDir"
-            }
-            process {
-                """
-                input[0] = [
-                    [ id:'test' ],
-                    file(params.test_data['sarscov2']['genome']['proteome_fasta'], checkIfExists: true)
-                ]
-                input[1] = 'tsv'
-                """
-            }
-        }
+    // test("sarscov2 - proteome_fasta") {
 
-        then {
-            assertAll(
-                { assert process.success },
-                { assert snapshot(process.out.tsv).match("tsv") },
-                { assert process.out.versions }
-            )
-        }
+    //     when {
+    //         process {
+    //             """
+    //             input[0] = [
+    //                 [ id:'test' ],
+    //                 file(params.test_data['sarscov2']['genome']['proteome_fasta'], checkIfExists: true)
+    //             ]
+    //             input[1] = []
+    //             """
+    //         }
+    //     }
 
-    }
+    //     then {
+    //         assertAll(
+    //             { assert process.success },
+    //             { assert snapshot(
+    //                 path(process.out.tsv[0][1]).readLines()[0]
+    //                     .contains("ENSSASP00005000004.1	4c35f09aac2f7be4f3cffd30c6aecac8	1273	Coils	Coil	Coil	1176	1203	-	T"),
+    //                 process.out.xml,
+    //                 process.out.json,
+    //                 path(process.out.gff3[0][1]).readLines()[0..4,6..-1],
+    //                 process.out.versions,
+    //                 ).match()
+    //             }
+    //         )
+    //     }
+
+    // }
+
+    // test("sarscov2 - proteome_fasta_gz") {
 
-    test("Annotates set of zipped input proteins in an output xml file") {
+    //     when {
+    //         process {
+    //             """
+    //             input[0] = [
+    //                 [ id:'test' ],
+    //                 file(params.test_data['sarscov2']['genome']['proteome_fasta_gz'], checkIfExists: true)
+    //             ]
+    //             input[1] = []
+    //             """
+    //         }
+    //     }
+
+    //     then {
+    //         assertAll(
+    //             { assert process.success },
+    //             { assert snapshot(
+    //                 path(process.out.tsv[0][1]).readLines()[0]
+    //                     .contains("ENSSASP00005000004.1	4c35f09aac2f7be4f3cffd30c6aecac8	1273	Coils	Coil	Coil	1176	1203	-	T"),
+    //                 process.out.xml,
+    //                 process.out.json,
+    //                 path(process.out.gff3[0][1]).readLines()[0..4,6..-1],
+    //                 process.out.versions,
+    //                 ).match()
+    //             }
+    //         )
+    //     }
+
+    // }
+
+    test("sarscov2 - proteome_fasta_gz - stub") {
+
+        options '-stub'
 
         when {
-            params {
-                outdir = "$outputDir"
-            }
             process {
                 """
                 input[0] = [
                     [ id:'test' ],
                     file(params.test_data['sarscov2']['genome']['proteome_fasta_gz'], checkIfExists: true)
                 ]
-                input[1] = 'xml'
+                input[1] = []
                 """
             }
         }
 
         then {
             assertAll(
                 { assert process.success },
-                { assert snapshot(process.out.xml).match("xml") },
-                { assert process.out.versions }
+                { assert snapshot(process.out).match() }
             )
         }
 
     }
-
 }