Skip to content

Commit

Permalink
Update interproscan (#5688)
Browse files Browse the repository at this point in the history
* Add input path for interproscan database

* Remove --formats because it's optional

* Update version string

* Remove disable precalc as this is optional

* Remove applications as this is option

* Swap output flag for file-base

* Copy interproscan properties so data dir from work directory is used

* simplify bash test

* Only use interproscan db when not testing

* Update test

* Escape backslashes

* Use custom delimiter in sed

* Update test snapshot

* update meta yml

* Update meta.yml

* Add default CPU and memory constraints

* Try reducing for test

* Protect space

* update test

* Update snapshot

* Remove env. Just causing errors

* Try set max heap size

* Try again to reduce memory of interproscan

* Try reducing in interproscan.sh

* Revert changes to attempts at setting memory

* Add myself as author and maintainer as it's a big change

* Add interproscan to docker_self_hosted exclude list

* Revert test exclusion

* Reduce to stub test only

* Add note to why tests are commented out

---------

Co-authored-by: Simon Pearce <24893913+SPPearce@users.noreply.github.com>
  • Loading branch information
mahesh-panchal and SPPearce authored Jun 18, 2024
1 parent 5f65b53 commit f82c181
Show file tree
Hide file tree
Showing 4 changed files with 277 additions and 82 deletions.
58 changes: 18 additions & 40 deletions modules/nf-core/interproscan/main.nf
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
process INTERPROSCAN {
tag "$meta.id"
label 'process_medium'
label 'process_long'

conda "${moduleDir}/environment.yml"
Expand All @@ -9,7 +10,7 @@ process INTERPROSCAN {

input:
tuple val(meta), path(fasta)
val(out_ext)
path(interproscan_database, stageAs: 'data')

output:
tuple val(meta), path('*.tsv') , optional: true, emit: tsv
Expand All @@ -26,63 +27,40 @@ process INTERPROSCAN {
def prefix = task.ext.prefix ?: "${meta.id}"
def is_compressed = fasta.name.endsWith(".gz")
def fasta_name = fasta.name.replace(".gz", "")

def appl = "-appl TIGRFAM,FunFam,SFLD,PANTHER,Gene3D,Hamap,ProSiteProfiles,Coils,SMART,CDD,PRINTS,PIRSR,ProSitePatterns,AntiFam,Pfam,MobiDBLite"
if ( args.contains("-appl") ) {
appl = ""
}
switch ( out_ext ) {
case "tsv": break
case "xml": break
case "gff3": break
case "json": break
default:
out_ext = 'tsv';
log.warn("Unknown output file format provided (${out_ext}): selecting tsv as fallback");
break
}

// -dp (disable precalculation) is on so no online dependency
"""
if [ "${is_compressed}" == "true" ]; then
if [ -d 'data' ]; then
# Find interproscan.properties to link data/ from work directory
INTERPROSCAN_DIR="\$( dirname "\$( dirname "\$( which interproscan.sh )" )" )"
INTERPROSCAN_PROPERTIES="\$( find "\$INTERPROSCAN_DIR" -name "interproscan.properties" )"
cp "\$INTERPROSCAN_PROPERTIES" .
sed -i "/^bin\\.directory=/ s|.*|bin.directory=\$INTERPROSCAN_DIR/bin|" interproscan.properties
export INTERPROSCAN_CONF=interproscan.properties
fi # else use sample DB included with conda ( testing only! )
if ${is_compressed} ; then
gzip -c -d ${fasta} > ${fasta_name}
fi
interproscan.sh \\
-cpu ${task.cpus} \\
-i ${fasta_name} \\
-f ${out_ext} \\
-dp \\
${appl} \\
--cpu ${task.cpus} \\
--input ${fasta_name} \\
${args} \\
-o ${prefix}.${out_ext}
--output-file-base ${prefix}
cat <<-END_VERSIONS > versions.yml
"${task.process}":
interproscan: \$(echo \$(interproscan.sh --version 2>&1) | head -n 1 | sed 's/^.*InterProScan version//' | sed 's/\\s*InterProScan.*//')
interproscan: \$( interproscan.sh --version | sed '1!d; s/.*version //' )
END_VERSIONS
"""

stub:
def prefix = task.ext.prefix ?: "${meta.id}"

switch ( out_ext ) {
case "tsv": break
case "xml": break
case "gff3": break
case "json": break
default:
out_ext = 'tsv';
log.warn("Unknown output file format provided (${out_ext}): selecting tsv as fallback");
break
}

"""
touch ${prefix}.${out_ext}
touch ${prefix}.{tsv,xml,json,gff3}
cat <<-END_VERSIONS > versions.yml
"${task.process}":
interproscan: \$(echo \$(interproscan.sh --version 2>&1) | head -n 1 | sed 's/^.*InterProScan version//' | sed 's/\\s*InterProScan.*//')
interproscan: \$( interproscan.sh --version | sed '1!d; s/.*version //' )
END_VERSIONS
"""
}
15 changes: 9 additions & 6 deletions modules/nf-core/interproscan/meta.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ description: Produces protein annotations and predictions from an amino acids FA
keywords:
- annotation
- fasta
- protein
- dna
- interproscan
tools:
- "interproscan":
Expand All @@ -11,7 +13,7 @@ tools:
documentation: "https://interproscan-docs.readthedocs.io"
tool_dev_url: "https://github.com/ebi-pf-team/interproscan"
doi: "10.1093/bioinformatics/btu031"
licence: "['GPL v3']"
licence: ["GPL v3"]
input:
- meta:
type: map
Expand All @@ -20,12 +22,11 @@ input:
e.g. [ id:'test', single_end:false ]
- fasta:
type: file
description: Input fasta file containing the amino acid query sequences
description: Input fasta file containing the amino acid or dna query sequences
pattern: "*.{fa,fasta,fa.gz,fasta.gz}"
- out_ext:
type: string
description: Specify the type of output file to be generated
pattern: "tsv|xml|gff3|json"
- interproscan_database:
type: directory
description: Path to the interproscan database (untarred http://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/${version_major}-${version_minor}/interproscan-${version_major}-${version_minor}-64-bit.tar.gz)
output:
- tsv:
type: file
Expand All @@ -49,6 +50,8 @@ output:
pattern: "versions.yml"
authors:
- "@toniher"
- "@mahesh-panchal"
maintainers:
- "@toniher"
- "@vagkaratzas"
- "@mahesh-panchal"
97 changes: 66 additions & 31 deletions modules/nf-core/interproscan/tests/main.nf.test
Original file line number Diff line number Diff line change
Expand Up @@ -8,58 +8,93 @@ nextflow_process {
tag "modules_nfcore"
tag "interproscan"

test("Annotates set of input proteins in an output tsv file") {
// Note: Regular tests have been commented out because Interproscan has a harded coded a requirement of 10G memory,
// and so will therefore not run on the nf-core test runners without being killed.

when {
params {
outdir = "$outputDir"
}
process {
"""
input[0] = [
[ id:'test' ],
file(params.test_data['sarscov2']['genome']['proteome_fasta'], checkIfExists: true)
]
input[1] = 'tsv'
"""
}
}
// test("sarscov2 - proteome_fasta") {

then {
assertAll(
{ assert process.success },
{ assert snapshot(process.out.tsv).match("tsv") },
{ assert process.out.versions }
)
}
// when {
// process {
// """
// input[0] = [
// [ id:'test' ],
// file(params.test_data['sarscov2']['genome']['proteome_fasta'], checkIfExists: true)
// ]
// input[1] = []
// """
// }
// }

}
// then {
// assertAll(
// { assert process.success },
// { assert snapshot(
// path(process.out.tsv[0][1]).readLines()[0]
// .contains("ENSSASP00005000004.1 4c35f09aac2f7be4f3cffd30c6aecac8 1273 Coils Coil Coil 1176 1203 - T"),
// process.out.xml,
// process.out.json,
// path(process.out.gff3[0][1]).readLines()[0..4,6..-1],
// process.out.versions,
// ).match()
// }
// )
// }

// }

// test("sarscov2 - proteome_fasta_gz") {

test("Annotates set of zipped input proteins in an output xml file") {
// when {
// process {
// """
// input[0] = [
// [ id:'test' ],
// file(params.test_data['sarscov2']['genome']['proteome_fasta_gz'], checkIfExists: true)
// ]
// input[1] = []
// """
// }
// }

// then {
// assertAll(
// { assert process.success },
// { assert snapshot(
// path(process.out.tsv[0][1]).readLines()[0]
// .contains("ENSSASP00005000004.1 4c35f09aac2f7be4f3cffd30c6aecac8 1273 Coils Coil Coil 1176 1203 - T"),
// process.out.xml,
// process.out.json,
// path(process.out.gff3[0][1]).readLines()[0..4,6..-1],
// process.out.versions,
// ).match()
// }
// )
// }

// }

test("sarscov2 - proteome_fasta_gz - stub") {

options '-stub'

when {
params {
outdir = "$outputDir"
}
process {
"""
input[0] = [
[ id:'test' ],
file(params.test_data['sarscov2']['genome']['proteome_fasta_gz'], checkIfExists: true)
]
input[1] = 'xml'
input[1] = []
"""
}
}

then {
assertAll(
{ assert process.success },
{ assert snapshot(process.out.xml).match("xml") },
{ assert process.out.versions }
{ assert snapshot(process.out).match() }
)
}

}

}
Loading

0 comments on commit f82c181

Please sign in to comment.