Skip to content

Commit 9246335

Browse files
authored
Merge pull request #84 from EBI-Metagenomics/assembly_decontamination
Assembly decontamination subworkflow
2 parents ffa7607 + 511668f commit 9246335

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+1024
-176
lines changed

.prettierignore

+1
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,4 @@ __pycache__
1616
*.pyc
1717
.github/renovate.json5
1818
*.pac
19+
*.njs

README.md

+4
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,10 @@ Then import the desired module in your pipeline script:
4646
include { <subworkflow_name> } from '../subworkflows/ebi-metagenomics/<subworkflow_name>.nf'
4747
```
4848

49+
## nf-core modules
50+
51+
The [nf-core](https://nf-co.re/) team supports a large number of high-quality modules, and our team contributes whenever we can. At the moment, the [nf-core tools](https://github.com/nf-core/tools/) don't support subworkflows that install modules from different repos ([#3083](https://github.com/nf-core/tools/pull/3083)). That is why we decided to copy some modules from nf-core into this repo (a nasty hack, but it works). The nf-core team has been making impressive progress on supporting this use case (subworkflows with modules from different repos), and we will remove the duplicated modules once they reach that point. In the meantime, you will find duplicated modules from nf-core here.
52+
4953
## References
5054

5155
This pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/master/LICENSE).
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
3+
channels:
4+
- conda-forge
5+
- bioconda
6+
dependencies:
7+
- bioconda::blast=2.15.0
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
process BLAST_BLASTN {
2+
tag "$meta.id"
3+
label 'process_medium'
4+
5+
conda "${moduleDir}/environment.yml"
6+
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
7+
'https://depot.galaxyproject.org/singularity/blast:2.15.0--pl5321h6f7f691_1':
8+
'biocontainers/blast:2.15.0--pl5321h6f7f691_1' }"
9+
10+
input:
11+
tuple val(meta) , path(fasta)
12+
tuple val(meta2), path(db)
13+
14+
output:
15+
tuple val(meta), path('*.txt'), emit: txt
16+
path "versions.yml" , emit: versions
17+
18+
when:
19+
task.ext.when == null || task.ext.when
20+
21+
script:
22+
def args = task.ext.args ?: ''
23+
def prefix = task.ext.prefix ?: "${meta.id}"
24+
def is_compressed = fasta.getExtension() == "gz" ? true : false
25+
def fasta_name = is_compressed ? fasta.getBaseName() : fasta
26+
27+
"""
28+
if [ "${is_compressed}" == "true" ]; then
29+
gzip -c -d ${fasta} > ${fasta_name}
30+
fi
31+
32+
DB=`find -L ./ -name "*.nal" | sed 's/\\.nal\$//'`
33+
if [ -z "\$DB" ]; then
34+
DB=`find -L ./ -name "*.nin" | sed 's/\\.nin\$//'`
35+
fi
36+
echo Using \$DB
37+
38+
blastn \\
39+
-num_threads ${task.cpus} \\
40+
-db \$DB \\
41+
-query ${fasta_name} \\
42+
${args} \\
43+
-out ${prefix}.txt
44+
45+
cat <<-END_VERSIONS > versions.yml
46+
"${task.process}":
47+
blast: \$(blastn -version 2>&1 | sed 's/^.*blastn: //; s/ .*\$//')
48+
END_VERSIONS
49+
"""
50+
51+
stub:
52+
def args = task.ext.args ?: ''
53+
def prefix = task.ext.prefix ?: "${meta.id}"
54+
"""
55+
touch ${prefix}.txt
56+
57+
cat <<-END_VERSIONS > versions.yml
58+
"${task.process}":
59+
blast: \$(blastn -version 2>&1 | sed 's/^.*blastn: //; s/ .*\$//')
60+
END_VERSIONS
61+
"""
62+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
name: blast_blastn
2+
description: Queries a BLAST DNA database
3+
keywords:
4+
- fasta
5+
- blast
6+
- blastn
7+
- DNA sequence
8+
tools:
9+
- blast:
10+
description: |
11+
BLAST finds regions of similarity between biological sequences.
12+
homepage: https://blast.ncbi.nlm.nih.gov/Blast.cgi
13+
documentation: https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=Blastdocs
14+
doi: 10.1016/S0022-2836(05)80360-2
15+
licence: ["US-Government-Work"]
16+
identifier: ""
17+
input:
18+
- - meta:
19+
type: map
20+
description: |
21+
Groovy Map containing sample information
22+
e.g. [ id:'test', single_end:false ]
23+
- fasta:
24+
type: file
25+
description: Input fasta file containing queries sequences
26+
pattern: "*.{fa,fasta,fa.gz,fasta.gz}"
27+
- - meta2:
28+
type: map
29+
description: |
30+
Groovy Map containing db information
31+
e.g. [ id:'test2', single_end:false ]
32+
- db:
33+
type: directory
34+
description: Directory containing the blast database
35+
pattern: "*"
36+
output:
37+
- txt:
38+
- meta:
39+
type: map
40+
description: |
41+
Groovy Map containing sample information
42+
e.g. [ id:'test', single_end:false ]
43+
- "*.txt":
44+
type: file
45+
description: File containing blastn hits
46+
pattern: "*.txt"
47+
- versions:
48+
- versions.yml:
49+
type: file
50+
description: File containing software versions
51+
pattern: "versions.yml"
52+
authors:
53+
- "@joseespinosa"
54+
- "@drpatelh"
55+
maintainers:
56+
- "@joseespinosa"
57+
- "@drpatelh"
58+
- "@vagkaratzas"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
nextflow_process {
2+
3+
name "Test Process BLAST_BLASTN"
4+
script "../main.nf"
5+
process "BLAST_BLASTN"
6+
config "./nextflow.config"
7+
tag "modules"
8+
tag "modules_nfcore"
9+
tag "modules_ebimetagenomics"
10+
tag "blast"
11+
tag "blast/blastn"
12+
tag "blast/makeblastdb"
13+
14+
setup {
15+
run("BLAST_MAKEBLASTDB") {
16+
script "../../makeblastdb/main.nf"
17+
process {
18+
"""
19+
input[0] = [ [id:'test2'], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ]
20+
"""
21+
}
22+
}
23+
}
24+
25+
test("Should search for nucleotide hits against a blast db") {
26+
27+
when {
28+
params {
29+
outdir = "$outputDir"
30+
}
31+
process {
32+
"""
33+
input[0] = [ [id:'test'], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ]
34+
input[1] = BLAST_MAKEBLASTDB.out.db
35+
"""
36+
}
37+
}
38+
39+
then {
40+
assertAll(
41+
{ assert process.success },
42+
{ assert path(process.out.txt[0][1]).getText().contains("Query= MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate") },
43+
{ assert snapshot(process.out.versions).match("versions") }
44+
)
45+
}
46+
47+
}
48+
49+
test("Should search for zipped nucleotide hits against a blast db") {
50+
51+
when {
52+
params {
53+
outdir = "$outputDir"
54+
}
55+
process {
56+
"""
57+
input[0] = [ [id:'test'], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) ]
58+
input[1] = BLAST_MAKEBLASTDB.out.db
59+
"""
60+
}
61+
}
62+
63+
then {
64+
assertAll(
65+
{ assert process.success },
66+
{ assert path(process.out.txt[0][1]).getText().contains("Query= MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate") },
67+
{ assert snapshot(process.out.versions).match("versions_zipped") }
68+
)
69+
}
70+
71+
}
72+
73+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
{
2+
"versions": {
3+
"content": [
4+
[
5+
"versions.yml:md5,faf2471d836ebbf24d96d3e1f8720b17"
6+
]
7+
],
8+
"timestamp": "2023-12-11T07:20:03.54997013"
9+
},
10+
"versions_zipped": {
11+
"content": [
12+
[
13+
"versions.yml:md5,faf2471d836ebbf24d96d3e1f8720b17"
14+
]
15+
],
16+
"timestamp": "2023-12-11T07:20:12.925782708"
17+
}
18+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
process {
2+
withName: BLAST_MAKEBLASTDB {
3+
ext.args = '-dbtype nucl'
4+
}
5+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
blast/blastn:
2+
- modules/nf-core/blast/blastn/**
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
3+
channels:
4+
- conda-forge
5+
- bioconda
6+
dependencies:
7+
- bioconda::blast=2.15.0

modules/ebi-metagenomics/blast/makeblastdb/main.nf

+43-11
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,62 @@
11
process BLAST_MAKEBLASTDB {
2-
tag "$fasta"
2+
tag "$meta.id"
33
label 'process_medium'
44

5-
conda "bioconda::blast=2.13.0"
5+
conda "${moduleDir}/environment.yml"
66
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
7-
'https://depot.galaxyproject.org/singularity/blast:2.13.0--hf3cf87c_0' :
8-
'biocontainers/blast:2.13.0--hf3cf87c_0' }"
7+
'https://depot.galaxyproject.org/singularity/blast:2.15.0--pl5321h6f7f691_1':
8+
'biocontainers/blast:2.15.0--pl5321h6f7f691_1' }"
99

1010
input:
11-
path fasta
11+
tuple val(meta), path(fasta)
1212

1313
output:
14-
path 'blast_db' , emit: db
15-
path "versions.yml" , emit: versions
14+
tuple val(meta), path("${meta.id}"), emit: db
15+
path "versions.yml" , emit: versions
1616

1717
when:
1818
task.ext.when == null || task.ext.when
1919

2020
script:
2121
def args = task.ext.args ?: ''
22+
def prefix = task.ext.prefix ?: "${meta.id}"
23+
def is_compressed = fasta.getExtension() == "gz" ? true : false
24+
def fasta_name = is_compressed ? fasta.getBaseName() : fasta
2225
"""
26+
if [ "${is_compressed}" == "true" ]; then
27+
gzip -c -d ${fasta} > ${fasta_name}
28+
fi
29+
2330
makeblastdb \\
24-
-in $fasta \\
25-
$args
26-
mkdir blast_db
27-
mv ${fasta}* blast_db
31+
-in ${fasta_name} \\
32+
${args}
33+
mkdir ${prefix}
34+
mv ${fasta_name}* ${prefix}
35+
36+
cat <<-END_VERSIONS > versions.yml
37+
"${task.process}":
38+
blast: \$(blastn -version 2>&1 | sed 's/^.*blastn: //; s/ .*\$//')
39+
END_VERSIONS
40+
"""
41+
42+
stub:
43+
def args = task.ext.args ?: ''
44+
def prefix = task.ext.prefix ?: "${meta.id}"
45+
def is_compressed = fasta.getExtension() == "gz" ? true : false
46+
def fasta_name = is_compressed ? fasta.getBaseName() : fasta
47+
"""
48+
touch ${fasta_name}.fasta
49+
touch ${fasta_name}.fasta.ndb
50+
touch ${fasta_name}.fasta.nhr
51+
touch ${fasta_name}.fasta.nin
52+
touch ${fasta_name}.fasta.njs
53+
touch ${fasta_name}.fasta.not
54+
touch ${fasta_name}.fasta.nsq
55+
touch ${fasta_name}.fasta.ntf
56+
touch ${fasta_name}.fasta.nto
57+
mkdir ${prefix}
58+
mv ${fasta_name}* ${prefix}
59+
2860
cat <<-END_VERSIONS > versions.yml
2961
"${task.process}":
3062
blast: \$(blastn -version 2>&1 | sed 's/^.*blastn: //; s/ .*\$//')

modules/ebi-metagenomics/blast/makeblastdb/meta.yml

+27-10
Original file line numberDiff line numberDiff line change
@@ -12,20 +12,37 @@ tools:
1212
documentation: https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=Blastdocs
1313
doi: 10.1016/S0022-2836(05)80360-2
1414
licence: ["US-Government-Work"]
15+
identifier: ""
1516
input:
16-
- fasta:
17-
type: file
18-
description: Input fasta file
19-
pattern: "*.{fa,fasta}"
17+
- - meta:
18+
type: map
19+
description: |
20+
Groovy Map containing sample information
21+
e.g. [ id:'test', single_end:false ]
22+
- fasta:
23+
type: file
24+
description: Input fasta file
25+
pattern: "*.{fa,fasta,fa.gz,fasta.gz}"
2026
output:
2127
- db:
22-
type: directory
23-
description: Output directory containing blast database files
24-
pattern: "*"
28+
- meta:
29+
type: map
30+
description: |
31+
Groovy Map containing sample information
32+
e.g. [ id:'test', single_end:false ]
33+
- ${meta.id}:
34+
type: directory
35+
description: Output directory containing blast database files
36+
pattern: "*"
2537
- versions:
26-
type: file
27-
description: File containing software versions
28-
pattern: "versions.yml"
38+
- versions.yml:
39+
type: file
40+
description: File containing software versions
41+
pattern: "versions.yml"
2942
authors:
3043
- "@joseespinosa"
3144
- "@drpatelh"
45+
maintainers:
46+
- "@joseespinosa"
47+
- "@drpatelh"
48+
- "@vagkaratzas"

0 commit comments

Comments
 (0)