Skip to content

Commit

Permalink
Generate index out of all the generated references (#66)
Browse files Browse the repository at this point in the history
* add index.json

* fix path

* add run_tabix

* file()

* mix versions

* update CHANGELOG

* fix publish

* cursor :shake_fist:

* Apply suggestions from code review

* update snapshot because input was changed

* use branch

* file()

* no file()
  • Loading branch information
maxulysse authored Dec 22, 2024
1 parent 51f270d commit 1c775ae
Show file tree
Hide file tree
Showing 16 changed files with 261 additions and 214 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ Initial release of nf-core/references, created with the [nf-core](https://nf-co.
- [56](https://github.com/nf-core/references/pull/56) - Add new params: kallisto_make_unique to use the --make-unique option for kallisto
- [56](https://github.com/nf-core/references/pull/56) - New file assets/genomes/Caenorhabditis_elegans/NCBI/WBcel235_updated.yml, build from assets/genomes/Caenorhabditis_elegans/NCBI/WBcel235.yml
- [62](https://github.com/nf-core/references/pull/62) - Added comments to the code
- [68](https://github.com/nf-core/references/pull/68) - Output vcf asset
- [66](https://github.com/nf-core/references/pull/66) - Output index

### Changed

Expand Down
202 changes: 88 additions & 114 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -100,105 +100,99 @@ workflow {
}

output {
'bowtie1_index' {
path { meta, _bowtie1_index -> { _file -> "${meta.species}/${meta.source}/${meta.genome}/Sequence/BowtieIndex/version1.3.1" } }
}
'bowtie2_index' {
path { meta, _bowtie2_index -> { _file -> "${meta.species}/${meta.source}/${meta.genome}/Sequence/Bowtie2Index/version2.5.2" } }
}
'bwamem1_index' {
path { meta, _bwamem1_index -> { _file -> "${meta.species}/${meta.source}/${meta.genome}/Sequence/BWAIndex/version0.7.18" } }
}
'bwamem2_index' {
path { meta, _bwamem2_index -> { _file -> "${meta.species}/${meta.source}/${meta.genome}/Sequence/BWAmem2Index/version2.2.1" } }
}
'dragmap_hashmap' {
path { meta, _index -> { _file -> "${meta.species}/${meta.source}/${meta.genome}/Sequence/dragmap/version1.2.1" } }
}
'fasta' {
path { meta, _fasta -> { file -> "${meta.species}/${meta.source}/${meta.genome}/Sequence/WholeGenomeFasta/${file}" } }
}
'fasta_dict' {
path { meta, _fasta_dict -> { file -> "${meta.species}/${meta.source}/${meta.genome}/Sequence/WholeGenomeFasta/${file}" } }
}
'fasta_fai' {
path { meta, _fasta_fai -> { file -> "${meta.species}/${meta.source}/${meta.genome}/Sequence/WholeGenomeFasta/${file}" } }
}
'fasta_sizes' {
path { meta, _fasta_sizes -> { file -> "${meta.species}/${meta.source}/${meta.genome}/Sequence/WholeGenomeFasta/${file}" } }
}
'gff' {
path { meta, _gff -> { file -> "${meta.species}/${meta.source}/${meta.genome}/Annotation/Genes/${file}" } }
}
'gtf' {
path { meta, _gtf -> { file -> "${meta.species}/${meta.source}/${meta.genome}/Annotation/Genes/${file}" } }
}
'hisat2_index' {
path { meta, _hisat2_index ->
{ _file ->
meta.source_version == "unknown"
? "${meta.species}/${meta.source}/${meta.genome}/Sequence/Hisat2Index/version2.2.1"
: "${meta.species}/${meta.source}/${meta.genome}/Sequence/Hisat2Index/${meta.source_version}/version2.2.1"
}
}
}
'intervals_bed' {
path { meta, _intervals_bed -> { file -> "${meta.species}/${meta.source}/${meta.genome}/Annotation/intervals/${file}" } }
}
'kallisto_index' {
path { meta, _kallisto_index ->
{ file ->
meta.source_version == "unknown"
? "${meta.species}/${meta.source}/${meta.genome}/Sequence/KallistoIndex/version0.51.1/${file}"
: "${meta.species}/${meta.source}/${meta.genome}/Sequence/KallistoIndex/${meta.source_version}/version0.51.1/${file}"
}
}
}
'msisensorpro_list' {
path { meta, _msisensorpro_list -> { file -> "${meta.species}/${meta.source}/${meta.genome}/Annotation/msisensorpro/${file}" } }
}
'multiqc' {
path "multiqc"
}
'rsem_index' {
path { meta, _rsem_index ->
{ _file ->
meta.source_version == "unknown"
? "${meta.species}/${meta.source}/${meta.genome}/Sequence/RSEMIndex/version1.3.1"
: "${meta.species}/${meta.source}/${meta.genome}/Sequence/RSEMIndex/${meta.source_version}/version1.3.1"
}
}
}
'salmon_index' {
path { meta, _salmon_index ->
{ _file ->
meta.source_version == "unknown"
? "${meta.species}/${meta.source}/${meta.genome}/Sequence/SalmonIndex/version1.10.3"
: "${meta.species}/${meta.source}/${meta.genome}/Sequence/SalmonIndex/${meta.source_version}/version1.10.3"
'reference' {
path { meta, _file ->
{ file ->
if (meta.file == "bowtie1_index") {
"${meta.species}/${meta.source}/${meta.genome}/Sequence/BowtieIndex/version1.3.1"
}
else if (meta.file == "bowtie2_index") {
"${meta.species}/${meta.source}/${meta.genome}/Sequence/Bowtie2Index/version2.5.2"
}
else if (meta.file == "bwamem1_index") {
"${meta.species}/${meta.source}/${meta.genome}/Sequence/BWAIndex/version0.7.18"
}
else if (meta.file == "bwamem2_index") {
"${meta.species}/${meta.source}/${meta.genome}/Sequence/BWAmem2Index/version2.2.1"
}
else if (meta.file == "dragmap_hashmap") {
"${meta.species}/${meta.source}/${meta.genome}/Sequence/dragmap/version1.2.1"
}
else if (meta.file == "fasta") {
"${meta.species}/${meta.source}/${meta.genome}/Sequence/WholeGenomeFasta/${file}"
}
else if (meta.file == "fasta_dict") {
"${meta.species}/${meta.source}/${meta.genome}/Sequence/WholeGenomeFasta/${file}"
}
else if (meta.file == "fasta_fai") {
"${meta.species}/${meta.source}/${meta.genome}/Sequence/WholeGenomeFasta/${file}"
}
else if (meta.file == "fasta_sizes") {
"${meta.species}/${meta.source}/${meta.genome}/Sequence/WholeGenomeFasta/${file}"
}
else if (meta.file == "gff") {
"${meta.species}/${meta.source}/${meta.genome}/Annotation/Genes/${file}"
}
else if (meta.file == "gtf") {
"${meta.species}/${meta.source}/${meta.genome}/Annotation/Genes/${file}"
}
else if (meta.file == "hisat2_index") {
meta.source_version == "unknown"
? "${meta.species}/${meta.source}/${meta.genome}/Sequence/Hisat2Index/version2.2.1"
: "${meta.species}/${meta.source}/${meta.genome}/Sequence/Hisat2Index/${meta.source_version}/version2.2.1"
}
else if (meta.file == "intervals_bed") {
"${meta.species}/${meta.source}/${meta.genome}/Annotation/intervals/${file}"
}
else if (meta.file == "kallisto_index") {
meta.source_version == "unknown"
? "${meta.species}/${meta.source}/${meta.genome}/Sequence/KallistoIndex/version0.51.1/${file}"
: "${meta.species}/${meta.source}/${meta.genome}/Sequence/KallistoIndex/${meta.source_version}/version0.51.1/${file}"
}
else if (meta.file == "msisensorpro_list") {
"${meta.species}/${meta.source}/${meta.genome}/Annotation/msisensorpro/${file}"
}
else if (meta.file == "rsem_index") {
meta.source_version == "unknown"
? "${meta.species}/${meta.source}/${meta.genome}/Sequence/RSEMIndex/version1.3.1/"
: "${meta.species}/${meta.source}/${meta.genome}/Sequence/RSEMIndex/${meta.source_version}/version1.3.1/"
}
else if (meta.file == "salmon_index") {
meta.source_version == "unknown"
? "${meta.species}/${meta.source}/${meta.genome}/Sequence/SalmonIndex/version1.10.3/"
: "${meta.species}/${meta.source}/${meta.genome}/Sequence/SalmonIndex/${meta.source_version}/version1.10.3/"
}
else if (meta.file == "splice_sites") {
"${meta.species}/${meta.source}/${meta.genome}/Sequence/SpliceSites/${file}"
}
else if (meta.file == "star_index") {
meta.source_version == "unknown"
? "${meta.species}/${meta.source}/${meta.genome}/Sequence/STARIndex/version2.7.11b/"
: "${meta.species}/${meta.source}/${meta.genome}/Sequence/STARIndex/${meta.source_version}/version2.7.11b/"
}
else if (meta.file == "transcript_fasta") {
"${meta.species}/${meta.source}/${meta.genome}/Sequence/TranscriptFasta/${file}"
}
else if (meta.file == "vcf") {
"${meta.species}/${meta.source}/${meta.genome}/Annotation/${meta.source_vcf}/${file}"
}
else if (meta.file == "vcf_tbi") {
"${meta.species}/${meta.source}/${meta.genome}/Annotation/${meta.source_vcf}/${file}"
}
else {
null
}
}
}
}
'splice_sites' {
path { meta, _splice_sites -> { file -> "${meta.species}/${meta.source}/${meta.genome}/Sequence/SpliceSites/${file}" } }
}
'star_index' {
path { meta, _star_index ->
{ _file ->
meta.source_version == "unknown"
? "${meta.species}/${meta.source}/${meta.genome}/Sequence/STARIndex/version2.7.11b"
: "${meta.species}/${meta.source}/${meta.genome}/Sequence/STARIndex/${meta.source_version}/version2.7.11b"
}

index {
path "index.json"
mapper { meta, reference -> ["${meta.file}:${reference}"] }
}
}
'transcript_fasta' {
path { meta, _transcript_fasta -> { file -> "${meta.species}/${meta.source}/${meta.genome}/Sequence/TranscriptFasta/${file}" } }
}
// 'vcf' {
// path { meta, _vcf -> { file -> "${meta.species}/${meta.source}/${meta.genome}/Annotation/${meta.source_vcf}/${file}" } }
// }
'vcf_tbi' {
path { meta, _vcf_tbi -> { file -> "${meta.species}/${meta.source}/${meta.genome}/Annotation/${meta.source_vcf}/${file}" } }
}
}
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand All @@ -219,26 +213,6 @@ workflow NFCORE_REFERENCES {
REFERENCES(input, tools)

emit:
bowtie1_index = REFERENCES.out.bowtie1_index
bowtie2_index = REFERENCES.out.bowtie2_index
bwamem1_index = REFERENCES.out.bwamem1_index
bwamem2_index = REFERENCES.out.bwamem2_index
dragmap_hashmap = REFERENCES.out.dragmap_hashmap
fasta = REFERENCES.out.fasta
fasta_dict = REFERENCES.out.fasta_dict
fasta_fai = REFERENCES.out.fasta_fai
fasta_sizes = REFERENCES.out.fasta_sizes
gtf = REFERENCES.out.gtf
hisat2_index = REFERENCES.out.hisat2_index
intervals_bed = REFERENCES.out.intervals_bed
kallisto_index = REFERENCES.out.kallisto_index
msisensorpro_list = REFERENCES.out.msisensorpro_list
rsem_index = REFERENCES.out.rsem_index
salmon_index = REFERENCES.out.salmon_index
splice_sites = REFERENCES.out.splice_sites
star_index = REFERENCES.out.star_index
transcript_fasta = REFERENCES.out.transcript_fasta
// vcf = REFERENCES.out.vcf
vcf_tbi = REFERENCES.out.vcf_tbi
versions = REFERENCES.out.versions
reference = REFERENCES.out.reference
versions = REFERENCES.out.versions
}
89 changes: 79 additions & 10 deletions subworkflows/local/asset_to_channel/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -14,36 +14,105 @@ workflow ASSET_TO_CHANNEL {

def reduce = { meta -> meta.subMap(['genome', 'id', 'source', 'source_vcf', 'source_version', 'species']) }

intervals_bed = asset.map { meta, _fasta -> meta.intervals_bed ? [reduce(meta), meta.intervals_bed] : null }
intervals_bed_branch = asset.branch { meta, _fasta ->
file: meta.intervals_bed
return [reduce(meta), meta.intervals_bed]
other: true
return null
}
intervals_bed = intervals_bed_branch.file


// If ends with .gz, decompress it
// If any of the asset exists, then adding run_tools to false and skip the asset creation from the fasta file
fasta = asset.map { meta, fasta_ -> fasta_ ? [reduce(meta) + [decompress_fasta: fasta_.endsWith('.gz') ?: false] + [run_bowtie1: meta.bowtie1_index ? false : true] + [run_bowtie2: meta.bowtie2_index ? false : true] + [run_bwamem1: meta.bwamem1_index ? false : true] + [run_bwamem2: meta.bwamem2_index ? false : true] + [run_dragmap: meta.dragmap_hashtable ? false : true] + [run_faidx: meta.fasta_fai && meta.fasta_sizes ? false : true] + [run_gatkdict: meta.fasta_dict ? false : true] + [run_hisat2: meta.hisat2_index ? false : true] + [run_intervals: meta.intervals_bed ? false : true] + [run_kallisto: meta.kallisto_index ? false : true] + [run_msisenpro: meta.msisensorpro_list ? false : true] + [run_rsem: meta.rsem_index ? false : true] + [run_rsem_make_transcript_fasta: meta.transcript_fasta ? false : true] + [run_salmon: meta.salmon_index ? false : true] + [run_star: meta.star_index ? false : true], fasta_] : null }
fasta_branch = asset.branch { meta, fasta_ ->
file: fasta_
return [reduce(meta) + [decompress_fasta: fasta_.endsWith('.gz') ?: false] + [run_bowtie1: meta.bowtie1_index ? false : true] + [run_bowtie2: meta.bowtie2_index ? false : true] + [run_bwamem1: meta.bwamem1_index ? false : true] + [run_bwamem2: meta.bwamem2_index ? false : true] + [run_dragmap: meta.dragmap_hashtable ? false : true] + [run_faidx: meta.fasta_fai && meta.fasta_sizes ? false : true] + [run_gatkdict: meta.fasta_dict ? false : true] + [run_hisat2: meta.hisat2_index ? false : true] + [run_intervals: meta.intervals_bed ? false : true] + [run_kallisto: meta.kallisto_index ? false : true] + [run_msisenpro: meta.msisensorpro_list ? false : true] + [run_rsem: meta.rsem_index ? false : true] + [run_rsem_make_transcript_fasta: meta.transcript_fasta ? false : true] + [run_salmon: meta.salmon_index ? false : true] + [run_star: meta.star_index ? false : true], fasta_]
other: true
return null
}
fasta = fasta_branch.file


fasta_dict_branch = asset.branch { meta, _fasta ->
file: meta.fasta_dict
return [reduce(meta), meta.fasta_dict]
other: true
return null
}
fasta_dict = fasta_dict_branch.file

fasta_dict = asset.map { meta, _fasta -> meta.fasta_dict ? [reduce(meta), meta.fasta_dict] : null }

// If we have intervals_bed, then we don't need to run faidx
fasta_fai = asset.map { meta, _fasta -> meta.fasta_fai ? [reduce(meta) + [run_intervals: meta.intervals_bed ? false : true], meta.fasta_fai] : null }
fasta_fai_branch = asset.branch { meta, _fasta ->
file: meta.fasta_fai
return [reduce(meta) + [run_intervals: meta.intervals_bed ? false : true], meta.fasta_fai]
other: true
return null
}
fasta_fai = fasta_fai_branch.file


fasta_sizes_branch = asset.branch { meta, _fasta ->
file: meta.fasta_sizes
return [reduce(meta), meta.fasta_sizes]
other: true
return null
}
fasta_sizes = fasta_sizes_branch.file

fasta_sizes = asset.map { meta, _fasta -> meta.fasta_sizes ? [reduce(meta), meta.fasta_sizes] : null }

// If ends with .gz, decompress it
// If any of the asset exists, then adding run_tools to false and skip the asset creation from the annotation derived file (gff, gtf or transcript_fasta)
gff = asset.map { meta, fasta_ -> meta.gff ? [reduce(meta) + [decompress_gff: meta.gff.endsWith('.gz') ?: false] + [run_gffread: fasta_ && !meta.gtf ?: false] + [run_hisat2: meta.splice_sites ? false : true], meta.gff] : null }
gff_branch = asset.branch { meta, fasta_ ->
file: meta.gff
return [reduce(meta) + [decompress_gff: meta.gff.endsWith('.gz') ?: false] + [run_gffread: fasta_ && !meta.gtf ?: false] + [run_hisat2: meta.splice_sites ? false : true], meta.gff]
other: true
return null
}
gff = gff_branch.file


// If ends with .gz, decompress it
// If any of the asset exists, then adding run_tools to false and skip the asset creation from the annotation derived file (gff, gtf or transcript_fasta)
gtf = asset.map { meta, _fasta -> meta.gtf ? [reduce(meta) + [decompress_gtf: meta.gtf.endsWith('.gz') ?: false] + [run_hisat2: meta.splice_sites ? false : true], meta.gtf] : null }
gtf_branch = asset.branch { meta, _fasta ->
file: meta.gtf
return [reduce(meta) + [decompress_gtf: meta.gtf.endsWith('.gz') ?: false] + [run_hisat2: meta.splice_sites ? false : true], meta.gtf]
other: true
return null
}
gtf = gtf_branch.file


splice_sites_branch = asset.branch { meta, _fasta ->
file: meta.splice_sites
return [reduce(meta), meta.splice_sites]
other: true
return null
}
splice_sites = splice_sites_branch.file

splice_sites = asset.map { meta, _fasta -> meta.splice_sites ? [reduce(meta), meta.splice_sites] : null }

// If any of the asset exists, then adding run_tools to false and skip the asset creation from the annotation derived file (gff, gtf or transcript_fasta)
transcript_fasta = asset.map { meta, _fasta -> meta.transcript_fasta ? [reduce(meta) + [run_hisat2: meta.hisat2_index ? false : true] + [run_kallisto: meta.kallisto_index ? false : true] + [run_rsem: meta.rsem_index ? false : true] + [run_salmon: meta.salmon_index ? false : true] + [run_star: meta.star_index ? false : true], meta.transcript_fasta] : null }
transcript_fasta_branch = asset.branch { meta, _fasta ->
file: meta.transcript_fasta
return [reduce(meta) + [run_hisat2: meta.hisat2_index ? false : true] + [run_kallisto: meta.kallisto_index ? false : true] + [run_rsem: meta.rsem_index ? false : true] + [run_salmon: meta.salmon_index ? false : true] + [run_star: meta.star_index ? false : true], meta.transcript_fasta]
other: true
return null
}
transcript_fasta = transcript_fasta_branch.file


// Using transpose here because we want to catch vcf with globs in the path because of nf-core/Sarek
// return a file, because we can catch globs this way, but it create issues with publishing
// If we already have the vcf_tbi, then we don't need to index the vcf
vcf = asset.map { meta, _fasta -> meta.vcf ? [reduce(meta) + [run_tabix: meta.vcf_tbi ? false : true], file(meta.vcf)] : null }.transpose()
vcf_branch = asset.branch { meta, _fasta ->
file: meta.vcf
return [reduce(meta) + [run_tabix: meta.vcf_tbi ? false : true], file(meta.vcf)]
other: true
return null
}
vcf = vcf_branch.file.transpose()

emit:
intervals_bed // channel: [meta, *.bed]
Expand Down
2 changes: 1 addition & 1 deletion subworkflows/local/index_vcf/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ workflow INDEX_VCF {
TABIX_TABIX(vcf_tabix)

vcf_tbi = TABIX_TABIX.out.tbi
versions = TABIX_TABIX.out.versions
versions = versions.mix(TABIX_TABIX.out.versions)
}

emit:
Expand Down
3 changes: 2 additions & 1 deletion tests/.nftignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
**/kallisto
**/RSEMIndex/**/Log.out
**/STARIndex/**/Log.out
**/SalmonIndex/**/ctable.bin
Expand All @@ -9,7 +8,9 @@
**/dragmap/*/hash_table.cfg
**/dragmap/*/hash_table.cfg.bin
**/dragmap/*/hash_table_stats.txt
**/kallisto
.DS_Store
index.json
multiqc/multiqc_data/multiqc.log
multiqc/multiqc_data/multiqc_data.json
multiqc/multiqc_data/multiqc_general_stats.txt
Expand Down
Loading

0 comments on commit 1c775ae

Please sign in to comment.