diff --git a/CHANGELOG.md b/CHANGELOG.md index d90cb637..5446abeb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,6 +38,8 @@ Initial release of nf-core/references, created with the [nf-core](https://nf-co. - [41](https://github.com/nf-core/references/pull/41) - Better sarek tests - [41](https://github.com/nf-core/references/pull/41) - Better publishing for sarek related files - [43](https://github.com/nf-core/references/pull/43) - Fasta is no longer a required asset +- [48](https://github.com/nf-core/references/pull/48) - Simplify VCF tabix index generation and related assets +- [48](https://github.com/nf-core/references/pull/48) - Code refactoring (new subworfklows for each type of operations) - [49](https://github.com/nf-core/references/pull/49) - Better publishing for all files ### Fixed diff --git a/assets/genomes/test/default_extended.yml b/assets/genomes/test/default_extended.yml index fc69d058..a51628b3 100644 --- a/assets/genomes/test/default_extended.yml +++ b/assets/genomes/test/default_extended.yml @@ -1,22 +1,28 @@ - genome: "GRCh38_chr21" - dbsnp_vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz" fasta: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/GRCh38_chr21.fa" fasta_dict: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/GRCh38_chr21.dict" fasta_fai: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/GRCh38_chr21.fa.fai" fasta_sizes: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/GRCh38_chr21.fa.sizes" - germline_resource_vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/gnomAD.r2.1.1.vcf.gz" gff: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/genes_chr21.gff" gtf: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/GRCh38_chr21.gtf" - known_indels_vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/mills_and_1000G.indels.vcf.gz" - known_snps_vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz" mito_name: "MT" readme: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/README.md" source: "nf-core/references" - source_dbsnp: "GATK_BUNDLE" - source_germline_resource: "GATK_BUNDLE" - source_known_indels: "GATK_BUNDLE" - source_known_snps: "GATK_BUNDLE" + source_vcf: "GATK_BUNDLE" species: "Homo_sapiens" splice_sites: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/genes_chr21.splice_sites.txt" transcript_fasta: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/genome.transcripts.fa" + vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz" # macs_gsize: "1.2e7" +- genome: "GRCh38_chr21" + readme: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/README.md" + source: "nf-core/references" + source_vcf: "GATK_BUNDLE" + species: "Homo_sapiens" + vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/gnomAD.r2.1.1.vcf.gz" +- genome: "GRCh38_chr21" + readme: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/README.md" + source: "nf-core/references" + source_vcf: "GATK_BUNDLE" + species: "Homo_sapiens" + vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/mills_and_1000G.indels.vcf.gz" diff --git a/assets/genomes/test/default_full.yml b/assets/genomes/test/default_full.yml index 27accf41..cc79561f 100644 --- a/assets/genomes/test/default_full.yml +++ b/assets/genomes/test/default_full.yml @@ -1,27 +1,32 @@ - genome: "GRCh38_chr21" - dbsnp_vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz" - dbsnp_vcf_tbi: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz.tbi" fasta: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/GRCh38_chr21.fa" fasta_dict: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/GRCh38_chr21.dict" fasta_fai: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/GRCh38_chr21.fa.fai" fasta_sizes: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/GRCh38_chr21.fa.sizes" - germline_resource_vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/gnomAD.r2.1.1.vcf.gz" - germline_resource_vcf_tbi: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/gnomAD.r2.1.1.vcf.gz.tbi" gff: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/genes_chr21.gff" gtf: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/GRCh38_chr21.gtf" intervals_bed: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/GRCh38_chr21.bed" - known_indels_vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/mills_and_1000G.indels.vcf.gz" - known_indels_vcf_tbi: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/mills_and_1000G.indels.vcf.gz.tbi" - known_snps_vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz" - known_snps_vcf_tbi: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz.tbi" mito_name: "MT" readme: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/README.md" source: "nf-core/references" - source_dbsnp: "GATK_BUNDLE" - source_germline_resource: "GATK_BUNDLE" - source_known_indels: "GATK_BUNDLE" - source_known_snps: "GATK_BUNDLE" + source_vcf: "GATK_BUNDLE" species: "Homo_sapiens" splice_sites: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/genes_chr21.splice_sites.txt" transcript_fasta: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/genome.transcripts.fa" + vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz" + vcf_tbi: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz.tbi" # macs_gsize: "1.2e7" +- genome: "GRCh38_chr21" + readme: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/README.md" + source: "nf-core/references" + source_vcf: "GATK_BUNDLE" + species: "Homo_sapiens" + vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/gnomAD.r2.1.1.vcf.gz" + vcf_tbi: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/gnomAD.r2.1.1.vcf.gz.tbi" +- genome: "GRCh38_chr21" + readme: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/README.md" + source: "nf-core/references" + source_vcf: "GATK_BUNDLE" + species: "Homo_sapiens" + vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/mills_and_1000G.indels.vcf.gz" + vcf_tbi: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/mills_and_1000G.indels.vcf.gz.tbi" diff --git a/assets/genomes/test/pipelines/sarek.yml b/assets/genomes/test/pipelines/sarek.yml index f039b0fe..c813acf8 100644 --- a/assets/genomes/test/pipelines/sarek.yml +++ b/assets/genomes/test/pipelines/sarek.yml @@ -1,12 +1,16 @@ - genome: "testdata.GRCh38_chr22" - dbsnp_vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz" fasta: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta" - germline_resource_vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/gnomAD.r2.1.1.vcf.gz" - known_indels_vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/mills_and_1000G.indels.vcf.gz" - known_snps_vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz" + vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz" source: "nf-core/references" - source_dbsnp: "GATK_BUNDLE" - source_germline_resource: "GATK_BUNDLE" - source_known_indels: "GATK_BUNDLE" - source_known_snps: "GATK_BUNDLE" + source_vcf: "GATK_BUNDLE" species: "Homo_sapiens" +- genome: "testdata.GRCh38_chr22" + source_vcf: "GATK_BUNDLE" + species: "Homo_sapiens" + source: "nf-core/references" + vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/gnomAD.r2.1.1.vcf.gz" +- genome: "testdata.GRCh38_chr22" + source_vcf: "GATK_BUNDLE" + species: "Homo_sapiens" + source: "nf-core/references" + vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/mills_and_1000G.indels.vcf.gz" diff --git a/assets/genomes/test/pipelines/sarek_s3_muliple_glob.yml b/assets/genomes/test/pipelines/sarek_s3_muliple_glob.yml index 5eb4d04e..23b1bd90 100644 --- a/assets/genomes/test/pipelines/sarek_s3_muliple_glob.yml +++ b/assets/genomes/test/pipelines/sarek_s3_muliple_glob.yml @@ -1,11 +1,11 @@ # from sarek igenomes.config - genome: GRCh37 - known_indels_vcf: "s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh37/Annotation/GATKBundle/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.vcf.gz" - source_known_indels: "GATK_BUNDLE" + vcf: "s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh37/Annotation/GATKBundle/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.vcf.gz" + source_vcf: "GATK_BUNDLE" species: "Homo_sapiens" source: "GATK" - genome: GRCh38 - known_indels_vcf: "s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz" - source_known_indels: "GATK_BUNDLE" + vcf: "s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz" + source_vcf: "GATK_BUNDLE" species: "Homo_sapiens" source: "GATK" diff --git a/assets/schema_input.json b/assets/schema_input.json index 36293cc4..784a8827 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -18,25 +18,10 @@ "errorMessage": "Where the references came from", "meta": ["source"] }, - "source_dbsnp": { + "source_vcf": { "type": "string", "errorMessage": "Where the references came from", - "meta": ["source_dbsnp"] - }, - "source_germline_resource": { - "type": "string", - "errorMessage": "Where the references came from", - "meta": ["source_germline_resource"] - }, - "source_known_indels": { - "type": "string", - "errorMessage": "Where the references came from", - "meta": ["source_known_indels"] - }, - "source_known_snps": { - "type": "string", - "errorMessage": "Where the references came from", - "meta": ["source_known_snps"] + "meta": ["source_vcf"] }, "species": { "type": "string", @@ -88,22 +73,7 @@ "pattern": "^\\S+\\.f(ast|n)?a(\\.gz)?$", "errorMessage": "TODO" }, - "dbsnp_vcf": { - "type": "string", - "pattern": "^\\S+\\.vcf\\.gz$", - "errorMessage": "TODO" - }, - "known_snps_vcf": { - "type": "string", - "pattern": "^\\S+\\.vcf\\.gz$", - "errorMessage": "TODO" - }, - "known_indels_vcf": { - "type": "string", - "pattern": "^\\S+\\.vcf\\.gz$", - "errorMessage": "TODO" - }, - "germline_resource_vcf": { + "vcf": { "type": "string", "pattern": "^\\S+\\.vcf\\.gz$", "errorMessage": "TODO" diff --git a/main.nf b/main.nf index cad54fed..fd71eb93 100644 --- a/main.nf +++ b/main.nf @@ -102,19 +102,19 @@ workflow { } output { - 'bowtie1' { + 'bowtie1_index' { path { meta, index -> { file -> "${meta.species}/${meta.source}/${meta.id}/Sequence/BowtieIndex/" } } } - 'bowtie2' { + 'bowtie2_index' { path { meta, index -> { file -> "${meta.species}/${meta.source}/${meta.id}/Sequence/Bowtie2Index/" } } } - 'bwamem1' { + 'bwamem1_index' { path { meta, index -> { file -> "${meta.species}/${meta.source}/${meta.id}/Sequence/BWAIndex/" } } } - 'bwamem2' { + 'bwamem2_index' { path { meta, index -> { file -> "${meta.species}/${meta.source}/${meta.id}/Sequence/BWAmem2Index/" } } } - 'dragmap' { + 'dragmap_hashmap' { path { meta, index -> { file -> "${meta.species}/${meta.source}/${meta.id}/Sequence/dragmap/" } } } 'fasta' { @@ -129,56 +129,47 @@ output { 'fasta_sizes' { path { meta, sizes -> { file -> "${meta.species}/${meta.source}/${meta.id}/Sequence/WholeGenomeFasta/${file}" } } } - 'gffread' { + 'gtf' { path { meta, intervals -> { file -> "${meta.species}/${meta.source}/${meta.id}/Annotation/Genes/${file}" } } } - 'hisat2' { + 'hisat2_index' { path { meta, index -> { file -> "${meta.species}/${meta.source}/${meta.id}/Sequence/Hisat2Index/" } } } - 'intervals' { + 'intervals_bed' { path { meta, intervals -> { file -> "${meta.species}/${meta.source}/${meta.id}/Annotation/intervals/${file}" } } } - 'kallisto' { + 'kallisto_index' { path { meta, index -> { file -> "${meta.species}/${meta.source}/${meta.id}/Sequence/KallistoIndex/" } } } - 'msisensorpro' { + 'msisensorpro_list' { path { meta, index -> { file -> "${meta.species}/${meta.source}/${meta.id}/Annotation/msisensorpro/${file}" } } } 'multiqc_data' { - path 'multiqc' + path { folder -> { file -> "multiqc/multiqc_data" } } } 'multiqc_plots' { - path 'multiqc' + path { folder -> { file -> "multiqc/multiqc_plots" } } } 'multiqc_report' { - path 'multiqc' + path { folder -> { file -> "multiqc/multiqc_report" } } } - 'rsem' { + 'rsem_index' { path { meta, index -> { file -> "${meta.species}/${meta.source}/${meta.id}/Sequence/RSEMIndex/" } } } - 'salmon' { + 'salmon_index' { path { meta, index -> { file -> "${meta.species}/${meta.source}/${meta.id}/Sequence/SalmonIndex/" } } } 'splice_sites' { path { meta, txt -> { file -> "${meta.species}/${meta.source}/${meta.id}/Sequence/SpliceSites/${file}" } } } - 'star' { + 'star_index' { path { meta, index -> { file -> "${meta.species}/${meta.source}/${meta.id}/Sequence/STARIndex/" } } } 'transcript_fasta' { path { meta, fasta -> { file -> "${meta.species}/${meta.source}/${meta.id}/Sequence/TranscriptFasta/${file}" } } } - 'tabix_dbsnp' { - path { meta, vcf -> { file -> "${meta.species}/${meta.source}/${meta.id}/Annotation/${meta.source_dbsnp}/${file}" } } - } - 'tabix_germline_resource' { - path { meta, vcf -> { file -> "${meta.species}/${meta.source}/${meta.id}/Annotation/${meta.source_germline_resource}/${file}" } } - } - 'tabix_known_indels' { - path { meta, vcf -> { file -> "${meta.species}/${meta.source}/${meta.id}/Annotation/${meta.source_known_indels}/${file}" } } - } - 'tabix_known_snps' { - path { meta, vcf -> { file -> "${meta.species}/${meta.source}/${meta.id}/Annotation/${meta.source_known_snps}/${file}" } } + 'vcf_tbi' { + path { meta, tbi -> { file -> "${meta.species}/${meta.source}/${meta.id}/Annotation/${meta.source_vcf}/${file}" } } } } /* @@ -200,28 +191,25 @@ workflow NFCORE_REFERENCES { REFERENCES(input, tools) emit: - bowtie1 = REFERENCES.out.bowtie1 - bowtie2 = REFERENCES.out.bowtie2 - bwamem1 = REFERENCES.out.bwamem1 - bwamem2 = REFERENCES.out.bwamem2 - dbsnp_vcf_tbi = REFERENCES.out.dbsnp_vcf_tbi - dragmap = REFERENCES.out.dragmap - fasta = REFERENCES.out.fasta - fasta_dict = REFERENCES.out.fasta_dict - fasta_fai = REFERENCES.out.fasta_fai - germline_resource_vcf_tbi = REFERENCES.out.germline_resource_vcf_tbi - gffread = REFERENCES.out.gff_gtf - hisat2 = REFERENCES.out.hisat2 - hisat2_splice_sites = REFERENCES.out.hisat2_splice_sites - intervals = REFERENCES.out.intervals_bed - kallisto = REFERENCES.out.kallisto - known_indels_vcf_tbi = REFERENCES.out.known_indels_vcf_tbi - known_snps_vcf_tbi = REFERENCES.out.known_snps_vcf_tbi - msisensorpro = REFERENCES.out.msisensorpro - rsem = REFERENCES.out.rsem - rsem_transcript_fasta = REFERENCES.out.rsem_transcript_fasta - salmon = REFERENCES.out.salmon - sizes = REFERENCES.out.sizes - star = REFERENCES.out.star - versions = REFERENCES.out.versions + bowtie1_index = REFERENCES.out.bowtie1_index + bowtie2_index = REFERENCES.out.bowtie2_index + bwamem1_index = REFERENCES.out.bwamem1_index + bwamem2_index = REFERENCES.out.bwamem2_index + dragmap_hashmap = REFERENCES.out.dragmap_hashmap + fasta = REFERENCES.out.fasta + fasta_dict = REFERENCES.out.fasta_dict + fasta_fai = REFERENCES.out.fasta_fai + fasta_sizes = REFERENCES.out.fasta_sizes + gtf = REFERENCES.out.gtf + hisat2_index = REFERENCES.out.hisat2_index + splice_sites = REFERENCES.out.splice_sites + intervals_bed = REFERENCES.out.intervals_bed + kallisto_index = REFERENCES.out.kallisto_index + msisensorpro_list = REFERENCES.out.msisensorpro_list + rsem_index = REFERENCES.out.rsem_index + transcript_fasta = REFERENCES.out.transcript_fasta + salmon_index = REFERENCES.out.salmon_index + star_index = REFERENCES.out.star_index + vcf_tbi = REFERENCES.out.vcf_tbi + versions = REFERENCES.out.versions } diff --git a/subworkflows/local/create_align_index/main.nf b/subworkflows/local/create_align_index/main.nf new file mode 100644 index 00000000..d087b9f5 --- /dev/null +++ b/subworkflows/local/create_align_index/main.nf @@ -0,0 +1,67 @@ +include { BOWTIE_BUILD as BOWTIE1_BUILD } from '../../../modules/nf-core/bowtie/build' +include { BOWTIE2_BUILD } from '../../../modules/nf-core/bowtie2/build' +include { BWAMEM2_INDEX } from '../../../modules/nf-core/bwamem2/index' +include { BWA_INDEX as BWAMEM1_INDEX } from '../../../modules/nf-core/bwa/index' +include { DRAGMAP_HASHTABLE } from '../../../modules/nf-core/dragmap/hashtable' + +workflow CREATE_ALIGN_INDEX { + take: + fasta // channel: [meta, fasta] + run_bowtie1 // boolean: true/false + run_bowtie2 // boolean: true/false + run_bwamem1 // boolean: true/false + run_bwamem2 // boolean: true/false + run_dragmap // boolean: true/false + + main: + bowtie1_index = Channel.empty() + bowtie2_index = Channel.empty() + bwamem1_index = Channel.empty() + bwamem2_index = Channel.empty() + dragmap_hashmap = Channel.empty() + + versions = Channel.empty() + + if (run_bowtie1) { + BOWTIE1_BUILD(fasta) + + bowtie1_index = BOWTIE1_BUILD.out.index + versions = versions.mix(BOWTIE1_BUILD.out.versions) + } + + if (run_bowtie2) { + BOWTIE2_BUILD(fasta) + + bowtie2_index = BOWTIE2_BUILD.out.index + versions = versions.mix(BOWTIE2_BUILD.out.versions) + } + + if (run_bwamem1) { + BWAMEM1_INDEX(fasta) + + bwamem1_index = BWAMEM1_INDEX.out.index + versions = versions.mix(BWAMEM1_INDEX.out.versions) + } + + if (run_bwamem2) { + BWAMEM2_INDEX(fasta) + + bwamem2_index = BWAMEM2_INDEX.out.index + versions = versions.mix(BWAMEM2_INDEX.out.versions) + } + + if (run_dragmap) { + DRAGMAP_HASHTABLE(fasta) + + dragmap_hashmap = DRAGMAP_HASHTABLE.out.hashmap + versions = versions.mix(DRAGMAP_HASHTABLE.out.versions) + } + + emit: + bowtie1_index // channel: [meta, BowtieIndex/] + bowtie2_index // channel: [meta, Bowtie2Index/] + bwamem1_index // channel: [meta, BWAmemIndex/] + bwamem2_index // channel: [meta, BWAmem2memIndex/] + dragmap_hashmap // channel: [meta, DragmapHashtable/] + versions // channel: [versions.yml] +} diff --git a/subworkflows/local/create_align_index_with_gff/main.nf b/subworkflows/local/create_align_index_with_gff/main.nf new file mode 100644 index 00000000..0ca28be4 --- /dev/null +++ b/subworkflows/local/create_align_index_with_gff/main.nf @@ -0,0 +1,132 @@ +include { GFFREAD } from '../../../modules/nf-core/gffread' +include { HISAT2_BUILD } from '../../../modules/nf-core/hisat2/build' +include { HISAT2_EXTRACTSPLICESITES } from '../../../modules/nf-core/hisat2/extractsplicesites' +include { KALLISTO_INDEX } from '../../../modules/nf-core/kallisto/index' +include { RSEM_PREPAREREFERENCE as MAKE_TRANSCRIPTS_FASTA } from '../../../modules/nf-core/rsem/preparereference' +include { RSEM_PREPAREREFERENCE as RSEM_PREPAREREFERENCE_GENOME } from '../../../modules/nf-core/rsem/preparereference' +include { SALMON_INDEX } from '../../../modules/nf-core/salmon/index' +include { STAR_GENOMEGENERATE } from '../../../modules/nf-core/star/genomegenerate' + +workflow CREATE_ALIGN_INDEX_WITH_GFF { + take: + fasta // channel: [meta, fasta] + input_gff // channel: [meta, gff] + input_gtf // channel: [meta, gtf] + input_splice_sites // channel: [meta, splice_sites] + input_transcript_fasta // channel: [meta, transcript_fasta] + run_hisat2 // boolean: true/false + run_hisat2_extractsplicesites // boolean: true/false + run_kallisto // boolean: true/false + run_rsem // boolean: true/false + run_rsem_make_transcript_fasta // boolean: true/false + run_salmon // boolean: true/false + run_star // boolean: true/false + + main: + gtf = Channel.empty() + hisat2_index = Channel.empty() + kallisto_index = Channel.empty() + rsem_index = Channel.empty() + salmon_index = Channel.empty() + splice_sites = Channel.empty() + star_index = Channel.empty() + transcript_fasta = Channel.empty() + + versions = Channel.empty() + + if (run_hisat2 || run_kallisto || run_rsem || run_rsem_make_transcript_fasta || run_salmon || run_star) { + + GFFREAD( + input_gff, + [] + ) + + versions = versions.mix(GFFREAD.out.versions) + + gtf = input_gtf + .mix(GFFREAD.out.gtf) + .groupTuple() + .map { meta, file -> + return file[1] ? [meta, file[1]] : [meta, file] + } + + if (run_hisat2 || run_hisat2_extractsplicesites) { + gtf_hisat2 = gtf.map { meta, map_gtf -> + return meta.run_hisat2 ? [meta, map_gtf] : null + } + + HISAT2_EXTRACTSPLICESITES(gtf_hisat2) + + splice_sites = input_splice_sites.mix(HISAT2_EXTRACTSPLICESITES.out.txt) + + if (run_hisat2) { + HISAT2_BUILD( + fasta, + gtf, + splice_sites + ) + + hisat2_index = HISAT2_BUILD.out.index + + versions = versions.mix(HISAT2_EXTRACTSPLICESITES.out.versions) + versions = versions.mix(HISAT2_BUILD.out.versions) + } + } + + if (run_kallisto || run_rsem_make_transcript_fasta || run_salmon) { + fasta_make_transcripts_fasta = fasta.map { meta, map_fasta -> + return meta.run_rsem_make_transcript_fasta ? [meta, map_fasta] : null + } + + MAKE_TRANSCRIPTS_FASTA( + fasta_make_transcripts_fasta, + gtf + ) + versions = versions.mix(MAKE_TRANSCRIPTS_FASTA.out.versions) + + transcript_fasta = input_transcript_fasta.mix(MAKE_TRANSCRIPTS_FASTA.out.transcript_fasta) + + if (run_kallisto) { + KALLISTO_INDEX(transcript_fasta) + + kallisto_index = KALLISTO_INDEX.out.index + versions = versions.mix(KALLISTO_INDEX.out.versions) + } + + if (run_salmon) { + SALMON_INDEX( + fasta, + transcript_fasta + ) + + salmon_index = SALMON_INDEX.out.index + versions = versions.mix(SALMON_INDEX.out.versions) + } + } + + if (run_rsem) { + RSEM_PREPAREREFERENCE_GENOME(fasta, gtf) + + rsem_index = RSEM_PREPAREREFERENCE_GENOME.out.index + versions = versions.mix(RSEM_PREPAREREFERENCE_GENOME.out.versions) + } + + if (run_star) { + STAR_GENOMEGENERATE(fasta, gtf) + + star_index = STAR_GENOMEGENERATE.out.index + versions = versions.mix(STAR_GENOMEGENERATE.out.versions) + } + } + + emit: + gtf // channel: [meta, gtf] + hisat2_index // channel: [meta, Hisat2Index/] + kallisto_index // channel: [meta, KallistoIndex] + rsem_index // channel: [meta, RSEMIndex/] + salmon_index // channel: [meta, SalmonIndex/] + splice_sites // channel: [meta, *.splice_sites.txt] + star_index // channel: [meta, STARIndex/] + transcript_fasta // channel: [meta, *.transcripts.fasta] + versions // channel: [versions.yml] +} diff --git a/subworkflows/local/index_fasta/main.nf b/subworkflows/local/index_fasta/main.nf new file mode 100644 index 00000000..ab8e9b5a --- /dev/null +++ b/subworkflows/local/index_fasta/main.nf @@ -0,0 +1,72 @@ +include { GATK4_CREATESEQUENCEDICTIONARY } from '../../../modules/nf-core/gatk4/createsequencedictionary' +include { GAWK as BUILD_INTERVALS } from '../../../modules/nf-core/gawk' +include { MSISENSORPRO_SCAN } from '../../../modules/nf-core/msisensorpro/scan' +include { SAMTOOLS_FAIDX } from '../../../modules/nf-core/samtools/faidx' + +workflow INDEX_FASTA { + take: + fasta // channel: [meta, fasta] + input_fasta_fai // channel: [meta, fasta_fai] + run_createsequencedictionary // boolean: true/false + run_faidx // boolean: true/false + run_intervals // boolean: true/false + run_msisensorpro // boolean: true/false + run_sizes // boolean: true/false + + main: + intervals_bed = Channel.empty() + fasta_fai = Channel.empty() + fasta_dict = Channel.empty() + fasta_sizes = Channel.empty() + msisensorpro_list = Channel.empty() + + versions = Channel.empty() + + if (run_createsequencedictionary) { + GATK4_CREATESEQUENCEDICTIONARY(fasta) + + fasta_dict = GATK4_CREATESEQUENCEDICTIONARY.out.dict + versions = versions.mix(GATK4_CREATESEQUENCEDICTIONARY.out.versions) + } + + if (run_faidx || run_intervals || run_sizes) { + fasta_samtools = fasta.map { meta, map_fasta -> + return meta.run_faidx ? [meta, map_fasta] : null + } + + SAMTOOLS_FAIDX( + fasta_samtools, + [[id: 'no_fai'], []], + run_sizes + ) + + fasta_fai = input_fasta_fai.mix(SAMTOOLS_FAIDX.out.fai) + fasta_sizes = SAMTOOLS_FAIDX.out.sizes + versions = versions.mix(SAMTOOLS_FAIDX.out.versions) + + if (run_intervals) { + fasta_fai_intervals = fasta_fai.map { meta, map_fasta_fai -> + return meta.run_intervals ? [meta, map_fasta_fai] : null + } + + BUILD_INTERVALS(fasta_fai_intervals, []) + intervals_bed = BUILD_INTERVALS.out.output + versions = versions.mix(BUILD_INTERVALS.out.versions) + } + } + + if (run_msisensorpro) { + MSISENSORPRO_SCAN(fasta) + + msisensorpro_list = MSISENSORPRO_SCAN.out.list + versions = versions.mix(MSISENSORPRO_SCAN.out.versions) + } + + emit: + fasta_dict // channel: [meta, *.fa(sta).dict] + fasta_fai // channel: [meta, *.fa(sta).fai] + fasta_sizes // channel: [meta, *.fa(sta).sizes] + intervals_bed // channel: [meta, *.bed] + msisensorpro_list // channel: [meta, *.list] + versions // channel: [versions.yml] +} diff --git a/subworkflows/local/index_vcf/main.nf b/subworkflows/local/index_vcf/main.nf new file mode 100644 index 00000000..48bab948 --- /dev/null +++ b/subworkflows/local/index_vcf/main.nf @@ -0,0 +1,23 @@ +include { TABIX_TABIX } from '../../../modules/nf-core/tabix/tabix' + +workflow INDEX_VCF { + take: + vcf // channel: [meta, vcf] + run_tabix // boolean: true/false + + main: + vcf_tbi = Channel.empty() + versions = Channel.empty() + + + if (run_tabix) { + TABIX_TABIX(vcf) + + vcf_tbi = TABIX_TABIX.out.tbi + versions = TABIX_TABIX.out.versions + } + + emit: + vcf_tbi // channel: [meta, *.vcf.tbi] + versions // channel: [versions.yml] +} diff --git a/subworkflows/local/samplesheet_to_channel/main.nf b/subworkflows/local/samplesheet_to_channel/main.nf new file mode 100644 index 00000000..611771cc --- /dev/null +++ b/subworkflows/local/samplesheet_to_channel/main.nf @@ -0,0 +1,60 @@ +workflow SAMPLESHEET_TO_CHANNEL { + take: + reference // channel: [meta, intervals_bed, fasta, fasta_dict, fasta_fai, fasta_sizes, gff, gtf, splice_sites, transcript_fasta, vcf, readme, bed12, mito_name, macs_gsize] + + main: + + intervals_bed = reference.map { meta, input_intervals_bed, input_fasta, input_fasta_dict, input_fasta_fai, input_fasta_sizes, input_gff, input_gtf, input_splice_sites, input_transcript_fasta, input_vcf, input_readme, input_bed12, input_mito_name, input_macs_gsize -> + return input_intervals_bed ? [meta, input_intervals_bed] : null + } + + fasta = reference.map { meta, input_intervals_bed, input_fasta, input_fasta_dict, input_fasta_fai, input_fasta_sizes, input_gff, input_gtf, input_splice_sites, input_transcript_fasta, input_vcf, input_readme, input_bed12, input_mito_name, input_macs_gsize -> + return input_fasta ? [meta + [run_faidx: input_fasta_fai && input_fasta_sizes ? false : true] + [run_intervals: input_intervals_bed ? false : true] + [run_rsem_make_transcript_fasta: input_transcript_fasta ? false : true], input_fasta] : null + } + + fasta_dict = reference.map { meta, input_intervals_bed, input_fasta, input_fasta_dict, input_fasta_fai, input_fasta_sizes, input_gff, input_gtf, input_splice_sites, input_transcript_fasta, input_vcf, input_readme, input_bed12, input_mito_name, input_macs_gsize -> + return input_fasta_dict ? [meta, input_fasta_dict] : null + } + + fasta_fai = reference.map { meta, input_intervals_bed, input_fasta, input_fasta_dict, input_fasta_fai, input_fasta_sizes, input_gff, input_gtf, input_splice_sites, input_transcript_fasta, input_vcf, input_readme, input_bed12, input_mito_name, input_macs_gsize -> + return input_fasta_fai ? [meta + [run_intervals: input_intervals_bed ? false : true], input_fasta_fai] : null + } + + fasta_sizes = reference.map { meta, input_intervals_bed, input_fasta, input_fasta_dict, input_fasta_fai, input_fasta_sizes, input_gff, input_gtf, input_splice_sites, input_transcript_fasta, input_vcf, input_readme, input_bed12, input_mito_name, input_macs_gsize -> + return input_fasta_sizes ? [meta, input_fasta_sizes] : null + } + + gff = reference.map { meta, input_intervals_bed, input_fasta, input_fasta_dict, input_fasta_fai, input_fasta_sizes, input_gff, input_gtf, input_splice_sites, input_transcript_fasta, input_vcf, input_readme, input_bed12, input_mito_name, input_macs_gsize -> + return input_gff && !input_gtf ? [meta + [run_hisat2: input_splice_sites ? false : true], input_gff] : null + } + + gtf = reference.map { meta, input_intervals_bed, input_fasta, input_fasta_dict, input_fasta_fai, input_fasta_sizes, input_gff, input_gtf, input_splice_sites, input_transcript_fasta, input_vcf, input_readme, input_bed12, input_mito_name, input_macs_gsize -> + return input_gtf ? [meta + [run_hisat2: input_splice_sites ? false : true], input_gtf] : null + } + + splice_sites = reference.map { meta, input_intervals_bed, input_fasta, input_fasta_dict, input_fasta_fai, input_fasta_sizes, input_gff, input_gtf, input_splice_sites, input_transcript_fasta, input_vcf, input_readme, input_bed12, input_mito_name, input_macs_gsize -> + return input_splice_sites ? [meta, input_splice_sites] : null + } + + transcript_fasta = reference.map { meta, input_intervals_bed, input_fasta, input_fasta_dict, input_fasta_fai, input_fasta_sizes, input_gff, input_gtf, input_splice_sites, input_transcript_fasta, input_vcf, input_readme, input_bed12, input_mito_name, input_macs_gsize -> + return input_transcript_fasta ? [meta, input_transcript_fasta] : null + } + + vcf = reference + .map { meta, input_intervals_bed, input_fasta, input_fasta_dict, input_fasta_fai, input_fasta_sizes, input_gff, input_gtf, input_splice_sites, input_transcript_fasta, input_vcf, input_readme, input_bed12, input_mito_name, input_macs_gsize -> + return input_vcf ? [meta, file(input_vcf)] : null + } + .transpose() + + emit: + intervals_bed + fasta + fasta_dict + fasta_fai + fasta_sizes + gff + gtf + splice_sites + transcript_fasta + vcf +} diff --git a/subworkflows/local/utils_nfcore_references_pipeline/main.nf b/subworkflows/local/utils_nfcore_references_pipeline/main.nf index c6b006c9..7885e2d9 100644 --- a/subworkflows/local/utils_nfcore_references_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_references_pipeline/main.nf @@ -8,14 +8,14 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { UTILS_NFSCHEMA_PLUGIN } from '../../nf-core/utils_nfschema_plugin' -include { paramsSummaryMap } from 'plugin/nf-schema' -include { samplesheetToList } from 'plugin/nf-schema' -include { completionEmail } from '../../nf-core/utils_nfcore_pipeline' -include { completionSummary } from '../../nf-core/utils_nfcore_pipeline' -include { imNotification } from '../../nf-core/utils_nfcore_pipeline' -include { UTILS_NFCORE_PIPELINE } from '../../nf-core/utils_nfcore_pipeline' -include { UTILS_NEXTFLOW_PIPELINE } from '../../nf-core/utils_nextflow_pipeline' +include { UTILS_NFSCHEMA_PLUGIN } from '../../nf-core/utils_nfschema_plugin' +include { paramsSummaryMap } from 'plugin/nf-schema' +include { samplesheetToList } from 'plugin/nf-schema' +include { completionEmail } from '../../nf-core/utils_nfcore_pipeline' +include { completionSummary } from '../../nf-core/utils_nfcore_pipeline' +include { imNotification } from '../../nf-core/utils_nfcore_pipeline' +include { UTILS_NFCORE_PIPELINE } from '../../nf-core/utils_nfcore_pipeline' +include { UTILS_NEXTFLOW_PIPELINE } from '../../nf-core/utils_nextflow_pipeline' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -24,7 +24,6 @@ include { UTILS_NEXTFLOW_PIPELINE } from '../../nf-core/utils_nextflow_pipelin */ workflow PIPELINE_INITIALISATION { - take: version // boolean: Display version and exit validate_params // boolean: Boolean whether to validate parameters against the schema at runtime @@ -40,7 +39,7 @@ workflow PIPELINE_INITIALISATION { // // Print version and exit if required and dump pipeline parameters to JSON file // - UTILS_NEXTFLOW_PIPELINE ( + UTILS_NEXTFLOW_PIPELINE( version, true, outdir, @@ -50,7 +49,7 @@ workflow PIPELINE_INITIALISATION { // // Validate parameters and generate parameter summary to stdout // - UTILS_NFSCHEMA_PLUGIN ( + UTILS_NFSCHEMA_PLUGIN( workflow, validate_params, null @@ -59,14 +58,14 @@ workflow PIPELINE_INITIALISATION { // // Check config provided to the pipeline // - UTILS_NFCORE_PIPELINE ( + UTILS_NFCORE_PIPELINE( nextflow_cli_args ) // // Create channel from input file provided through params.input // - ch_samplesheet = Channel.fromList(samplesheetToList(params.input, "${projectDir}/assets/schema_input.json")) + ch_samplesheet = Channel.fromList(samplesheetToList(input, "${projectDir}/assets/schema_input.json")) emit: samplesheet = ch_samplesheet @@ -80,7 +79,6 @@ workflow PIPELINE_INITIALISATION { */ workflow PIPELINE_COMPLETION { - take: email // string: email address email_on_fail // string: email address sent on pipeline failure @@ -116,7 +114,7 @@ workflow PIPELINE_COMPLETION { } workflow.onError { - log.error "Pipeline failed. Please refer to troubleshooting docs: https://nf-co.re/docs/usage/troubleshooting" + log.error("Pipeline failed. Please refer to troubleshooting docs: https://nf-co.re/docs/usage/troubleshooting") } } @@ -133,11 +131,11 @@ def toolCitationText() { // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "Tool (Foo et al. 2023)" : "", // Uncomment function in methodsDescriptionText to render in MultiQC report def citation_text = [ - "Tools used in the workflow included:", - "FastQC (Andrews 2010),", - "MultiQC (Ewels et al. 2016)", - "." - ].join(' ').trim() + "Tools used in the workflow included:", + "FastQC (Andrews 2010),", + "MultiQC (Ewels et al. 2016)", + "." + ].join(' ').trim() return citation_text } @@ -147,9 +145,9 @@ def toolBibliographyText() { // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "
  • Author (2023) Pub name, Journal, DOI
  • " : "", // Uncomment function in methodsDescriptionText to render in MultiQC report def reference_text = [ - "
  • Andrews S, (2010) FastQC, URL: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/).
  • ", - "
  • Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics , 32(19), 3047–3048. doi: /10.1093/bioinformatics/btw354
  • " - ].join(' ').trim() + "
  • Andrews S, (2010) FastQC, URL: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/).
  • ", + "
  • Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics , 32(19), 3047–3048. doi: /10.1093/bioinformatics/btw354
  • " + ].join(' ').trim() return reference_text } @@ -171,7 +169,10 @@ def methodsDescriptionText(mqc_methods_yaml) { temp_doi_ref += "(doi: ${doi_ref.replace("https://doi.org/", "").replace(" ", "")}), " } meta["doi_text"] = temp_doi_ref.substring(0, temp_doi_ref.length() - 2) - } else meta["doi_text"] = "" + } + else { + meta["doi_text"] = "" + } meta["nodoi_text"] = meta.manifest_map.doi ? "" : "
  • If available, make sure to update the text to include the Zenodo DOI of version of the pipeline used.
  • " // Tool references @@ -185,9 +186,8 @@ def methodsDescriptionText(mqc_methods_yaml) { def methods_text = mqc_methods_yaml.text - def engine = new groovy.text.SimpleTemplateEngine() + def engine = new groovy.text.SimpleTemplateEngine() def description_html = engine.createTemplate(methods_text).make(meta) return description_html.toString() } - diff --git a/tests/multiple.nf.test.snap b/tests/multiple.nf.test.snap index 126b0cd8..72f8eaa6 100644 --- a/tests/multiple.nf.test.snap +++ b/tests/multiple.nf.test.snap @@ -3,7 +3,7 @@ "content": [ 5, { - "TABIX_KNOWN_INDELS": { + "TABIX_TABIX": { "tabix": 1.2 }, "Workflow": { @@ -46,7 +46,7 @@ "nf-test": "0.9.2", "nextflow": "24.10.2" }, - "timestamp": "2024-11-28T18:25:56.49079277" + "timestamp": "2024-12-02T12:09:53.42046409" }, "Run with profile test for multiple | --input assets/genomes/test/default_multiple.yml": { "content": [ @@ -135,6 +135,6 @@ "nf-test": "0.9.2", "nextflow": "24.10.2" }, - "timestamp": "2024-11-28T16:56:49.442608122" + "timestamp": "2024-12-02T12:27:31.034085088" } -} +} \ No newline at end of file diff --git a/tests/sarek.nf.test.snap b/tests/sarek.nf.test.snap index 3dcda1c5..b51a2a0a 100644 --- a/tests/sarek.nf.test.snap +++ b/tests/sarek.nf.test.snap @@ -1,7 +1,7 @@ { "Run with profile test for sarek | --input assets/genomes/test/pipelines/sarek.yml": { "content": [ - 12, + 11, { "BUILD_INTERVALS": { "gawk": "5.3.0" @@ -24,16 +24,7 @@ "SAMTOOLS_FAIDX": { "samtools": 1.21 }, - "TABIX_DBSNP": { - "tabix": 1.2 - }, - "TABIX_GERMLINE_RESOURCE": { - "tabix": 1.2 - }, - "TABIX_KNOWN_INDELS": { - "tabix": 1.2 - }, - "TABIX_KNOWN_SNPS": { + "TABIX_TABIX": { "tabix": 1.2 }, "Workflow": { @@ -120,6 +111,6 @@ "nf-test": "0.9.2", "nextflow": "24.10.2" }, - "timestamp": "2024-11-28T15:48:25.039272086" + "timestamp": "2024-12-02T11:21:36.124482055" } } \ No newline at end of file diff --git a/tests/tabix.nf.test.snap b/tests/tabix.nf.test.snap index 59c076f4..1cb24172 100644 --- a/tests/tabix.nf.test.snap +++ b/tests/tabix.nf.test.snap @@ -1,18 +1,9 @@ { "Run with profile test with tabix | --input assets/genomes/test/default_extended.yml": { "content": [ - 5, + 4, { - "TABIX_DBSNP": { - "tabix": 1.2 - }, - "TABIX_GERMLINE_RESOURCE": { - "tabix": 1.2 - }, - "TABIX_KNOWN_INDELS": { - "tabix": 1.2 - }, - "TABIX_KNOWN_SNPS": { + "TABIX_TABIX": { "tabix": 1.2 }, "Workflow": { @@ -51,22 +42,13 @@ "nf-test": "0.9.2", "nextflow": "24.10.2" }, - "timestamp": "2024-11-28T14:57:48.388939938" + "timestamp": "2024-12-02T11:21:55.801035083" }, "Run with profile test with tabix | --input assets/genomes/test/default_full.yml": { "content": [ - 5, + 4, { - "TABIX_DBSNP": { - "tabix": 1.2 - }, - "TABIX_GERMLINE_RESOURCE": { - "tabix": 1.2 - }, - "TABIX_KNOWN_INDELS": { - "tabix": 1.2 - }, - "TABIX_KNOWN_SNPS": { + "TABIX_TABIX": { "tabix": 1.2 }, "Workflow": { @@ -105,6 +87,6 @@ "nf-test": "0.9.2", "nextflow": "24.10.2" }, - "timestamp": "2024-11-28T14:58:13.905623169" + "timestamp": "2024-12-02T11:22:15.160055968" } } diff --git a/workflows/references/main.nf b/workflows/references/main.nf index 53fe1c57..298d44f3 100644 --- a/workflows/references/main.nf +++ b/workflows/references/main.nf @@ -1,362 +1,139 @@ -include { BOWTIE2_BUILD } from '../../modules/nf-core/bowtie2/build' -include { BOWTIE_BUILD as BOWTIE1_BUILD } from '../../modules/nf-core/bowtie/build' -include { BWAMEM2_INDEX } from '../../modules/nf-core/bwamem2/index' -include { BWA_INDEX as BWAMEM1_INDEX } from '../../modules/nf-core/bwa/index' -include { DRAGMAP_HASHTABLE } from '../../modules/nf-core/dragmap/hashtable' -include { GATK4_CREATESEQUENCEDICTIONARY } from '../../modules/nf-core/gatk4/createsequencedictionary' -include { GAWK as BUILD_INTERVALS } from '../../modules/nf-core/gawk' -include { GFFREAD } from '../../modules/nf-core/gffread' -include { HISAT2_BUILD } from '../../modules/nf-core/hisat2/build' -include { HISAT2_EXTRACTSPLICESITES } from '../../modules/nf-core/hisat2/extractsplicesites' -include { KALLISTO_INDEX } from '../../modules/nf-core/kallisto/index' -include { MSISENSORPRO_SCAN } from '../../modules/nf-core/msisensorpro/scan' -include { RSEM_PREPAREREFERENCE as MAKE_TRANSCRIPTS_FASTA } from '../../modules/nf-core/rsem/preparereference' -include { RSEM_PREPAREREFERENCE as RSEM_PREPAREREFERENCE_GENOME } from '../../modules/nf-core/rsem/preparereference' -include { SALMON_INDEX } from '../../modules/nf-core/salmon/index' -include { SAMTOOLS_FAIDX } from '../../modules/nf-core/samtools/faidx' -include { STAR_GENOMEGENERATE } from '../../modules/nf-core/star/genomegenerate' -include { TABIX_TABIX as TABIX_DBSNP } from '../../modules/nf-core/tabix/tabix' -include { TABIX_TABIX as TABIX_GERMLINE_RESOURCE } from '../../modules/nf-core/tabix/tabix' -include { TABIX_TABIX as TABIX_KNOWN_INDELS } from '../../modules/nf-core/tabix/tabix' -include { TABIX_TABIX as TABIX_KNOWN_SNPS } from '../../modules/nf-core/tabix/tabix' - -// include { BBMAP_BBSPLIT } from '../../modules/nf-core/bbmap/bbsplit' -// include { CUSTOM_CATADDITIONALFASTA } from '../../modules/nf-core/custom/catadditionalfasta' -// include { SORTMERNA as SORTMERNA_INDEX } from '../../modules/nf-core/sortmerna' +include { CREATE_ALIGN_INDEX } from '../../subworkflows/local/create_align_index' +include { CREATE_ALIGN_INDEX_WITH_GFF } from '../../subworkflows/local/create_align_index_with_gff' +include { INDEX_FASTA } from '../../subworkflows/local/index_fasta' +include { INDEX_VCF } from '../../subworkflows/local/index_vcf' +include { SAMPLESHEET_TO_CHANNEL } from '../../subworkflows/local/samplesheet_to_channel' workflow REFERENCES { take: - reference // fasta, gff, gtf, splice_sites, transcript_fasta + reference // fasta, gff, gtf, splice_sites, transcript_fasta, vcf tools // bowtie|bowtie2|bwamem1|bwamem2|createsequencedictionary|dragmap|faidx|gffread|intervals|hisat2|hisat2_extractsplicesites|kallisto|msisensorpro|rsem|rsem_make_transcripts_fasta|salmon|star|tabix main: - ch_bowtie1 = Channel.empty() - ch_bowtie2 = Channel.empty() - ch_dbsnp_vcf_tbi = Channel.empty() - ch_fasta_fai = Channel.empty() - ch_germline_resource_vcf_tbi = Channel.empty() - ch_gff_gtf = Channel.empty() - ch_hisat2 = Channel.empty() - ch_hisat2_splice_sites = Channel.empty() - ch_intervals_bed = Channel.empty() - ch_kallisto = Channel.empty() - ch_known_indels_vcf_tbi = Channel.empty() - ch_known_snps_vcf_tbi = Channel.empty() - ch_msisensorpro = Channel.empty() - ch_rsem = Channel.empty() - ch_rsem_transcript_fasta = Channel.empty() - ch_salmon = Channel.empty() - ch_sizes = Channel.empty() - ch_star = Channel.empty() versions = Channel.empty() - input = reference.multiMap { meta, intervals_bed, fasta, fasta_dict, fasta_fai, fasta_sizes, gff, gtf, splice_sites, transcript_fasta, dbsnp_vcf, known_snps_vcf, known_indels_vcf, germline_resource_vcf, readme, bed12, mito_name, macs_gsize -> - fasta: [meta, fasta] - fasta_dict: [meta, fasta_dict] - fasta_fai: [meta, fasta_fai] - fasta_sizes: [meta, fasta_sizes] - gff: [meta, gff] - gtf: [meta, gtf] - splice_sites: [meta, splice_sites] - transcript_fasta: [meta, transcript_fasta] - readme: [meta, readme] - bed12: [meta, bed12] - mito_name: [meta, mito_name] - macs_gsize: [meta, macs_gsize] - intervals_bed: [meta, intervals_bed] - bwamem1_fasta: tools.contains('bwamem1') && fasta ? [meta, file(fasta)] : [[:], []] - bwamem2_fasta: tools.contains('bwamem2') && fasta ? [meta, file(fasta)] : [[:], []] - createsequencedictionary_fasta: tools.contains('createsequencedictionary') && fasta ? [meta, file(fasta)] : [[:], []] - dragmap_fasta: tools.contains('dragmap') && fasta ? [meta, file(fasta)] : [[:], []] - fasta_samtools: ((tools.contains('faidx') || tools.contains('sizes')) && !(fasta_fai || fasta_sizes) && fasta) || (tools.contains('intervals') && !(fasta_fai || intervals_bed)) ? [meta, file(fasta)] : [[:], []] - dbsnp_vcf: tools.contains('tabix') && dbsnp_vcf ? [meta, file(dbsnp_vcf)] : [[:], []] - known_snps_vcf: tools.contains('tabix') && known_snps_vcf ? [meta, file(known_snps_vcf)] : [[:], []] - known_indels_vcf: tools.contains('tabix') && known_indels_vcf ? [meta, file(known_indels_vcf)] : [[:], []] - germline_resource_vcf: tools.contains('tabix') && germline_resource_vcf ? [meta, file(germline_resource_vcf)] : [[:], []] - gff_gffread: !gtf && gff && (tools.contains('gffread') || tools.contains('hisat2') || tools.contains('kallisto') || tools.contains('rsem') || tools.contains('salmon') || tools.contains('star')) ? [meta, file(gff)] : [[:], []] - } - // I should be able to output null instead of `[[:], []] and have that registered as an empty channel and not trigger downstream processes - // but not working currently - - if (tools && tools.split(',').contains('bowtie1')) { - BOWTIE1_BUILD(input.fasta) - - ch_bowtie1 = BOWTIE1_BUILD.out.index - versions = versions.mix(BOWTIE1_BUILD.out.versions) - } - - if (tools && tools.split(',').contains('bowtie2')) { - BOWTIE2_BUILD(input.fasta) - - ch_bowtie2 = BOWTIE2_BUILD.out.index - versions = versions.mix(BOWTIE2_BUILD.out.versions) - } - - // the whole map -> null should be removed once I managed to make it work properly - BWAMEM1_INDEX( - input.bwamem1_fasta.map { meta, file -> - return file ? [meta, file] : null - } - ) - ch_bwamem1 = BWAMEM1_INDEX.out.index - - BWAMEM2_INDEX( - input.bwamem2_fasta.map { meta, file -> - return file ? [meta, file] : null - } - ) - ch_bwamem2 = BWAMEM2_INDEX.out.index - - DRAGMAP_HASHTABLE( - input.dragmap_fasta.map { meta, file -> - return file ? [meta, file] : null - } - ) - ch_dragmap = DRAGMAP_HASHTABLE.out.hashmap - - GATK4_CREATESEQUENCEDICTIONARY( - input.createsequencedictionary_fasta.map { meta, file -> - return file ? [meta, file] : null - } - ) - ch_fasta_dict = GATK4_CREATESEQUENCEDICTIONARY.out.dict - - SAMTOOLS_FAIDX( - input.fasta_samtools.map { meta, file -> - return file ? [meta, file] : null - }, - [[id: 'no_fai'], []], - tools.contains('sizes') - ) - - // TODO: be smarter about input assets - // Here we either mix+GT an empty channel (either no output or no input faidx) with the faidx return faidx - // And we filter out the empty value - ch_fasta_fai = input.fasta_fai - .mix(SAMTOOLS_FAIDX.out.fai) - .groupTuple() - .map { meta, file -> - return file[1] ? [meta, file[1]] : [meta, file] - } - - // TODO: be smarter about input assets - // Here we either mix+GT an empty channel (either no output or no input sizes) with the sizes return sizes - // And we filter out the empty value - ch_sizes = input.fasta_sizes - .mix(SAMTOOLS_FAIDX.out.sizes) - .groupTuple() - .map { meta, file -> - return file[1] ? [meta, file[1]] : [meta, file] - } - - ch_fasta_fai_intervals_bed = input.intervals_bed - .mix(ch_fasta_fai) - .groupTuple() - .map { meta, file -> - return file[0] || !tools.contains('intervals') ? null : file[1] ? [meta, file[1]] : [meta, file] - } - - BUILD_INTERVALS(ch_fasta_fai_intervals_bed, []) - ch_intervals_bed = BUILD_INTERVALS.out.output - - TABIX_DBSNP( - input.dbsnp_vcf.map { meta, file -> - return file ? [meta, file] : null - } - ) - ch_dbsnp_vcf_tbi = TABIX_DBSNP.out.tbi - - TABIX_KNOWN_SNPS( - input.known_snps_vcf.map { meta, file -> - return file ? [meta, file] : null - } + SAMPLESHEET_TO_CHANNEL(reference) + + intervals_bed = SAMPLESHEET_TO_CHANNEL.out.intervals_bed + fasta = SAMPLESHEET_TO_CHANNEL.out.fasta + fasta_dict = SAMPLESHEET_TO_CHANNEL.out.fasta_dict + fasta_fai = SAMPLESHEET_TO_CHANNEL.out.fasta_fai + fasta_sizes = SAMPLESHEET_TO_CHANNEL.out.fasta_sizes + gff = SAMPLESHEET_TO_CHANNEL.out.gff + gtf = SAMPLESHEET_TO_CHANNEL.out.gtf + splice_sites = SAMPLESHEET_TO_CHANNEL.out.splice_sites + transcript_fasta = SAMPLESHEET_TO_CHANNEL.out.transcript_fasta + vcf = SAMPLESHEET_TO_CHANNEL.out.vcf + + CREATE_ALIGN_INDEX( + fasta, + tools.split(',').contains('bowtie1'), + tools.split(',').contains('bowtie2'), + tools.split(',').contains('bwamem1'), + tools.split(',').contains('bwamem2'), + tools.split(',').contains('dragmap') ) - ch_known_snps_vcf_tbi = TABIX_KNOWN_SNPS.out.tbi - TABIX_KNOWN_INDELS( - input.known_indels_vcf.map { meta, file -> - return file ? [meta, file] : null - }.transpose() + CREATE_ALIGN_INDEX_WITH_GFF( + fasta, + gff, + gtf, + splice_sites, + transcript_fasta, + tools.split(',').contains('hisat2'), + tools.split(',').contains('hisat2_extractsplicesites'), + tools.split(',').contains('kallisto'), + tools.split(',').contains('rsem'), + tools.split(',').contains('rsem_make_transcript_fasta'), + tools.split(',').contains('salmon'), + tools.split(',').contains('star') ) - ch_known_indels_vcf_tbi = TABIX_KNOWN_INDELS.out.tbi - TABIX_GERMLINE_RESOURCE( - input.germline_resource_vcf.map { meta, file -> - return file ? [meta, file] : null - } + INDEX_FASTA( + fasta, + fasta_fai, + tools.split(',').contains('createsequencedictionary'), + tools.split(',').contains('faidx'), + tools.split(',').contains('intervals'), + tools.split(',').contains('msisensorpro'), + tools.split(',').contains('sizes') ) - ch_germline_resource_vcf_tbi = TABIX_GERMLINE_RESOURCE.out.tbi - GFFREAD( - input.gff_gffread.map { meta, file -> - return file ? [meta, file] : null - }, - [] + INDEX_VCF( + vcf, + tools.split(',').contains('tabix') ) - ch_gff_gtf = input.gtf - .mix(GFFREAD.out.gtf) - .groupTuple() - .map { meta, file -> - return file[1] ? [meta, file[1]] : [meta, file] - } - - if (tools.contains('hisat2')) { - // TODO: be smarter about input assets - // Here we either return an empty channel if we have a splice_sites so that HISAT2_EXTRACTSPLICESITES is not triggered - // Or we return the provided gtf so that HISAT2_EXTRACTSPLICESITES is run - ch_gtf_hisat2 = ch_gff_gtf - .join(input.splice_sites) - .groupTuple() - .map { meta, gtf, splice_sites -> - return splice_sites[0][0] ? null : [meta, gtf] - } - - HISAT2_EXTRACTSPLICESITES(ch_gtf_hisat2) - versions = versions.mix(HISAT2_EXTRACTSPLICESITES.out.versions) - - // TODO: be smarter about input assets - // Here we either mix+GT an empty channel (either no output or no input splice_sites) with the splice_sites return splice_sites - // And we filter out the empty value - ch_hisat2_splice_sites = input.splice_sites - .mix(HISAT2_EXTRACTSPLICESITES.out.txt) - .groupTuple() - .map { meta, txt -> - return txt[1] ? [meta, txt[1]] : [meta, txt] - } - - if (tools && tools.split(',').contains('hisat2')) { - HISAT2_BUILD(input.fasta, ch_gff_gtf, ch_hisat2_splice_sites) - - ch_hisat2 = HISAT2_BUILD.out.index - versions = versions.mix(HISAT2_BUILD.out.versions) - } - } - - if (tools.contains('kallisto') || tools.contains('rsem_make_transcript_fasta') || tools.contains('salmon')) { - // TODO: be smarter about input assets - // Here we either return an empty channel if we have a transcript_fasta so that MAKE_TRANSCRIPTS_FASTA is not triggered - // Or we return the provided gtf so that MAKE_TRANSCRIPTS_FASTA is run - ch_gtf_rsem = ch_gff_gtf - .join(input.transcript_fasta) - .groupTuple() - .map { meta, gtf, transcript_fasta -> - return transcript_fasta[0][0] ? null : [meta, gtf] - } - - MAKE_TRANSCRIPTS_FASTA(input.fasta, ch_gtf_rsem) - versions = versions.mix(MAKE_TRANSCRIPTS_FASTA.out.versions) - - // TODO: be smarter about input assets - // Here we either mix+GT an empty channel (either no output or no input transcript_fasta) with the transcript_fasta return transcript_fasta - // And we filter out the empty value - ch_rsem_transcript_fasta = input.transcript_fasta - .mix(MAKE_TRANSCRIPTS_FASTA.out.transcript_fasta) - .groupTuple() - .map { meta, txt -> - return txt[1] ? [meta, txt[1]] : [meta, txt] - } - - if (tools.contains('kallisto')) { - KALLISTO_INDEX(ch_rsem_transcript_fasta) - - ch_kallisto = KALLISTO_INDEX.out.index - versions = versions.mix(KALLISTO_INDEX.out.versions) - } - - if (tools.contains('salmon')) { - SALMON_INDEX(input.fasta, ch_rsem_transcript_fasta) - - ch_salmon = SALMON_INDEX.out.index - versions = versions.mix(SALMON_INDEX.out.versions) - } - } - - if (tools.contains('msisensorpro')) { - MSISENSORPRO_SCAN(input.fasta) - - ch_msisensorpro = MSISENSORPRO_SCAN.out.list - versions = versions.mix(MSISENSORPRO_SCAN.out.versions) - } - - if (tools && tools.split(',').contains('rsem')) { - RSEM_PREPAREREFERENCE_GENOME(input.fasta, ch_gff_gtf) - - ch_rsem = RSEM_PREPAREREFERENCE_GENOME.out.index - versions = versions.mix(RSEM_PREPAREREFERENCE_GENOME.out.versions) - } - - if (tools.contains('star')) { - STAR_GENOMEGENERATE(input.fasta, ch_gff_gtf) - - ch_star = STAR_GENOMEGENERATE.out.index - versions = versions.mix(STAR_GENOMEGENERATE.out.versions) - } - - // versions - versions = versions.mix(BUILD_INTERVALS.out.versions) - versions = versions.mix(BWAMEM1_INDEX.out.versions) - versions = versions.mix(BWAMEM2_INDEX.out.versions) - versions = versions.mix(DRAGMAP_HASHTABLE.out.versions) - versions = versions.mix(GATK4_CREATESEQUENCEDICTIONARY.out.versions) - versions = versions.mix(GFFREAD.out.versions) - versions = versions.mix(SAMTOOLS_FAIDX.out.versions) - versions = versions.mix(TABIX_DBSNP.out.versions) - versions = versions.mix(TABIX_GERMLINE_RESOURCE.out.versions) - versions = versions.mix(TABIX_KNOWN_INDELS.out.versions) - versions = versions.mix(TABIX_KNOWN_SNPS.out.versions) - - // input fasta - ch_fasta = input.fasta + bowtie1_index = CREATE_ALIGN_INDEX.out.bowtie1_index + bowtie2_index = CREATE_ALIGN_INDEX.out.bowtie2_index + bwamem1_index = CREATE_ALIGN_INDEX.out.bwamem1_index + bwamem2_index = CREATE_ALIGN_INDEX.out.bwamem2_index + dragmap_hashmap = CREATE_ALIGN_INDEX.out.dragmap_hashmap + + gtf = gtf.mix(CREATE_ALIGN_INDEX_WITH_GFF.out.gtf) + hisat2_index = CREATE_ALIGN_INDEX_WITH_GFF.out.hisat2_index + splice_sites = splice_sites.mix(CREATE_ALIGN_INDEX_WITH_GFF.out.splice_sites) + kallisto_index = CREATE_ALIGN_INDEX_WITH_GFF.out.kallisto_index + rsem_index = CREATE_ALIGN_INDEX_WITH_GFF.out.rsem_index + transcript_fasta = transcript_fasta.mix(CREATE_ALIGN_INDEX_WITH_GFF.out.transcript_fasta) + salmon_index = CREATE_ALIGN_INDEX_WITH_GFF.out.salmon_index + star_index = CREATE_ALIGN_INDEX_WITH_GFF.out.star_index + + fasta_dict = fasta_dict.mix(INDEX_FASTA.out.fasta_dict) + fasta_fai = fasta_fai.mix(INDEX_FASTA.out.fasta_fai) + intervals_bed = intervals_bed.mix(INDEX_FASTA.out.intervals_bed) + fasta_sizes = fasta_sizes.mix(INDEX_FASTA.out.fasta_sizes) + msisensorpro_list = INDEX_FASTA.out.msisensorpro_list + + vcf_tbi = INDEX_VCF.out.vcf_tbi + + versions = versions.mix(CREATE_ALIGN_INDEX.out.versions) + versions = versions.mix(CREATE_ALIGN_INDEX_WITH_GFF.out.versions) + versions = versions.mix(INDEX_FASTA.out.versions) + versions = versions.mix(INDEX_VCF.out.versions) emit: - bowtie1 = ch_bowtie1 - bowtie2 = ch_bowtie2 - bwamem1 = ch_bwamem1 - bwamem2 = ch_bwamem2 - dbsnp_vcf_tbi = ch_dbsnp_vcf_tbi - dragmap = ch_dragmap - fasta = ch_fasta - fasta_dict = ch_fasta_dict - fasta_fai = ch_fasta_fai - germline_resource_vcf_tbi = ch_germline_resource_vcf_tbi - gff_gtf = ch_gff_gtf - hisat2 = ch_hisat2 - hisat2_splice_sites = ch_hisat2_splice_sites - intervals_bed = ch_intervals_bed - kallisto = ch_kallisto - known_indels_vcf_tbi = ch_known_indels_vcf_tbi - known_snps_vcf_tbi = ch_known_snps_vcf_tbi - msisensorpro = ch_msisensorpro - rsem = ch_rsem - rsem_transcript_fasta = ch_rsem_transcript_fasta - salmon = ch_salmon - sizes = ch_sizes - star = ch_star - versions = versions + bowtie1_index // channel: [meta, BowtieIndex/] + bowtie2_index // channel: [meta, Bowtie2Index/] + bwamem1_index // channel: [meta, BWAmemIndex/] + bwamem2_index // channel: [meta, BWAmem2memIndex/] + dragmap_hashmap // channel: [meta, DragmapHashtable/] + fasta // channel: [meta, *.fa(sta)] + fasta_dict // channel: [meta, *.fa(sta).dict] + fasta_fai // channel: [meta, *.fa(sta).fai] + fasta_sizes // channel: [meta, *.fa(sta).sizes] + gtf // channel: [meta, gtf] + hisat2_index // channel: [meta, Hisat2Index/] + intervals_bed // channel: [meta, *.bed] + kallisto_index // channel: [meta, KallistoIndex] + msisensorpro_list // channel: [meta, *.list] + rsem_index // channel: [meta, RSEMIndex/] + salmon_index // channel: [meta, SalmonIndex/] + splice_sites // channel: [meta, *.splice_sites.txt] + star_index // channel: [meta, STARIndex/] + transcript_fasta // channel: [meta, *.transcripts.fasta] + vcf_tbi // channel: [meta, *.vcf.tbi] + versions // channel: [versions.yml] publish: - ch_bowtie1 >> 'bowtie1' - ch_bowtie2 >> 'bowtie2' - ch_bwamem1 >> 'bwamem1' - ch_bwamem2 >> 'bwamem2' - ch_dbsnp_vcf_tbi >> 'tabix_dbsnp' - ch_dragmap >> 'dragmap' - ch_fasta >> 'fasta' - ch_fasta_dict >> 'fasta_dict' - ch_fasta_fai >> 'fasta_fai' - ch_germline_resource_vcf_tbi >> 'tabix_germline_resource' - ch_gff_gtf >> 'gffread' - ch_hisat2 >> 'hisat2' - ch_hisat2_splice_sites >> 'splice_sites' - ch_intervals_bed >> 'intervals' - ch_kallisto >> 'kallisto' - ch_known_indels_vcf_tbi >> 'tabix_known_indels' - ch_known_snps_vcf_tbi >> 'tabix_known_snps' - ch_msisensorpro >> 'msisensorpro' - ch_rsem >> 'rsem' - ch_rsem_transcript_fasta >> 'transcript_fasta' - ch_salmon >> 'salmon' - ch_sizes >> 'fasta_sizes' - ch_star >> 'star' + bowtie1_index >> 'bowtie1_index' + bowtie2_index >> 'bowtie2_index' + bwamem1_index >> 'bwamem1_index' + bwamem2_index >> 'bwamem2_index' + dragmap_hashmap >> 'dragmap_hashmap' + fasta >> 'fasta' + fasta_dict >> 'fasta_dict' + fasta_fai >> 'fasta_fai' + fasta_sizes >> 'fasta_sizes' + gtf >> 'gtf' + hisat2_index >> 'hisat2_index' + intervals_bed >> 'intervals_bed' + kallisto_index >> 'kallisto_index' + msisensorpro_list >> 'msisensorpro_list' + rsem_index >> 'rsem_index' + salmon_index >> 'salmon_index' + splice_sites >> 'splice_sites' + star_index >> 'star_index' + transcript_fasta >> 'transcript_fasta' + vcf_tbi >> 'vcf_tbi' }