From cfd0f835171922f4a7527ae976ba1c74e5487d71 Mon Sep 17 00:00:00 2001 From: Chris Fields Date: Mon, 13 Jan 2025 17:21:11 -0600 Subject: [PATCH] last subworkflow for now; will likely move chimera removal into another one at a later point --- conf/modules.config | 6 +- conf/test.config | 1 + nextflow.config | 20 +++-- subworkflows/local/dada2_denoise.nf | 6 ++ subworkflows/local/generate_output.nf | 72 +++++++++++----- subworkflows/local/phylogeny.nf | 61 ++++++++------ subworkflows/local/pre_qc.nf | 11 +++ subworkflows/local/qualitycontrol.nf | 38 ++++----- subworkflows/local/taxonomy.nf | 3 +- workflows/tada.nf | 114 +++++++------------------- 10 files changed, 167 insertions(+), 165 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 958fba7..6c5d9a0 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -242,11 +242,11 @@ process { ] } - withName: SEQTABLE2TEXT { + withName: DADA2_SEQTABLE2TEXT { publishDir = [ path: { "${params.outdir}/TSV" }, mode: params.publish_dir_mode, - pattern: '*.txt' + pattern: 'seqtab_final.txt' ] } @@ -254,7 +254,7 @@ process { publishDir = [ path: { "${params.outdir}/TSV" }, mode: params.publish_dir_mode, - pattern: '*.txt' + pattern: 'tax_final*.txt' ] } diff --git a/conf/test.config b/conf/test.config index 11e1deb..e6d079d 100644 --- a/conf/test.config +++ b/conf/test.config @@ -25,5 +25,6 @@ params { trim_rev = 25 reference = 'https://file-server.igb.illinois.edu/~cjfields/TADA/silva_nr99_v138.1_train_set.fa.gz' phylo_tool = 'fasttree' + to_QIIME2 = true // species = 'https://file-server.igb.illinois.edu/~cjfields/TADA/silva_species_assignment_v138.1.fa.gz' } diff --git a/nextflow.config b/nextflow.config index a92b56e..01f2ca8 100644 --- a/nextflow.config +++ b/nextflow.config @@ -16,7 +16,7 @@ params { genome = null igenomes_base = 's3://ngi-igenomes/igenomes/' igenomes_ignore = false - fasta = null// MultiQC options + fasta = null // MultiQC options multiqc_config = null multiqc_title = null multiqc_logo = null @@ -24,11 +24,18 @@ params { multiqc_methods_description = null // TODO: this needs to be removed or made more specific - amplicon = "16S" - quality_binning = false // set to true if using binned qualities (NovaSeq, PacBio Revio) - quality_bins = "" + amplicon = "16S" + + // loessErrfun, PacBioErrfun, makeBinnedQualErrfun, loessErrfun_mod1, loessErrfun_mod2, loessErrfun_mod3, loessErrfun_mod4 + error_function = "loessErrfun" + + quality_binning = false // set to true if using binned qualities (NovaSeq, PacBio Revio) + + // if quality_binning is true and error_function is set to 'makeBinnedQualErrfun', this is required to be set + + quality_bins = "" amplicon_type = "overlapping" - platform = "illumina" + platform = "illumina" // illumina, pacbio; 454 and others could be added // QC skip_FASTQC = false // set to run this step by default, this can fail with large sample #'s @@ -63,9 +70,6 @@ params { // I think we can make these bool 'false' as above with R coersion (either through as.logical or using optparse in a Rscript) rmPhiX = false - // Error model - // custom_error_model = 'illumina' // NYI, thinking about best way to implement this - // ASV inference pooling pool = "pseudo" diff --git a/subworkflows/local/dada2_denoise.nf b/subworkflows/local/dada2_denoise.nf index 23f5475..4957ed1 100644 --- a/subworkflows/local/dada2_denoise.nf +++ b/subworkflows/local/dada2_denoise.nf @@ -5,6 +5,7 @@ include { DADA_INFER } from '../../modules/local/dada include { POOLED_SEQTABLE } from '../../modules/local/pooledseqtable' include { DADA2_REMOVE_CHIMERAS } from '../../modules/local/removechimeras' include { RENAME_ASVS } from '../../modules/local/renameasvs' +include { DADA2_SEQTABLE2TEXT } from '../../modules/local/seqtable2txt' workflow DADA2_DENOISE { @@ -55,7 +56,12 @@ workflow DADA2_DENOISE { POOLED_SEQTABLE.out.filtered_seqtable ) + DADA2_SEQTABLE2TEXT( + RENAME_ASVS.out.seqtable_renamed + ) + emit: + seqtab2qiime = DADA2_SEQTABLE2TEXT.out.seqtab2qiime nonchimeric_asvs = RENAME_ASVS.out.nonchimeric_asvs seqtable_renamed = RENAME_ASVS.out.seqtable_renamed readmap = RENAME_ASVS.out.readmap diff --git a/subworkflows/local/generate_output.nf b/subworkflows/local/generate_output.nf index 393e8a8..1df4b8d 100644 --- a/subworkflows/local/generate_output.nf +++ b/subworkflows/local/generate_output.nf @@ -1,36 +1,66 @@ -// TODO nf-core: If in doubt look at other nf-core/subworkflows to see how we are doing things! :) -// https://github.com/nf-core/modules/tree/master/subworkflows -// You can also ask for help via your pull request or on the #subworkflows channel on the nf-core Slack workspace: -// https://nf-co.re/join -// TODO nf-core: A subworkflow SHOULD import at least two modules - -include { SAMTOOLS_SORT } from '../../../modules/nf-core/samtools/sort/main' -include { SAMTOOLS_INDEX } from '../../../modules/nf-core/samtools/index/main' +include { BIOM } from '../../modules/local/biom' +include { QIIME2_FEATURETABLE } from '../../modules/local/qiime2featuretable' +include { QIIME2_TAXTABLE } from '../../modules/local/qiime2taxtable' +include { QIIME2_SEQUENCE } from '../../modules/local/qiime2seqs' +include { QIIME2_ALIGNMENT } from '../../modules/local/qiime2aln' +include { QIIME2_TREE } from '../../modules/local/qiime2tree' +include { SESSION_INFO } from '../../modules/local/rsessioninfo' workflow GENERATE_OUTPUT { + // TODO: I'd like to have this simply be TSV files (no RDS) + // so we can generate from other subworkflows if needed take: - // TODO nf-core: edit input (take) channels - ch_bam // channel: [ val(meta), [ bam ] ] + seq_table_rds + seq_table_qiime + tax_table_rds + tax_table_tsv + asvs + alignment + unrooted_tree + rooted_tree main: - ch_versions = Channel.empty() - // TODO nf-core: substitute modules here for the modules of your subworkflow + if (params.to_BIOM) { + BIOM( + seq_table_rds, + tax_table_rds + ) + } - SAMTOOLS_SORT ( ch_bam ) - ch_versions = ch_versions.mix(SAMTOOLS_SORT.out.versions.first()) + if (params.to_QIIME2) { + QIIME2_FEATURETABLE( + seq_table_qiime + ) - SAMTOOLS_INDEX ( SAMTOOLS_SORT.out.bam ) - ch_versions = ch_versions.mix(SAMTOOLS_INDEX.out.versions.first()) + QIIME2_TAXTABLE( + tax_table_tsv + ) - emit: - // TODO nf-core: edit emitted channels - bam = SAMTOOLS_SORT.out.bam // channel: [ val(meta), [ bam ] ] - bai = SAMTOOLS_INDEX.out.bai // channel: [ val(meta), [ bai ] ] - csi = SAMTOOLS_INDEX.out.csi // channel: [ val(meta), [ csi ] ] + QIIME2_SEQUENCE( + asvs + ) + + if (!params.skip_alignment) { + QIIME2_ALIGNMENT( + alignment + ) + } + if (!params.skip_tree) { + QIIME2_TREE( + unrooted_tree, + rooted_tree + ) + } + } + + // TODO: May become redundant with versions + SESSION_INFO() + + emit: versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/phylogeny.nf b/subworkflows/local/phylogeny.nf index 70d3095..e44f5da 100644 --- a/subworkflows/local/phylogeny.nf +++ b/subworkflows/local/phylogeny.nf @@ -1,36 +1,49 @@ -// TODO nf-core: If in doubt look at other nf-core/subworkflows to see how we are doing things! :) -// https://github.com/nf-core/modules/tree/master/subworkflows -// You can also ask for help via your pull request or on the #subworkflows channel on the nf-core Slack workspace: -// https://nf-co.re/join -// TODO nf-core: A subworkflow SHOULD import at least two modules - -include { SAMTOOLS_SORT } from '../../../modules/nf-core/samtools/sort/main' -include { SAMTOOLS_INDEX } from '../../../modules/nf-core/samtools/index/main' +include { DECIPHER } from '../../modules/local/decipher' +include { PHANGORN } from '../../modules/local/phangorn' +include { FASTTREE } from '../../modules/local/fasttree' +include { ROOT_TREE } from '../../modules/local/roottree' workflow PHYLOGENY { - take: - // TODO nf-core: edit input (take) channels - ch_bam // channel: [ val(meta), [ bam ] ] + asvs main: + ch_alignment = Channel.empty() + ch_unrooted_tree = Channel.empty() + ch_rooted_tree = Channel.empty() ch_versions = Channel.empty() - // TODO nf-core: substitute modules here for the modules of your subworkflow - - SAMTOOLS_SORT ( ch_bam ) - ch_versions = ch_versions.mix(SAMTOOLS_SORT.out.versions.first()) - - SAMTOOLS_INDEX ( SAMTOOLS_SORT.out.bam ) - ch_versions = ch_versions.mix(SAMTOOLS_INDEX.out.versions.first()) + if (!params.skip_alignment) { + DECIPHER( + asvs + ) + ch_alignment = DECIPHER.out.alignment + if (!params.skip_tree) { + if (params.phylo_tool == 'phangorn') { + PHANGORN( + ch_alignment + ) + ch_unrooted_tree = PHANGORN.out.treeGTR + } else if (params.phylo_tool == 'fasttree') { + FASTTREE( + ch_alignment + ) + ch_unrooted_tree = FASTTREE.out.treeGTR + } + + ROOT_TREE( + ch_unrooted_tree, + params.phylo_tool + ) + ch_rooted_tree = ROOT_TREE.out.rooted_tree + } + } emit: - // TODO nf-core: edit emitted channels - bam = SAMTOOLS_SORT.out.bam // channel: [ val(meta), [ bam ] ] - bai = SAMTOOLS_INDEX.out.bai // channel: [ val(meta), [ bai ] ] - csi = SAMTOOLS_INDEX.out.csi // channel: [ val(meta), [ csi ] ] - - versions = ch_versions // channel: [ versions.yml ] + ch_alignment + ch_unrooted_tree + ch_rooted_tree + versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/pre_qc.nf b/subworkflows/local/pre_qc.nf index e4aa683..4df8cc8 100644 --- a/subworkflows/local/pre_qc.nf +++ b/subworkflows/local/pre_qc.nf @@ -4,6 +4,7 @@ // https://nf-co.re/join // TODO nf-core: A subworkflow SHOULD import at least two modules +include { FASTQC } from '../../modules/nf-core/fastqc/main' include { PLOT_QUALITY_PROFILE } from '../../modules/local/plotqualityprofile' include { VSEARCH_EESTATS } from '../../modules/local/vsearch_eestats' include { VSEARCH_OVERLAP } from '../../modules/local/vsearchoverlap' @@ -20,6 +21,14 @@ workflow PRE_QC { main: ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + FASTQC ( + ch_samplesheet + ) + + ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}) + ch_versions = ch_versions.mix(FASTQC.out.versions.first()) if (!skip_merging) { VSEARCH_OVERLAP( @@ -27,6 +36,7 @@ workflow PRE_QC { ) ch_versions = ch_versions.mix(VSEARCH_OVERLAP.out.versions.first()) + MERGE_OVERLAP_CHECK( VSEARCH_OVERLAP.out.merged_log.collect() ) @@ -54,4 +64,5 @@ workflow PRE_QC { emit: versions = ch_versions // channel: [ versions.yml ] + zip = FASTQC.out.zip } diff --git a/subworkflows/local/qualitycontrol.nf b/subworkflows/local/qualitycontrol.nf index 530cfa5..e18ab86 100644 --- a/subworkflows/local/qualitycontrol.nf +++ b/subworkflows/local/qualitycontrol.nf @@ -1,36 +1,30 @@ -// TODO nf-core: If in doubt look at other nf-core/subworkflows to see how we are doing things! :) -// https://github.com/nf-core/modules/tree/master/subworkflows -// You can also ask for help via your pull request or on the #subworkflows channel on the nf-core Slack workspace: -// https://nf-co.re/join -// TODO nf-core: A subworkflow SHOULD import at least two modules +include { READ_TRACKING } from '../../modules/local/readtracking' +include { PLOT_MERGED_HEATMAP } from '../../modules/local/plotmerged' +include { PLOT_ASV_DIST } from '../../modules/local/plotasvlen' -include { SAMTOOLS_SORT } from '../../../modules/nf-core/samtools/sort/main' -include { SAMTOOLS_INDEX } from '../../../modules/nf-core/samtools/index/main' - -workflow QUALITYCONTROL { +workflow QUALITY_CONTROL { take: - // TODO nf-core: edit input (take) channels - ch_bam // channel: [ val(meta), [ bam ] ] + ch_readtracking + merged_seqs + filtered_seqtable main: - ch_versions = Channel.empty() - // TODO nf-core: substitute modules here for the modules of your subworkflow + READ_TRACKING( + ch_readtracking.collect() + ) - SAMTOOLS_SORT ( ch_bam ) - ch_versions = ch_versions.mix(SAMTOOLS_SORT.out.versions.first()) + PLOT_MERGED_HEATMAP( + merged_seqs + ) - SAMTOOLS_INDEX ( SAMTOOLS_SORT.out.bam ) - ch_versions = ch_versions.mix(SAMTOOLS_INDEX.out.versions.first()) + PLOT_ASV_DIST( + filtered_seqtable + ) emit: - // TODO nf-core: edit emitted channels - bam = SAMTOOLS_SORT.out.bam // channel: [ val(meta), [ bam ] ] - bai = SAMTOOLS_INDEX.out.bai // channel: [ val(meta), [ bai ] ] - csi = SAMTOOLS_INDEX.out.csi // channel: [ val(meta), [ csi ] ] - versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/taxonomy.nf b/subworkflows/local/taxonomy.nf index 50d9b8a..c1d4e8c 100644 --- a/subworkflows/local/taxonomy.nf +++ b/subworkflows/local/taxonomy.nf @@ -12,7 +12,8 @@ workflow TAXONOMY { ch_versions = Channel.empty() ch_taxtab = Channel.empty() ch_metrics = Channel.empty() - + // TODO: eventually this will have multiple options for + // taxonomic assignment DADA2_ASSIGN_TAXA_SPECIES( readmap, ref_file, diff --git a/workflows/tada.nf b/workflows/tada.nf index e51d546..9edabb9 100644 --- a/workflows/tada.nf +++ b/workflows/tada.nf @@ -4,37 +4,15 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { FASTQC } from '../modules/nf-core/fastqc/main' include { MULTIQC } from '../modules/nf-core/multiqc/main' include { PRE_QC } from '../subworkflows/local/pre_qc' include { FILTER_AND_TRIM } from '../subworkflows/local/filter_and_trim' include { DADA2_DENOISE } from '../subworkflows/local/dada2_denoise' include { TAXONOMY } from '../subworkflows/local/taxonomy' - -// TODO: Move into phylogenetic subworkflow -// include { PHYLOGENETICS } from '../subworkflows/local/taxonomic_assignment' -include { DECIPHER } from '../modules/local/decipher' -include { PHANGORN } from '../modules/local/phangorn' -include { FASTTREE } from '../modules/local/fasttree' -include { ROOT_TREE } from '../modules/local/roottree' - -// TODO: Move into subworkflow(s) -// include { QUALITY_CONTROL } from '../subworkflows/local/taxonomic_assignment' -include { READ_TRACKING } from '../modules/local/readtracking' -include { PLOT_MERGED_HEATMAP } from '../modules/local/plotmerged' -include { PLOT_ASV_DIST } from '../modules/local/plotasvlen' -include { SESSION_INFO } from '../modules/local/rsessioninfo' - -// TODO: Move into subworkflow(s) -// include { OUTPUT } from '../subworkflows/local/taxonomic_assignment' -include { DADA2_SEQTABLE2TEXT } from '../modules/local/seqtable2txt' -include { BIOM } from '../modules/local/biom' -include { QIIME2_FEATURETABLE } from '../modules/local/qiime2featuretable' -include { QIIME2_TAXTABLE } from '../modules/local/qiime2taxtable' -include { QIIME2_SEQUENCE } from '../modules/local/qiime2seqs' -include { QIIME2_ALIGNMENT } from '../modules/local/qiime2aln' -include { QIIME2_TREE } from '../modules/local/qiime2tree' +include { PHYLOGENY } from '../subworkflows/local/phylogeny' +include { QUALITY_CONTROL } from '../subworkflows/local/qualitycontrol' +include { GENERATE_OUTPUT } from '../subworkflows/local/generate_output' include { paramsSummaryMap } from 'plugin/nf-validation' include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' @@ -92,13 +70,10 @@ workflow TADA { exit 1, "--id_type can currently only be set to 'simple' or 'md5', got ${params.id_type}" } - FASTQC ( - ch_samplesheet - ) + // FASTQC ( + // ch_samplesheet + // ) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}) - ch_versions = ch_versions.mix(FASTQC.out.versions.first()) - PRE_QC( ch_samplesheet, params.skip_ee, @@ -106,6 +81,7 @@ workflow TADA { params.skip_dadaQC ) + ch_multiqc_files = ch_multiqc_files.mix(PRE_QC.out.zip.collect{it[1]}) ch_versions = ch_versions.mix(PRE_QC.out.versions) // ch_multiqc_files = ch_multiqc_files.mix(PLOTQUALITYPROFILE.out.zip.collect{it[1]}) @@ -162,71 +138,37 @@ workflow TADA { ch_metrics = TAXONOMY.out.ch_metrics } - // Subworkflows-Alignment + Phylogenetic Tree (optional) - DECIPHER( - DADA2_DENOISE.out.nonchimeric_asvs - ) - - ch_tree = Channel.empty() - - if (params.phylo_tool == 'phangorn') { - PHANGORN( - DECIPHER.out.alignment - ) - ch_tree = PHANGORN.out.treeGTR - } else if (params.phylo_tool == 'fasttree') { - FASTTREE( - DECIPHER.out.alignment - ) - ch_tree = FASTTREE.out.treeGTR - } - - ROOT_TREE( - ch_tree, - params.phylo_tool - ) + PHYLOGENY(DADA2_DENOISE.out.nonchimeric_asvs) // Post-QC - READ_TRACKING( - ch_readtracking.collect() - ) - - PLOT_MERGED_HEATMAP( - DADA2_DENOISE.out.merged_seqs - ) - - PLOT_ASV_DIST( + QUALITY_CONTROL( + ch_readtracking, + DADA2_DENOISE.out.merged_seqs, DADA2_DENOISE.out.filtered_seqtable ) + // READ_TRACKING( + // ch_readtracking.collect() + // ) - // Subworkflow - Outputs + // PLOT_MERGED_HEATMAP( + // DADA2_DENOISE.out.merged_seqs + // ) - // TODO: these could be moved into the - DADA2_SEQTABLE2TEXT( - DADA2_DENOISE.out.seqtable_renamed - ) + // PLOT_ASV_DIST( + // DADA2_DENOISE.out.filtered_seqtable + // ) - // TODO: BIOM should read in TSV files, not RDS - BIOM( + GENERATE_OUTPUT( DADA2_DENOISE.out.seqtable_renamed, - TAXONOMY.out.ch_taxtab_rds + DADA2_DENOISE.out.seqtab2qiime, + TAXONOMY.out.ch_taxtab_rds, + TAXONOMY.out.ch_taxtab, + DADA2_DENOISE.out.nonchimeric_asvs, + PHYLOGENY.out.ch_alignment, + PHYLOGENY.out.ch_unrooted_tree, + PHYLOGENY.out.ch_rooted_tree ) - QIIME2_FEATURETABLE(DADA2_SEQTABLE2TEXT.out.seqtab2qiime) - - QIIME2_TAXTABLE(ch_taxtab) - - QIIME2_SEQUENCE(DADA2_DENOISE.out.nonchimeric_asvs) - - QIIME2_ALIGNMENT(DECIPHER.out.alignment) - - QIIME2_TREE( - ch_tree, - ROOT_TREE.out.rooted_tree - ) - - SESSION_INFO() - // // Collate and save software versions //