diff --git a/conf/test.config b/conf/test.config index e6d079d..cb8ee43 100644 --- a/conf/test.config +++ b/conf/test.config @@ -23,8 +23,11 @@ params { input = 'https://file-server.igb.illinois.edu/~cjfields/TADA/samplesheet.miseq.16s.csv' trim_for = 24 trim_rev = 25 + for_primer = "GTGYCAGCMGCCGCGGTAA" + rev_primer = "GGACTACNVGGGTWTCTAAT" reference = 'https://file-server.igb.illinois.edu/~cjfields/TADA/silva_nr99_v138.1_train_set.fa.gz' phylo_tool = 'fasttree' to_QIIME2 = true + trimmer = "cutadapt" // species = 'https://file-server.igb.illinois.edu/~cjfields/TADA/silva_species_assignment_v138.1.fa.gz' } diff --git a/modules/local/illumina_cutadapt.nf b/modules/local/illumina_cutadapt.nf new file mode 100644 index 0000000..0afd5d1 --- /dev/null +++ b/modules/local/illumina_cutadapt.nf @@ -0,0 +1,57 @@ +process ILLUMINA_CUTADAPT { + tag "$meta.id" + label 'process_medium' + + container 'quay.io/biocontainers/cutadapt:5.0--py39hbcbf7aa_0' + + input: + tuple val(meta), path(reads) + val(for_primer) + val(rev_primer) + val(for_primer_rc) + val(rev_primer_rc) + + output: + tuple val(meta), path("${meta.id}.R1.filtered.fastq.gz"), optional: true, emit: trimmed_R1 + tuple val(meta), path("${meta.id}.R2.filtered.fastq.gz"), optional: true, emit: trimmed_R2 + tuple val(meta), path("${meta.id}.R[12].filtered.fastq.gz"), optional: true, emit: trimmed + path("${meta.id}.cutadapt.out"), emit: trimmed_report // to merging data + path("${meta.id}.cutadapt.json"), emit: cutadapt_json // to MultiQC + + when: + task.ext.when == null || task.ext.when + + script: + // def args = task.ext.args ?: '' + // def prefix = task.ext.prefix ?: "${meta.id}" + maxN = params.maxN >=0 ? "--max-n ${params.maxN}" : "" + maxEE = "--max-ee ${[params.maxEE_for,params.maxEE_rev].max()}" + min_len = params.min_read_len ? "-m ${params.min_read_len}" : "-m 50" + max_len = params.max_read_len != "Inf" ? "-M ${params.max_read_len}" : "" + outr2 = meta.single_end ? '' : "-p ${meta.id}.R2.filtered.fastq.gz" + p2 = meta.single_end ? '' : "-G ${rev_primer} -A ${for_primer_rc}" + polyG = params.illumina_twocolor ? "--nextseq-trim=2" : "" + + """ + cutadapt --report=minimal \\ + --json=${meta.id}.cutadapt.json \\ + -g ${for_primer} -a ${rev_primer_rc} ${p2} \\ + --cores ${task.cpus} \\ + -n 2 ${maxEE} ${min_len} ${max_len} ${maxN} ${polyG} \\ + -o ${meta.id}.R1.filtered.fastq.gz ${outr2} \\ + ${reads} > ${meta.id}.cutadapt.out + + # is the FASTQ file empty? + if [ -n "\$(gunzip <${meta.id}.R1.filtered.fastq.gz | head -c 1 | tr '\\0\\n' __)" ]; then + echo "Sequences present" + else + rm ${meta.id}.R[12].filtered.fastq.gz + fi + """ + + // stub: + // def args = task.ext.args ?: '' + // def prefix = task.ext.prefix ?: "${meta.id}" + // """ + // """ +} diff --git a/modules/local/mergetrimtables.nf b/modules/local/mergetrimtables.nf index 525dd09..b6957c4 100644 --- a/modules/local/mergetrimtables.nf +++ b/modules/local/mergetrimtables.nf @@ -15,18 +15,49 @@ process MERGE_TRIM_TABLES { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix - """ - #!/usr/bin/env Rscript + if (params.trimmer == "dada2") { + """ + #!/usr/bin/env Rscript - # Main purpose of this script is to merge all trimming data into one table + # Main purpose of this script is to merge all trimming data into one table + trimmedFiles <- list.files(path = '.', pattern = '*.trimmed.txt') + sample.names <- sub('.trimmed.txt', '', trimmedFiles) + # TODO: switch to dplyr bind_rows + trimmed <- do.call("rbind", lapply(trimmedFiles, function (x) as.data.frame(read.csv(x)))) + colnames(trimmed) <- c("dada2.ft.sequence", "dada2.ft.input", "dada2.ft.filtered") + trimmed\$SampleID <- sample.names + write.csv(trimmed, "all.trimmed.csv", row.names = FALSE) + """ + } else { // has to be cutadapt + """ + #!/usr/bin/env Rscript - trimmedFiles <- list.files(path = '.', pattern = '*.trimmed.txt') - sample.names <- sub('.trimmed.txt', '', trimmedFiles) - trimmed <- do.call("rbind", lapply(trimmedFiles, function (x) as.data.frame(read.csv(x)))) - colnames(trimmed)[1] <- "Sequence" - trimmed\$SampleID <- sample.names - write.csv(trimmed, "all.trimmed.csv", row.names = FALSE) - """ + # Main purpose of this script is to merge all cutadapt trimming data into one table + suppressPackageStartupMessages(library(tidyverse)) + + read_cutadapt <- function(x) { + counts <- read_tsv(x, col_names = TRUE,show_col_types = FALSE) + counts + } + + # gather files and load + cutadapt_files <- list.files(path = ".", pattern = "*.cutadapt.out") + cutadapt_sample_data <- lapply(cutadapt_files, read_cutadapt) + + # fix sample names + nms <- gsub(".cutadapt.out", "", cutadapt_files) + names(cutadapt_sample_data) <- nms + + # only keep some data + to_keep <- c("SampleID", "status", "in_reads", "too_short", + "too_long", "too_many_n", "out_reads", "w/adapters", "w/adapters2") + final_cutadapt <- bind_rows(cutadapt_sample_data, .id="SampleID") %>% + select(all_of(to_keep)) + # keep sampleID intact, but prepend 'cutadapt' to other columns + colnames(final_cutadapt)[2:length(to_keep)] <- paste0("cutadapt.", colnames(final_cutadapt)[2:length(to_keep)]) + write.csv(final_cutadapt, "all.trimmed.csv", row.names = FALSE) + """ + } stub: def args = task.ext.args ?: '' diff --git a/modules/local/pacbio_cutadapt.nf b/modules/local/pacbio_cutadapt.nf index a3e0e28..db285b8 100644 --- a/modules/local/pacbio_cutadapt.nf +++ b/modules/local/pacbio_cutadapt.nf @@ -1,36 +1,43 @@ -// TODO: this is currently a local module; we should try to set this up -// to use the standard nf-core module process PACBIO_CUTADAPT { tag "${meta.id}" - container 'quay.io/biocontainers/cutadapt:4.1--py310h1425a21_1' + container 'quay.io/biocontainers/cutadapt:5.0--py39hbcbf7aa_0' input: - // TODO: Note the channel name here should probably be changed tuple val(meta), path(reads) + val(for_primer) + val(rev_primer_rc) output: - tuple val(meta), file("${meta.id}.noprimer.fastq.gz"), optional: true, emit: cutadapt_trimmed - // file("${meta.id}.cutadapt.out"), emit: cutadapt_report - // file("${meta.id}.untrimmed.fastq.gz"), emit: cutadapt_untrimmed + tuple val(meta), file("${meta.id}.filtered.fastq.gz"), optional: true, emit: cutadapt_trimmed + file("${meta.id}.cutadapt.out"), emit: trimmed_report // to merging data + file("${meta.id}.untrimmed.fastq.gz"), emit: cutadapt_untrimmed + file("${meta.id}.cutadapt.json"), emit: cutadapt_json // to MultiQC - // when: - // !(params.precheck) + when: + task.ext.when == null || task.ext.when script: strictness = params.pacbio_strict_match ? '-g' : '-a' + maxN = params.maxN >=0 ? "--max-n ${params.maxN} " : "" + maxEE = [params.max_ee_for,params.max_ee_rev].max() == 0 ? "--max-ee ${[params.max_ee_for,params.max_ee_rev].max()}" : "" + min_len = params.min_read_len ? "-m ${params.min_read_len}" : "-m 50" + max_len = params.max_read_len != "Inf" ? "-M ${params.max_read_len}" : "" """ - # Logic: we should trim out the HiFi reads and require *both* primers be present (-g). - # This should also reorient the sequence to match the primers (--rc). - # Keep anything longer than 50bp, and allow users to filter their data by length later - revprimer_rc=\$( echo -n ${params.rev_primer} | tr "[ATGCUNYRSWKMBDHV]" "[TACGANRYSWMKVHDB]" | rev ) - cutadapt --rc \\ - ${strictness} "${params.for_primer}...\${revprimer_rc}" \\ - -m 50 \\ - -j ${task.cpus} \\ + --report=minimal \\ + ${strictness} "${for_primer}...${rev_primer_rc}" \\ + -j ${task.cpus} ${min_len} ${max_len} ${maxEE} ${max_N} \\ --untrimmed-output "${meta.id}.untrimmed.fastq.gz" \\ - -o "${meta.id}.noprimer.fastq.gz" \\ + --json=${meta.id}.cutadapt.json \\ + -o "${meta.id}.filtered.fastq.gz" \\ ${reads} > "${meta.id}.cutadapt.out" + + # is the FASTQ file empty? + if [ -n "\$(gunzip <${meta.id}.filtered.fastq.gz | head -c 1 | tr '\\0\\n' __)" ]; then + echo "Sequences present" + else + rm ${meta.id}.filtered.fastq.gz + fi """ } diff --git a/nextflow.config b/nextflow.config index ac185bd..23f7e32 100644 --- a/nextflow.config +++ b/nextflow.config @@ -42,15 +42,13 @@ params { skip_FASTQC = false skip_dadaQC = false skip_MultiQC = false - // TODO: change to skip_merge_check skip_merging_check = true skip_ee_check = true // Trimming skip_trimming = false // TODO: implement trimming/filtering subworkflow - // trim_mode = "dada2" - // filter_mode = "dada2" + trimmer = "dada2" // when true (default), this sets cutadapt's trimming // (which uses linked adapters) to require *both* primers be present. @@ -75,6 +73,8 @@ params { maxN = 0 min_read_len = 50 max_read_len = 5000 + // a general flag for trimming with Illumina modern two-color sequencing + illumina_twocolor = false // I think we can make these bool 'false' as above with R coersion (either through as.logical or using optparse in a Rscript) rmPhiX = false diff --git a/nextflow_schema.json b/nextflow_schema.json index e2bd32c..9838e4e 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -526,6 +526,19 @@ "type": "boolean", "default": true, "description": "Skip expected error checking in pre-QC step, hich runs per sample and can take time for larger paired-read data sets and long reads." + }, + "trimmer": { + "type": "string", + "default": "dada2", + "description": "Trimming tool to use.", + "enum": [ + "dada2", + "cutadapt" + ] + }, + "illumina_twocolor": { + "type": "boolean", + "description": "Set for Illumina two-color trimming." } } } \ No newline at end of file diff --git a/subworkflows/local/filter_and_trim.nf b/subworkflows/local/filter_and_trim.nf index 1cd8b7f..db480b0 100644 --- a/subworkflows/local/filter_and_trim.nf +++ b/subworkflows/local/filter_and_trim.nf @@ -1,5 +1,6 @@ include { ILLUMINA_DADA2_FILTER_AND_TRIM } from '../../modules/local/illumina_filterandtrim' include { PACBIO_DADA2_FILTER_AND_TRIM } from '../../modules/local/pacbio_filterandtrim' +include { ILLUMINA_CUTADAPT } from '../../modules/local/illumina_cutadapt' include { PACBIO_CUTADAPT } from '../../modules/local/pacbio_cutadapt' include { MERGE_TRIM_TABLES } from '../../modules/local/mergetrimtables' @@ -7,67 +8,67 @@ workflow FILTER_AND_TRIM { take: input //channel: [val(meta), path(reads)] - skip_filtering + skip_filtering main: - // Three options for Illumina data: - // DADA2 trimming and filtering (PE and SE currently) - implemented - // cutadapt-based (primers + Ns) + vsearch (EE) - NYI - // Hybrid (variable length) - NYI - // Two options for PacBio: - // DADA2 filtering (filter) - NYI - // cutadapt (trim) + vsearch - NYI - ch_reports = Channel.empty() ch_trimmed = Channel.empty() ch_trimmed_R1 = Channel.empty() ch_trimmed_R2 = Channel.empty() + ch_multiqc_files = Channel.empty() - // TODO: we're probably going to move to requiring the primer sequences to - // make the workflow more flexible re: trimming options, esp. since - // the current version assumes the presence of primer sequences and - // does a hard trim. This also allows for passing in cutadapt anchors - // and primer options (would need to parse these out) - for_primer = params.for_primer - for_primer_rc = "" - rev_primer = params.rev_primer - rev_primer_rc = "" - - if (for_primer && rev_primer) { - for_primer_rc = reverse_complement(for_primer) - rev_primer_rc = reverse_complement(rev_primer) - } + for_primer_rc = params.for_primer ? reverse_complement(params.for_primer) : "" + rev_primer_rc = params.rev_primer ? reverse_complement(params.rev_primer) : "" + + // Two modules/subworkflows for Illumina data: + // DADA2 trimming and filtering (PE and SE currently) - implemented + // cutadapt-based (primers + Ns) + vsearch (EE) - implemented + // Two modules/subworkflows for PacBio: + // DADA2 filtering (filter) - NYI + // cutadapt (trim) - implemented if (params.platform == "pacbio") { - // TODO: this could be modified/split into a `cutadapt`-only step; there - // are additional filters for max EE and max N in cutadapt PACBIO_CUTADAPT( input + params.for_primer + rev_primer_rc ) - // TODO: should be summarized as well, go to MultiQC - // ch_reports = PACBIO_CUTADAPT_FILTER_AND_TRIM.out.cutadapt_report.collect() - // TODO: this could be modified/split into a `DADA2`-only step - PACBIO_DADA2_FILTER_AND_TRIM( - PACBIO_CUTADAPT.out.cutadapt_trimmed - ) - ch_trimmed = PACBIO_DADA2_FILTER_AND_TRIM.out.trimmed - ch_trimmed_R1 = PACBIO_DADA2_FILTER_AND_TRIM.out.trimmed - ch_reports = PACBIO_DADA2_FILTER_AND_TRIM.out.trimmed_report.collect() + // PACBIO_DADA2_FILTER_AND_TRIM( + // PACBIO_CUTADAPT.out.cutadapt_trimmed + // ) + ch_trimmed = PACBIO_CUTADAPT.out.trimmed + ch_trimmed_R1 = PACBIO_CUTADAPT.out.trimmed + ch_reports = PACBIO_CUTADAPT.out.cutadapt_report.collect() + ch_multiqc_files = ch_multiqc_files.mix(ILLUMINA_CUTADAPT.out.cutadapt_json) } else { // this handles both paired and single-end data - ILLUMINA_DADA2_FILTER_AND_TRIM( - input - ) - ch_trimmed = ILLUMINA_DADA2_FILTER_AND_TRIM.out.trimmed - ch_reports = ILLUMINA_DADA2_FILTER_AND_TRIM.out.trimmed_report.collect() - ch_trimmed_R1 = ILLUMINA_DADA2_FILTER_AND_TRIM.out.trimmed_R1 - ch_trimmed_R2 = ILLUMINA_DADA2_FILTER_AND_TRIM.out.trimmed_R2 + if (params.trimmer == "dada2") { + ILLUMINA_DADA2_FILTER_AND_TRIM( + input + ) + ch_trimmed = ILLUMINA_DADA2_FILTER_AND_TRIM.out.trimmed + ch_reports = ILLUMINA_DADA2_FILTER_AND_TRIM.out.trimmed_report.collect() + ch_trimmed_R1 = ILLUMINA_DADA2_FILTER_AND_TRIM.out.trimmed_R1 + ch_trimmed_R2 = ILLUMINA_DADA2_FILTER_AND_TRIM.out.trimmed_R2 + } else if (params.trimmer == "cutadapt") { + ILLUMINA_CUTADAPT( + input, + params.for_primer, + params.rev_primer, + for_primer_rc, + rev_primer_rc + ) + ch_trimmed = ILLUMINA_CUTADAPT.out.trimmed + ch_reports = ILLUMINA_CUTADAPT.out.trimmed_report.collect() + ch_trimmed_R1 = ILLUMINA_CUTADAPT.out.trimmed_R1 + ch_trimmed_R2 = ILLUMINA_CUTADAPT.out.trimmed_R2 + ch_multiqc_files = ch_multiqc_files.mix(ILLUMINA_CUTADAPT.out.cutadapt_json) + } } - // TODO: add variable-length and PacBio MERGE_TRIM_TABLES( ch_reports ) @@ -93,14 +94,9 @@ workflow FILTER_AND_TRIM { trimmed = ch_trimmed trimmed_report = MERGE_TRIM_TABLES.out.trimmed_report // channel: [ RDS ] trimmed_infer = ch_trimmed_infer + ch_multiqc_files } -// def clean_primers(primer) { -// // returns a clean primer string, IUPAC codes -// // w/o any metadata or anchors. Assumes cutadapt -// // filtering -// } - def reverse_complement(primer) { // returns the revcomp, handles IUPAC ambig codes // tr "[ATGCUNYRSWKMBDHV]" "[TACGANRYSWMKVHDB]" @@ -126,3 +122,10 @@ def reverse_complement(primer) { } }.join('') } + +// def clean_primers(primer) { +// // returns a clean primer string, IUPAC codes +// // w/o any metadata or anchors. Assumes cutadapt +// // filtering +// } + diff --git a/workflows/tada.nf b/workflows/tada.nf index 2dfecdb..bb302f3 100644 --- a/workflows/tada.nf +++ b/workflows/tada.nf @@ -70,10 +70,6 @@ workflow TADA { exit 1, "--id_type can currently only be set to 'simple' or 'md5', got ${params.id_type}" } - // FASTQC ( - // ch_samplesheet - // ) - PRE_QC( ch_samplesheet, params.skip_FASTQC, @@ -87,18 +83,11 @@ workflow TADA { // ch_multiqc_files = ch_multiqc_files.mix(PLOTQUALITYPROFILE.out.zip.collect{it[1]}) - // Subworkflows-Trimming and Filtering: - // cutadapt (overlapping paired: V4, COI) - // cutadapt (variable paired: ITS) - // cutadapt (full-length reads: PacBio 16S) - // DADA2 filterAndTrim - // Alternative est error filtering - FILTER_AND_TRIM ( ch_samplesheet, params.skip_trimming ) - + ch_multiqc_files = ch_multiqc_files.mix(FILTER_AND_TRIM.out.ch_multiqc_files) ch_readtracking = ch_readtracking.mix(FILTER_AND_TRIM.out.trimmed_report) // TODO: Input for these should be the trimmed reads from above, but