implement cutadapt for Illumina, add read tracking, and tidy pacbio c…

…utadapt
h3abionet · Jan 29, 2025 · c30a875 · c30a875
1 parent f6f7dd4
commit c30a875
Show file tree

Hide file tree

Showing 8 changed files with 194 additions and 91 deletions.
diff --git a/conf/test.config b/conf/test.config
@@ -23,8 +23,11 @@ params {
     input     = 'https://file-server.igb.illinois.edu/~cjfields/TADA/samplesheet.miseq.16s.csv'
     trim_for   = 24
     trim_rev   = 25
+    for_primer = "GTGYCAGCMGCCGCGGTAA"
+    rev_primer = "GGACTACNVGGGTWTCTAAT"
     reference = 'https://file-server.igb.illinois.edu/~cjfields/TADA/silva_nr99_v138.1_train_set.fa.gz'
     phylo_tool  = 'fasttree'
     to_QIIME2   = true
+    trimmer     = "cutadapt"
     // species   = 'https://file-server.igb.illinois.edu/~cjfields/TADA/silva_species_assignment_v138.1.fa.gz'
 }
diff --git a/modules/local/illumina_cutadapt.nf b/modules/local/illumina_cutadapt.nf
@@ -0,0 +1,57 @@
+process ILLUMINA_CUTADAPT {
+    tag "$meta.id"
+    label 'process_medium'
+
+    container 'quay.io/biocontainers/cutadapt:5.0--py39hbcbf7aa_0'
+
+    input:
+    tuple val(meta), path(reads)
+    val(for_primer)
+    val(rev_primer)
+    val(for_primer_rc)
+    val(rev_primer_rc)
+
+    output:
+    tuple val(meta), path("${meta.id}.R1.filtered.fastq.gz"), optional: true, emit: trimmed_R1
+    tuple val(meta), path("${meta.id}.R2.filtered.fastq.gz"), optional: true, emit: trimmed_R2
+    tuple val(meta), path("${meta.id}.R[12].filtered.fastq.gz"), optional: true, emit: trimmed
+    path("${meta.id}.cutadapt.out"), emit: trimmed_report // to merging data
+    path("${meta.id}.cutadapt.json"), emit: cutadapt_json  // to MultiQC
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    // def args = task.ext.args ?: ''
+    // def prefix = task.ext.prefix ?: "${meta.id}"
+    maxN = params.maxN >=0 ? "--max-n ${params.maxN}" : ""
+    maxEE = "--max-ee ${[params.maxEE_for,params.maxEE_rev].max()}"
+    min_len = params.min_read_len ? "-m ${params.min_read_len}" : "-m 50" 
+    max_len = params.max_read_len != "Inf" ? "-M ${params.max_read_len}" : ""
+    outr2 = meta.single_end ? '' : "-p ${meta.id}.R2.filtered.fastq.gz"
+    p2 = meta.single_end ? '' : "-G ${rev_primer} -A ${for_primer_rc}"
+    polyG = params.illumina_twocolor ? "--nextseq-trim=2" : ""
+
+    """
+    cutadapt --report=minimal \\
+        --json=${meta.id}.cutadapt.json \\
+        -g ${for_primer} -a ${rev_primer_rc} ${p2} \\
+        --cores ${task.cpus} \\
+        -n 2 ${maxEE} ${min_len} ${max_len} ${maxN} ${polyG} \\
+        -o ${meta.id}.R1.filtered.fastq.gz ${outr2} \\
+        ${reads} > ${meta.id}.cutadapt.out
+
+    # is the FASTQ file empty?
+    if [ -n "\$(gunzip <${meta.id}.R1.filtered.fastq.gz | head -c 1 | tr '\\0\\n' __)" ]; then
+        echo "Sequences present"
+    else
+        rm ${meta.id}.R[12].filtered.fastq.gz
+    fi
+    """
+
+    // stub:
+    // def args = task.ext.args ?: ''
+    // def prefix = task.ext.prefix ?: "${meta.id}"
+    // """
+    // """
+}
diff --git a/modules/local/mergetrimtables.nf b/modules/local/mergetrimtables.nf
@@ -15,18 +15,49 @@ process MERGE_TRIM_TABLES {
     script:
     def args = task.ext.args ?: ''
     def prefix = task.ext.prefix
-    """
-    #!/usr/bin/env Rscript
+    if (params.trimmer == "dada2") {
+        """
+        #!/usr/bin/env Rscript
 
-    # Main purpose of this script is to merge all trimming data into one table
+        # Main purpose of this script is to merge all trimming data into one table
+        trimmedFiles <- list.files(path = '.', pattern = '*.trimmed.txt')
+        sample.names <- sub('.trimmed.txt', '', trimmedFiles)
+        # TODO: switch to dplyr bind_rows
+        trimmed <- do.call("rbind", lapply(trimmedFiles, function (x) as.data.frame(read.csv(x))))
+        colnames(trimmed) <- c("dada2.ft.sequence", "dada2.ft.input", "dada2.ft.filtered")
+        trimmed\$SampleID <- sample.names
+        write.csv(trimmed, "all.trimmed.csv", row.names = FALSE)
+        """
+    } else { // has to be cutadapt
+        """
+        #!/usr/bin/env Rscript
 
-    trimmedFiles <- list.files(path = '.', pattern = '*.trimmed.txt')
-    sample.names <- sub('.trimmed.txt', '', trimmedFiles)
-    trimmed <- do.call("rbind", lapply(trimmedFiles, function (x) as.data.frame(read.csv(x))))
-    colnames(trimmed)[1] <- "Sequence"
-    trimmed\$SampleID <- sample.names
-    write.csv(trimmed, "all.trimmed.csv", row.names = FALSE)
-    """
+        # Main purpose of this script is to merge all cutadapt trimming data into one table
+        suppressPackageStartupMessages(library(tidyverse))
+
+        read_cutadapt <- function(x) {
+          counts <- read_tsv(x, col_names = TRUE,show_col_types = FALSE)
+          counts
+        }
+
+        # gather files and load
+        cutadapt_files <- list.files(path = ".", pattern = "*.cutadapt.out")
+        cutadapt_sample_data <- lapply(cutadapt_files, read_cutadapt)
+
+        # fix sample names
+        nms <- gsub(".cutadapt.out", "", cutadapt_files)
+        names(cutadapt_sample_data) <- nms
+
+        # only keep some data
+        to_keep <- c("SampleID", "status", "in_reads",  "too_short", 
+            "too_long", "too_many_n", "out_reads", "w/adapters", "w/adapters2")
+        final_cutadapt <- bind_rows(cutadapt_sample_data, .id="SampleID") %>% 
+            select(all_of(to_keep))
+        # keep sampleID intact, but prepend 'cutadapt' to other columns
+        colnames(final_cutadapt)[2:length(to_keep)] <- paste0("cutadapt.", colnames(final_cutadapt)[2:length(to_keep)])
+        write.csv(final_cutadapt, "all.trimmed.csv", row.names = FALSE)
+        """        
+    }
 
     stub:
     def args = task.ext.args ?: ''

diff --git a/modules/local/pacbio_cutadapt.nf b/modules/local/pacbio_cutadapt.nf
@@ -1,36 +1,43 @@
-// TODO: this is currently a local module; we should try to set this up
-// to use the standard nf-core module
 process PACBIO_CUTADAPT {
     tag "${meta.id}"
 
-    container 'quay.io/biocontainers/cutadapt:4.1--py310h1425a21_1'
+    container 'quay.io/biocontainers/cutadapt:5.0--py39hbcbf7aa_0'
 
     input:
-    // TODO: Note the channel name here should probably be changed
     tuple val(meta), path(reads)
+    val(for_primer)
+    val(rev_primer_rc)
 
     output:
-    tuple val(meta), file("${meta.id}.noprimer.fastq.gz"), optional: true, emit: cutadapt_trimmed
-    // file("${meta.id}.cutadapt.out"), emit: cutadapt_report
-    // file("${meta.id}.untrimmed.fastq.gz"), emit: cutadapt_untrimmed
+    tuple val(meta), file("${meta.id}.filtered.fastq.gz"), optional: true, emit: cutadapt_trimmed
+    file("${meta.id}.cutadapt.out"), emit: trimmed_report // to merging data
+    file("${meta.id}.untrimmed.fastq.gz"), emit: cutadapt_untrimmed
+    file("${meta.id}.cutadapt.json"), emit: cutadapt_json  // to MultiQC
 
-    // when:
-    // !(params.precheck)
+    when:
+    task.ext.when == null || task.ext.when
 
     script:
     strictness = params.pacbio_strict_match ? '-g' : '-a'
+    maxN = params.maxN >=0 ? "--max-n ${params.maxN} " : ""
+    maxEE = [params.max_ee_for,params.max_ee_rev].max() == 0 ? "--max-ee ${[params.max_ee_for,params.max_ee_rev].max()}" : ""
+    min_len = params.min_read_len ? "-m ${params.min_read_len}" : "-m 50" 
+    max_len = params.max_read_len != "Inf" ? "-M ${params.max_read_len}" : ""
     """
-    # Logic: we should trim out the HiFi reads and require *both* primers be present (-g).
-    # This should also reorient the sequence to match the primers (--rc).
-    # Keep anything longer than 50bp, and allow users to filter their data by length later
-    revprimer_rc=\$( echo -n ${params.rev_primer} | tr "[ATGCUNYRSWKMBDHV]" "[TACGANRYSWMKVHDB]" | rev )
-
     cutadapt --rc \\
-        ${strictness} "${params.for_primer}...\${revprimer_rc}" \\
-        -m 50 \\
-        -j ${task.cpus} \\
+        --report=minimal \\
+        ${strictness} "${for_primer}...${rev_primer_rc}" \\
+        -j ${task.cpus} ${min_len} ${max_len} ${maxEE} ${max_N} \\
         --untrimmed-output "${meta.id}.untrimmed.fastq.gz" \\
-        -o "${meta.id}.noprimer.fastq.gz" \\
+        --json=${meta.id}.cutadapt.json \\
+        -o "${meta.id}.filtered.fastq.gz" \\
         ${reads} > "${meta.id}.cutadapt.out"
+
+    # is the FASTQ file empty?
+    if [ -n "\$(gunzip <${meta.id}.filtered.fastq.gz | head -c 1 | tr '\\0\\n' __)" ]; then
+        echo "Sequences present"
+    else
+        rm ${meta.id}.filtered.fastq.gz
+    fi
     """
 }
diff --git a/nextflow.config b/nextflow.config
@@ -42,15 +42,13 @@ params {
     skip_FASTQC                = false
     skip_dadaQC                = false
     skip_MultiQC               = false
-    // TODO: change to skip_merge_check
     skip_merging_check         = true
     skip_ee_check              = true
 
     // Trimming
     skip_trimming              = false
     // TODO: implement trimming/filtering subworkflow
-    // trim_mode               = "dada2"
-    // filter_mode             = "dada2"
+    trimmer                    = "dada2"
 
     // when true (default), this sets cutadapt's trimming 
     // (which uses linked adapters) to require *both* primers be present.  
@@ -75,6 +73,8 @@ params {
     maxN                       = 0
     min_read_len               = 50
     max_read_len               = 5000
+    // a general flag for trimming with Illumina modern two-color sequencing
+    illumina_twocolor          = false
     // I think we can make these bool 'false' as above with R coersion (either through as.logical or using optparse in a Rscript)
     rmPhiX                     = false
 

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -526,6 +526,19 @@
             "type": "boolean",
             "default": true,
             "description": "Skip expected error checking in pre-QC step, hich runs per sample and can take time for larger paired-read data sets and long reads."
+        },
+        "trimmer": {
+            "type": "string",
+            "default": "dada2",
+            "description": "Trimming tool to use.",
+            "enum": [
+                "dada2",
+                "cutadapt"
+            ]
+        },
+        "illumina_twocolor": {
+            "type": "boolean",
+            "description": "Set for Illumina two-color trimming."
         }
     }
 }
diff --git a/subworkflows/local/filter_and_trim.nf b/subworkflows/local/filter_and_trim.nf
@@ -1,73 +1,74 @@
 include { ILLUMINA_DADA2_FILTER_AND_TRIM   } from '../../modules/local/illumina_filterandtrim'
 include { PACBIO_DADA2_FILTER_AND_TRIM     } from '../../modules/local/pacbio_filterandtrim'
+include { ILLUMINA_CUTADAPT                } from '../../modules/local/illumina_cutadapt'
 include { PACBIO_CUTADAPT                  } from '../../modules/local/pacbio_cutadapt'
 include { MERGE_TRIM_TABLES                } from '../../modules/local/mergetrimtables'
 
 workflow FILTER_AND_TRIM {
 
     take:
     input           //channel: [val(meta), path(reads)]
-    skip_filtering  
+    skip_filtering
 
     main:
-    // Three options for Illumina data:
-    //       DADA2 trimming and filtering (PE and SE currently) - implemented
-    //       cutadapt-based (primers + Ns) + vsearch (EE) - NYI
-    //       Hybrid (variable length) - NYI
-    // Two options for PacBio:
-    //       DADA2 filtering (filter) - NYI
-    //       cutadapt (trim) + vsearch - NYI
-
     ch_reports = Channel.empty()
     ch_trimmed = Channel.empty()
     ch_trimmed_R1 = Channel.empty()
     ch_trimmed_R2 = Channel.empty()
+    ch_multiqc_files = Channel.empty()
 
-    // TODO: we're probably going to move to requiring the primer sequences to
-    //       make the workflow more flexible re: trimming options, esp. since
-    //       the current version assumes the presence of primer sequences and
-    //       does a hard trim. This also allows for passing in cutadapt anchors 
-    //       and primer options (would need to parse these out)
-    for_primer = params.for_primer
-    for_primer_rc = ""
-    rev_primer = params.rev_primer
-    rev_primer_rc = ""
-
-    if (for_primer && rev_primer) {
-        for_primer_rc = reverse_complement(for_primer)
-        rev_primer_rc = reverse_complement(rev_primer)
-    }
+    for_primer_rc = params.for_primer ? reverse_complement(params.for_primer) : ""
+    rev_primer_rc = params.rev_primer ? reverse_complement(params.rev_primer) : ""
+
+    // Two modules/subworkflows for Illumina data:
+    //       DADA2 trimming and filtering (PE and SE currently) - implemented
+    //       cutadapt-based (primers + Ns) + vsearch (EE) - implemented
+    // Two modules/subworkflows for PacBio:
+    //       DADA2 filtering (filter) - NYI
+    //       cutadapt (trim) - implemented
 
     if (params.platform == "pacbio") {
 
-        // TODO: this could be modified/split into a `cutadapt`-only step; there
-        // are additional filters for max EE and max N in cutadapt
         PACBIO_CUTADAPT(
             input
+            params.for_primer
+            rev_primer_rc
         )
 
-        // TODO: should be summarized as well, go to MultiQC
-        // ch_reports = PACBIO_CUTADAPT_FILTER_AND_TRIM.out.cutadapt_report.collect()
-
         // TODO: this could be modified/split into a `DADA2`-only step
-        PACBIO_DADA2_FILTER_AND_TRIM(
-            PACBIO_CUTADAPT.out.cutadapt_trimmed
-        )
-        ch_trimmed = PACBIO_DADA2_FILTER_AND_TRIM.out.trimmed
-        ch_trimmed_R1 = PACBIO_DADA2_FILTER_AND_TRIM.out.trimmed
-        ch_reports = PACBIO_DADA2_FILTER_AND_TRIM.out.trimmed_report.collect()
+        // PACBIO_DADA2_FILTER_AND_TRIM(
+        //     PACBIO_CUTADAPT.out.cutadapt_trimmed
+        // )
+        ch_trimmed = PACBIO_CUTADAPT.out.trimmed
+        ch_trimmed_R1 = PACBIO_CUTADAPT.out.trimmed
+        ch_reports = PACBIO_CUTADAPT.out.cutadapt_report.collect()
+        ch_multiqc_files = ch_multiqc_files.mix(ILLUMINA_CUTADAPT.out.cutadapt_json)
     } else {
         // this handles both paired and single-end data
-        ILLUMINA_DADA2_FILTER_AND_TRIM(
-            input
-        )
-        ch_trimmed = ILLUMINA_DADA2_FILTER_AND_TRIM.out.trimmed
-        ch_reports = ILLUMINA_DADA2_FILTER_AND_TRIM.out.trimmed_report.collect()
-        ch_trimmed_R1 = ILLUMINA_DADA2_FILTER_AND_TRIM.out.trimmed_R1
-        ch_trimmed_R2 = ILLUMINA_DADA2_FILTER_AND_TRIM.out.trimmed_R2
+        if (params.trimmer == "dada2") {
+            ILLUMINA_DADA2_FILTER_AND_TRIM(
+                input 
+            )
+            ch_trimmed = ILLUMINA_DADA2_FILTER_AND_TRIM.out.trimmed
+            ch_reports = ILLUMINA_DADA2_FILTER_AND_TRIM.out.trimmed_report.collect()
+            ch_trimmed_R1 = ILLUMINA_DADA2_FILTER_AND_TRIM.out.trimmed_R1
+            ch_trimmed_R2 = ILLUMINA_DADA2_FILTER_AND_TRIM.out.trimmed_R2
+        } else if (params.trimmer == "cutadapt") {
+            ILLUMINA_CUTADAPT(
+                input,
+                params.for_primer,
+                params.rev_primer,
+                for_primer_rc,
+                rev_primer_rc
+            )
+            ch_trimmed = ILLUMINA_CUTADAPT.out.trimmed
+            ch_reports = ILLUMINA_CUTADAPT.out.trimmed_report.collect()
+            ch_trimmed_R1 = ILLUMINA_CUTADAPT.out.trimmed_R1
+            ch_trimmed_R2 = ILLUMINA_CUTADAPT.out.trimmed_R2
+            ch_multiqc_files = ch_multiqc_files.mix(ILLUMINA_CUTADAPT.out.cutadapt_json)
+        }
     }
 
-    // TODO: add variable-length and PacBio
     MERGE_TRIM_TABLES(
         ch_reports
     )
@@ -93,14 +94,9 @@ workflow FILTER_AND_TRIM {
     trimmed = ch_trimmed
     trimmed_report = MERGE_TRIM_TABLES.out.trimmed_report // channel: [ RDS ]
     trimmed_infer = ch_trimmed_infer
+    ch_multiqc_files
 }
 
-// def clean_primers(primer) {
-//     // returns a clean primer string, IUPAC codes 
-//     // w/o any metadata or anchors. Assumes cutadapt 
-//     // filtering
-// }
-
 def reverse_complement(primer) {
     // returns the revcomp, handles IUPAC ambig codes
     // tr "[ATGCUNYRSWKMBDHV]" "[TACGANRYSWMKVHDB]"
@@ -126,3 +122,10 @@ def reverse_complement(primer) {
         }
     }.join('')
 }
+
+// def clean_primers(primer) {
+//     // returns a clean primer string, IUPAC codes 
+//     // w/o any metadata or anchors. Assumes cutadapt 
+//     // filtering
+// }
+