Skip to content

Commit

Permalink
implement cutadapt for Illumina, add read tracking, and tidy pacbio c…
Browse files Browse the repository at this point in the history
…utadapt
  • Loading branch information
cjfields committed Jan 29, 2025
1 parent f6f7dd4 commit c30a875
Show file tree
Hide file tree
Showing 8 changed files with 194 additions and 91 deletions.
3 changes: 3 additions & 0 deletions conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,11 @@ params {
input = 'https://file-server.igb.illinois.edu/~cjfields/TADA/samplesheet.miseq.16s.csv'
trim_for = 24
trim_rev = 25
for_primer = "GTGYCAGCMGCCGCGGTAA"
rev_primer = "GGACTACNVGGGTWTCTAAT"
reference = 'https://file-server.igb.illinois.edu/~cjfields/TADA/silva_nr99_v138.1_train_set.fa.gz'
phylo_tool = 'fasttree'
to_QIIME2 = true
trimmer = "cutadapt"
// species = 'https://file-server.igb.illinois.edu/~cjfields/TADA/silva_species_assignment_v138.1.fa.gz'
}
57 changes: 57 additions & 0 deletions modules/local/illumina_cutadapt.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
process ILLUMINA_CUTADAPT {
tag "$meta.id"
label 'process_medium'

container 'quay.io/biocontainers/cutadapt:5.0--py39hbcbf7aa_0'

input:
tuple val(meta), path(reads)
val(for_primer)
val(rev_primer)
val(for_primer_rc)
val(rev_primer_rc)

output:
tuple val(meta), path("${meta.id}.R1.filtered.fastq.gz"), optional: true, emit: trimmed_R1
tuple val(meta), path("${meta.id}.R2.filtered.fastq.gz"), optional: true, emit: trimmed_R2
tuple val(meta), path("${meta.id}.R[12].filtered.fastq.gz"), optional: true, emit: trimmed
path("${meta.id}.cutadapt.out"), emit: trimmed_report // to merging data
path("${meta.id}.cutadapt.json"), emit: cutadapt_json // to MultiQC

when:
task.ext.when == null || task.ext.when

script:
// def args = task.ext.args ?: ''
// def prefix = task.ext.prefix ?: "${meta.id}"
maxN = params.maxN >=0 ? "--max-n ${params.maxN}" : ""
maxEE = "--max-ee ${[params.maxEE_for,params.maxEE_rev].max()}"
min_len = params.min_read_len ? "-m ${params.min_read_len}" : "-m 50"
max_len = params.max_read_len != "Inf" ? "-M ${params.max_read_len}" : ""
outr2 = meta.single_end ? '' : "-p ${meta.id}.R2.filtered.fastq.gz"
p2 = meta.single_end ? '' : "-G ${rev_primer} -A ${for_primer_rc}"
polyG = params.illumina_twocolor ? "--nextseq-trim=2" : ""

"""
cutadapt --report=minimal \\
--json=${meta.id}.cutadapt.json \\
-g ${for_primer} -a ${rev_primer_rc} ${p2} \\
--cores ${task.cpus} \\
-n 2 ${maxEE} ${min_len} ${max_len} ${maxN} ${polyG} \\
-o ${meta.id}.R1.filtered.fastq.gz ${outr2} \\
${reads} > ${meta.id}.cutadapt.out
# is the FASTQ file empty?
if [ -n "\$(gunzip <${meta.id}.R1.filtered.fastq.gz | head -c 1 | tr '\\0\\n' __)" ]; then
echo "Sequences present"
else
rm ${meta.id}.R[12].filtered.fastq.gz
fi
"""

// stub:
// def args = task.ext.args ?: ''
// def prefix = task.ext.prefix ?: "${meta.id}"
// """
// """
}
51 changes: 41 additions & 10 deletions modules/local/mergetrimtables.nf
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,49 @@ process MERGE_TRIM_TABLES {
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix
"""
#!/usr/bin/env Rscript
if (params.trimmer == "dada2") {
"""
#!/usr/bin/env Rscript
# Main purpose of this script is to merge all trimming data into one table
# Main purpose of this script is to merge all trimming data into one table
trimmedFiles <- list.files(path = '.', pattern = '*.trimmed.txt')
sample.names <- sub('.trimmed.txt', '', trimmedFiles)
# TODO: switch to dplyr bind_rows
trimmed <- do.call("rbind", lapply(trimmedFiles, function (x) as.data.frame(read.csv(x))))
colnames(trimmed) <- c("dada2.ft.sequence", "dada2.ft.input", "dada2.ft.filtered")
trimmed\$SampleID <- sample.names
write.csv(trimmed, "all.trimmed.csv", row.names = FALSE)
"""
} else { // has to be cutadapt
"""
#!/usr/bin/env Rscript
trimmedFiles <- list.files(path = '.', pattern = '*.trimmed.txt')
sample.names <- sub('.trimmed.txt', '', trimmedFiles)
trimmed <- do.call("rbind", lapply(trimmedFiles, function (x) as.data.frame(read.csv(x))))
colnames(trimmed)[1] <- "Sequence"
trimmed\$SampleID <- sample.names
write.csv(trimmed, "all.trimmed.csv", row.names = FALSE)
"""
# Main purpose of this script is to merge all cutadapt trimming data into one table
suppressPackageStartupMessages(library(tidyverse))
read_cutadapt <- function(x) {
counts <- read_tsv(x, col_names = TRUE,show_col_types = FALSE)
counts
}
# gather files and load
cutadapt_files <- list.files(path = ".", pattern = "*.cutadapt.out")
cutadapt_sample_data <- lapply(cutadapt_files, read_cutadapt)
# fix sample names
nms <- gsub(".cutadapt.out", "", cutadapt_files)
names(cutadapt_sample_data) <- nms
# only keep some data
to_keep <- c("SampleID", "status", "in_reads", "too_short",
"too_long", "too_many_n", "out_reads", "w/adapters", "w/adapters2")
final_cutadapt <- bind_rows(cutadapt_sample_data, .id="SampleID") %>%
select(all_of(to_keep))
# keep sampleID intact, but prepend 'cutadapt' to other columns
colnames(final_cutadapt)[2:length(to_keep)] <- paste0("cutadapt.", colnames(final_cutadapt)[2:length(to_keep)])
write.csv(final_cutadapt, "all.trimmed.csv", row.names = FALSE)
"""
}

stub:
def args = task.ext.args ?: ''
Expand Down
43 changes: 25 additions & 18 deletions modules/local/pacbio_cutadapt.nf
Original file line number Diff line number Diff line change
@@ -1,36 +1,43 @@
// TODO: this is currently a local module; we should try to set this up
// to use the standard nf-core module
process PACBIO_CUTADAPT {
tag "${meta.id}"

container 'quay.io/biocontainers/cutadapt:4.1--py310h1425a21_1'
container 'quay.io/biocontainers/cutadapt:5.0--py39hbcbf7aa_0'

input:
// TODO: Note the channel name here should probably be changed
tuple val(meta), path(reads)
val(for_primer)
val(rev_primer_rc)

output:
tuple val(meta), file("${meta.id}.noprimer.fastq.gz"), optional: true, emit: cutadapt_trimmed
// file("${meta.id}.cutadapt.out"), emit: cutadapt_report
// file("${meta.id}.untrimmed.fastq.gz"), emit: cutadapt_untrimmed
tuple val(meta), file("${meta.id}.filtered.fastq.gz"), optional: true, emit: cutadapt_trimmed
file("${meta.id}.cutadapt.out"), emit: trimmed_report // to merging data
file("${meta.id}.untrimmed.fastq.gz"), emit: cutadapt_untrimmed
file("${meta.id}.cutadapt.json"), emit: cutadapt_json // to MultiQC

// when:
// !(params.precheck)
when:
task.ext.when == null || task.ext.when

script:
strictness = params.pacbio_strict_match ? '-g' : '-a'
maxN = params.maxN >=0 ? "--max-n ${params.maxN} " : ""
maxEE = [params.max_ee_for,params.max_ee_rev].max() == 0 ? "--max-ee ${[params.max_ee_for,params.max_ee_rev].max()}" : ""
min_len = params.min_read_len ? "-m ${params.min_read_len}" : "-m 50"
max_len = params.max_read_len != "Inf" ? "-M ${params.max_read_len}" : ""
"""
# Logic: we should trim out the HiFi reads and require *both* primers be present (-g).
# This should also reorient the sequence to match the primers (--rc).
# Keep anything longer than 50bp, and allow users to filter their data by length later
revprimer_rc=\$( echo -n ${params.rev_primer} | tr "[ATGCUNYRSWKMBDHV]" "[TACGANRYSWMKVHDB]" | rev )
cutadapt --rc \\
${strictness} "${params.for_primer}...\${revprimer_rc}" \\
-m 50 \\
-j ${task.cpus} \\
--report=minimal \\
${strictness} "${for_primer}...${rev_primer_rc}" \\
-j ${task.cpus} ${min_len} ${max_len} ${maxEE} ${max_N} \\
--untrimmed-output "${meta.id}.untrimmed.fastq.gz" \\
-o "${meta.id}.noprimer.fastq.gz" \\
--json=${meta.id}.cutadapt.json \\
-o "${meta.id}.filtered.fastq.gz" \\
${reads} > "${meta.id}.cutadapt.out"
# is the FASTQ file empty?
if [ -n "\$(gunzip <${meta.id}.filtered.fastq.gz | head -c 1 | tr '\\0\\n' __)" ]; then
echo "Sequences present"
else
rm ${meta.id}.filtered.fastq.gz
fi
"""
}
6 changes: 3 additions & 3 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,13 @@ params {
skip_FASTQC = false
skip_dadaQC = false
skip_MultiQC = false
// TODO: change to skip_merge_check
skip_merging_check = true
skip_ee_check = true

// Trimming
skip_trimming = false
// TODO: implement trimming/filtering subworkflow
// trim_mode = "dada2"
// filter_mode = "dada2"
trimmer = "dada2"

// when true (default), this sets cutadapt's trimming
// (which uses linked adapters) to require *both* primers be present.
Expand All @@ -75,6 +73,8 @@ params {
maxN = 0
min_read_len = 50
max_read_len = 5000
// a general flag for trimming with Illumina modern two-color sequencing
illumina_twocolor = false
// I think we can make these bool 'false' as above with R coersion (either through as.logical or using optparse in a Rscript)
rmPhiX = false

Expand Down
13 changes: 13 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -526,6 +526,19 @@
"type": "boolean",
"default": true,
"description": "Skip expected error checking in pre-QC step, hich runs per sample and can take time for larger paired-read data sets and long reads."
},
"trimmer": {
"type": "string",
"default": "dada2",
"description": "Trimming tool to use.",
"enum": [
"dada2",
"cutadapt"
]
},
"illumina_twocolor": {
"type": "boolean",
"description": "Set for Illumina two-color trimming."
}
}
}
99 changes: 51 additions & 48 deletions subworkflows/local/filter_and_trim.nf
Original file line number Diff line number Diff line change
@@ -1,73 +1,74 @@
include { ILLUMINA_DADA2_FILTER_AND_TRIM } from '../../modules/local/illumina_filterandtrim'
include { PACBIO_DADA2_FILTER_AND_TRIM } from '../../modules/local/pacbio_filterandtrim'
include { ILLUMINA_CUTADAPT } from '../../modules/local/illumina_cutadapt'
include { PACBIO_CUTADAPT } from '../../modules/local/pacbio_cutadapt'
include { MERGE_TRIM_TABLES } from '../../modules/local/mergetrimtables'

workflow FILTER_AND_TRIM {

take:
input //channel: [val(meta), path(reads)]
skip_filtering
skip_filtering

main:
// Three options for Illumina data:
// DADA2 trimming and filtering (PE and SE currently) - implemented
// cutadapt-based (primers + Ns) + vsearch (EE) - NYI
// Hybrid (variable length) - NYI
// Two options for PacBio:
// DADA2 filtering (filter) - NYI
// cutadapt (trim) + vsearch - NYI

ch_reports = Channel.empty()
ch_trimmed = Channel.empty()
ch_trimmed_R1 = Channel.empty()
ch_trimmed_R2 = Channel.empty()
ch_multiqc_files = Channel.empty()

// TODO: we're probably going to move to requiring the primer sequences to
// make the workflow more flexible re: trimming options, esp. since
// the current version assumes the presence of primer sequences and
// does a hard trim. This also allows for passing in cutadapt anchors
// and primer options (would need to parse these out)
for_primer = params.for_primer
for_primer_rc = ""
rev_primer = params.rev_primer
rev_primer_rc = ""

if (for_primer && rev_primer) {
for_primer_rc = reverse_complement(for_primer)
rev_primer_rc = reverse_complement(rev_primer)
}
for_primer_rc = params.for_primer ? reverse_complement(params.for_primer) : ""
rev_primer_rc = params.rev_primer ? reverse_complement(params.rev_primer) : ""

// Two modules/subworkflows for Illumina data:
// DADA2 trimming and filtering (PE and SE currently) - implemented
// cutadapt-based (primers + Ns) + vsearch (EE) - implemented
// Two modules/subworkflows for PacBio:
// DADA2 filtering (filter) - NYI
// cutadapt (trim) - implemented

if (params.platform == "pacbio") {

// TODO: this could be modified/split into a `cutadapt`-only step; there
// are additional filters for max EE and max N in cutadapt
PACBIO_CUTADAPT(
input
params.for_primer
rev_primer_rc
)

// TODO: should be summarized as well, go to MultiQC
// ch_reports = PACBIO_CUTADAPT_FILTER_AND_TRIM.out.cutadapt_report.collect()

// TODO: this could be modified/split into a `DADA2`-only step
PACBIO_DADA2_FILTER_AND_TRIM(
PACBIO_CUTADAPT.out.cutadapt_trimmed
)
ch_trimmed = PACBIO_DADA2_FILTER_AND_TRIM.out.trimmed
ch_trimmed_R1 = PACBIO_DADA2_FILTER_AND_TRIM.out.trimmed
ch_reports = PACBIO_DADA2_FILTER_AND_TRIM.out.trimmed_report.collect()
// PACBIO_DADA2_FILTER_AND_TRIM(
// PACBIO_CUTADAPT.out.cutadapt_trimmed
// )
ch_trimmed = PACBIO_CUTADAPT.out.trimmed
ch_trimmed_R1 = PACBIO_CUTADAPT.out.trimmed
ch_reports = PACBIO_CUTADAPT.out.cutadapt_report.collect()
ch_multiqc_files = ch_multiqc_files.mix(ILLUMINA_CUTADAPT.out.cutadapt_json)
} else {
// this handles both paired and single-end data
ILLUMINA_DADA2_FILTER_AND_TRIM(
input
)
ch_trimmed = ILLUMINA_DADA2_FILTER_AND_TRIM.out.trimmed
ch_reports = ILLUMINA_DADA2_FILTER_AND_TRIM.out.trimmed_report.collect()
ch_trimmed_R1 = ILLUMINA_DADA2_FILTER_AND_TRIM.out.trimmed_R1
ch_trimmed_R2 = ILLUMINA_DADA2_FILTER_AND_TRIM.out.trimmed_R2
if (params.trimmer == "dada2") {
ILLUMINA_DADA2_FILTER_AND_TRIM(
input
)
ch_trimmed = ILLUMINA_DADA2_FILTER_AND_TRIM.out.trimmed
ch_reports = ILLUMINA_DADA2_FILTER_AND_TRIM.out.trimmed_report.collect()
ch_trimmed_R1 = ILLUMINA_DADA2_FILTER_AND_TRIM.out.trimmed_R1
ch_trimmed_R2 = ILLUMINA_DADA2_FILTER_AND_TRIM.out.trimmed_R2
} else if (params.trimmer == "cutadapt") {
ILLUMINA_CUTADAPT(
input,
params.for_primer,
params.rev_primer,
for_primer_rc,
rev_primer_rc
)
ch_trimmed = ILLUMINA_CUTADAPT.out.trimmed
ch_reports = ILLUMINA_CUTADAPT.out.trimmed_report.collect()
ch_trimmed_R1 = ILLUMINA_CUTADAPT.out.trimmed_R1
ch_trimmed_R2 = ILLUMINA_CUTADAPT.out.trimmed_R2
ch_multiqc_files = ch_multiqc_files.mix(ILLUMINA_CUTADAPT.out.cutadapt_json)
}
}

// TODO: add variable-length and PacBio
MERGE_TRIM_TABLES(
ch_reports
)
Expand All @@ -93,14 +94,9 @@ workflow FILTER_AND_TRIM {
trimmed = ch_trimmed
trimmed_report = MERGE_TRIM_TABLES.out.trimmed_report // channel: [ RDS ]
trimmed_infer = ch_trimmed_infer
ch_multiqc_files
}

// def clean_primers(primer) {
// // returns a clean primer string, IUPAC codes
// // w/o any metadata or anchors. Assumes cutadapt
// // filtering
// }

def reverse_complement(primer) {
// returns the revcomp, handles IUPAC ambig codes
// tr "[ATGCUNYRSWKMBDHV]" "[TACGANRYSWMKVHDB]"
Expand All @@ -126,3 +122,10 @@ def reverse_complement(primer) {
}
}.join('')
}

// def clean_primers(primer) {
// // returns a clean primer string, IUPAC codes
// // w/o any metadata or anchors. Assumes cutadapt
// // filtering
// }

Loading

0 comments on commit c30a875

Please sign in to comment.