diff --git a/.dockstore.yml b/.dockstore.yml index 7c238594c..76e333a4e 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -349,3 +349,13 @@ workflows: primaryDescriptorPath: /pipes/WDL/workflows/trimal.wdl testParameterFiles: - empty.json + - name: amplicon16S_analysis + subclass: WDL + primaryDescriptorPath: /pipes/WDL/workflows/amplicon16S_analysis.wdl + testParameterFiles: + - empty.json + - name: qiime_import_bam + subclass: WDL + primaryDescriptorPath: /pipes/WDL/workflows/qiime_import_bam.wdl + testParameterFiles: + - empty.json \ No newline at end of file diff --git a/pipes/WDL/tasks/tasks_16S_amplicon.wdl b/pipes/WDL/tasks/tasks_16S_amplicon.wdl new file mode 100755 index 000000000..07c8d59cd --- /dev/null +++ b/pipes/WDL/tasks/tasks_16S_amplicon.wdl @@ -0,0 +1,295 @@ +version 1.0 + +task qiime_import_from_bam { + meta { + description: "Parsing demultiplexed fastq BAM files into qiime readable files." + } + input { + File reads_bam + String sample_name + Int memory_mb = 2000 + Int cpu = 1 + Int disk_size_gb = ceil(2*size(reads_bam, "GiB")) + 5 + String docker = "quay.io/broadinstitute/qiime2:conda" + } + parameter_meta { + reads_bam: {description: "Input BAM file"} + } + + command <<< + set -ex -o pipefail + #Part 1A | BAM -> FASTQ [Simple samtools command] + samtools fastq -1 $(pwd)/R1.fastq.gz -2 $(pwd)/R2.fastq.gz -0 /dev/null ~{reads_bam} + #making new bash variable | regex: (_) -> (-) + NEWSAMPLENAME=$(echo "~{sample_name}" | perl -lape 's/[_]/-/g') + #All names added to one giant file + echo ${NEWSAMPLENAME} > NEWSAMPLENAME.txt + #Make a manifest.txt that contains [1.sampleid 2.R1_fastq 3.R2.fastq] + #> =overwrite or writes new file + echo -e "sample-id\tforward-absolute-filepath\treverse-absolute-filepath" > manifest.tsv + #>>= appends + #\t= tabs next value + echo -e "$NEWSAMPLENAME\t$(pwd)/R1.fastq.gz\t$(pwd)/R2.fastq.gz" >> manifest.tsv + + #fastq -> bam (provided by qiime tools import fxn) + qiime tools import \ + --type 'SampleData[PairedEndSequencesWithQuality]' \ + --input-path manifest.tsv \ + --input-format PairedEndFastqManifestPhred33V2 \ + --output-path "~{sample_name}.qza" + >>> + + output { + File reads_qza = "~{sample_name}.qza" + String samplename_master_sheet = read_string("NEWSAMPLENAME.txt") + } + runtime { + docker: docker + memory: "${memory_mb} MiB" + cpu: cpu + disk: disk_size_gb + " GB" + disks: "local-disk " + disk_size_gb + " HDD" + } +} + +#Part 1 | Step 2:cutadapt: Trim sequences +#trimreads_trim +#trim = default +task trim_reads { + + meta { + description:"Removing adapter sequences, primers, and other unwanted sequence from sequence data." + } + + input { + File reads_qza + + String qza_basename = basename(reads_qza, '.qza') + #Boolean not_default = false + String forward_adapter = "CTGCTGCCTCCCGTAGGAGT" + String reverse_adapter = "AGAGTTTGATCCTGGCTCAG" + Int min_length = 1 + Boolean keep_untrimmed_reads = false + Int memory_mb = 2000 + Int cpu = 1 + Int disk_size_gb = ceil(2*size(reads_qza, "GiB")) + 5 + String docker = "quay.io/broadinstitute/qiime2:conda" + } + + command <<< + set -ex -o pipefail + qiime cutadapt trim-paired \ + --i-demultiplexed-sequences "~{reads_qza}" \ + --p-front-f "~{forward_adapter}" \ + --p-front-r "~{reverse_adapter}" \ + ~{"--p-minimum-length " + min_length} \ + ~{true='--p-no-discard-untrimmed' false='--p-discard-untrimmed' keep_untrimmed_reads} \ + --o-trimmed-sequences "~{qza_basename}_trimmed.qza" + + #trim_visual + qiime demux summarize \ + --i-data "~{qza_basename}_trimmed.qza" \ + --o-visualization "~{qza_basename}_trim_summary.qzv" + >>> + + output { + #trimmed_sequences = paired ends for vsearch + File trimmed_reads_qza = "~{qza_basename}_trimmed.qza" + File trimmed_visualization = "~{qza_basename}_trim_summary.qzv" + } + + runtime { + docker: docker + memory: "${memory_mb} MiB" + cpu: cpu + disk: disk_size_gb + " GB" + disks: "local-disk " + disk_size_gb + " HDD" + } +} + +#Part 1 | Step 3:VSEARCH: Merge sequences +task join_paired_ends { + meta { + description: "Join paired-end sequence reads using vseach's merge_pairs function." + } + input { + #Input File: Merge paired reads + File trimmed_reads_qza + String reads_basename = basename(trimmed_reads_qza, '.qza') + Int memory_mb = 2000 + Int cpu = 1 + Int disk_size_gb = ceil(2*size(trimmed_reads_qza, "GiB")) + 5 + String docker = "quay.io/broadinstitute/qiime2:conda" + } + + command <<< + set -ex -o pipefail + qiime vsearch join-pairs \ + --i-demultiplexed-seqs ~{trimmed_reads_qza} \ + --o-joined-sequences "~{reads_basename}_joined.qza" + + qiime demux summarize \ + --i-data "~{reads_basename}_joined.qza" \ + --o-visualization "~{reads_basename}_visualization.qzv" + >>> + output { + File joined_end_reads_qza = "~{reads_basename}_joined.qza" + File joined_end_visualization = "~{reads_basename}_visualization.qzv" + } + runtime { + docker: docker + memory: "${memory_mb} MiB" + cpu: cpu + disk: disk_size_gb + " GB" + disks: "local-disk " + disk_size_gb + " HDD" + } +} + +task deblur { + + meta { + description: "Perform sequence quality control for Illumina data using the Deblur workflow with a 16S reference as a positive filter." + } + input { + File joined_end_reads_qza + String joined_end_basename = basename(joined_end_reads_qza, '.qza') + Int trim_length_var = 300 + Int memory_mb = 2000 + Int cpu = 1 + Int disk_size_gb = ceil(2*size(joined_end_reads_qza, "GiB")) + 5 + String docker = "quay.io/broadinstitute/qiime2:conda" + } + command <<< + set -ex -o pipefail + + qiime deblur denoise-16S \ + --i-demultiplexed-seqs ~{joined_end_reads_qza} \ + ~{"--p-trim-length " + trim_length_var} \ + --p-sample-stats \ + --o-representative-sequences "~{joined_end_basename}_rep_seqs.qza" \ + --o-table "~{joined_end_basename}_table.qza" \ + --o-stats "~{joined_end_basename}_stats.qza" + + #Generate feature table- give you the number of features per sample + qiime feature-table summarize \ + --i-table "~{joined_end_basename}_table.qza" \ + --o-visualization "~{joined_end_basename}_table.qzv" + #Generate visualization of deblur stats + qiime deblur visualize-stats \ + --i-deblur-stats "~{joined_end_basename}_stats.qza" \ + --o-visualization "~{joined_end_basename}_stats.qzv" + >>> + output { + File representative_seqs_qza = "~{joined_end_basename}_rep_seqs.qza" + File representative_table_qza = "~{joined_end_basename}_table.qza" + File feature_table = "~{joined_end_basename}_table.qzv" + File visualize_stats = "~{joined_end_basename}_stats.qzv" + + } + runtime { + docker: docker + memory: "${memory_mb} MiB" + cpu: cpu + disk: disk_size_gb + " GB" + disks: "local-disk " + disk_size_gb + " HDD" + } +} +task train_classifier { + meta { + descrription: " Upload a classidier trained to classify v1-2 amplicon sequences" + } + input { + File otu_ref + File taxanomy_ref + String forward_adapter + String reverse_adapter + Int min_length = 100 + Int max_length = 500 + String otu_basename = basename(otu_ref, '.qza') + Int memory_mb = 2000 + Int cpu = 1 + Int disk_size_gb = ceil(2*size(otu_ref, "GiB")) + 5 + String docker = "quay.io/broadinstitute/qiime2:conda" + } + command <<< + set -ex -o pipefail + CONDA_ENV_NAME=$(conda info --envs -q | awk -F" " '/qiime.*/{ print $1 }') + conda activate ${CONDA_ENV_NAME} + + qiime tools import \ + --type 'FeatureData[Sequence]' \ + --input-path ~{otu_ref} \ + --output-path "~{otu_basename}_seqs.qza" + + qiime tools import \ + --type 'FeatureData[Taxonomy]' + --input-format HeaderlessTSVTaxonomyFormat \ + --input-path ~{taxanomy_ref} \ + --output-path "~{otu_basename}_tax.qza" + + qiime feature-classifier extract-reads\ + --i-sequeunces "~{otu_basename}_seqs.qza"\ + --p-f-primer "~{forward_adapter}" \ + --p-r-primer "~{reverse_adapter}" \ + ~{"--p-min-length " + min_length} \ + ~{"--p-max-length " + max_length} \ + --o-reads "~{otu_basename}_v1-2-ref-seqs.qza" + + qiime feature-classifier fit-classifier-naive-bayes \ + --i-reference-reads "~{otu_basename}_v1-2-ref-seqs.qza" \ + --i-reference-taxonomy "~{otu_basename}_tax.qza" \ + --o-classifier "~{otu_basename}_v1-2-classifier.qza" + >>> + output { + File trained_classifier = "~{otu_basename}_v1-2-classifier.qza" + } + runtime { + docker: docker + memory: "${memory_mb} MiB" + cpu: cpu + disk: disk_size_gb + " GB" + disks: "local-disk " + disk_size_gb + " HDD" + } +} +task tax_analysis { + meta { + description: "Protocol describes performing a taxonomic classification with a naive bayes classifier that has been trained on the V1-2 regions amplified by our primers." + } + input { + File trained_classifier + File representative_seqs_qza + File representative_table_qza + String basename = basename(trained_classifier, '.qza') + Int memory_mb = 5 + Int cpu = 1 + Int disk_size_gb = 375 + String docker = "quay.io/broadinstitute/qiime2:conda" + } + command <<< + set -ex -o pipefail + qiime feature-classifier classify-sklearn \ + --i-classifier ~{trained_classifier} \ + --i-reads ~{representative_seqs_qza} \ + --o-classification "~{basename}_tax.qza" + + qiime feature-table tabulate-seqs \ + --i-data ~{representative_seqs_qza} \ + --o-visualization "~{basename}_rep_seqs.qzv" + + qiime taxa barplot \ + --i-table ~{representative_table_qza} \ + --i-taxonomy "~{basename}_tax.qza" \ + --o-visualization "~{basename}_bar_plots.qzv" + >>> + output { + File rep_seq_list = "~{basename}_rep_seqs.qzv" + File tax_classification_graph = "~{basename}_bar_plots.qzv" +} + runtime { + docker: docker + memory: "7 GB" + cpu: cpu + disk: disk_size_gb + " GB" + disks: "local-disk " + disk_size_gb + " HDD" + } +} \ No newline at end of file diff --git a/pipes/WDL/workflows/amplicon16S_analysis.wdl b/pipes/WDL/workflows/amplicon16S_analysis.wdl new file mode 100755 index 000000000..b0635eafb --- /dev/null +++ b/pipes/WDL/workflows/amplicon16S_analysis.wdl @@ -0,0 +1,48 @@ +version 1.0 + +import "../tasks/tasks_16S_amplicon.wdl" as qiime + +workflow amplicon16S_analysis { + + meta { + description: "Running 16S amplicon (from BAM format) sequencing analysis with qiime." + author: "fnegrete" + email: "viral_ngs@broadinstitute.org" + allowNestedInputs: true + } + input { + File reads_bam + File trained_classifier + String sample_name + Boolean keep_untrimmed_reads + } + + call qiime.qiime_import_from_bam { + input: + reads_bam = reads_bam, + sample_name = sample_name + } + #__________________________________________ + call qiime.trim_reads { + input: + reads_qza = qiime_import_from_bam.reads_qza, + keep_untrimmed_reads = keep_untrimmed_reads + } + #__________________________________________ + call qiime.join_paired_ends { + input: + trimmed_reads_qza = trim_reads.trimmed_reads_qza + } + #_________________________________________ + call qiime.deblur { + input: + joined_end_reads_qza = join_paired_ends.joined_end_reads_qza + } + #_________________________________________ + call qiime.tax_analysis { + input: + trained_classifier = trained_classifier, + representative_seqs_qza = deblur.representative_seqs_qza, + representative_table_qza = deblur.representative_table_qza + } +} \ No newline at end of file diff --git a/pipes/WDL/workflows/qiime_import_bam.wdl b/pipes/WDL/workflows/qiime_import_bam.wdl new file mode 100644 index 000000000..caaee2cdb --- /dev/null +++ b/pipes/WDL/workflows/qiime_import_bam.wdl @@ -0,0 +1,23 @@ +version 1.0 + +import "../tasks/tasks_16S_amplicon.wdl" as infile + +workflow qiime_import_bam { + +meta{ + description: "Importing BAM files into QIIME" + author: "fnegrete" + email: "viral_ngs@broadinstitute.org" + allowNestedInputs: true +} +input { + File reads_bam + String sample_name +} + +call infile.qiime_import_from_bam { + input: + reads_bam = reads_bam, + sample_name = sample_name + } +} \ No newline at end of file diff --git a/test/input/WDL/cromwell-local/test_inputs-qiime_import_bam-local.json b/test/input/WDL/cromwell-local/test_inputs-qiime_import_bam-local.json new file mode 120000 index 000000000..15bb6b643 --- /dev/null +++ b/test/input/WDL/cromwell-local/test_inputs-qiime_import_bam-local.json @@ -0,0 +1 @@ +../test_inputs-qiime_import_bam-local.json \ No newline at end of file diff --git a/test/input/WDL/miniwdl-local/test_inputs-qiime_import_bam-local.json b/test/input/WDL/miniwdl-local/test_inputs-qiime_import_bam-local.json new file mode 120000 index 000000000..15bb6b643 --- /dev/null +++ b/test/input/WDL/miniwdl-local/test_inputs-qiime_import_bam-local.json @@ -0,0 +1 @@ +../test_inputs-qiime_import_bam-local.json \ No newline at end of file diff --git a/test/input/WDL/test_inputs-qiime_import_bam-local.json b/test/input/WDL/test_inputs-qiime_import_bam-local.json new file mode 100644 index 000000000..000c9f95a --- /dev/null +++ b/test/input/WDL/test_inputs-qiime_import_bam-local.json @@ -0,0 +1,4 @@ +{ + "qiime_import_bam.reads_bam": "test/input/G5012.3.subset.bam", + "qiime_import_bam.sample_name": "G5012.3.subset.bam" +}