Skip to content

Commit

Permalink
Merge pull request #123 from broadinstitute/dp-assembly
Browse files Browse the repository at this point in the history
default to minimap2 for assemble_refbased
  • Loading branch information
dpark01 committed Jun 19, 2020
2 parents 33d2a3b + d4b65c0 commit fd7bc67
Show file tree
Hide file tree
Showing 9 changed files with 187 additions and 26 deletions.
15 changes: 11 additions & 4 deletions pipes/WDL/tasks/tasks_assembly.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,6 @@ task ivar_trim {
}

command {
set -ex -o pipefail
ivar version | head -1 | tee VERSION
if [ -f "${trim_coords_bed}" ]; then
ivar trim -e \
Expand Down Expand Up @@ -256,7 +255,7 @@ task align_reads {

File? novocraft_license

String? aligner="novoalign"
String aligner="minimap2"
String? aligner_options
Boolean? skip_mark_dupes=false

Expand Down Expand Up @@ -311,6 +310,8 @@ task align_reads {
samtools index "${sample_name}.mapped.bam" "${sample_name}.mapped.bai"
fi

cat /proc/loadavg > CPU_LOAD

# collect figures of merit
grep -v '^>' assembly.fasta | tr -d '\nNn' | wc -c | tee assembly_length_unambiguous
samtools view -c ${reads_unmapped_bam} | tee reads_provided
Expand All @@ -323,6 +324,9 @@ task align_reads {

# fastqc mapped bam
reports.py fastqc ${sample_name}.mapped.bam ${sample_name}.mapped_fastqc.html --out_zip ${sample_name}.mapped_fastqc.zip

cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC
cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES
}

output {
Expand All @@ -336,8 +340,11 @@ task align_reads {
Int reads_provided = read_int("reads_provided")
Int reads_aligned = read_int("reads_aligned")
Int read_pairs_aligned = read_int("read_pairs_aligned")
Int bases_aligned = read_int("bases_aligned")
Float bases_aligned = read_float("bases_aligned")
Float mean_coverage = read_float("mean_coverage")
Int max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000)
Int runtime_sec = ceil(read_float("UPTIME_SEC"))
String cpu_load = read_string("CPU_LOAD")
String viralngs_version = read_string("VERSION")
}
Expand Down Expand Up @@ -628,7 +635,7 @@ task refine_2x_and_plot {
Int assembly_length_unambiguous = read_int("assembly_length_unambiguous")
Int reads_aligned = read_int("reads_aligned")
Int read_pairs_aligned = read_int("read_pairs_aligned")
Int bases_aligned = read_int("bases_aligned")
Float bases_aligned = read_float("bases_aligned")
Float mean_coverage = read_float("mean_coverage")
String viralngs_version = read_string("VERSION")
}
Expand Down
112 changes: 111 additions & 1 deletion pipes/WDL/tasks/tasks_reports.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ task plot_coverage {
Int assembly_length = read_int("assembly_length")
Int reads_aligned = read_int("reads_aligned")
Int read_pairs_aligned = read_int("read_pairs_aligned")
Int bases_aligned = read_int("bases_aligned")
Float bases_aligned = read_float("bases_aligned")
Float mean_coverage = read_float("mean_coverage")
String viralngs_version = read_string("VERSION")
}
Expand Down Expand Up @@ -359,3 +359,113 @@ task MultiQC {
dx_instance_type: "mem1_ssd1_v2_x2"
}
}

task tsv_join {
input {
Array[File]+ input_tsvs
Array[String]+ id_columns
String join_type="inner"
String out_basename

String docker="stratdat/csvkit"
}

command {
if [ "${join_type}" = "inner" ]; then
JOIN_TYPE=""
elif [ "${join_type}" = "outer" ]; then
JOIN_TYPE="--${join_type}"
elif [ "${join_type}" = "left" ]; then
JOIN_TYPE="--${join_type}"
elif [ "${join_type}" = "right" ]; then
JOIN_TYPE="--${join_type}"
else
echo "unrecognized join_type ${join_type}"
exit 1
fi
csvjoin -t -y 0 -I \
-c ${sep=',' id_columns} \
$JOIN_TYPE \
${sep=' ' input_tsvs} \
| tr , '\t' \
> ${out_basename}.txt
}

output {
File out_tsv = "${out_basename}.txt"
}

runtime {
memory: "1 GB"
cpu: 1
docker: "${docker}"
disks: "local-disk 50 HDD"
dx_instance_type: "mem1_ssd1_v2_x2"
}
}

task tsv_stack {
input {
Array[File]+ input_tsvs
String out_basename
String docker="stratdat/csvkit"
}

command {
csvstack -t --filenames \
${sep=' ' input_tsvs} \
| tr , '\t' \
> ${out_basename}.txt
}

output {
File out_tsv = "${out_basename}.txt"
}

runtime {
memory: "1 GB"
cpu: 1
docker: "${docker}"
disks: "local-disk 50 HDD"
dx_instance_type: "mem1_ssd1_v2_x2"
}

}

task compare_two_genomes {
input {
File genome_one
File genome_two
String out_basename

String docker="quay.io/broadinstitute/viral-assemble"
}

command {
set -ex -o pipefail
assembly.py --version | tee VERSION
assembly.py alignment_summary "${genome_one}" "${genome_two}" --outfileName "${out_basename}.txt" --printCounts --loglevel=DEBUG
cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC
cat /proc/loadavg > CPU_LOAD
cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES
}

output {
File comparison_table = "${out_basename}.txt"
Int max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000)
Int runtime_sec = ceil(read_float("UPTIME_SEC"))
String cpu_load = read_string("CPU_LOAD")
String viralngs_version = read_string("VERSION")
}
runtime {
memory: "3 GB"
cpu: 2
docker: "${docker}"
disks: "local-disk 50 HDD"
dx_instance_type: "mem1_ssd1_v2_x2"
preemptible: 1
}
}


2 changes: 1 addition & 1 deletion pipes/WDL/workflows/align_and_plot.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ workflow align_and_plot {
Int reads_provided = align.reads_provided
Int reads_aligned = align.reads_aligned
Int read_pairs_aligned = align.read_pairs_aligned
Int bases_aligned = align.bases_aligned
Float bases_aligned = align.bases_aligned
Float mean_coverage = align.mean_coverage
String align_viral_core_version = align.viralngs_version
File coverage_plot = plot_coverage.coverage_plot
Expand Down
2 changes: 1 addition & 1 deletion pipes/WDL/workflows/assemble_denovo.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ workflow assemble_denovo {
File aligned_only_reads_fastqc = refine_2x_and_plot.aligned_only_reads_fastqc
File coverage_tsv = refine_2x_and_plot.coverage_tsv
Int read_pairs_aligned = refine_2x_and_plot.read_pairs_aligned
Int bases_aligned = refine_2x_and_plot.bases_aligned
Float bases_aligned = refine_2x_and_plot.bases_aligned

String? deplete_viral_classify_version = deplete_taxa.viralngs_version
String? taxfilt_viral_classify_version = filter_to_taxon.viralngs_version
Expand Down
31 changes: 22 additions & 9 deletions pipes/WDL/workflows/assemble_refbased.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import "../tasks/tasks_read_utils.wdl" as read_utils
workflow assemble_refbased {

meta {
description: "Reference-based microbial consensus calling. Aligns short reads to a singular reference genome, calls a new consensus sequence, and emits: new assembly, reads aligned to provided reference, reads aligned to new assembly, various figures of merit, plots, and QC metrics. The user may provide unaligned reads spread across multiple input files and this workflow will parallelize alignment per input file before merging results prior to consensus calling."
description: "Reference-based microbial consensus calling. Aligns NGS reads to a singular reference genome, calls a new consensus sequence, and emits: new assembly, reads aligned to provided reference, reads aligned to new assembly, various figures of merit, plots, and QC metrics. The user may provide unaligned reads spread across multiple input files and this workflow will parallelize alignment per input file before merging results prior to consensus calling."
author: "Broad Viral Genomics"
email: "viral-ngs@broadinstitute.org"
}
Expand All @@ -25,6 +25,9 @@ workflow assemble_refbased {
description: "Reference genome to align reads to.",
patterns: ["*.fasta"]
}
aligner: {
description: "Read aligner software to use. Options: novoalign, bwa, minimap2. Minimap2 can automatically handle Illumina, PacBio, or Oxford Nanopore reads as long as the 'PL' field in the BAM read group header is set properly (novoalign and bwa are Illumina-only)."
}
novocraft_license: {
description: "The default Novoalign short read aligner is a commercially licensed software that is available in a much slower, single-threaded version for free. If you have a paid license file, provide it here to run in multi-threaded mode. If this is omitted, it will run in single-threaded mode.",
patterns: ["*.lic"]
Expand All @@ -50,21 +53,32 @@ workflow assemble_refbased {
Array[File]+ reads_unmapped_bams
File reference_fasta

String aligner="minimap2"
File? novocraft_license
Boolean? skip_mark_dupes=false
File? trim_coords_bed
}

Map[String,String] align_to_ref_options = {
"novoalign": "-r Random -l 40 -g 40 -x 20 -t 501 -k",
"bwa": "-k 12 -B 1",
"minimap2": ""
}
Map[String,String] align_to_self_options = {
"novoalign": "-r Random -l 40 -g 40 -x 20 -t 100",
"bwa": "",
"minimap2": ""
}

scatter(reads_unmapped_bam in reads_unmapped_bams) {
call assembly.align_reads as align_to_ref {
input:
reference_fasta = reference_fasta,
reads_unmapped_bam = reads_unmapped_bam,
novocraft_license = novocraft_license,
skip_mark_dupes = skip_mark_dupes,
aligner_options = "-r Random -l 40 -g 40 -x 20 -t 501 -k"
## (for bwa) -- aligner_options = "-k 12 -B 1"
## (for novoalign) -- aligner_options = "-r Random -l 40 -g 40 -x 20 -t 501 -k"
aligner = aligner,
aligner_options = align_to_ref_options[aligner]
}
call assembly.ivar_trim {
input:
Expand Down Expand Up @@ -100,9 +114,8 @@ workflow assemble_refbased {
reads_unmapped_bam = reads_unmapped_bam,
novocraft_license = novocraft_license,
skip_mark_dupes = skip_mark_dupes,
aligner_options = "-r Random -l 40 -g 40 -x 20 -t 100"
## (for bwa) -- aligner_options = "-k 12 -B 1"
## (for novoalign) -- aligner_options = "-r Random -l 40 -g 40 -x 20 -t 501 -k"
aligner = aligner,
aligner_options = align_to_self_options[aligner]
}
}

Expand Down Expand Up @@ -137,14 +150,14 @@ workflow assemble_refbased {
File align_to_ref_merged_coverage_tsv = plot_ref_coverage.coverage_tsv
Int align_to_ref_merged_reads_aligned = plot_ref_coverage.reads_aligned
Int align_to_ref_merged_read_pairs_aligned = plot_ref_coverage.read_pairs_aligned
Int align_to_ref_merged_bases_aligned = plot_ref_coverage.bases_aligned
Float align_to_ref_merged_bases_aligned = plot_ref_coverage.bases_aligned

File align_to_self_merged_aligned_only_bam = merge_align_to_self.out_bam
File align_to_self_merged_coverage_plot = plot_self_coverage.coverage_plot
File align_to_self_merged_coverage_tsv = plot_self_coverage.coverage_tsv
Int align_to_self_merged_reads_aligned = plot_self_coverage.reads_aligned
Int align_to_self_merged_read_pairs_aligned = plot_self_coverage.read_pairs_aligned
Int align_to_self_merged_bases_aligned = plot_self_coverage.bases_aligned
Float align_to_self_merged_bases_aligned = plot_self_coverage.bases_aligned
Float align_to_self_merged_mean_coverage = plot_self_coverage.mean_coverage

String align_to_ref_viral_core_version = align_to_ref.viralngs_version[0]
Expand Down
31 changes: 31 additions & 0 deletions pipes/WDL/workflows/diff_genome_sets.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
version 1.0

import "../tasks/tasks_reports.wdl" as reports

workflow diff_genome_sets {

input {
Array[File] genome_set_one
Array[File] genome_set_two
}

scatter(sample in zip(genome_set_one, genome_set_two)) {
call reports.compare_two_genomes {
input:
genome_one = sample.left,
genome_two = sample.right,
out_basename = basename(sample.left, '.fasta')
}
}

call reports.tsv_stack {
input:
input_tsvs = compare_two_genomes.comparison_table,
out_basename = "diff_genome_sets.txt"
}

output {
File diff = tsv_stack.out_tsv
}

}
2 changes: 1 addition & 1 deletion pipes/WDL/workflows/scaffold_and_refine.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ workflow scaffold_and_refine {
File aligned_only_reads_fastqc = refine_2x_and_plot.aligned_only_reads_fastqc
File coverage_tsv = refine_2x_and_plot.coverage_tsv
Int read_pairs_aligned = refine_2x_and_plot.read_pairs_aligned
Int bases_aligned = refine_2x_and_plot.bases_aligned
Float bases_aligned = refine_2x_and_plot.bases_aligned

String scaffold_viral_assemble_version = scaffold.viralngs_version
String refine_viral_assemble_version = refine_2x_and_plot.viralngs_version
Expand Down
2 changes: 1 addition & 1 deletion requirements-modules.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
broadinstitute/viral-core=2.1.3
broadinstitute/viral-assemble=2.1.3.0
broadinstitute/viral-assemble=2.1.3.1
broadinstitute/viral-classify=2.1.3.1
broadinstitute/viral-phylo=2.1.3.1
broadinstitute/beast-beagle-cuda=1.10.5pre
Expand Down
16 changes: 8 additions & 8 deletions test/input/WDL/test_outputs-assemble_refbased-local.json
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
{
"assemble_refbased.align_to_self_merged_bases_aligned": 1765480,
"assemble_refbased.align_to_self_merged_read_pairs_aligned": 16798,
"assemble_refbased.align_to_self_merged_reads_aligned": 17480,
"assemble_refbased.align_to_ref_merged_bases_aligned": 1841937,
"assemble_refbased.align_to_ref_merged_read_pairs_aligned": 17644,
"assemble_refbased.align_to_ref_merged_reads_aligned": 18237,
"assemble_refbased.align_to_ref_merged_bases_aligned": 1851882,
"assemble_refbased.align_to_ref_merged_read_pairs_aligned": 17312,
"assemble_refbased.align_to_ref_merged_reads_aligned": 18409,
"assemble_refbased.align_to_self_merged_bases_aligned": 1851898,
"assemble_refbased.align_to_self_merged_read_pairs_aligned": 17314,
"assemble_refbased.align_to_self_merged_reads_aligned": 18409,
"assemble_refbased.reference_genome_length": 18959,
"assemble_refbased.assembly_length_unambiguous": 18865,
"assemble_refbased.assembly_length": 18865
"assemble_refbased.assembly_length_unambiguous": 18889,
"assemble_refbased.assembly_length": 18889
}

0 comments on commit fd7bc67

Please sign in to comment.