Merge pull request #123 from broadinstitute/dp-assembly

default to minimap2 for assemble_refbased
broadinstitute · Jun 19, 2020 · fd7bc67 · fd7bc67
2 parents 33d2a3b + d4b65c0
commit fd7bc67
Show file tree

Hide file tree

Showing 9 changed files with 187 additions and 26 deletions.
diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl
@@ -215,7 +215,6 @@ task ivar_trim {
     }
 
     command {
-        set -ex -o pipefail
         ivar version | head -1 | tee VERSION
         if [ -f "${trim_coords_bed}" ]; then
           ivar trim -e \
@@ -256,7 +255,7 @@ task align_reads {
 
     File?    novocraft_license
 
-    String?  aligner="novoalign"
+    String   aligner="minimap2"
     String?  aligner_options
     Boolean? skip_mark_dupes=false
 
@@ -311,6 +310,8 @@ task align_reads {
       samtools index "${sample_name}.mapped.bam" "${sample_name}.mapped.bai"
     fi
 
+    cat /proc/loadavg > CPU_LOAD
+
     # collect figures of merit
     grep -v '^>' assembly.fasta | tr -d '\nNn' | wc -c | tee assembly_length_unambiguous
     samtools view -c ${reads_unmapped_bam} | tee reads_provided
@@ -323,6 +324,9 @@ task align_reads {
 
     # fastqc mapped bam
     reports.py fastqc ${sample_name}.mapped.bam ${sample_name}.mapped_fastqc.html --out_zip ${sample_name}.mapped_fastqc.zip
+
+    cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC
+    cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES
   }
 
   output {
@@ -336,8 +340,11 @@ task align_reads {
     Int    reads_provided                = read_int("reads_provided")
     Int    reads_aligned                 = read_int("reads_aligned")
     Int    read_pairs_aligned            = read_int("read_pairs_aligned")
-    Int    bases_aligned                 = read_int("bases_aligned")
+    Float  bases_aligned                 = read_float("bases_aligned")
     Float  mean_coverage                 = read_float("mean_coverage")
+    Int    max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000)
+    Int    runtime_sec = ceil(read_float("UPTIME_SEC"))
+    String cpu_load = read_string("CPU_LOAD")
     String viralngs_version              = read_string("VERSION")
   }
 
@@ -628,7 +635,7 @@ task refine_2x_and_plot {
         Int  assembly_length_unambiguous   = read_int("assembly_length_unambiguous")
         Int  reads_aligned                 = read_int("reads_aligned")
         Int  read_pairs_aligned            = read_int("read_pairs_aligned")
-        Int  bases_aligned                 = read_int("bases_aligned")
+        Float bases_aligned                 = read_float("bases_aligned")
         Float mean_coverage                = read_float("mean_coverage")
         String viralngs_version            = read_string("VERSION")
     }

diff --git a/pipes/WDL/tasks/tasks_reports.wdl b/pipes/WDL/tasks/tasks_reports.wdl
@@ -64,7 +64,7 @@ task plot_coverage {
     Int    assembly_length               = read_int("assembly_length")
     Int    reads_aligned                 = read_int("reads_aligned")
     Int    read_pairs_aligned            = read_int("read_pairs_aligned")
-    Int    bases_aligned                 = read_int("bases_aligned")
+    Float  bases_aligned                 = read_float("bases_aligned")
     Float  mean_coverage                 = read_float("mean_coverage")
     String viralngs_version              = read_string("VERSION")
   }
@@ -359,3 +359,113 @@ task MultiQC {
     dx_instance_type: "mem1_ssd1_v2_x2"
   }
 }
+
+task tsv_join {
+  input {
+    Array[File]+   input_tsvs
+    Array[String]+ id_columns
+    String         join_type="inner"
+    String         out_basename
+
+    String         docker="stratdat/csvkit"
+  }
+
+  command {
+    if [ "${join_type}" = "inner" ]; then
+      JOIN_TYPE=""
+    elif [ "${join_type}" = "outer" ]; then
+      JOIN_TYPE="--${join_type}"
+    elif [ "${join_type}" = "left" ]; then
+      JOIN_TYPE="--${join_type}"
+    elif [ "${join_type}" = "right" ]; then
+      JOIN_TYPE="--${join_type}"
+    else
+      echo "unrecognized join_type ${join_type}"
+      exit 1
+    fi
+    csvjoin -t -y 0 -I \
+      -c ${sep=',' id_columns} \
+      $JOIN_TYPE \
+      ${sep=' ' input_tsvs} \
+      | tr , '\t' \
+      > ${out_basename}.txt
+  }
+
+  output {
+    File   out_tsv = "${out_basename}.txt"
+  }
+
+  runtime {
+    memory: "1 GB"
+    cpu: 1
+    docker: "${docker}"
+    disks: "local-disk 50 HDD"
+    dx_instance_type: "mem1_ssd1_v2_x2"
+  }
+}
+
+task tsv_stack {
+  input {
+    Array[File]+   input_tsvs
+    String         out_basename
+    String         docker="stratdat/csvkit"
+  }
+
+  command {
+    csvstack -t --filenames \
+      ${sep=' ' input_tsvs} \
+      | tr , '\t' \
+      > ${out_basename}.txt
+  }
+
+  output {
+    File   out_tsv = "${out_basename}.txt"
+  }
+
+  runtime {
+    memory: "1 GB"
+    cpu: 1
+    docker: "${docker}"
+    disks: "local-disk 50 HDD"
+    dx_instance_type: "mem1_ssd1_v2_x2"
+  }
+
+}
+
+task compare_two_genomes {
+  input {
+    File          genome_one
+    File          genome_two
+    String        out_basename
+
+    String        docker="quay.io/broadinstitute/viral-assemble"
+  }
+
+  command {
+    set -ex -o pipefail
+    assembly.py --version | tee VERSION
+    assembly.py alignment_summary "${genome_one}" "${genome_two}" --outfileName "${out_basename}.txt" --printCounts --loglevel=DEBUG
+    cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC
+    cat /proc/loadavg > CPU_LOAD
+    cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES
+  }
+
+  output {
+    File   comparison_table = "${out_basename}.txt"
+    Int    max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000)
+    Int    runtime_sec = ceil(read_float("UPTIME_SEC"))
+    String cpu_load = read_string("CPU_LOAD")
+    String viralngs_version = read_string("VERSION")
+  }
+
+  runtime {
+    memory: "3 GB"
+    cpu: 2
+    docker: "${docker}"
+    disks: "local-disk 50 HDD"
+    dx_instance_type: "mem1_ssd1_v2_x2"
+    preemptible: 1
+  }
+}
+
+
diff --git a/pipes/WDL/workflows/align_and_plot.wdl b/pipes/WDL/workflows/align_and_plot.wdl
@@ -29,7 +29,7 @@ workflow align_and_plot {
         Int    reads_provided                = align.reads_provided
         Int    reads_aligned                 = align.reads_aligned
         Int    read_pairs_aligned            = align.read_pairs_aligned
-        Int    bases_aligned                 = align.bases_aligned
+        Float  bases_aligned                 = align.bases_aligned
         Float  mean_coverage                 = align.mean_coverage
         String align_viral_core_version      = align.viralngs_version
         File   coverage_plot                 = plot_coverage.coverage_plot

diff --git a/pipes/WDL/workflows/assemble_denovo.wdl b/pipes/WDL/workflows/assemble_denovo.wdl
@@ -163,7 +163,7 @@ workflow assemble_denovo {
     File aligned_only_reads_fastqc     = refine_2x_and_plot.aligned_only_reads_fastqc
     File coverage_tsv                  = refine_2x_and_plot.coverage_tsv
     Int  read_pairs_aligned            = refine_2x_and_plot.read_pairs_aligned
-    Int  bases_aligned                 = refine_2x_and_plot.bases_aligned
+    Float bases_aligned                 = refine_2x_and_plot.bases_aligned
 
     String? deplete_viral_classify_version  = deplete_taxa.viralngs_version
     String? taxfilt_viral_classify_version  = filter_to_taxon.viralngs_version

diff --git a/pipes/WDL/workflows/assemble_refbased.wdl b/pipes/WDL/workflows/assemble_refbased.wdl
@@ -7,7 +7,7 @@ import "../tasks/tasks_read_utils.wdl" as read_utils
 workflow assemble_refbased {
 
     meta {
-        description: "Reference-based microbial consensus calling. Aligns short reads to a singular reference genome, calls a new consensus sequence, and emits: new assembly, reads aligned to provided reference, reads aligned to new assembly, various figures of merit, plots, and QC metrics. The user may provide unaligned reads spread across multiple input files and this workflow will parallelize alignment per input file before merging results prior to consensus calling."
+        description: "Reference-based microbial consensus calling. Aligns NGS reads to a singular reference genome, calls a new consensus sequence, and emits: new assembly, reads aligned to provided reference, reads aligned to new assembly, various figures of merit, plots, and QC metrics. The user may provide unaligned reads spread across multiple input files and this workflow will parallelize alignment per input file before merging results prior to consensus calling."
         author: "Broad Viral Genomics"
         email:  "viral-ngs@broadinstitute.org"
     }
@@ -25,6 +25,9 @@ workflow assemble_refbased {
             description: "Reference genome to align reads to.",
             patterns: ["*.fasta"]
         }
+        aligner: {
+            description: "Read aligner software to use. Options: novoalign, bwa, minimap2. Minimap2 can automatically handle Illumina, PacBio, or Oxford Nanopore reads as long as the 'PL' field in the BAM read group header is set properly (novoalign and bwa are Illumina-only)."
+        }
         novocraft_license: {
             description: "The default Novoalign short read aligner is a commercially licensed software that is available in a much slower, single-threaded version for free. If you have a paid license file, provide it here to run in multi-threaded mode. If this is omitted, it will run in single-threaded mode.",
             patterns: ["*.lic"]
@@ -50,21 +53,32 @@ workflow assemble_refbased {
         Array[File]+    reads_unmapped_bams
         File            reference_fasta
 
+        String          aligner="minimap2"
         File?           novocraft_license
         Boolean?        skip_mark_dupes=false
         File?           trim_coords_bed
     }
 
+    Map[String,String] align_to_ref_options = {
+                            "novoalign": "-r Random -l 40 -g 40 -x 20 -t 501 -k",
+                            "bwa": "-k 12 -B 1",
+                            "minimap2": ""
+                            }
+    Map[String,String] align_to_self_options = {
+                            "novoalign": "-r Random -l 40 -g 40 -x 20 -t 100",
+                            "bwa": "",
+                            "minimap2": ""
+                            }
+
     scatter(reads_unmapped_bam in reads_unmapped_bams) {
         call assembly.align_reads as align_to_ref {
             input:
                 reference_fasta    = reference_fasta,
                 reads_unmapped_bam = reads_unmapped_bam,
                 novocraft_license  = novocraft_license,
                 skip_mark_dupes    = skip_mark_dupes,
-                aligner_options    = "-r Random -l 40 -g 40 -x 20 -t 501 -k"
-                ## (for bwa) -- aligner_options = "-k 12 -B 1"
-                ## (for novoalign) -- aligner_options = "-r Random -l 40 -g 40 -x 20 -t 501 -k"
+                aligner            = aligner,
+                aligner_options    = align_to_ref_options[aligner]
         }
         call assembly.ivar_trim {
             input:
@@ -100,9 +114,8 @@ workflow assemble_refbased {
                 reads_unmapped_bam = reads_unmapped_bam,
                 novocraft_license  = novocraft_license,
                 skip_mark_dupes    = skip_mark_dupes,
-                aligner_options    = "-r Random -l 40 -g 40 -x 20 -t 100"
-                ## (for bwa) -- aligner_options = "-k 12 -B 1"
-                ## (for novoalign) -- aligner_options = "-r Random -l 40 -g 40 -x 20 -t 501 -k"
+                aligner            = aligner,
+                aligner_options    = align_to_self_options[aligner]
         }
     }
 
@@ -137,14 +150,14 @@ workflow assemble_refbased {
         File   align_to_ref_merged_coverage_tsv             = plot_ref_coverage.coverage_tsv
         Int    align_to_ref_merged_reads_aligned            = plot_ref_coverage.reads_aligned
         Int    align_to_ref_merged_read_pairs_aligned       = plot_ref_coverage.read_pairs_aligned
-        Int    align_to_ref_merged_bases_aligned            = plot_ref_coverage.bases_aligned
+        Float  align_to_ref_merged_bases_aligned            = plot_ref_coverage.bases_aligned
 
         File   align_to_self_merged_aligned_only_bam   = merge_align_to_self.out_bam
         File   align_to_self_merged_coverage_plot      = plot_self_coverage.coverage_plot
         File   align_to_self_merged_coverage_tsv       = plot_self_coverage.coverage_tsv
         Int    align_to_self_merged_reads_aligned      = plot_self_coverage.reads_aligned
         Int    align_to_self_merged_read_pairs_aligned = plot_self_coverage.read_pairs_aligned
-        Int    align_to_self_merged_bases_aligned      = plot_self_coverage.bases_aligned
+        Float  align_to_self_merged_bases_aligned      = plot_self_coverage.bases_aligned
         Float  align_to_self_merged_mean_coverage            = plot_self_coverage.mean_coverage
 
         String align_to_ref_viral_core_version = align_to_ref.viralngs_version[0]

diff --git a/pipes/WDL/workflows/diff_genome_sets.wdl b/pipes/WDL/workflows/diff_genome_sets.wdl
@@ -0,0 +1,31 @@
+version 1.0
+
+import "../tasks/tasks_reports.wdl" as reports
+
+workflow diff_genome_sets {
+
+    input {
+        Array[File]   genome_set_one
+        Array[File]   genome_set_two
+    }
+
+    scatter(sample in zip(genome_set_one, genome_set_two)) {
+        call reports.compare_two_genomes {
+            input:
+                genome_one = sample.left,
+                genome_two = sample.right,
+                out_basename = basename(sample.left, '.fasta')
+        }
+    }
+
+    call reports.tsv_stack {
+        input:
+            input_tsvs = compare_two_genomes.comparison_table,
+            out_basename = "diff_genome_sets.txt"
+    }
+
+    output {
+        File diff = tsv_stack.out_tsv
+    }
+
+}
diff --git a/pipes/WDL/workflows/scaffold_and_refine.wdl b/pipes/WDL/workflows/scaffold_and_refine.wdl
@@ -41,7 +41,7 @@ workflow scaffold_and_refine {
     File aligned_only_reads_fastqc     = refine_2x_and_plot.aligned_only_reads_fastqc
     File coverage_tsv                  = refine_2x_and_plot.coverage_tsv
     Int  read_pairs_aligned            = refine_2x_and_plot.read_pairs_aligned
-    Int  bases_aligned                 = refine_2x_and_plot.bases_aligned
+    Float bases_aligned                 = refine_2x_and_plot.bases_aligned
 
     String scaffold_viral_assemble_version = scaffold.viralngs_version
     String refine_viral_assemble_version   = refine_2x_and_plot.viralngs_version

diff --git a/requirements-modules.txt b/requirements-modules.txt
@@ -1,5 +1,5 @@
 broadinstitute/viral-core=2.1.3
-broadinstitute/viral-assemble=2.1.3.0
+broadinstitute/viral-assemble=2.1.3.1
 broadinstitute/viral-classify=2.1.3.1
 broadinstitute/viral-phylo=2.1.3.1
 broadinstitute/beast-beagle-cuda=1.10.5pre

diff --git a/test/input/WDL/test_outputs-assemble_refbased-local.json b/test/input/WDL/test_outputs-assemble_refbased-local.json
@@ -1,11 +1,11 @@
 {
-  "assemble_refbased.align_to_self_merged_bases_aligned": 1765480,
-  "assemble_refbased.align_to_self_merged_read_pairs_aligned": 16798,
-  "assemble_refbased.align_to_self_merged_reads_aligned": 17480,
-  "assemble_refbased.align_to_ref_merged_bases_aligned": 1841937,
-  "assemble_refbased.align_to_ref_merged_read_pairs_aligned": 17644,
-  "assemble_refbased.align_to_ref_merged_reads_aligned": 18237,
+  "assemble_refbased.align_to_ref_merged_bases_aligned": 1851882,
+  "assemble_refbased.align_to_ref_merged_read_pairs_aligned": 17312,
+  "assemble_refbased.align_to_ref_merged_reads_aligned": 18409,
+  "assemble_refbased.align_to_self_merged_bases_aligned": 1851898,
+  "assemble_refbased.align_to_self_merged_read_pairs_aligned": 17314,
+  "assemble_refbased.align_to_self_merged_reads_aligned": 18409,
   "assemble_refbased.reference_genome_length": 18959,
-  "assemble_refbased.assembly_length_unambiguous": 18865,
-  "assemble_refbased.assembly_length": 18865
+  "assemble_refbased.assembly_length_unambiguous": 18889,
+  "assemble_refbased.assembly_length": 18889
 }