From 87d8bc0149f8ccebbad744fc12292aaddd1056fe Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Mon, 8 Jun 2020 16:27:31 -0400
Subject: [PATCH 01/17] add augur mask step explicitly

---
 pipes/WDL/tasks/tasks_nextstrain.wdl     | 31 ++++++++++++++++++++++++
 pipes/WDL/workflows/build_augur_tree.wdl | 11 ++++++---
 2 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/pipes/WDL/tasks/tasks_nextstrain.wdl b/pipes/WDL/tasks/tasks_nextstrain.wdl
index e38348387..e16bc5381 100644
--- a/pipes/WDL/tasks/tasks_nextstrain.wdl
+++ b/pipes/WDL/tasks/tasks_nextstrain.wdl
@@ -186,6 +186,37 @@ task augur_mafft_align {
     }
 }
 
+task augur_mask_sites {
+    meta {
+        description: "Mask unwanted positions from alignment. See https://nextstrain-augur.readthedocs.io/en/stable/usage/cli/mask.html"
+    }
+    input {
+        File     sequences
+        File?    mask_bed
+
+        String   docker = "nextstrain/base"
+    }
+    String basename = basename(sequences, '.fasta')
+    command {
+        augur version > VERSION
+        augur mask --sequences ~{sequences} \
+            --mask ~{select_first([mask_bed, "/dev/null"])} \
+            --output ~{basename}_masked.fasta
+    }
+    runtime {
+        docker: docker
+        memory: "3 GB"
+        cpu :   2
+        disks:  "local-disk 100 HDD"
+        preemptible: 2
+        dx_instance_type: "mem1_ssd1_v2_x2"
+    }
+    output {
+        File masked_sequences = "~{basename}_masked.fasta"
+        String augur_version = read_string("VERSION")
+    }
+}
+
 task draft_augur_tree {
     meta {
         description: "Build a tree using a variety of methods. See https://nextstrain-augur.readthedocs.io/en/stable/usage/cli/tree.html"
diff --git a/pipes/WDL/workflows/build_augur_tree.wdl b/pipes/WDL/workflows/build_augur_tree.wdl
index 2d486b436..5cb4d69fd 100644
--- a/pipes/WDL/workflows/build_augur_tree.wdl
+++ b/pipes/WDL/workflows/build_augur_tree.wdl
@@ -64,15 +64,19 @@ workflow build_augur_tree {
             ref_fasta = ref_fasta,
             basename  = virus
     }
+    call nextstrain.augur_mask_sites {
+        input:
+            sequences = augur_mafft_align.aligned_sequences
+    }
     call nextstrain.draft_augur_tree {
         input:
-            aligned_fasta  = augur_mafft_align.aligned_sequences,
+            aligned_fasta  = augur_mask_sites.masked_sequences,
             basename       = virus
     }
     call nextstrain.refine_augur_tree {
         input:
             raw_tree       = draft_augur_tree.aligned_tree,
-            aligned_fasta  = augur_mafft_align.aligned_sequences,
+            aligned_fasta  = augur_mask_sites.masked_sequences,
             metadata       = sample_metadata,
             basename       = virus
     }
@@ -88,7 +92,7 @@ workflow build_augur_tree {
     call nextstrain.ancestral_tree {
         input:
             refined_tree   = refine_augur_tree.tree_refined,
-            aligned_fasta  = augur_mafft_align.aligned_sequences,
+            aligned_fasta  = augur_mask_sites.masked_sequences,
             basename       = virus
     }
     call nextstrain.translate_augur_tree {
@@ -123,6 +127,7 @@ workflow build_augur_tree {
     output {
         File  combined_assembly_fasta    = concatenate.combined
         File  augur_aligned_fasta        = augur_mafft_align.aligned_sequences
+        File  masked_fasta        = augur_mask_sites.masked_sequences
         File  raw_tree                   = draft_augur_tree.aligned_tree
         File  refined_tree               = refine_augur_tree.tree_refined
         File  branch_lengths             = refine_augur_tree.branch_lengths

From 4a1b1afc16b27746a97826fb8ea667344e6bc688 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Mon, 8 Jun 2020 16:46:53 -0400
Subject: [PATCH 02/17] try out --append_run_id option in viral-core

---
 pipes/WDL/tasks/tasks_demux.wdl | 1 +
 requirements-modules.txt        | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/pipes/WDL/tasks/tasks_demux.wdl b/pipes/WDL/tasks/tasks_demux.wdl
index 12f0c425b..36a5b9b04 100644
--- a/pipes/WDL/tasks/tasks_demux.wdl
+++ b/pipes/WDL/tasks/tasks_demux.wdl
@@ -194,6 +194,7 @@ task illumina_demux {
       --JVMmemory="$mem_in_mb"m \
       $demux_threads \
       ${true='--force_gc=true' false="--force_gc=false" forceGC} \
+      --append_run_id \
       --compression_level=5 \
       --loglevel=DEBUG
 
diff --git a/requirements-modules.txt b/requirements-modules.txt
index 1bd4ad746..71b30af88 100644
--- a/requirements-modules.txt
+++ b/requirements-modules.txt
@@ -1,4 +1,4 @@
-broadinstitute/viral-core=2.1.0
+broadinstitute/viral-core=dp-demux
 broadinstitute/viral-assemble=2.1.0.0
 broadinstitute/viral-classify=2.1.0.0
 broadinstitute/viral-phylo=2.1.0.0

From a4a8afc7936e05bcfed56992d782f85304382880 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Mon, 8 Jun 2020 17:13:59 -0400
Subject: [PATCH 03/17] augur mask doesnt like empty bed input, so fake it

---
 pipes/WDL/tasks/tasks_nextstrain.wdl | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/pipes/WDL/tasks/tasks_nextstrain.wdl b/pipes/WDL/tasks/tasks_nextstrain.wdl
index e16bc5381..5c56c934b 100644
--- a/pipes/WDL/tasks/tasks_nextstrain.wdl
+++ b/pipes/WDL/tasks/tasks_nextstrain.wdl
@@ -199,9 +199,14 @@ task augur_mask_sites {
     String basename = basename(sequences, '.fasta')
     command {
         augur version > VERSION
-        augur mask --sequences ~{sequences} \
-            --mask ~{select_first([mask_bed, "/dev/null"])} \
-            --output ~{basename}_masked.fasta
+        BEDFILE=~{select_first([mask_bed, "/dev/null"])}
+        if [ -s "$BEDFILE" ]; then
+            augur mask --sequences ~{sequences} \
+                --mask "$BEDFILE" \
+                --output "~{basename}_masked.fasta"
+        else
+            cp "~{sequences}" "~{basename}_masked.fasta"
+        fi
     }
     runtime {
         docker: docker

From 89383b4b59018a0c7146a08344d619813f6f28d5 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Tue, 9 Jun 2020 09:47:23 -0400
Subject: [PATCH 04/17] update dxWDL 1.47 to 1.47.2

---
 travis/install-wdl.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/travis/install-wdl.sh b/travis/install-wdl.sh
index e959db508..69742b49a 100755
--- a/travis/install-wdl.sh
+++ b/travis/install-wdl.sh
@@ -19,7 +19,7 @@ cached_fetch_jar_from_github () {
 
 cached_fetch_jar_from_github broadinstitute cromwell womtool 49
 cached_fetch_jar_from_github broadinstitute cromwell cromwell 49
-cached_fetch_jar_from_github dnanexus dxWDL dxWDL v1.47
+cached_fetch_jar_from_github dnanexus dxWDL dxWDL v1.47.2
 
 TGZ=dx-toolkit-v0.293.0-ubuntu-16.04-amd64.tar.gz
 if [ ! -f $CACHE_DIR/$TGZ ]; then

From bc66cf837d4789f61309c06fd23f1149da0afc6f Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Tue, 9 Jun 2020 12:16:09 -0400
Subject: [PATCH 05/17] upstream merged

---
 requirements-modules.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-modules.txt b/requirements-modules.txt
index 71b30af88..a7e47f6c7 100644
--- a/requirements-modules.txt
+++ b/requirements-modules.txt
@@ -1,4 +1,4 @@
-broadinstitute/viral-core=dp-demux
+broadinstitute/viral-core=2.1.0-rc3
 broadinstitute/viral-assemble=2.1.0.0
 broadinstitute/viral-classify=2.1.0.0
 broadinstitute/viral-phylo=2.1.0.0

From abd41f3e90548c1edc80b122aa8aacc7e9eac911 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Tue, 9 Jun 2020 12:47:01 -0400
Subject: [PATCH 06/17] change input variable names, parameter_meta docs, and
 basename handling to ensure we can tolerate VCFs as well as FASTAs at all
 augur steps that can do that.

---
 pipes/WDL/tasks/tasks_nextstrain.wdl     | 79 ++++++++++++++++--------
 pipes/WDL/workflows/build_augur_tree.wdl | 18 +++---
 2 files changed, 61 insertions(+), 36 deletions(-)

diff --git a/pipes/WDL/tasks/tasks_nextstrain.wdl b/pipes/WDL/tasks/tasks_nextstrain.wdl
index 5c56c934b..7df55cf2f 100644
--- a/pipes/WDL/tasks/tasks_nextstrain.wdl
+++ b/pipes/WDL/tasks/tasks_nextstrain.wdl
@@ -96,15 +96,15 @@ task filter_subsample_sequences {
     }
     parameter_meta {
         sequences_fasta: {
-          description: "Set of sequences in fasta format to subsample using augur filter. These must represent a single chromosome/segment of a genome only.",
-          patterns: ["*.fasta", "*.fa"]
+          description: "Set of sequences (unaligned fasta or aligned fasta -- one sequence per genome) or variants (vcf format) to subsample using augur filter.",
+          patterns: ["*.fasta", "*.fa", "*.vcf", "*.vcf.gz"]
         }
         sample_metadata_tsv: {
           description: "Metadata in tab-separated text format. See https://nextstrain-augur.readthedocs.io/en/stable/faq/metadata.html for details.",
           patterns: ["*.txt", "*.tsv"]
         }
     }
-    String in_basename = basename(sequences_fasta, ".fasta")
+    String out_fname = sub(sub(sequences_fasta, ".vcf", ".filtered.vcf"), ".fasta$", ".filtered.fasta")
     command {
         augur version > VERSION
         augur filter \
@@ -122,23 +122,24 @@ task filter_subsample_sequences {
             ~{"--subsample-seed " + subsample_seed} \
             ~{"--exclude-where " + exclude_where} \
             ~{"--include-where " + include_where} \
-            --output "~{in_basename}.filtered.fasta"
-        cat ~{sequences_fasta} | grep \> | wc -l > IN_COUNT
-        cat ~{in_basename}.filtered.fasta | grep \> | wc -l > OUT_COUNT
+            --output "~{out_fname}" | tee STDOUT
+        #cat ~{sequences_fasta} | grep \> | wc -l > IN_COUNT
+        grep "sequences were dropped during filtering" STDOUT | cut -f 1 -d ' ' > DROP_COUNT
+        grep "sequences have been written out to" STDOUT | cut -f 1 -d ' ' > OUT_COUNT
     }
     runtime {
         docker: docker
-        memory: "4 GB"
-        cpu :   2
-        disks:  "local-disk 375 LOCAL"
+        memory: "3 GB"
+        cpu :   1
+        disks:  "local-disk 100 HDD"
         dx_instance_type: "mem1_ssd1_v2_x2"
         preemptible: 1
     }
     output {
-        File   filtered_fasta = "~{in_basename}.filtered.fasta"
-        String augur_version  = read_string("VERSION")
-        Int    sequences_in   = read_int("IN_COUNT")
-        Int    sequences_out  = read_int("OUT_COUNT")
+        File   filtered_fasta    = out_fname
+        String augur_version     = read_string("VERSION")
+        Int    sequences_dropped = read_int("DROP_COUNT")
+        Int    sequences_out     = read_int("OUT_COUNT")
     }
 }
 
@@ -188,7 +189,7 @@ task augur_mafft_align {
 
 task augur_mask_sites {
     meta {
-        description: "Mask unwanted positions from alignment. See https://nextstrain-augur.readthedocs.io/en/stable/usage/cli/mask.html"
+        description: "Mask unwanted positions from alignment or SNP table. See https://nextstrain-augur.readthedocs.io/en/stable/usage/cli/mask.html"
     }
     input {
         File     sequences
@@ -196,16 +197,22 @@ task augur_mask_sites {
 
         String   docker = "nextstrain/base"
     }
-    String basename = basename(sequences, '.fasta')
+    parameter_meta {
+        sequences: {
+          description: "Set of alignments (fasta format) or variants (vcf format) to mask.",
+          patterns: ["*.fasta", "*.fa", "*.vcf", "*.vcf.gz"]
+        }
+    }
+    String out_fname = sub(sub(sequences, ".vcf", ".masked.vcf"), ".fasta$", ".masked.fasta")
     command {
         augur version > VERSION
         BEDFILE=~{select_first([mask_bed, "/dev/null"])}
         if [ -s "$BEDFILE" ]; then
             augur mask --sequences ~{sequences} \
                 --mask "$BEDFILE" \
-                --output "~{basename}_masked.fasta"
+                --output "~{out_fname}"
         else
-            cp "~{sequences}" "~{basename}_masked.fasta"
+            cp "~{sequences}" "~{out_fname}"
         fi
     }
     runtime {
@@ -217,17 +224,17 @@ task augur_mask_sites {
         dx_instance_type: "mem1_ssd1_v2_x2"
     }
     output {
-        File masked_sequences = "~{basename}_masked.fasta"
-        String augur_version = read_string("VERSION")
+        File masked_sequences = out_fname
+        String augur_version  = read_string("VERSION")
     }
 }
 
 task draft_augur_tree {
     meta {
-        description: "Build a tree using a variety of methods. See https://nextstrain-augur.readthedocs.io/en/stable/usage/cli/tree.html"
+        description: "Build a tree using iqTree. See https://nextstrain-augur.readthedocs.io/en/stable/usage/cli/tree.html"
     }
     input {
-        File     aligned_fasta
+        File     msa_or_vcf
         String   basename
 
         String   method = "iqtree"
@@ -240,9 +247,15 @@ task draft_augur_tree {
         Int?     disk_space_gb = 750
         String   docker = "nextstrain/base"
     }
+    parameter_meta {
+        msa_or_vcf: {
+          description: "Set of alignments (fasta format) or variants (vcf format) to construct a tree from using augur tree (iqTree).",
+          patterns: ["*.fasta", "*.fa", "*.vcf", "*.vcf.gz"]
+        }
+    }
     command {
         augur version > VERSION
-        AUGUR_RECURSION_LIMIT=10000 augur tree --alignment ~{aligned_fasta} \
+        AUGUR_RECURSION_LIMIT=10000 augur tree --alignment ~{msa_or_vcf} \
             --output ~{basename}_raw_tree.nwk \
             --method ~{default="iqtree" method} \
             --substitution-model ~{default="GTR" substitution_model} \
@@ -267,11 +280,11 @@ task draft_augur_tree {
 
 task refine_augur_tree {
     meta {
-        description: "Refine an initial tree using sequence metadata. See https://nextstrain-augur.readthedocs.io/en/stable/usage/cli/refine.html"
+        description: "Refine an initial tree using sequence metadata and Treetime. See https://nextstrain-augur.readthedocs.io/en/stable/usage/cli/refine.html"
     }
     input {
         File     raw_tree
-        File     aligned_fasta
+        File     msa_or_vcf
         File     metadata
         String   basename
 
@@ -295,11 +308,17 @@ task refine_augur_tree {
         Int?     disk_space_gb = 750
         String   docker = "nextstrain/base"
     }
+    parameter_meta {
+        msa_or_vcf: {
+          description: "Set of alignments (fasta format) or variants (vcf format) to use to guide Treetime.",
+          patterns: ["*.fasta", "*.fa", "*.vcf", "*.vcf.gz"]
+        }
+    }
     command {
         augur version > VERSION
         AUGUR_RECURSION_LIMIT=10000 augur refine \
             --tree ~{raw_tree} \
-            --alignment ~{aligned_fasta} \
+            --alignment ~{msa_or_vcf} \
             --metadata ~{metadata} \
             --output-tree ~{basename}_refined_tree.nwk \
             --output-node-data ~{basename}_branch_lengths.json \
@@ -382,7 +401,7 @@ task ancestral_tree {
     }
     input {
         File     refined_tree
-        File     aligned_fasta
+        File     msa_or_vcf
         String   basename
 
         String   inference = "joint"
@@ -395,11 +414,17 @@ task ancestral_tree {
         Int?     machine_mem_gb
         String   docker = "nextstrain/base"
     }
+    parameter_meta {
+        msa_or_vcf: {
+          description: "Set of alignments (fasta format) or variants (vcf format) to use to guide Treetime.",
+          patterns: ["*.fasta", "*.fa", "*.vcf", "*.vcf.gz"]
+        }
+    }
     command {
         augur version > VERSION
         AUGUR_RECURSION_LIMIT=10000 augur ancestral \
             --tree ~{refined_tree} \
-            --alignment ~{aligned_fasta} \
+            --alignment ~{msa_or_vcf} \
             --output-node-data ~{basename}_nt_muts.json \
             ~{"--vcf-reference " + vcf_reference} \
             ~{"--output-vcf " + output_vcf} \
diff --git a/pipes/WDL/workflows/build_augur_tree.wdl b/pipes/WDL/workflows/build_augur_tree.wdl
index 5cb4d69fd..d26385429 100644
--- a/pipes/WDL/workflows/build_augur_tree.wdl
+++ b/pipes/WDL/workflows/build_augur_tree.wdl
@@ -70,15 +70,15 @@ workflow build_augur_tree {
     }
     call nextstrain.draft_augur_tree {
         input:
-            aligned_fasta  = augur_mask_sites.masked_sequences,
-            basename       = virus
+            msa_or_vcf = augur_mask_sites.masked_sequences,
+            basename   = virus
     }
     call nextstrain.refine_augur_tree {
         input:
-            raw_tree       = draft_augur_tree.aligned_tree,
-            aligned_fasta  = augur_mask_sites.masked_sequences,
-            metadata       = sample_metadata,
-            basename       = virus
+            raw_tree    = draft_augur_tree.aligned_tree,
+            msa_or_vcf  = augur_mask_sites.masked_sequences,
+            metadata    = sample_metadata,
+            basename    = virus
     }
     if(defined(ancestral_traits_to_infer) && length(select_first([ancestral_traits_to_infer,[]]))>0) {
         call nextstrain.ancestral_traits {
@@ -91,9 +91,9 @@ workflow build_augur_tree {
     }
     call nextstrain.ancestral_tree {
         input:
-            refined_tree   = refine_augur_tree.tree_refined,
-            aligned_fasta  = augur_mask_sites.masked_sequences,
-            basename       = virus
+            refined_tree  = refine_augur_tree.tree_refined,
+            msa_or_vcf    = augur_mask_sites.masked_sequences,
+            basename      = virus
     }
     call nextstrain.translate_augur_tree {
         input:

From 46d6a2b04d625d78017b685bccffedd9cb9fa9b6 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Tue, 9 Jun 2020 15:16:24 -0400
Subject: [PATCH 07/17] oops, remove the full path preceding the filename

---
 pipes/WDL/tasks/tasks_nextstrain.wdl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipes/WDL/tasks/tasks_nextstrain.wdl b/pipes/WDL/tasks/tasks_nextstrain.wdl
index 7df55cf2f..d67def77d 100644
--- a/pipes/WDL/tasks/tasks_nextstrain.wdl
+++ b/pipes/WDL/tasks/tasks_nextstrain.wdl
@@ -104,7 +104,7 @@ task filter_subsample_sequences {
           patterns: ["*.txt", "*.tsv"]
         }
     }
-    String out_fname = sub(sub(sequences_fasta, ".vcf", ".filtered.vcf"), ".fasta$", ".filtered.fasta")
+    String out_fname = sub(sub(basename(sequences_fasta), ".vcf", ".filtered.vcf"), ".fasta$", ".filtered.fasta")
     command {
         augur version > VERSION
         augur filter \
@@ -203,7 +203,7 @@ task augur_mask_sites {
           patterns: ["*.fasta", "*.fa", "*.vcf", "*.vcf.gz"]
         }
     }
-    String out_fname = sub(sub(sequences, ".vcf", ".masked.vcf"), ".fasta$", ".masked.fasta")
+    String out_fname = sub(sub(basename(sequences), ".vcf", ".masked.vcf"), ".fasta$", ".masked.fasta")
     command {
         augur version > VERSION
         BEDFILE=~{select_first([mask_bed, "/dev/null"])}

From 9a380387e2914e952f4afe1fd847daf0b5d72314 Mon Sep 17 00:00:00 2001
From: Chris Tomkins-Tinch <tomkinsc@broadinstitute.org>
Date: Tue, 9 Jun 2020 15:35:49 -0400
Subject: [PATCH 08/17] viral-core 2.1.0 -> 2.1.1

---
 requirements-modules.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-modules.txt b/requirements-modules.txt
index 1bd4ad746..0ad5a25ad 100644
--- a/requirements-modules.txt
+++ b/requirements-modules.txt
@@ -1,4 +1,4 @@
-broadinstitute/viral-core=2.1.0
+broadinstitute/viral-core=2.1.1
 broadinstitute/viral-assemble=2.1.0.0
 broadinstitute/viral-classify=2.1.0.0
 broadinstitute/viral-phylo=2.1.0.0

From 1b95f753d7da4677a1d9503d9e31df789bc0dbd1 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Tue, 9 Jun 2020 16:47:10 -0400
Subject: [PATCH 09/17] strip out blastx from classify_multi workflow for now

---
 pipes/WDL/workflows/classify_multi.wdl | 32 +-------------------------
 1 file changed, 1 insertion(+), 31 deletions(-)

diff --git a/pipes/WDL/workflows/classify_multi.wdl b/pipes/WDL/workflows/classify_multi.wdl
index e132c9a58..a8634cb5f 100644
--- a/pipes/WDL/workflows/classify_multi.wdl
+++ b/pipes/WDL/workflows/classify_multi.wdl
@@ -8,7 +8,7 @@ import "../tasks/tasks_reports.wdl" as reports
 
 workflow classify_multi {
     meta {
-         description: "Runs raw reads through taxonomic classification (Kraken2), human read depletion (based on Kraken2), de novo assembly (SPAdes), taxonomic classification of contigs (BLASTx), and FASTQC/multiQC of reads."
+         description: "Runs raw reads through taxonomic classification (Kraken2), human read depletion (based on Kraken2), de novo assembly (SPAdes), and FASTQC/multiQC of reads."
          author: "Broad Viral Genomics"
          email:  "viral-ngs@broadinstitute.org"
     }
@@ -23,8 +23,6 @@ workflow classify_multi {
 
         File  kraken2_db_tgz
         File  krona_taxonomy_db_kraken2_tgz
-        File? blast_db_tgz
-        File? krona_taxonomy_db_blast_tgz
     }
 
     parameter_meta {
@@ -48,14 +46,6 @@ workflow classify_multi {
           description: "Krona taxonomy database containing a single file: taxonomy.tab, or possibly just a compressed taxonomy.tab",
           patterns: ["*.tab.zst", "*.tab.gz", "*.tab", "*.tar.gz", "*.tar.lz4", "*.tar.bz2", "*.tar.zst"]
         }
-        blast_db_tgz: {
-          description: "Pre-built BLAST database tarball containing an indexed blast database named 'nr'",
-          patterns: ["*.tar.gz", "*.tar.lz4", "*.tar.bz2", "*.tar.zst"]
-        }
-        krona_taxonomy_db_blast_tgz: {
-          description: "Krona taxonomy database: a tarball containing a taxonomy.tab file as well as accession to taxid mapping (a kraken-based taxonomy database will not suffice).",
-          patterns: ["*.tar.gz", "*.tar.lz4", "*.tar.bz2", "*.tar.zst"]
-        }
         ncbi_taxdump_tgz: {
           description: "An NCBI taxdump.tar.gz file that contains, at the minimum, a nodes.dmp and names.dmp file.",
           patterns: ["*.tar.gz", "*.tar.lz4", "*.tar.bz2", "*.tar.zst"]
@@ -114,17 +104,8 @@ workflow classify_multi {
                 assembler = "spades",
                 reads_unmapped_bam = rmdup_ubam.dedup_bam,
                 trim_clip_db = trim_clip_db,
-                spades_min_contig_len = 800,
                 always_succeed = true
         }
-        if(defined(blast_db_tgz) && defined(krona_taxonomy_db_blast_tgz)) {
-            call metagenomics.blastx as blastx {
-                input:
-                    contigs_fasta = spades.contigs_fasta,
-                    blast_db_tgz = select_first([blast_db_tgz]),
-                    krona_taxonomy_db_tgz = select_first([krona_taxonomy_db_blast_tgz])
-            }
-        }
     }
 
     call reports.MultiQC as multiqc_raw {
@@ -163,14 +144,6 @@ workflow classify_multi {
             out_basename = "merged-kraken2.krona"
     }
 
-    if(defined(blast_db_tgz) && defined(krona_taxonomy_db_blast_tgz)) {
-        call metagenomics.krona_merge as krona_merge_blastx {
-            input:
-                krona_reports = select_all(blastx.krona_report_html),
-                out_basename = "merged-spades-blastx.krona"
-        }
-    }
-
     output {
         Array[File] cleaned_reads_unaligned_bams = deplete.bam_filtered_to_taxa
         Array[File] deduplicated_reads_unaligned = rmdup_ubam.dedup_bam
@@ -187,12 +160,9 @@ workflow classify_multi {
         File        spikein_counts         = spike_summary.count_summary
         File        kraken2_merged_krona   = krona_merge_kraken2.krona_report_html
         File        kraken2_summary        = metag_summary_report.krakenuniq_aggregate_taxlevel_summary
-        File?       blastx_merged_krona   = krona_merge_blastx.krona_report_html
 
         Array[File] kraken2_summary_reports = kraken2.kraken2_summary_report
         Array[File] kraken2_krona_by_sample = kraken2.krona_report_html
-        Array[File] blastx_report_by_sample = select_all(blastx.blast_report)
-        Array[File] blastx_krona_by_sample  = select_all(blastx.krona_report_html)
 
         String      kraken2_viral_classify_version = kraken2.viralngs_version[0]
         String      deplete_viral_classify_version    = deplete.viralngs_version[0]

From 7a8a3643b9c1ca9214700ccc2b21c6e149d21606 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Tue, 9 Jun 2020 17:25:17 -0400
Subject: [PATCH 10/17] create a new cromwell config file that forces travis to
 run only one job at a time

---
 pipes/cromwell/cromwell.local-travis.conf | 110 ++++++++++++++++++++++
 travis/tests-cromwell.sh                  |   4 +-
 2 files changed, 112 insertions(+), 2 deletions(-)
 create mode 100644 pipes/cromwell/cromwell.local-travis.conf

diff --git a/pipes/cromwell/cromwell.local-travis.conf b/pipes/cromwell/cromwell.local-travis.conf
new file mode 100644
index 000000000..06edf3faa
--- /dev/null
+++ b/pipes/cromwell/cromwell.local-travis.conf
@@ -0,0 +1,110 @@
+# Documentation
+# https://cromwell.readthedocs.io/en/stable/backends/Local/
+
+    # Define a new backend provider.
+
+    LocalExample {
+
+      # The actor that runs the backend. In this case, it's the Shared File System (SFS) ConfigBackend.
+      actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory"
+
+      # The backend custom configuration.
+      config {
+
+        # Optional limits on the number of concurrent jobs
+        concurrent-job-limit = 1
+
+        # If true submits scripts to the bash background using "&". Only usefull for dispatchers that do NOT submit
+        # the job and then immediately return a scheduled job id.
+        run-in-background = true
+
+        # `temporary-directory` creates the temporary directory for commands.
+        #
+        # If this value is not set explicitly, the default value creates a unique temporary directory, equivalent to:
+        # temporary-directory = "$(mktemp -d \"$PWD\"/tmp.XXXXXX)"
+        #
+        # The expression is run from the execution directory for the script. The expression must create the directory
+        # if it does not exist, and then return the full path to the directory.
+        #
+        # To create and return a non-random temporary directory, use something like:
+        # temporary-directory = "$(mkdir -p /tmp/mydir && echo /tmp/mydir)"
+
+        # `script-epilogue` configures a shell command to run after the execution of every command block.
+        #
+        # If this value is not set explicitly, the default value is `sync`, equivalent to:
+        # script-epilogue = "sync"
+        #
+        # To turn off the default `sync` behavior set this value to an empty string:
+        # script-epilogue = ""
+
+        # `glob-link-command` specifies command used to link glob outputs, by default using hard-links.
+        # If filesystem doesn't allow hard-links (e.g., beeGFS), change to soft-links as follows:
+        # glob-link-command = "ln -sL GLOB_PATTERN GLOB_DIRECTORY"
+
+        # The list of possible runtime custom attributes.
+        runtime-attributes = """
+        String? docker
+        String? docker_user
+        """
+
+        # Submit string when there is no "docker" runtime attribute.
+        submit = "/usr/bin/env bash ${script}"
+
+        # Submit string when there is a "docker" runtime attribute.
+        submit-docker = """
+        docker run \
+          --rm -i \
+          ${"--user " + docker_user} \
+          --entrypoint ${job_shell} \
+          -v ${cwd}:${docker_cwd} \
+          ${docker} ${script}
+        """
+
+        # Root directory where Cromwell writes job results.  This directory must be
+        # visible and writeable by the Cromwell process as well as the jobs that Cromwell
+        # launches.
+        root = "cromwell-executions"
+
+        # Root directory where Cromwell writes job results in the container. This value
+        # can be used to specify where the execution folder is mounted in the container.
+        # it is used for the construction of the docker_cwd string in the submit-docker
+        # value above.
+        dockerRoot = "/cromwell-executions"
+
+        # File system configuration.
+        filesystems {
+
+          # For SFS backends, the "local" configuration specifies how files are handled.
+          local {
+
+            # Try to hard link (ln), then soft-link (ln -s), and if both fail, then copy the files.
+            localization: [
+              "hard-link", "soft-link", "copy"
+            ]
+
+            # Call caching strategies
+            caching {
+              # When copying a cached result, what type of file duplication should occur.
+              # For more information check: https://cromwell.readthedocs.io/en/stable/backends/HPC/#shared-filesystem
+              duplication-strategy: [
+                "hard-link", "soft-link", "copy"
+              ]
+
+              # Strategy to determine if a file has been used before.
+              # For extended explanation and alternative strategies check: https://cromwell.readthedocs.io/en/stable/Configuring/#call-caching
+              hashing-strategy: "md5"
+
+              # When true, will check if a sibling file with the same name and the .md5 extension exists, and if it does, use the content of this file as a hash.
+              # If false or the md5 does not exist, will proceed with the above-defined hashing strategy.
+              check-sibling-md5: false
+            }
+          }
+        }
+
+        # The defaults for runtime attributes if not provided.
+        default-runtime-attributes {
+          failOnStderr: false
+          continueOnReturnCode: 0
+        }
+      }
+    }
diff --git a/travis/tests-cromwell.sh b/travis/tests-cromwell.sh
index 928e00aa3..8fc9a11ab 100755
--- a/travis/tests-cromwell.sh
+++ b/travis/tests-cromwell.sh
@@ -2,7 +2,6 @@
 set -e  # intentionally allow for pipe failures below
 
 mkdir -p workflows
-cp *.jar pipes/WDL/workflows/*.wdl pipes/WDL/tasks/*.wdl workflows
 cp -r test workflows/
 cd workflows
 
@@ -13,7 +12,8 @@ for workflow in ../pipes/WDL/workflows/*.wdl; do
 		date
 		echo "Executing $workflow_name using Cromwell on local instance"
 		# the "cat" is to allow a pipe failure (otherwise it halts because of set -e)
-		java -jar cromwell.jar run \
+		java -Dconfig.file=../pipes/cromwell/cromwell.local-travis.conf \
+			-jar ../cromwell.jar run \
 			$workflow_name.wdl \
 			-i $input_json | tee cromwell.out
 		if [ ${PIPESTATUS[0]} -gt 0 ]; then

From 995256da9f4fda6b84e1d48c28f0af7c4c5d383a Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Tue, 9 Jun 2020 17:41:13 -0400
Subject: [PATCH 11/17] begin work on augur_from_msa workflow, rename other
 nextstrain wdls to similar augur_from_x naming scheme

---
 ...o_auspice.wdl => augur_from_beast_mcc.wdl} |   2 +-
 pipes/WDL/workflows/augur_from_msa.wdl        | 123 ++++++++++++++++++
 ...k_to_auspice.wdl => augur_from_newick.wdl} |   2 +-
 pipes/WDL/workflows/build_augur_tree.wdl      |   2 +-
 4 files changed, 126 insertions(+), 3 deletions(-)
 rename pipes/WDL/workflows/{beast_to_auspice.wdl => augur_from_beast_mcc.wdl} (97%)
 create mode 100644 pipes/WDL/workflows/augur_from_msa.wdl
 rename pipes/WDL/workflows/{newick_to_auspice.wdl => augur_from_newick.wdl} (94%)

diff --git a/pipes/WDL/workflows/beast_to_auspice.wdl b/pipes/WDL/workflows/augur_from_beast_mcc.wdl
similarity index 97%
rename from pipes/WDL/workflows/beast_to_auspice.wdl
rename to pipes/WDL/workflows/augur_from_beast_mcc.wdl
index 67cd2baac..70c95df50 100644
--- a/pipes/WDL/workflows/beast_to_auspice.wdl
+++ b/pipes/WDL/workflows/augur_from_beast_mcc.wdl
@@ -2,7 +2,7 @@ version 1.0
 
 import "../tasks/tasks_nextstrain.wdl" as nextstrain
 
-workflow beast_to_auspice {
+workflow augur_from_beast_mcc {
     meta {
         description: "Visualize BEAST output with Nextstrain. This workflow converts a BEAST MCC tree (.tree file) into an Auspice v2 json file. See https://nextstrain-augur.readthedocs.io/en/stable/faq/import-beast.html for details."
         author: "Broad Viral Genomics"
diff --git a/pipes/WDL/workflows/augur_from_msa.wdl b/pipes/WDL/workflows/augur_from_msa.wdl
new file mode 100644
index 000000000..cd1f73ff5
--- /dev/null
+++ b/pipes/WDL/workflows/augur_from_msa.wdl
@@ -0,0 +1,123 @@
+version 1.0
+
+import "../tasks/tasks_nextstrain.wdl" as nextstrain
+
+workflow augur_from_msa {
+    meta {
+        description: "Build trees, and convert to json representation suitable for Nextstrain visualization. See https://nextstrain.org/docs/getting-started/ and https://nextstrain-augur.readthedocs.io/en/stable/"
+        author: "Broad Viral Genomics"
+        email:  "viral-ngs@broadinstitute.org"
+    }
+
+    input {
+        File            msa_or_vcf
+        File            sample_metadata
+        String          virus
+        File            ref_fasta
+        File            genbank_gb
+        File?           clades_tsv
+        Array[String]?  ancestral_traits_to_infer
+    }
+
+    parameter_meta {
+        msa_or_vcf: {
+          description: "Multiple sequence alignment (aligned fasta) or variants (vcf format).",
+          patterns: ["*.fasta", "*.fa", "*.vcf", "*.vcf.gz"]
+        }
+        sample_metadata: {
+          description: "Metadata in tab-separated text format. See https://nextstrain-augur.readthedocs.io/en/stable/faq/metadata.html for details.",
+          patterns: ["*.txt", "*.tsv"]
+        }
+        virus: {
+          description: "A filename-friendly string that is used as a base for output file names."
+        }
+        ref_fasta: {
+          description: "A reference assembly (not included in assembly_fastas) to align assembly_fastas against. Typically from NCBI RefSeq or similar.",
+          patterns: ["*.fasta", "*.fa"]
+        }
+        genbank_gb: {
+          description: "A 'genbank' formatted gene annotation file that is used to calculate coding consequences of observed mutations. Must correspond to the same coordinate space as ref_fasta. Typically downloaded from the same NCBI accession number as ref_fasta.",
+          patterns: ["*.gb", "*.gbf"]
+        }
+        ancestral_traits_to_infer: {
+          description: "A list of metadata traits to use for ancestral node inference (see https://nextstrain-augur.readthedocs.io/en/stable/usage/cli/traits.html). Multiple traits may be specified; must correspond exactly to column headers in metadata file. Omitting these values will skip ancestral trait inference, and ancestral nodes will not have estimated values for metadata."
+        }
+        clades_tsv: {
+          description: "A TSV file containing clade mutation positions in four columns: [clade  gene    site    alt]; see: https://nextstrain.org/docs/tutorials/defining-clades",
+          patterns: ["*.tsv", "*.txt"]
+        }
+    }
+
+    call nextstrain.augur_mask_sites {
+        input:
+            sequences = msa_or_vcf
+    }
+    call nextstrain.draft_augur_tree {
+        input:
+            msa_or_vcf = augur_mask_sites.masked_sequences,
+            basename   = virus
+    }
+    call nextstrain.refine_augur_tree {
+        input:
+            raw_tree    = draft_augur_tree.aligned_tree,
+            msa_or_vcf  = augur_mask_sites.masked_sequences,
+            metadata    = sample_metadata,
+            basename    = virus
+    }
+    if(defined(ancestral_traits_to_infer) && length(select_first([ancestral_traits_to_infer,[]]))>0) {
+        call nextstrain.ancestral_traits {
+            input:
+                tree           = refine_augur_tree.tree_refined,
+                metadata       = sample_metadata,
+                columns        = select_first([ancestral_traits_to_infer,[]]),
+                basename       = virus
+        }
+    }
+    call nextstrain.ancestral_tree {
+        input:
+            refined_tree  = refine_augur_tree.tree_refined,
+            msa_or_vcf    = augur_mask_sites.masked_sequences,
+            basename      = virus
+    }
+    call nextstrain.translate_augur_tree {
+        input:
+            basename       = virus,
+            refined_tree   = refine_augur_tree.tree_refined,
+            nt_muts        = ancestral_tree.nt_muts_json,
+            genbank_gb     = genbank_gb
+    }
+    if(defined(clades_tsv)) {
+        call nextstrain.assign_clades_to_nodes {
+            input:
+                tree_nwk     = refine_augur_tree.tree_refined,
+                nt_muts_json = ancestral_tree.nt_muts_json,
+                aa_muts_json = translate_augur_tree.aa_muts_json,
+                ref_fasta    = ref_fasta,
+                clades_tsv   = select_first([clades_tsv])
+        }
+    }
+    call nextstrain.export_auspice_json {
+        input:
+            tree            = refine_augur_tree.tree_refined,
+            sample_metadata = sample_metadata,
+            node_data_jsons = select_all([
+                                refine_augur_tree.branch_lengths,
+                                ancestral_traits.node_data_json,
+                                ancestral_tree.nt_muts_json,
+                                translate_augur_tree.aa_muts_json,
+                                assign_clades_to_nodes.node_clade_data_json])
+    }
+
+    output {
+        File  masked_fasta               = augur_mask_sites.masked_sequences
+        File  raw_tree                   = draft_augur_tree.aligned_tree
+        File  refined_tree               = refine_augur_tree.tree_refined
+        File  branch_lengths             = refine_augur_tree.branch_lengths
+        File  json_nt_muts               = ancestral_tree.nt_muts_json
+        File  ancestral_sequences_fasta  = ancestral_tree.sequences
+        File  json_aa_muts               = translate_augur_tree.aa_muts_json
+        File? node_clade_data_json       = assign_clades_to_nodes.node_clade_data_json
+        File? json_ancestral_traits      = ancestral_traits.node_data_json
+        File  auspice_input_json         = export_auspice_json.virus_json
+    }
+}
diff --git a/pipes/WDL/workflows/newick_to_auspice.wdl b/pipes/WDL/workflows/augur_from_newick.wdl
similarity index 94%
rename from pipes/WDL/workflows/newick_to_auspice.wdl
rename to pipes/WDL/workflows/augur_from_newick.wdl
index 093a2b713..58d3c3ea0 100644
--- a/pipes/WDL/workflows/newick_to_auspice.wdl
+++ b/pipes/WDL/workflows/augur_from_newick.wdl
@@ -2,7 +2,7 @@ version 1.0
 
 import "../tasks/tasks_nextstrain.wdl" as nextstrain
 
-workflow newick_to_auspice {
+workflow augur_from_newick {
     meta {
         description: "Convert a newick formatted phylogenetic tree into a json suitable for auspice visualization. See https://nextstrain-augur.readthedocs.io/en/stable/usage/cli/export.html"
         author: "Broad Viral Genomics"
diff --git a/pipes/WDL/workflows/build_augur_tree.wdl b/pipes/WDL/workflows/build_augur_tree.wdl
index d26385429..74bb68c11 100644
--- a/pipes/WDL/workflows/build_augur_tree.wdl
+++ b/pipes/WDL/workflows/build_augur_tree.wdl
@@ -127,7 +127,7 @@ workflow build_augur_tree {
     output {
         File  combined_assembly_fasta    = concatenate.combined
         File  augur_aligned_fasta        = augur_mafft_align.aligned_sequences
-        File  masked_fasta        = augur_mask_sites.masked_sequences
+        File  masked_fasta               = augur_mask_sites.masked_sequences
         File  raw_tree                   = draft_augur_tree.aligned_tree
         File  refined_tree               = refine_augur_tree.tree_refined
         File  branch_lengths             = refine_augur_tree.branch_lengths

From 09305b24f1198fa1899d7a40bb5273fa8c6a9f6e Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Tue, 9 Jun 2020 17:43:38 -0400
Subject: [PATCH 12/17] prepend path to wdl

---
 travis/tests-cromwell.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/travis/tests-cromwell.sh b/travis/tests-cromwell.sh
index 8fc9a11ab..6d1bc4552 100755
--- a/travis/tests-cromwell.sh
+++ b/travis/tests-cromwell.sh
@@ -14,7 +14,7 @@ for workflow in ../pipes/WDL/workflows/*.wdl; do
 		# the "cat" is to allow a pipe failure (otherwise it halts because of set -e)
 		java -Dconfig.file=../pipes/cromwell/cromwell.local-travis.conf \
 			-jar ../cromwell.jar run \
-			$workflow_name.wdl \
+			../pipes/WDL/workflows/$workflow_name.wdl \
 			-i $input_json | tee cromwell.out
 		if [ ${PIPESTATUS[0]} -gt 0 ]; then
 			echo "error running $workflow_name"

From 4c77cf8b5105f5e872964a57086c80b9c3fadd15 Mon Sep 17 00:00:00 2001
From: Christopher Tomkins-Tinch <tomkinsc@broadinstitute.org>
Date: Tue, 9 Jun 2020 17:56:25 -0400
Subject: [PATCH 13/17] bump viral-assemble, viral-classify, viral-phylo
 2.1.0.0 -> 2.1.1.0

---
 requirements-modules.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/requirements-modules.txt b/requirements-modules.txt
index 0ad5a25ad..23e7fde5e 100644
--- a/requirements-modules.txt
+++ b/requirements-modules.txt
@@ -1,7 +1,7 @@
 broadinstitute/viral-core=2.1.1
-broadinstitute/viral-assemble=2.1.0.0
-broadinstitute/viral-classify=2.1.0.0
-broadinstitute/viral-phylo=2.1.0.0
+broadinstitute/viral-assemble=2.1.1.0
+broadinstitute/viral-classify=2.1.1.0
+broadinstitute/viral-phylo=2.1.1.0
 broadinstitute/beast-beagle-cuda=1.10.5
 broadinstitute/ncbi-tools=2.10.7.0
 nextstrain/base=build-20200529T044753Z

From 33188e8ddc01da2c0b862f134f09866c8d609c08 Mon Sep 17 00:00:00 2001
From: Chris Tomkins-Tinch <tomkinsc@broadinstitute.org>
Date: Tue, 9 Jun 2020 17:56:58 -0400
Subject: [PATCH 14/17] bugfix dxid reference for
 CONSOLIDATE_RUN_TARBALLS_APPLET (#104)

---
 travis/build-dx.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/travis/build-dx.sh b/travis/build-dx.sh
index 2a9788559..cd7ea823d 100755
--- a/travis/build-dx.sh
+++ b/travis/build-dx.sh
@@ -53,7 +53,7 @@ done
 # build consolidate_run_tarballs (native DNAnexus applet) applet
 pushd pipes/dnax/dx-launcher
 cp consolidate_run_tarballs.yml consolidate_run_tarballs_dxapp.yml
-dx_id=$(./dx-yml-build consolidate_run_tarballs_dxapp.yml -a --destination /build/$VERSION/ | jq -r ".id")
+consolidate_tarballs_dx_id=$(./dx-yml-build consolidate_run_tarballs_dxapp.yml -a --destination /build/$VERSION/ | jq -r ".id")
 popd
 echo -e "consolidate_run_tarballs\t$dx_id" >> $COMPILE_SUCCESS
 
@@ -66,7 +66,7 @@ for wf_name in $(echo "${demux_workflows_to_build}"); do
   pushd pipes/dnax/dx-launcher
   sed "s/DEFAULT_DEMUX_WORKFLOW_ID/$demux_workflow_id/" demux_launcher.yml \
     | sed "s/DEFAULT_DEMUX_WORKFLOW_NAME/${wf_name}_launcher/" \
-    | sed "s/DEFAULT_CONSOLIDATE_RUN_TARBALLS_APPLET_ID/$dx_id/" > "${wf_name}_dxapp.yml"
+    | sed "s/DEFAULT_CONSOLIDATE_RUN_TARBALLS_APPLET_ID/$consolidate_tarballs_dx_id/" > "${wf_name}_dxapp.yml"
   dx_id=$(./dx-yml-build ${wf_name}_dxapp.yml -a --destination /build/$VERSION/ | jq -r ".id")
   popd
   echo -e "${wf_name}_launcher\t$dx_id" >> $COMPILE_SUCCESS

From 323f8e9a84ebf467759bf0401ac16e1f52424cb8 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Tue, 9 Jun 2020 18:26:02 -0400
Subject: [PATCH 15/17] revert bits of path handling for simplicity

---
 travis/tests-cromwell.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/travis/tests-cromwell.sh b/travis/tests-cromwell.sh
index 6d1bc4552..d9daf5fac 100755
--- a/travis/tests-cromwell.sh
+++ b/travis/tests-cromwell.sh
@@ -2,6 +2,7 @@
 set -e  # intentionally allow for pipe failures below
 
 mkdir -p workflows
+cp *.jar pipes/WDL/workflows/*.wdl pipes/WDL/tasks/*.wdl workflows
 cp -r test workflows/
 cd workflows
 
@@ -13,8 +14,8 @@ for workflow in ../pipes/WDL/workflows/*.wdl; do
 		echo "Executing $workflow_name using Cromwell on local instance"
 		# the "cat" is to allow a pipe failure (otherwise it halts because of set -e)
 		java -Dconfig.file=../pipes/cromwell/cromwell.local-travis.conf \
-			-jar ../cromwell.jar run \
-			../pipes/WDL/workflows/$workflow_name.wdl \
+			-jar cromwell.jar run \
+			$workflow_name.wdl \
 			-i $input_json | tee cromwell.out
 		if [ ${PIPESTATUS[0]} -gt 0 ]; then
 			echo "error running $workflow_name"

From f59311259ce194251676cf0f227b04a61cf3b5b8 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Wed, 10 Jun 2020 07:08:46 -0400
Subject: [PATCH 16/17] add perf monitoring stats to task outputs

---
 pipes/WDL/tasks/tasks_nextstrain.wdl | 51 ++++++++++++++++++++--------
 1 file changed, 36 insertions(+), 15 deletions(-)

diff --git a/pipes/WDL/tasks/tasks_nextstrain.wdl b/pipes/WDL/tasks/tasks_nextstrain.wdl
index d67def77d..7e47f3fd7 100644
--- a/pipes/WDL/tasks/tasks_nextstrain.wdl
+++ b/pipes/WDL/tasks/tasks_nextstrain.wdl
@@ -170,7 +170,6 @@ task augur_mafft_align {
             ~{true="--remove-reference" false="" remove_reference} \
             --debug \
             --nthreads auto
-        cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes
     }
     runtime {
         docker: docker
@@ -181,8 +180,9 @@ task augur_mafft_align {
         dx_instance_type: "mem3_ssd2_v2_x16"
     }
     output {
-        File aligned_sequences = "~{basename}_aligned.fasta"
-        File align_troubleshoot = stdout()
+        File   aligned_sequences = "~{basename}_aligned.fasta"
+        File   align_troubleshoot = stdout()
+        Int    max_ram_gb = ceil(read_float("/sys/fs/cgroup/memory/memory.max_usage_in_bytes")/1000000000)
         String augur_version = read_string("VERSION")
     }
 }
@@ -224,7 +224,8 @@ task augur_mask_sites {
         dx_instance_type: "mem1_ssd1_v2_x2"
     }
     output {
-        File masked_sequences = out_fname
+        File   masked_sequences = out_fname
+        Int    max_ram_gb = ceil(read_float("/sys/fs/cgroup/memory/memory.max_usage_in_bytes")/1000000000)
         String augur_version  = read_string("VERSION")
     }
 }
@@ -263,6 +264,8 @@ task draft_augur_tree {
             ~{"--vcf-reference " + vcf_reference} \
             ~{"--tree-builder-args " + tree_builder_args} \
             --nthreads auto
+        cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC
+        cat /proc/loadavg | cut -f 3 -d ' ' > LOAD_15M
     }
     runtime {
         docker: docker
@@ -273,7 +276,10 @@ task draft_augur_tree {
         preemptible: 0
     }
     output {
-        File aligned_tree = "~{basename}_raw_tree.nwk"
+        File   aligned_tree = "~{basename}_raw_tree.nwk"
+        Int    max_ram_gb = ceil(read_float("/sys/fs/cgroup/memory/memory.max_usage_in_bytes")/1000000000)
+        Int    runtime_sec = ceil(read_float("UPTIME_SEC"))
+        Int    cpu_load_15min = ceil(read_float("LOAD_15M"))
         String augur_version = read_string("VERSION")
     }
 }
@@ -338,6 +344,8 @@ task refine_augur_tree {
             ~{true="--keep-polytomies" false="" keep_polytomies} \
             ~{true="--date-confidence" false="" date_confidence} \
             ~{"--vcf-reference " + vcf_reference}
+        cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC
+        cat /proc/loadavg | cut -f 3 -d ' ' > LOAD_15M
     }
     runtime {
         docker: docker
@@ -348,8 +356,11 @@ task refine_augur_tree {
         preemptible: 0
     }
     output {
-        File tree_refined  = "~{basename}_refined_tree.nwk"
-        File branch_lengths = "~{basename}_branch_lengths.json"
+        File   tree_refined  = "~{basename}_refined_tree.nwk"
+        File   branch_lengths = "~{basename}_branch_lengths.json"
+        Int    max_ram_gb = ceil(read_float("/sys/fs/cgroup/memory/memory.max_usage_in_bytes")/1000000000)
+        Int    runtime_sec = ceil(read_float("UPTIME_SEC"))
+        Int    cpu_load_15min = ceil(read_float("LOAD_15M"))
         String augur_version = read_string("VERSION")
     }
 }
@@ -390,7 +401,8 @@ task ancestral_traits {
         preemptible: 2
     }
     output {
-        File node_data_json = "~{basename}_nodes.json"
+        File   node_data_json = "~{basename}_nodes.json"
+        Int    max_ram_gb = ceil(read_float("/sys/fs/cgroup/memory/memory.max_usage_in_bytes")/1000000000)
         String augur_version = read_string("VERSION")
     }
 }
@@ -433,6 +445,8 @@ task ancestral_tree {
             --inference ~{default="joint" inference} \
             ~{true="--keep-ambiguous" false="" keep_ambiguous} \
             ~{true="--infer-ambiguous" false="" infer_ambiguous}
+        cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC
+        cat /proc/loadavg | cut -f 3 -d ' ' > LOAD_15M
     }
     runtime {
         docker: docker
@@ -443,8 +457,11 @@ task ancestral_tree {
         preemptible: 2
     }
     output {
-        File nt_muts_json = "~{basename}_nt_muts.json"
-        File sequences    = "~{basename}_ancestral_sequences.fasta"
+        File   nt_muts_json = "~{basename}_nt_muts.json"
+        File   sequences    = "~{basename}_ancestral_sequences.fasta"
+        Int    max_ram_gb = ceil(read_float("/sys/fs/cgroup/memory/memory.max_usage_in_bytes")/1000000000)
+        Int    runtime_sec = ceil(read_float("UPTIME_SEC"))
+        Int    cpu_load_15min = ceil(read_float("LOAD_15M"))
         String augur_version = read_string("VERSION")
     }
 }
@@ -485,7 +502,8 @@ task translate_augur_tree {
         preemptible: 2
     }
     output {
-        File aa_muts_json = "~{basename}_aa_muts.json"
+        File   aa_muts_json = "~{basename}_aa_muts.json"
+        Int    max_ram_gb = ceil(read_float("/sys/fs/cgroup/memory/memory.max_usage_in_bytes")/1000000000)
         String augur_version = read_string("VERSION")
     }
 }
@@ -522,7 +540,8 @@ task assign_clades_to_nodes {
         preemptible: 2
     }
     output {
-        File node_clade_data_json = "~{out_basename}_node-clade-assignments.json"
+        File   node_clade_data_json = "~{out_basename}_node-clade-assignments.json"
+        Int    max_ram_gb = ceil(read_float("/sys/fs/cgroup/memory/memory.max_usage_in_bytes")/1000000000)
         String augur_version      = read_string("VERSION")
     }
 }
@@ -563,8 +582,9 @@ task augur_import_beast {
         preemptible: 2
     }
     output {
-        File tree_newick    = "~{tree_basename}.nwk"
-        File node_data_json = "~{tree_basename}.json"
+        File   tree_newick    = "~{tree_basename}.nwk"
+        File   node_data_json = "~{tree_basename}.json"
+        Int    max_ram_gb = ceil(read_float("/sys/fs/cgroup/memory/memory.max_usage_in_bytes")/1000000000)
         String augur_version = read_string("VERSION")
     }
 }
@@ -646,7 +666,8 @@ task export_auspice_json {
         preemptible: 2
     }
     output {
-        File virus_json = "~{out_basename}_auspice.json"
+        File   virus_json = "~{out_basename}_auspice.json"
+        Int    max_ram_gb = ceil(read_float("/sys/fs/cgroup/memory/memory.max_usage_in_bytes")/1000000000)
         String augur_version = read_string("VERSION")
     }
 }

From 118336ed56165ec38e0e1d546aee40f206b2b147 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Wed, 10 Jun 2020 07:55:47 -0400
Subject: [PATCH 17/17] copy data from inside container to host

---
 pipes/WDL/tasks/tasks_nextstrain.wdl | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/pipes/WDL/tasks/tasks_nextstrain.wdl b/pipes/WDL/tasks/tasks_nextstrain.wdl
index 7e47f3fd7..ecbbb19c6 100644
--- a/pipes/WDL/tasks/tasks_nextstrain.wdl
+++ b/pipes/WDL/tasks/tasks_nextstrain.wdl
@@ -170,6 +170,7 @@ task augur_mafft_align {
             ~{true="--remove-reference" false="" remove_reference} \
             --debug \
             --nthreads auto
+        cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES
     }
     runtime {
         docker: docker
@@ -182,7 +183,7 @@ task augur_mafft_align {
     output {
         File   aligned_sequences = "~{basename}_aligned.fasta"
         File   align_troubleshoot = stdout()
-        Int    max_ram_gb = ceil(read_float("/sys/fs/cgroup/memory/memory.max_usage_in_bytes")/1000000000)
+        Int    max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000)
         String augur_version = read_string("VERSION")
     }
 }
@@ -214,6 +215,7 @@ task augur_mask_sites {
         else
             cp "~{sequences}" "~{out_fname}"
         fi
+        cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES
     }
     runtime {
         docker: docker
@@ -225,7 +227,7 @@ task augur_mask_sites {
     }
     output {
         File   masked_sequences = out_fname
-        Int    max_ram_gb = ceil(read_float("/sys/fs/cgroup/memory/memory.max_usage_in_bytes")/1000000000)
+        Int    max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000)
         String augur_version  = read_string("VERSION")
     }
 }
@@ -266,6 +268,7 @@ task draft_augur_tree {
             --nthreads auto
         cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC
         cat /proc/loadavg | cut -f 3 -d ' ' > LOAD_15M
+        cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES
     }
     runtime {
         docker: docker
@@ -277,7 +280,7 @@ task draft_augur_tree {
     }
     output {
         File   aligned_tree = "~{basename}_raw_tree.nwk"
-        Int    max_ram_gb = ceil(read_float("/sys/fs/cgroup/memory/memory.max_usage_in_bytes")/1000000000)
+        Int    max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000)
         Int    runtime_sec = ceil(read_float("UPTIME_SEC"))
         Int    cpu_load_15min = ceil(read_float("LOAD_15M"))
         String augur_version = read_string("VERSION")
@@ -346,6 +349,7 @@ task refine_augur_tree {
             ~{"--vcf-reference " + vcf_reference}
         cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC
         cat /proc/loadavg | cut -f 3 -d ' ' > LOAD_15M
+        cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES
     }
     runtime {
         docker: docker
@@ -358,7 +362,7 @@ task refine_augur_tree {
     output {
         File   tree_refined  = "~{basename}_refined_tree.nwk"
         File   branch_lengths = "~{basename}_branch_lengths.json"
-        Int    max_ram_gb = ceil(read_float("/sys/fs/cgroup/memory/memory.max_usage_in_bytes")/1000000000)
+        Int    max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000)
         Int    runtime_sec = ceil(read_float("UPTIME_SEC"))
         Int    cpu_load_15min = ceil(read_float("LOAD_15M"))
         String augur_version = read_string("VERSION")
@@ -391,6 +395,7 @@ task ancestral_traits {
             --output-node-data "~{basename}_nodes.json" \
             ~{"--weights " + weights} \
             ~{true="--confidence" false="" confidence}
+        cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES
     }
     runtime {
         docker: docker
@@ -402,7 +407,7 @@ task ancestral_traits {
     }
     output {
         File   node_data_json = "~{basename}_nodes.json"
-        Int    max_ram_gb = ceil(read_float("/sys/fs/cgroup/memory/memory.max_usage_in_bytes")/1000000000)
+        Int    max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000)
         String augur_version = read_string("VERSION")
     }
 }
@@ -447,6 +452,7 @@ task ancestral_tree {
             ~{true="--infer-ambiguous" false="" infer_ambiguous}
         cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC
         cat /proc/loadavg | cut -f 3 -d ' ' > LOAD_15M
+        cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES
     }
     runtime {
         docker: docker
@@ -459,7 +465,7 @@ task ancestral_tree {
     output {
         File   nt_muts_json = "~{basename}_nt_muts.json"
         File   sequences    = "~{basename}_ancestral_sequences.fasta"
-        Int    max_ram_gb = ceil(read_float("/sys/fs/cgroup/memory/memory.max_usage_in_bytes")/1000000000)
+        Int    max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000)
         Int    runtime_sec = ceil(read_float("UPTIME_SEC"))
         Int    cpu_load_15min = ceil(read_float("LOAD_15M"))
         String augur_version = read_string("VERSION")
@@ -492,6 +498,7 @@ task translate_augur_tree {
             ~{"--vcf-reference " + vcf_reference} \
             ~{"--genes " + genes} \
             --output-node-data ~{basename}_aa_muts.json
+        cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES
     }
     runtime {
         docker: docker
@@ -503,7 +510,7 @@ task translate_augur_tree {
     }
     output {
         File   aa_muts_json = "~{basename}_aa_muts.json"
-        Int    max_ram_gb = ceil(read_float("/sys/fs/cgroup/memory/memory.max_usage_in_bytes")/1000000000)
+        Int    max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000)
         String augur_version = read_string("VERSION")
     }
 }
@@ -530,6 +537,7 @@ task assign_clades_to_nodes {
         --reference ~{ref_fasta} \
         --clades ~{clades_tsv} \
         --output-node-data ~{out_basename}_node-clade-assignments.json
+        cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES
     }
     runtime {
         docker: docker
@@ -541,7 +549,7 @@ task assign_clades_to_nodes {
     }
     output {
         File   node_clade_data_json = "~{out_basename}_node-clade-assignments.json"
-        Int    max_ram_gb = ceil(read_float("/sys/fs/cgroup/memory/memory.max_usage_in_bytes")/1000000000)
+        Int    max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000)
         String augur_version      = read_string("VERSION")
     }
 }
@@ -572,6 +580,7 @@ task augur_import_beast {
             ~{"--tip-date-regex " + tip_date_regex} \
             ~{"--tip-date-format " + tip_date_format} \
             ~{"--tip-date-delimeter " + tip_date_delimiter}
+        Int    max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000)
     }
     runtime {
         docker: docker
@@ -656,6 +665,7 @@ task export_auspice_json {
             ~{"--colors " + colors_tsv} \
             ~{"--description " + description_md} \
             --output ~{out_basename}_auspice.json)
+        cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES
     }
     runtime {
         docker: docker
@@ -667,7 +677,7 @@ task export_auspice_json {
     }
     output {
         File   virus_json = "~{out_basename}_auspice.json"
-        Int    max_ram_gb = ceil(read_float("/sys/fs/cgroup/memory/memory.max_usage_in_bytes")/1000000000)
+        Int    max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000)
         String augur_version = read_string("VERSION")
     }
 }