Merge pull request #132 from broadinstitute/dp-nextstrain

nextstrain wdl updates
broadinstitute · Jun 26, 2020 · e09a206 · e09a206
2 parents 803aaad + 9af37a1
commit e09a206
Show file tree

Hide file tree

Showing 5 changed files with 216 additions and 8 deletions.
diff --git a/pipes/WDL/tasks/tasks_nextstrain.wdl b/pipes/WDL/tasks/tasks_nextstrain.wdl
@@ -83,8 +83,8 @@ task filter_subsample_sequences {
 
         Boolean  non_nucleotide=true
 
-        String?  min_date
-        String?  max_date
+        Float?   min_date
+        Float?   max_date
         Int?     min_length
         File?    priority
         Int?     subsample_seed
@@ -149,6 +149,134 @@ task filter_subsample_sequences {
     }
 }
 
+task filter_sequences_to_list {
+    meta {
+        description: "Filter and subsample a sequence set to a specific list of ids in a text file (one id per line)."
+    }
+    input {
+        File     sequences
+        File?    keep_list
+
+        String   docker = "nextstrain/base"
+    }
+    parameter_meta {
+        sequences: {
+          description: "Set of sequences (unaligned fasta or aligned fasta -- one sequence per genome) or variants (vcf format) to subsample using augur filter.",
+          patterns: ["*.fasta", "*.fa", "*.vcf", "*.vcf.gz"]
+        }
+        keep_list: {
+          description: "List of strain ids.",
+          patterns: ["*.txt", "*.tsv"]
+        }
+    }
+    String out_fname = sub(sub(basename(sequences), ".vcf", ".filtered.vcf"), ".fasta$", ".filtered.fasta")
+    command {
+        set -e
+        augur version > VERSION
+        if [ -f "~{keep_list}" ]; then
+            echo "strain" > keep_list.txt
+            cat "~{keep_list}" >> keep_list.txt
+            augur filter \
+                --sequences "~{sequences}" \
+                --metadata keep_list.txt \
+                --output "~{out_fname}" | tee STDOUT
+            grep "sequences were dropped during filtering" STDOUT | cut -f 1 -d ' ' > DROP_COUNT
+            grep "sequences have been written out to" STDOUT | cut -f 1 -d ' ' > OUT_COUNT
+        else
+            cp "~{sequences}" "~{out_fname}"
+            echo "0" > DROP_COUNT
+            echo "-1" > OUT_COUNT
+        fi
+        cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC
+        cat /proc/loadavg > CPU_LOAD
+        cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES
+    }
+    runtime {
+        docker: docker
+        memory: "3 GB"
+        cpu :   2
+        disks:  "local-disk 100 HDD"
+        dx_instance_type: "mem1_ssd1_v2_x2"
+        preemptible: 1
+    }
+    output {
+        File   filtered_fasta    = out_fname
+        String augur_version     = read_string("VERSION")
+        Int    sequences_dropped = read_int("DROP_COUNT")
+        Int    sequences_out     = read_int("OUT_COUNT")
+        Int    max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000)
+        Int    runtime_sec = ceil(read_float("UPTIME_SEC"))
+        String cpu_load = read_string("CPU_LOAD")
+    }
+}
+
+task mafft_one_chr {
+    meta {
+        description: "Align multiple sequences from FASTA. Only appropriate for closely related (within 99% nucleotide conservation) genomes. See https://mafft.cbrc.jp/alignment/software/closelyrelatedviralgenomes.html"
+    }
+    input {
+        File     sequences
+        File?    ref_fasta
+        String   basename
+        Boolean  remove_reference = false
+        Boolean  keep_length = true
+
+        String   docker = "quay.io/broadinstitute/viral-phylo"
+    }
+    command {
+        set -e
+        touch args.txt
+
+        # if ref_fasta is specified, use "closely related" mode
+        # see https://mafft.cbrc.jp/alignment/software/closelyrelatedviralgenomes.html
+        if [ -f "~{ref_fasta}" ]; then
+            echo --addfragments >> args.txt
+            echo "~{sequences}" >> args.txt
+            echo "~{ref_fasta}" >> args.txt
+        else
+            echo "~{sequences}" >> args.txt
+        fi
+
+        # mafft align to reference in "closely related" mode
+        cat args.txt | xargs -d '\n' mafft --auto --thread -1 \
+            ~{true='--keeplength --mapout' false='' keep_length} \
+            > msa.fasta
+
+        # remove reference sequence
+        python3 <<CODE
+        import Bio.SeqIO
+        seq_it = Bio.SeqIO.parse('msa.fasta', 'fasta')
+        print("dumping " + str(seq_it.__next__().id))
+        Bio.SeqIO.write(seq_it, 'msa_drop_one.fasta', 'fasta')
+        CODE
+        REMOVE_REF="~{true='--remove-reference' false='' remove_reference}"
+        if [ -n "$REMOVE_REF" -a -f "~{ref_fasta}" ]; then
+            mv msa_drop_one.fasta "~{basename}_aligned.fasta"
+        else
+            mv msa.fasta "~{basename}_aligned.fasta"
+        fi
+
+        # profiling and stats
+        cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC
+        cat /proc/loadavg > CPU_LOAD
+        cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES
+    }
+    runtime {
+        docker: docker
+        memory: "60 GB"
+        cpu :   32
+        disks:  "local-disk 100 HDD"
+        preemptible: 0
+        dx_instance_type: "mem1_ssd1_v2_x36"
+    }
+    output {
+        File   aligned_sequences = "~{basename}_aligned.fasta"
+        Int    max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000)
+        Int    runtime_sec = ceil(read_float("UPTIME_SEC"))
+        String cpu_load = read_string("CPU_LOAD")
+    }
+}
+
 task augur_mafft_align {
     meta {
         description: "Align multiple sequences from FASTA. See https://nextstrain-augur.readthedocs.io/en/stable/usage/cli/align.html"
@@ -589,7 +717,7 @@ task assign_clades_to_nodes {
         preemptible: 1
     }
     output {
-        File   node_clade_data_json = "~{out_basename}_node-clade-assignments.json"
+        File   node_clade_data_json = "~{out_basename}_clades.json"
         Int    max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000)
         String augur_version      = read_string("VERSION")
     }

diff --git a/pipes/WDL/workflows/augur_from_assemblies.wdl b/pipes/WDL/workflows/augur_from_assemblies.wdl
@@ -58,19 +58,19 @@ workflow augur_from_assemblies {
                 sequences_fasta     = concatenate.combined,
                 sample_metadata_tsv = sample_metadata
     }
-    call nextstrain.augur_mafft_align {
+    call nextstrain.mafft_one_chr as mafft {
         input:
             sequences = filter_subsample_sequences.filtered_fasta,
             ref_fasta = ref_fasta,
             basename  = virus
     }
     call nextstrain.snp_sites {
         input:
-            msa_fasta = augur_mafft_align.aligned_sequences
+            msa_fasta = mafft.aligned_sequences
     }
     call nextstrain.augur_mask_sites {
         input:
-            sequences = augur_mafft_align.aligned_sequences
+            sequences = mafft.aligned_sequences
     }
     call nextstrain.draft_augur_tree {
         input:
@@ -125,7 +125,7 @@ workflow augur_from_assemblies {
 
     output {
         File  combined_assemblies = concatenate.combined
-        File  multiple_alignment  = augur_mafft_align.aligned_sequences
+        File  multiple_alignment  = mafft.aligned_sequences
         File  unmasked_snps       = snp_sites.snps_vcf
         File  masked_alignment    = augur_mask_sites.masked_sequences
         File  ml_tree             = draft_augur_tree.aligned_tree

diff --git a/pipes/WDL/workflows/augur_from_msa.wdl b/pipes/WDL/workflows/augur_from_msa.wdl
@@ -11,6 +11,7 @@ workflow augur_from_msa {
 
     input {
         File            msa_or_vcf
+        File?           sequence_ids_to_keep
         File            sample_metadata
         File            ref_fasta
         File            genbank_gb
@@ -23,6 +24,10 @@ workflow augur_from_msa {
           description: "Multiple sequence alignment (aligned fasta) or variants (vcf format).",
           patterns: ["*.fasta", "*.fa", "*.vcf", "*.vcf.gz"]
         }
+        sequence_ids_to_keep: {
+          description: "Optional list of sequence IDs (one per line) to filter the msa_or_vcf to at the beginning (otherwise we compute on all sequences in msa_to_vcf).",
+          patterns: ["*.txt", "*.tsv"]
+        }
         sample_metadata: {
           description: "Metadata in tab-separated text format. See https://nextstrain-augur.readthedocs.io/en/stable/faq/metadata.html for details.",
           patterns: ["*.txt", "*.tsv"]
@@ -44,9 +49,14 @@ workflow augur_from_msa {
         }
     }
 
+    call nextstrain.filter_sequences_to_list {
+        input:
+            sequences = msa_or_vcf,
+            keep_list = sequence_ids_to_keep
+    }
     call nextstrain.augur_mask_sites {
         input:
-            sequences = msa_or_vcf
+            sequences = filter_sequences_to_list.filtered_fasta
     }
     call nextstrain.draft_augur_tree {
         input:

diff --git a/pipes/WDL/workflows/filter_sequences.wdl b/pipes/WDL/workflows/filter_sequences.wdl
@@ -0,0 +1,11 @@
+version 1.0
+
+import "../tasks/tasks_nextstrain.wdl" as nextstrain
+
+workflow filter_sequences {
+    meta {
+        description: "Filter and subsample a sequence set. See https://nextstrain-augur.readthedocs.io/en/stable/usage/cli/filter.html"
+    }
+
+    call nextstrain.filter_subsample_sequences as filter
+}
diff --git a/pipes/WDL/workflows/mafft_iqtree.wdl b/pipes/WDL/workflows/mafft_iqtree.wdl
@@ -0,0 +1,59 @@
+version 1.0
+
+import "../tasks/tasks_nextstrain.wdl" as nextstrain
+
+workflow mafft_iqtree {
+    meta {
+        description: "Align assemblies, mask sites, build tree."
+        author: "Broad Viral Genomics"
+        email:  "viral-ngs@broadinstitute.org"
+    }
+
+    input {
+        Array[File]     assembly_fastas
+        File            ref_fasta
+    }
+
+    parameter_meta {
+        assembly_fastas: {
+          description: "Set of assembled genomes to align and build trees. These must represent a single chromosome/segment of a genome only. Fastas may be one-sequence-per-individual or a concatenated multi-fasta (unaligned) or a mixture of the two.",
+          patterns: ["*.fasta", "*.fa"]
+        }
+        ref_fasta: {
+          description: "A reference assembly (not included in assembly_fastas) to align assembly_fastas against. Typically from NCBI RefSeq or similar.",
+          patterns: ["*.fasta", "*.fa"]
+        }
+    }
+
+    call nextstrain.concatenate {
+        input:
+            infiles     = assembly_fastas,
+            output_name = "all_samples_combined_assembly.fasta"
+    }
+    call nextstrain.mafft_one_chr as mafft {
+        input:
+            sequences = concatenate.combined,
+            ref_fasta = ref_fasta,
+            basename  = "all_samples_aligned.fasta"
+    }
+    call nextstrain.snp_sites {
+        input:
+            msa_fasta = mafft.aligned_sequences
+    }
+    call nextstrain.augur_mask_sites {
+        input:
+            sequences = mafft.aligned_sequences
+    }
+    call nextstrain.draft_augur_tree {
+        input:
+            msa_or_vcf = augur_mask_sites.masked_sequences
+    }
+
+    output {
+        File  combined_assemblies = concatenate.combined
+        File  multiple_alignment  = mafft.aligned_sequences
+        File  unmasked_snps       = snp_sites.snps_vcf
+        File  masked_alignment    = augur_mask_sites.masked_sequences
+        File  ml_tree             = draft_augur_tree.aligned_tree
+    }
+}