Skip to content

Commit

Permalink
Merge pull request #132 from broadinstitute/dp-nextstrain
Browse files Browse the repository at this point in the history
nextstrain wdl updates
  • Loading branch information
dpark01 authored Jun 26, 2020
2 parents 803aaad + 9af37a1 commit e09a206
Show file tree
Hide file tree
Showing 5 changed files with 216 additions and 8 deletions.
134 changes: 131 additions & 3 deletions pipes/WDL/tasks/tasks_nextstrain.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,8 @@ task filter_subsample_sequences {
Boolean non_nucleotide=true
String? min_date
String? max_date
Float? min_date
Float? max_date
Int? min_length
File? priority
Int? subsample_seed
Expand Down Expand Up @@ -149,6 +149,134 @@ task filter_subsample_sequences {
}
}
task filter_sequences_to_list {
meta {
description: "Filter and subsample a sequence set to a specific list of ids in a text file (one id per line)."
}
input {
File sequences
File? keep_list
String docker = "nextstrain/base"
}
parameter_meta {
sequences: {
description: "Set of sequences (unaligned fasta or aligned fasta -- one sequence per genome) or variants (vcf format) to subsample using augur filter.",
patterns: ["*.fasta", "*.fa", "*.vcf", "*.vcf.gz"]
}
keep_list: {
description: "List of strain ids.",
patterns: ["*.txt", "*.tsv"]
}
}
String out_fname = sub(sub(basename(sequences), ".vcf", ".filtered.vcf"), ".fasta$", ".filtered.fasta")
command {
set -e
augur version > VERSION
if [ -f "~{keep_list}" ]; then
echo "strain" > keep_list.txt
cat "~{keep_list}" >> keep_list.txt
augur filter \
--sequences "~{sequences}" \
--metadata keep_list.txt \
--output "~{out_fname}" | tee STDOUT
grep "sequences were dropped during filtering" STDOUT | cut -f 1 -d ' ' > DROP_COUNT
grep "sequences have been written out to" STDOUT | cut -f 1 -d ' ' > OUT_COUNT
else
cp "~{sequences}" "~{out_fname}"
echo "0" > DROP_COUNT
echo "-1" > OUT_COUNT
fi
cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC
cat /proc/loadavg > CPU_LOAD
cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES
}
runtime {
docker: docker
memory: "3 GB"
cpu : 2
disks: "local-disk 100 HDD"
dx_instance_type: "mem1_ssd1_v2_x2"
preemptible: 1
}
output {
File filtered_fasta = out_fname
String augur_version = read_string("VERSION")
Int sequences_dropped = read_int("DROP_COUNT")
Int sequences_out = read_int("OUT_COUNT")
Int max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000)
Int runtime_sec = ceil(read_float("UPTIME_SEC"))
String cpu_load = read_string("CPU_LOAD")
}
}
task mafft_one_chr {
meta {
description: "Align multiple sequences from FASTA. Only appropriate for closely related (within 99% nucleotide conservation) genomes. See https://mafft.cbrc.jp/alignment/software/closelyrelatedviralgenomes.html"
}
input {
File sequences
File? ref_fasta
String basename
Boolean remove_reference = false
Boolean keep_length = true
String docker = "quay.io/broadinstitute/viral-phylo"
}
command {
set -e
touch args.txt
# if ref_fasta is specified, use "closely related" mode
# see https://mafft.cbrc.jp/alignment/software/closelyrelatedviralgenomes.html
if [ -f "~{ref_fasta}" ]; then
echo --addfragments >> args.txt
echo "~{sequences}" >> args.txt
echo "~{ref_fasta}" >> args.txt
else
echo "~{sequences}" >> args.txt
fi
# mafft align to reference in "closely related" mode
cat args.txt | xargs -d '\n' mafft --auto --thread -1 \
~{true='--keeplength --mapout' false='' keep_length} \
> msa.fasta
# remove reference sequence
python3 <<CODE
import Bio.SeqIO
seq_it = Bio.SeqIO.parse('msa.fasta', 'fasta')
print("dumping " + str(seq_it.__next__().id))
Bio.SeqIO.write(seq_it, 'msa_drop_one.fasta', 'fasta')
CODE
REMOVE_REF="~{true='--remove-reference' false='' remove_reference}"
if [ -n "$REMOVE_REF" -a -f "~{ref_fasta}" ]; then
mv msa_drop_one.fasta "~{basename}_aligned.fasta"
else
mv msa.fasta "~{basename}_aligned.fasta"
fi
# profiling and stats
cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC
cat /proc/loadavg > CPU_LOAD
cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES
}
runtime {
docker: docker
memory: "60 GB"
cpu : 32
disks: "local-disk 100 HDD"
preemptible: 0
dx_instance_type: "mem1_ssd1_v2_x36"
}
output {
File aligned_sequences = "~{basename}_aligned.fasta"
Int max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000)
Int runtime_sec = ceil(read_float("UPTIME_SEC"))
String cpu_load = read_string("CPU_LOAD")
}
}
task augur_mafft_align {
meta {
description: "Align multiple sequences from FASTA. See https://nextstrain-augur.readthedocs.io/en/stable/usage/cli/align.html"
Expand Down Expand Up @@ -589,7 +717,7 @@ task assign_clades_to_nodes {
preemptible: 1
}
output {
File node_clade_data_json = "~{out_basename}_node-clade-assignments.json"
File node_clade_data_json = "~{out_basename}_clades.json"
Int max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000)
String augur_version = read_string("VERSION")
}
Expand Down
8 changes: 4 additions & 4 deletions pipes/WDL/workflows/augur_from_assemblies.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -58,19 +58,19 @@ workflow augur_from_assemblies {
sequences_fasta = concatenate.combined,
sample_metadata_tsv = sample_metadata
}
call nextstrain.augur_mafft_align {
call nextstrain.mafft_one_chr as mafft {
input:
sequences = filter_subsample_sequences.filtered_fasta,
ref_fasta = ref_fasta,
basename = virus
}
call nextstrain.snp_sites {
input:
msa_fasta = augur_mafft_align.aligned_sequences
msa_fasta = mafft.aligned_sequences
}
call nextstrain.augur_mask_sites {
input:
sequences = augur_mafft_align.aligned_sequences
sequences = mafft.aligned_sequences
}
call nextstrain.draft_augur_tree {
input:
Expand Down Expand Up @@ -125,7 +125,7 @@ workflow augur_from_assemblies {

output {
File combined_assemblies = concatenate.combined
File multiple_alignment = augur_mafft_align.aligned_sequences
File multiple_alignment = mafft.aligned_sequences
File unmasked_snps = snp_sites.snps_vcf
File masked_alignment = augur_mask_sites.masked_sequences
File ml_tree = draft_augur_tree.aligned_tree
Expand Down
12 changes: 11 additions & 1 deletion pipes/WDL/workflows/augur_from_msa.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ workflow augur_from_msa {

input {
File msa_or_vcf
File? sequence_ids_to_keep
File sample_metadata
File ref_fasta
File genbank_gb
Expand All @@ -23,6 +24,10 @@ workflow augur_from_msa {
description: "Multiple sequence alignment (aligned fasta) or variants (vcf format).",
patterns: ["*.fasta", "*.fa", "*.vcf", "*.vcf.gz"]
}
sequence_ids_to_keep: {
description: "Optional list of sequence IDs (one per line) to filter the msa_or_vcf to at the beginning (otherwise we compute on all sequences in msa_to_vcf).",
patterns: ["*.txt", "*.tsv"]
}
sample_metadata: {
description: "Metadata in tab-separated text format. See https://nextstrain-augur.readthedocs.io/en/stable/faq/metadata.html for details.",
patterns: ["*.txt", "*.tsv"]
Expand All @@ -44,9 +49,14 @@ workflow augur_from_msa {
}
}

call nextstrain.filter_sequences_to_list {
input:
sequences = msa_or_vcf,
keep_list = sequence_ids_to_keep
}
call nextstrain.augur_mask_sites {
input:
sequences = msa_or_vcf
sequences = filter_sequences_to_list.filtered_fasta
}
call nextstrain.draft_augur_tree {
input:
Expand Down
11 changes: 11 additions & 0 deletions pipes/WDL/workflows/filter_sequences.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
version 1.0

import "../tasks/tasks_nextstrain.wdl" as nextstrain

workflow filter_sequences {
meta {
description: "Filter and subsample a sequence set. See https://nextstrain-augur.readthedocs.io/en/stable/usage/cli/filter.html"
}

call nextstrain.filter_subsample_sequences as filter
}
59 changes: 59 additions & 0 deletions pipes/WDL/workflows/mafft_iqtree.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
version 1.0

import "../tasks/tasks_nextstrain.wdl" as nextstrain

workflow mafft_iqtree {
meta {
description: "Align assemblies, mask sites, build tree."
author: "Broad Viral Genomics"
email: "viral-ngs@broadinstitute.org"
}

input {
Array[File] assembly_fastas
File ref_fasta
}

parameter_meta {
assembly_fastas: {
description: "Set of assembled genomes to align and build trees. These must represent a single chromosome/segment of a genome only. Fastas may be one-sequence-per-individual or a concatenated multi-fasta (unaligned) or a mixture of the two.",
patterns: ["*.fasta", "*.fa"]
}
ref_fasta: {
description: "A reference assembly (not included in assembly_fastas) to align assembly_fastas against. Typically from NCBI RefSeq or similar.",
patterns: ["*.fasta", "*.fa"]
}
}

call nextstrain.concatenate {
input:
infiles = assembly_fastas,
output_name = "all_samples_combined_assembly.fasta"
}
call nextstrain.mafft_one_chr as mafft {
input:
sequences = concatenate.combined,
ref_fasta = ref_fasta,
basename = "all_samples_aligned.fasta"
}
call nextstrain.snp_sites {
input:
msa_fasta = mafft.aligned_sequences
}
call nextstrain.augur_mask_sites {
input:
sequences = mafft.aligned_sequences
}
call nextstrain.draft_augur_tree {
input:
msa_or_vcf = augur_mask_sites.masked_sequences
}

output {
File combined_assemblies = concatenate.combined
File multiple_alignment = mafft.aligned_sequences
File unmasked_snps = snp_sites.snps_vcf
File masked_alignment = augur_mask_sites.masked_sequences
File ml_tree = draft_augur_tree.aligned_tree
}
}

0 comments on commit e09a206

Please sign in to comment.