Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

nextstrain wdl updates #132

Merged
merged 10 commits into from
Jun 26, 2020
134 changes: 131 additions & 3 deletions pipes/WDL/tasks/tasks_nextstrain.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,8 @@ task filter_subsample_sequences {

Boolean non_nucleotide=true

String? min_date
String? max_date
Float? min_date
Float? max_date
Int? min_length
File? priority
Int? subsample_seed
Expand Down Expand Up @@ -149,6 +149,134 @@ task filter_subsample_sequences {
}
}

task filter_sequences_to_list {
meta {
description: "Filter and subsample a sequence set to a specific list of ids in a text file (one id per line)."
}
input {
File sequences
File? keep_list

String docker = "nextstrain/base"
}
parameter_meta {
sequences: {
description: "Set of sequences (unaligned fasta or aligned fasta -- one sequence per genome) or variants (vcf format) to subsample using augur filter.",
patterns: ["*.fasta", "*.fa", "*.vcf", "*.vcf.gz"]
}
keep_list: {
description: "List of strain ids.",
patterns: ["*.txt", "*.tsv"]
}
}
String out_fname = sub(sub(basename(sequences), ".vcf", ".filtered.vcf"), ".fasta$", ".filtered.fasta")
command {
set -e
augur version > VERSION
if [ -f "~{keep_list}" ]; then
echo "strain" > keep_list.txt
cat "~{keep_list}" >> keep_list.txt
augur filter \
--sequences "~{sequences}" \
--metadata keep_list.txt \
--output "~{out_fname}" | tee STDOUT
grep "sequences were dropped during filtering" STDOUT | cut -f 1 -d ' ' > DROP_COUNT
grep "sequences have been written out to" STDOUT | cut -f 1 -d ' ' > OUT_COUNT
else
cp "~{sequences}" "~{out_fname}"
echo "0" > DROP_COUNT
echo "-1" > OUT_COUNT
fi
cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC
cat /proc/loadavg > CPU_LOAD
cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES
}
runtime {
docker: docker
memory: "3 GB"
cpu : 2
disks: "local-disk 100 HDD"
dx_instance_type: "mem1_ssd1_v2_x2"
preemptible: 1
}
output {
File filtered_fasta = out_fname
String augur_version = read_string("VERSION")
Int sequences_dropped = read_int("DROP_COUNT")
Int sequences_out = read_int("OUT_COUNT")
Int max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000)
Int runtime_sec = ceil(read_float("UPTIME_SEC"))
String cpu_load = read_string("CPU_LOAD")
}
}

task mafft_one_chr {
meta {
description: "Align multiple sequences from FASTA. Only appropriate for closely related (within 99% nucleotide conservation) genomes. See https://mafft.cbrc.jp/alignment/software/closelyrelatedviralgenomes.html"
}
input {
File sequences
File? ref_fasta
String basename
Boolean remove_reference = false
Boolean keep_length = true

String docker = "quay.io/broadinstitute/viral-phylo"
}
command {
set -e
touch args.txt

# if ref_fasta is specified, use "closely related" mode
# see https://mafft.cbrc.jp/alignment/software/closelyrelatedviralgenomes.html
if [ -f "~{ref_fasta}" ]; then
echo --addfragments >> args.txt
echo "~{sequences}" >> args.txt
echo "~{ref_fasta}" >> args.txt
else
echo "~{sequences}" >> args.txt
fi

# mafft align to reference in "closely related" mode
cat args.txt | xargs -d '\n' mafft --auto --thread -1 \
~{true='--keeplength --mapout' false='' keep_length} \
> msa.fasta

# remove reference sequence
python3 <<CODE
import Bio.SeqIO
seq_it = Bio.SeqIO.parse('msa.fasta', 'fasta')
print("dumping " + str(seq_it.__next__().id))
Bio.SeqIO.write(seq_it, 'msa_drop_one.fasta', 'fasta')
CODE
REMOVE_REF="~{true='--remove-reference' false='' remove_reference}"
if [ -n "$REMOVE_REF" -a -f "~{ref_fasta}" ]; then
mv msa_drop_one.fasta "~{basename}_aligned.fasta"
else
mv msa.fasta "~{basename}_aligned.fasta"
fi

# profiling and stats
cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC
cat /proc/loadavg > CPU_LOAD
cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES
}
runtime {
docker: docker
memory: "60 GB"
cpu : 32
disks: "local-disk 100 HDD"
preemptible: 0
dx_instance_type: "mem1_ssd1_v2_x36"
}
output {
File aligned_sequences = "~{basename}_aligned.fasta"
Int max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000)
Int runtime_sec = ceil(read_float("UPTIME_SEC"))
String cpu_load = read_string("CPU_LOAD")
}
}

task augur_mafft_align {
meta {
description: "Align multiple sequences from FASTA. See https://nextstrain-augur.readthedocs.io/en/stable/usage/cli/align.html"
Expand Down Expand Up @@ -589,7 +717,7 @@ task assign_clades_to_nodes {
preemptible: 1
}
output {
File node_clade_data_json = "~{out_basename}_node-clade-assignments.json"
File node_clade_data_json = "~{out_basename}_clades.json"
Int max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000)
String augur_version = read_string("VERSION")
}
Expand Down
8 changes: 4 additions & 4 deletions pipes/WDL/workflows/augur_from_assemblies.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -58,19 +58,19 @@ workflow augur_from_assemblies {
sequences_fasta = concatenate.combined,
sample_metadata_tsv = sample_metadata
}
call nextstrain.augur_mafft_align {
call nextstrain.mafft_one_chr as mafft {
input:
sequences = filter_subsample_sequences.filtered_fasta,
ref_fasta = ref_fasta,
basename = virus
}
call nextstrain.snp_sites {
input:
msa_fasta = augur_mafft_align.aligned_sequences
msa_fasta = mafft.aligned_sequences
}
call nextstrain.augur_mask_sites {
input:
sequences = augur_mafft_align.aligned_sequences
sequences = mafft.aligned_sequences
}
call nextstrain.draft_augur_tree {
input:
Expand Down Expand Up @@ -125,7 +125,7 @@ workflow augur_from_assemblies {

output {
File combined_assemblies = concatenate.combined
File multiple_alignment = augur_mafft_align.aligned_sequences
File multiple_alignment = mafft.aligned_sequences
File unmasked_snps = snp_sites.snps_vcf
File masked_alignment = augur_mask_sites.masked_sequences
File ml_tree = draft_augur_tree.aligned_tree
Expand Down
12 changes: 11 additions & 1 deletion pipes/WDL/workflows/augur_from_msa.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ workflow augur_from_msa {

input {
File msa_or_vcf
File? sequence_ids_to_keep
File sample_metadata
File ref_fasta
File genbank_gb
Expand All @@ -23,6 +24,10 @@ workflow augur_from_msa {
description: "Multiple sequence alignment (aligned fasta) or variants (vcf format).",
patterns: ["*.fasta", "*.fa", "*.vcf", "*.vcf.gz"]
}
sequence_ids_to_keep: {
description: "Optional list of sequence IDs (one per line) to filter the msa_or_vcf to at the beginning (otherwise we compute on all sequences in msa_to_vcf).",
patterns: ["*.txt", "*.tsv"]
}
sample_metadata: {
description: "Metadata in tab-separated text format. See https://nextstrain-augur.readthedocs.io/en/stable/faq/metadata.html for details.",
patterns: ["*.txt", "*.tsv"]
Expand All @@ -44,9 +49,14 @@ workflow augur_from_msa {
}
}

call nextstrain.filter_sequences_to_list {
input:
sequences = msa_or_vcf,
keep_list = sequence_ids_to_keep
}
call nextstrain.augur_mask_sites {
input:
sequences = msa_or_vcf
sequences = filter_sequences_to_list.filtered_fasta
}
call nextstrain.draft_augur_tree {
input:
Expand Down
11 changes: 11 additions & 0 deletions pipes/WDL/workflows/filter_sequences.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
version 1.0

import "../tasks/tasks_nextstrain.wdl" as nextstrain

workflow filter_sequences {
meta {
description: "Filter and subsample a sequence set. See https://nextstrain-augur.readthedocs.io/en/stable/usage/cli/filter.html"
}

call nextstrain.filter_subsample_sequences as filter
}
59 changes: 59 additions & 0 deletions pipes/WDL/workflows/mafft_iqtree.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
version 1.0

import "../tasks/tasks_nextstrain.wdl" as nextstrain

workflow mafft_iqtree {
meta {
description: "Align assemblies, mask sites, build tree."
author: "Broad Viral Genomics"
email: "viral-ngs@broadinstitute.org"
}

input {
Array[File] assembly_fastas
File ref_fasta
}

parameter_meta {
assembly_fastas: {
description: "Set of assembled genomes to align and build trees. These must represent a single chromosome/segment of a genome only. Fastas may be one-sequence-per-individual or a concatenated multi-fasta (unaligned) or a mixture of the two.",
patterns: ["*.fasta", "*.fa"]
}
ref_fasta: {
description: "A reference assembly (not included in assembly_fastas) to align assembly_fastas against. Typically from NCBI RefSeq or similar.",
patterns: ["*.fasta", "*.fa"]
}
}

call nextstrain.concatenate {
input:
infiles = assembly_fastas,
output_name = "all_samples_combined_assembly.fasta"
}
call nextstrain.mafft_one_chr as mafft {
input:
sequences = concatenate.combined,
ref_fasta = ref_fasta,
basename = "all_samples_aligned.fasta"
}
call nextstrain.snp_sites {
input:
msa_fasta = mafft.aligned_sequences
}
call nextstrain.augur_mask_sites {
input:
sequences = mafft.aligned_sequences
}
call nextstrain.draft_augur_tree {
input:
msa_or_vcf = augur_mask_sites.masked_sequences
}

output {
File combined_assemblies = concatenate.combined
File multiple_alignment = mafft.aligned_sequences
File unmasked_snps = snp_sites.snps_vcf
File masked_alignment = augur_mask_sites.masked_sequences
File ml_tree = draft_augur_tree.aligned_tree
}
}