Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add new workflow scaffold_and_refine_multitaxa #506

Merged
merged 18 commits into from
Feb 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,11 @@ workflows:
primaryDescriptorPath: /pipes/WDL/workflows/scaffold_and_refine.wdl
testParameterFiles:
- empty.json
- name: scaffold_and_refine_multitaxa
subclass: WDL
primaryDescriptorPath: /pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl
testParameterFiles:
- empty.json
- name: subsample_by_casecounts
subclass: WDL
primaryDescriptorPath: /pipes/WDL/workflows/subsample_by_casecounts.wdl
Expand Down
75 changes: 50 additions & 25 deletions pipes/WDL/tasks/tasks_assembly.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ task assemble {
String sample_name = basename(basename(reads_unmapped_bam, ".bam"), ".taxfilt")

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-assemble:2.1.33.0"
String docker = "quay.io/broadinstitute/viral-assemble:2.2.4.0"
}
parameter_meta{
reads_unmapped_bam: {
Expand Down Expand Up @@ -113,6 +113,7 @@ task scaffold {
Float? min_length_fraction
Float? min_unambig
Int replace_length=55
Boolean allow_incomplete_output = false

Int? nucmer_max_gap
Int? nucmer_min_match
Expand All @@ -121,7 +122,7 @@ task scaffold {
Float? scaffold_min_pct_contig_aligned

Int? machine_mem_gb
String docker="quay.io/broadinstitute/viral-assemble:2.1.33.0"
String docker="quay.io/broadinstitute/viral-assemble:2.2.4.0"

# do this in multiple steps in case the input doesn't actually have "assembly1-x" in the name
String sample_name = basename(basename(contigs_fasta, ".fasta"), ".assembly1-spades")
Expand Down Expand Up @@ -168,7 +169,7 @@ task scaffold {
description: "When scaffolding contigs to the reference via nucmer, this specifies the -l parameter to nucmer (the minimal size of a maximal exact match). Our default is 10 (down from nucmer default of 20) to allow for more divergence.",
category: "advanced"
}
nucmer_min_cluster:{
nucmer_min_cluster:{
description: "When scaffolding contigs to the reference via nucmer, this specifies the -c parameter to nucmer (minimum cluster length). Our default is the nucmer default of 65 bp.",
category: "advanced"
}
Expand Down Expand Up @@ -214,6 +215,7 @@ task scaffold {
--outReference ~{sample_name}.scaffolding_chosen_ref.fasta \
--outStats ~{sample_name}.scaffolding_stats.txt \
--outAlternateContigs ~{sample_name}.scaffolding_alt_contigs.fasta \
~{true='--allow_incomplete_output' false="" allow_incomplete_output} \
--loglevel=DEBUG

grep '^>' ~{sample_name}.scaffolding_chosen_ref.fasta | cut -c 2- | cut -f 1 -d ' ' > ~{sample_name}.scaffolding_chosen_refs.txt
Expand All @@ -226,8 +228,10 @@ task scaffold {
--maskErrors \
--loglevel=DEBUG

set +e +o pipefail
grep -v '^>' ~{sample_name}.intermediate_gapfill.fasta | tr -d '\n' | wc -c | tee assembly_preimpute_length
grep -v '^>' ~{sample_name}.intermediate_gapfill.fasta | tr -d '\nNn' | wc -c | tee assembly_preimpute_length_unambiguous
set -e -o pipefail

#Input assembly/contigs, FASTA, already ordered oriented and merged with the reference gneome (FASTA)
assembly.py impute_from_reference \
Expand Down Expand Up @@ -428,7 +432,7 @@ task align_reads {

String aligner = "minimap2"
String? aligner_options
Boolean? skip_mark_dupes = false
Boolean skip_mark_dupes = false

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-core:2.2.4"
Expand Down Expand Up @@ -566,7 +570,7 @@ task refine_assembly_with_aligned_reads {
Int min_coverage = 3

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-assemble:2.1.33.0"
String docker = "quay.io/broadinstitute/viral-assemble:2.2.4.0"
}

Int disk_size = 375
Expand Down Expand Up @@ -691,7 +695,7 @@ task refine_2x_and_plot {
String? plot_coverage_novoalign_options = "-r Random -l 40 -g 40 -x 20 -t 100 -k"

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-assemble:2.1.33.0"
String docker = "quay.io/broadinstitute/viral-assemble:2.2.4.0"

# do this in two steps in case the input doesn't actually have "cleaned" in the name
String sample_name = basename(basename(reads_unmapped_bam, ".bam"), ".cleaned")
Expand Down Expand Up @@ -852,8 +856,8 @@ task run_discordance {

read_utils.py --version | tee VERSION

# create 2-col table with read group ids in both cols
python3 <<CODE
# create 2-col table with read group ids in both cols
import tools.samtools
header = tools.samtools.SamtoolsTool().getHeader("~{reads_aligned_bam}")
rgids = [[x[3:] for x in h if x.startswith('ID:')][0] for h in header if h[0]=='@RG']
Expand All @@ -866,26 +870,47 @@ task run_discordance {
outf.write(str(n_rgs)+'\n')
with open('num_libraries', 'wt') as outf:
outf.write(str(n_lbs)+'\n')

# detect empty fasta situation and manually create empty VCF
import os.path
if (os.path.getsize('~{reference_fasta}') == 0):
sample_name = [[x[3:] for x in h if x.startswith('SM:')][0] for h in header if h[0]=='@RG'][0]
with open('everything.vcf', 'wt') as outf:
outf.write('##fileformat=VCFv4.3')
outf.write('##ALT=<ID=*,Description="Represents allele(s) other than observed.">')
outf.write('##INFO=<ID=DP,Number=1,Type=Integer,Description="Raw read depth">')
outf.write('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">')
outf.write('##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Number of high-quality bases">')
outf.write('\t'.join(('#CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT',sample_name))+'\n')
CODE

# bcftools call snps while treating each RG as a separate sample
bcftools mpileup \
-G readgroups.txt -d 10000 -a "FORMAT/DP,FORMAT/AD" \
-q 1 -m 2 -Ou \
-f "~{reference_fasta}" "~{reads_aligned_bam}" \
| bcftools call \
-P 0 -m --ploidy 1 \
--threads $(nproc) \
-Ov -o everything.vcf

# mask all GT calls when less than 3 reads
cat everything.vcf | bcftools filter -e "FMT/DP<~{min_coverage}" -S . > filtered.vcf
cat filtered.vcf | bcftools filter -i "MAC>0" > "~{out_basename}.discordant.vcf"

# tally outputs
bcftools filter -i 'MAC=0' filtered.vcf | bcftools query -f '%POS\n' | wc -l | tee num_concordant
bcftools filter -i 'TYPE="snp"' "~{out_basename}.discordant.vcf" | bcftools query -f '%POS\n' | wc -l | tee num_discordant_snps
bcftools filter -i 'TYPE!="snp"' "~{out_basename}.discordant.vcf" | bcftools query -f '%POS\n' | wc -l | tee num_discordant_indels
if [ ! -f everything.vcf ]; then
# bcftools call snps while treating each RG as a separate sample
bcftools mpileup \
-G readgroups.txt -d 10000 -a "FORMAT/DP,FORMAT/AD" \
tomkinsc marked this conversation as resolved.
Show resolved Hide resolved
-q 1 -m 2 -Ou \
-f "~{reference_fasta}" "~{reads_aligned_bam}" \
| bcftools call \
-P 0 -m --ploidy 1 \
--threads $(nproc) \
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe reserve a core for the system, writes, etc. ($(nproc --ignore=1))?

-Ov -o everything.vcf

# mask all GT calls when less than 3 reads
cat everything.vcf | bcftools filter -e "FMT/DP<~{min_coverage}" -S . > filtered.vcf
cat filtered.vcf | bcftools filter -i "MAC>0" > "~{out_basename}.discordant.vcf"

# tally outputs
bcftools filter -i 'MAC=0' filtered.vcf | bcftools query -f '%POS\n' | wc -l | tee num_concordant
bcftools filter -i 'TYPE="snp"' "~{out_basename}.discordant.vcf" | bcftools query -f '%POS\n' | wc -l | tee num_discordant_snps
bcftools filter -i 'TYPE!="snp"' "~{out_basename}.discordant.vcf" | bcftools query -f '%POS\n' | wc -l | tee num_discordant_indels

else
# handle empty case
cp everything.vcf "~{out_basename}.discordant.vcf"
echo 0 > num_concordant
echo 0 > num_discordant_snps
echo 0 > num_discordant_indels
fi
}

output {
Expand Down
55 changes: 30 additions & 25 deletions pipes/WDL/tasks/tasks_reports.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ task alignment_metrics {
File? primers_bed
String? amplicon_set
Int? min_coverage
Int? max_amp_len=5000
Int? max_amplicons=500
Int max_amp_len=5000
Int max_amplicons=500

Int? machine_mem_gb
Int machine_mem_gb=13
String docker = "quay.io/broadinstitute/viral-core:2.2.4"
}

Expand All @@ -31,25 +31,30 @@ task alignment_metrics {
cp "~{ref_fasta}" reference.fasta
picard $XMX CreateSequenceDictionary -R reference.fasta

# get Picard metrics and clean up the junky outputs
picard $XMX CollectRawWgsMetrics \
-R reference.fasta \
-I "~{aligned_bam}" \
-O picard_raw.raw_wgs_metrics.txt
grep -v \# picard_raw.raw_wgs_metrics.txt | grep . | head -2 > picard_clean.raw_wgs_metrics.txt

picard $XMX CollectAlignmentSummaryMetrics \
-R reference.fasta \
-I "~{aligned_bam}" \
-O picard_raw.alignment_metrics.txt
grep -v \# picard_raw.alignment_metrics.txt | grep . | head -4 > picard_clean.alignment_metrics.txt

picard $XMX CollectInsertSizeMetrics \
-I "~{aligned_bam}" \
-O picard_raw.insert_size_metrics.txt \
-H picard_raw.insert_size_metrics.pdf \
--INCLUDE_DUPLICATES true
grep -v \# picard_raw.insert_size_metrics.txt | grep . | head -2 > picard_clean.insert_size_metrics.txt
if [ -s "~{ref_fasta}" ]; then
# get Picard metrics and clean up the junky outputs
picard $XMX CollectRawWgsMetrics \
-R reference.fasta \
-I "~{aligned_bam}" \
-O picard_raw.raw_wgs_metrics.txt
grep -v \# picard_raw.raw_wgs_metrics.txt | grep . | head -2 > picard_clean.raw_wgs_metrics.txt

picard $XMX CollectAlignmentSummaryMetrics \
-R reference.fasta \
-I "~{aligned_bam}" \
-O picard_raw.alignment_metrics.txt
grep -v \# picard_raw.alignment_metrics.txt | grep . | head -4 > picard_clean.alignment_metrics.txt

picard $XMX CollectInsertSizeMetrics \
-I "~{aligned_bam}" \
-O picard_raw.insert_size_metrics.txt \
-H picard_raw.insert_size_metrics.pdf \
--INCLUDE_DUPLICATES true
grep -v \# picard_raw.insert_size_metrics.txt | grep . | head -2 > picard_clean.insert_size_metrics.txt
else
# ref_fasta is empty -> Picard will fail
touch picard_clean.raw_wgs_metrics.txt picard_clean.alignment_metrics.txt picard_clean.insert_size_metrics.txt
fi

# prepend the sample name in order to facilitate tsv joining later
SAMPLE=$(samtools view -H "~{aligned_bam}" | grep ^@RG | perl -lape 's/^@RG.*SM:(\S+).*$/$1/' | sort | uniq)
Expand Down Expand Up @@ -100,8 +105,8 @@ task alignment_metrics {
}

runtime {
docker: "~{docker}"
memory: select_first([machine_mem_gb, 13]) + " GB"
docker: docker
memory: machine_mem_gb + " GB"
cpu: 2
disks: "local-disk " + disk_size + " HDD"
disk: disk_size + " GB" # TES
Expand Down Expand Up @@ -630,7 +635,7 @@ task compare_two_genomes {
File genome_two
String out_basename

String docker = "quay.io/broadinstitute/viral-assemble:2.1.33.0"
String docker = "quay.io/broadinstitute/viral-assemble:2.2.4.0"
}

Int disk_size = 50
Expand Down
1 change: 1 addition & 0 deletions pipes/WDL/workflows/scaffold_and_refine.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ workflow scaffold_and_refine {
Int read_pairs_aligned = refine.align_to_self_merged_read_pairs_aligned
Float bases_aligned = refine.align_to_self_merged_bases_aligned

String assembly_method = "viral-ngs/scaffold_and_refine"
String scaffold_viral_assemble_version = scaffold.viralngs_version
String refine_viral_assemble_version = refine.viral_assemble_version
}
Expand Down
122 changes: 122 additions & 0 deletions pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
version 1.0

import "../tasks/tasks_assembly.wdl" as assembly
import "../tasks/tasks_ncbi.wdl" as ncbi
import "../tasks/tasks_utils.wdl" as utils
import "assemble_refbased.wdl" as assemble_refbased

workflow scaffold_and_refine_multitaxa {
meta {
description: "Scaffold de novo contigs against a set of possible references and subsequently polish with reads."
author: "Broad Viral Genomics"
email: "viral-ngs@broadinstitute.org"
allowNestedInputs: true
}

input {
String sample_id
File reads_unmapped_bam

Array[Pair[Int,Array[String]+]] taxid_to_ref_accessions = [
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about HAdV?
There are quite a few, but if we do want to include them:

            # HAdV reference genomes, via https://www.ncbi.nlm.nih.gov/genomes/GenomesGroup.cgi?taxid=10509&host=human
            (129875,  ["NC_001460.1"]),  # Human mastadenovirus A strain:Huie; serotype:Human adenovirus 12; culture-collection:ATCC:VR-863
            (108098,  ["NC_011203.1"]),  # Human adenovirus B1
            (108098,  ["NC_011202.1"]),  # Human adenovirus B2
            (129951,  ["NC_001405.1"]),  # Human mastadenovirus C serotype:Human adenovirus 2
            (130310,  ["NC_010956.1"]),  # Human mastadenovirus D strain:Hicks; NIAID V-209-003-014; serotype:Human adenovirus 9
            (130308,  ["NC_003266.2"]),  # Human mastadenovirus E strain:vaccine (CL 68578); serotype:human adenovirus 4
            (130309,  ["NC_001454.1"]),  # Human mastadenovirus F strain:Dugan; serotype:Human adenovirus 40
            (310540,  ["NC_006879.1"]),  # Simian adenovirus 1 strain:ATCC VR-195
            (1123958, ["NC_017825.1"]),  # Chimpanzee adenovirus Y25

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll get some proposed edits to this default list from Jillian soon

(208893, ["KY654518.1"]), # RSV-A
(208895, ["MZ516105.1"]), # RSV-B
(573824, ["NC_038311.1"]), # Rhino A1
(185900, ["ON311191.1"]), # Rhino B27
(1418033, ["ON311169.1"]), # Rhino C15
(463676, ["JN837686.2"]), # Rhino C45
(11137, ["NC_002645.1"]), # HCoV 229E
(290028, ["NC_006577.2"]), # HCoV HKU1
(277944, ["NC_005831.2"]), # HCoV NL63
(31631, ["NC_006213.1"]), # HCoV OC43
(2697049, ["NC_045512.2"]), # SARS-CoV-2 Wuhan Hu-1
(641809, ["NC_026438.1", "NC_026435.1", "NC_026437.1", "NC_026433.1", "NC_026436.1", "NC_026434.1", "NC_026431.1", "NC_026432.1"]), # Flu A/California/07/2009 H1N1
(335341, ["NC_007373.1", "NC_007372.1", "NC_007371.1", "NC_007366.1", "NC_007369.1", "NC_007368.1", "NC_007367.1", "NC_007370.1"]), # Flu A/New York/392/2004 H3N2
(518987, ["NC_002204.1", "NC_002205.1", "NC_002206.1", "NC_002207.1", "NC_002208.1", "NC_002209.1", "NC_002210.1", "NC_002211.1"]), # Flu B/Lee/1940
(162145, ["NC_039199.1"]), # metapneumo
(12730, ["NC_003461.1"]), # paraflu 1
(2560525, ["NC_003443.1"]), # paraflu 2
(11216, ["NC_001796.2"]), # paraflu 3
(11224, ["NC_021928.1"]), # paraflu 4
(129951, ["NC_001405.1"]) # adenovirus C
]

# Float min_pct_reference_covered = 0.1
}

Array[String] assembly_header = ["sample_id", "taxid", "assembly_fasta", "aligned_only_reads_bam", "coverage_plot", "assembly_length", "assembly_length_unambiguous", "reads_aligned", "mean_coverage", "percent_reference_covered", "intermediate_gapfill_fasta", "assembly_preimpute_length_unambiguous", "replicate_concordant_sites", "replicate_discordant_snps", "replicate_discordant_indels", "replicate_discordant_vcf", "isnvsFile", "aligned_bam", "coverage_tsv", "read_pairs_aligned", "bases_aligned"]

scatter(taxon in taxid_to_ref_accessions) {
call ncbi.download_annotations {
input:
accessions = taxon.right,
combined_out_prefix = taxon.left
}
call assembly.scaffold {
input:
reads_bam = reads_unmapped_bam,
reference_genome_fasta = [download_annotations.combined_fasta],
min_length_fraction = 0,
min_unambig = 0,
allow_incomplete_output = true
}
call assemble_refbased.assemble_refbased as refine {
input:
reads_unmapped_bams = [reads_unmapped_bam],
reference_fasta = scaffold.scaffold_fasta,
sample_name = sample_id
}
# to do: if percent_reference_covered > some threshold, run ncbi.rename_fasta_header and ncbi.align_and_annot_transfer_single
# to do: if biosample attributes file provided, run ncbi.biosample_to_genbank

Map[String, String] stats_by_taxon = {
"sample_id" : sample_id,
"taxid" : taxon.left,

"assembly_fasta" : refine.assembly_fasta,
"aligned_only_reads_bam" : refine.align_to_self_merged_aligned_only_bam,
"coverage_plot" : refine.align_to_self_merged_coverage_plot,
"assembly_length" : refine.assembly_length,
"assembly_length_unambiguous" : refine.assembly_length_unambiguous,
"reads_aligned" : refine.align_to_self_merged_reads_aligned,
"mean_coverage" : refine.align_to_self_merged_mean_coverage,
"percent_reference_covered" : 1.0 * refine.assembly_length_unambiguous / refine.reference_genome_length,

"intermediate_gapfill_fasta" : scaffold.intermediate_gapfill_fasta,
"assembly_preimpute_length_unambiguous" : scaffold.assembly_preimpute_length_unambiguous,

"replicate_concordant_sites" : refine.replicate_concordant_sites,
"replicate_discordant_snps" : refine.replicate_discordant_snps,
"replicate_discordant_indels" : refine.replicate_discordant_indels,
"replicate_discordant_vcf" : refine.replicate_discordant_vcf,

"isnvsFile" : refine.align_to_self_isnvs_vcf,
"aligned_bam" : refine.align_to_self_merged_aligned_only_bam,
"coverage_tsv" : refine.align_to_self_merged_coverage_tsv,
"read_pairs_aligned" : refine.align_to_self_merged_read_pairs_aligned,
"bases_aligned" : refine.align_to_self_merged_bases_aligned
}

scatter(h in assembly_header) {
tomkinsc marked this conversation as resolved.
Show resolved Hide resolved
String stat_by_taxon = stats_by_taxon[h]
}
}

### summary stats
call utils.concatenate {
input:
infiles = [write_tsv([assembly_header]), write_tsv(stat_by_taxon)],
output_name = "assembly_metadata-~{sample_id}.tsv"
}

output {
Array[Map[String,String]] assembly_stats_by_taxon = stats_by_taxon
File assembly_stats_by_taxon_tsv = concatenate.combined

Int num_read_groups = refine.num_read_groups[0]
Int num_libraries = refine.num_libraries[0]

String assembly_method = "viral-ngs/scaffold_and_refine_multitaxa"
String scaffold_viral_assemble_version = scaffold.viralngs_version[0]
String refine_viral_assemble_version = refine.viral_assemble_version[0]
}
}
2 changes: 1 addition & 1 deletion requirements-modules.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
broadinstitute/viral-core=2.2.4
broadinstitute/viral-assemble=2.1.33.0
broadinstitute/viral-assemble=2.2.4.0
broadinstitute/viral-classify=2.2.3.0
broadinstitute/viral-phylo=2.1.20.2
broadinstitute/py3-bio=0.1.2
Expand Down
Loading