From 7064708f27a0268943b93c8ad49e2d5407cd1e3c Mon Sep 17 00:00:00 2001 From: Danny Park Date: Mon, 5 Feb 2024 16:22:28 -0500 Subject: [PATCH 01/20] defend against rather common empty output scenario --- pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl index eac47dc1f..9c8ab9f01 100644 --- a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl +++ b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl @@ -79,7 +79,6 @@ workflow scaffold_and_refine_multitaxa { "assembly_length_unambiguous" : refine.assembly_length_unambiguous, "reads_aligned" : refine.align_to_self_merged_reads_aligned, "mean_coverage" : refine.align_to_self_merged_mean_coverage, - "percent_reference_covered" : 1.0 * refine.assembly_length_unambiguous / refine.reference_genome_length, "intermediate_gapfill_fasta" : scaffold.intermediate_gapfill_fasta, "assembly_preimpute_length_unambiguous" : scaffold.assembly_preimpute_length_unambiguous, @@ -96,6 +95,13 @@ workflow scaffold_and_refine_multitaxa { "bases_aligned" : refine.align_to_self_merged_bases_aligned } + if (refine.reference_genome_length > 0) { + stats_by_taxon["percent_reference_covered"] = 1.0 * refine.assembly_length_unambiguous / refine.reference_genome_length + } + if (refine.reference_genome_length <= 0) { + stats_by_taxon["percent_reference_covered"] = 0.0 + } + scatter(h in assembly_header) { String stat_by_taxon = stats_by_taxon[h] } From 059308345222cc73be01a5312269a91ad577d01f Mon Sep 17 00:00:00 2001 From: Danny Park Date: Mon, 5 Feb 2024 16:34:09 -0500 Subject: [PATCH 02/20] more compliant wdl --- pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl index 9c8ab9f01..7ab4d3476 100644 --- a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl +++ b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl @@ -68,6 +68,10 @@ workflow scaffold_and_refine_multitaxa { # to do: if percent_reference_covered > some threshold, run ncbi.rename_fasta_header and ncbi.align_and_annot_transfer_single # to do: if biosample attributes file provided, run ncbi.biosample_to_genbank + if (refine.reference_genome_length > 0) { + Float percent_reference_covered = 1.0 * refine.assembly_length_unambiguous / refine.reference_genome_length + } + Map[String, String] stats_by_taxon = { "sample_id" : sample_id, "taxid" : taxon.left, @@ -79,6 +83,7 @@ workflow scaffold_and_refine_multitaxa { "assembly_length_unambiguous" : refine.assembly_length_unambiguous, "reads_aligned" : refine.align_to_self_merged_reads_aligned, "mean_coverage" : refine.align_to_self_merged_mean_coverage, + "percent_reference_covered" : select_first([percent_reference_covered, 0.0]), "intermediate_gapfill_fasta" : scaffold.intermediate_gapfill_fasta, "assembly_preimpute_length_unambiguous" : scaffold.assembly_preimpute_length_unambiguous, @@ -95,12 +100,6 @@ workflow scaffold_and_refine_multitaxa { "bases_aligned" : refine.align_to_self_merged_bases_aligned } - if (refine.reference_genome_length > 0) { - stats_by_taxon["percent_reference_covered"] = 1.0 * refine.assembly_length_unambiguous / refine.reference_genome_length - } - if (refine.reference_genome_length <= 0) { - stats_by_taxon["percent_reference_covered"] = 0.0 - } scatter(h in assembly_header) { String stat_by_taxon = stats_by_taxon[h] From 0bedc963357089db2895c25c7e0c79f4e238adf1 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Wed, 7 Feb 2024 15:36:14 -0500 Subject: [PATCH 03/20] add new wdl task report_primary_kraken_taxa --- pipes/WDL/tasks/tasks_metagenomics.wdl | 46 ++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/pipes/WDL/tasks/tasks_metagenomics.wdl b/pipes/WDL/tasks/tasks_metagenomics.wdl index 58354426e..414abbcb1 100644 --- a/pipes/WDL/tasks/tasks_metagenomics.wdl +++ b/pipes/WDL/tasks/tasks_metagenomics.wdl @@ -326,6 +326,52 @@ task kraken2 { } } +task report_primary_kraken_taxa { + meta { + description: "Interprets a kraken (or kraken2 or krakenuniq) summary report file and emits the primary contributing taxa under a focal taxon of interest." + } + input { + File kraken_summary_report + String focal_taxon = "Viruses" + + String docker = "quay.io/broadinstitute/viral-classify:dp-ksummary" #skip-global-version-pin + } + String out_basename = basename(kraken_summary_report, '.txt') + Int disk_size = 50 + Int machine_mem_gb = 2 + + command <<< + set -e + metagenomics.py taxlevel_plurality "~{kraken_summary_report}" "~{focal_taxon}" "~{out_basename}.ranked_focal_report.tsv" + cat "~{out_basename}.ranked_focal_report.tsv" | head -2 | tail +2 > TOPROW + cut -f 4 TOPROW > PCT_OF_FOCAL + cut -f 7 TOPROW > NUM_READS + cut -f 8 TOPROW > TAX_RANK + cut -f 9 TOPROW > TAX_ID + cut -f 10 TOPROW > TAX_NAME + >>> + + output { + File ranked_focal_report = "~{out_basename}.ranked_focal_report.tsv" + Float percent_of_focal = read_float("PCT_OF_FOCAL") + Int num_reads = read_int("NUM_READS") + String tax_rank = read_string("TAX_RANK") + String tax_id = read_string("TAX_ID") + String tax_name = read_string("TAX_NAME") + } + + runtime { + docker: docker + memory: machine_mem_gb + " GB" + cpu: 1 + disks: "local-disk " + disk_size + " LOCAL" + disk: disk_size + " GB" # TESs + dx_instance_type: "mem1_ssd1_v2_x2" + preemptible: 2 + maxRetries: 2 + } +} + task build_kraken2_db { meta { description: "Builds a custom kraken2 database. Outputs tar.zst tarballs of kraken2 database, associated krona taxonomy db, and an ncbi taxdump.tar.gz. Requires live internet access if any standard_libraries are specified or if taxonomy_db_tgz is absent." From 98f9bbd7852d5f4cbde606fd4bb983be5bec000f Mon Sep 17 00:00:00 2001 From: Danny Park Date: Wed, 7 Feb 2024 16:28:19 -0500 Subject: [PATCH 04/20] add report_primary_kraken_taxa wdl task and add to classify_single --- pipes/WDL/workflows/classify_single.wdl | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pipes/WDL/workflows/classify_single.wdl b/pipes/WDL/workflows/classify_single.wdl index 1bfd793ae..9f1168c1b 100644 --- a/pipes/WDL/workflows/classify_single.wdl +++ b/pipes/WDL/workflows/classify_single.wdl @@ -121,6 +121,10 @@ workflow classify_single { trim_clip_db = trim_clip_db, always_succeed = true } + call metagenomics.report_primary_kraken_taxa { + input: + kraken_summary_report = kraken2.kraken2_summary_report + } output { File cleaned_reads_unaligned_bam = deplete.bam_filtered_to_taxa @@ -134,6 +138,13 @@ workflow classify_single { File kraken2_summary_report = kraken2.kraken2_summary_report File kraken2_krona_plot = kraken2.krona_report_html + File kraken2_top_taxa_report = report_primary_kraken_taxa.ranked_focal_report + String kraken2_top_taxon_id = report_primary_kraken_taxa.tax_id + String kraken2_top_taxon_name = report_primary_kraken_taxa.tax_name + String kraken2_top_taxon_rank = report_primary_kraken_taxa.tax_rank + Int kraken2_top_taxon_num_reads = report_primary_kraken_taxa.num_reads + Float kraken2_top_taxon_pct_of_focal = report_primary_kraken_taxa.percent_of_focal + File raw_fastqc = merge_raw_reads.fastqc File cleaned_fastqc = fastqc_cleaned.fastqc_html File spikein_report = spikein.report From e39919b0a360436fa8fb8107b261a73cc013f4ba Mon Sep 17 00:00:00 2001 From: Danny Park Date: Wed, 7 Feb 2024 17:03:12 -0500 Subject: [PATCH 05/20] add a few more outputs --- pipes/WDL/tasks/tasks_metagenomics.wdl | 3 +++ pipes/WDL/workflows/classify_single.wdl | 2 ++ 2 files changed, 5 insertions(+) diff --git a/pipes/WDL/tasks/tasks_metagenomics.wdl b/pipes/WDL/tasks/tasks_metagenomics.wdl index 414abbcb1..457a75819 100644 --- a/pipes/WDL/tasks/tasks_metagenomics.wdl +++ b/pipes/WDL/tasks/tasks_metagenomics.wdl @@ -344,6 +344,7 @@ task report_primary_kraken_taxa { set -e metagenomics.py taxlevel_plurality "~{kraken_summary_report}" "~{focal_taxon}" "~{out_basename}.ranked_focal_report.tsv" cat "~{out_basename}.ranked_focal_report.tsv" | head -2 | tail +2 > TOPROW + cut -f 2 TOPROW > NUM_FOCAL cut -f 4 TOPROW > PCT_OF_FOCAL cut -f 7 TOPROW > NUM_READS cut -f 8 TOPROW > TAX_RANK @@ -352,7 +353,9 @@ task report_primary_kraken_taxa { >>> output { + String focal_tax_name = focal_taxon File ranked_focal_report = "~{out_basename}.ranked_focal_report.tsv" + Int total_focal_reads = read_int("NUM_FOCAL") Float percent_of_focal = read_float("PCT_OF_FOCAL") Int num_reads = read_int("NUM_READS") String tax_rank = read_string("TAX_RANK") diff --git a/pipes/WDL/workflows/classify_single.wdl b/pipes/WDL/workflows/classify_single.wdl index 9f1168c1b..bea728228 100644 --- a/pipes/WDL/workflows/classify_single.wdl +++ b/pipes/WDL/workflows/classify_single.wdl @@ -139,6 +139,8 @@ workflow classify_single { File kraken2_summary_report = kraken2.kraken2_summary_report File kraken2_krona_plot = kraken2.krona_report_html File kraken2_top_taxa_report = report_primary_kraken_taxa.ranked_focal_report + String kraken2_focal_taxon_name = report_primary_kraken_taxa.focal_tax_name + Int kraken2_focal_total_reads = report_primary_kraken_taxa.total_focal_reads String kraken2_top_taxon_id = report_primary_kraken_taxa.tax_id String kraken2_top_taxon_name = report_primary_kraken_taxa.tax_name String kraken2_top_taxon_rank = report_primary_kraken_taxa.tax_rank From 9e12088342828cb13aa07835594898a5280852b3 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Thu, 8 Feb 2024 15:34:04 -0500 Subject: [PATCH 06/20] try wdl 1.1 and see what happens --- github_actions_ci/install-wdl.sh | 4 +- .../scaffold_and_refine_multitaxa.wdl | 58 ++++++++++--------- 2 files changed, 33 insertions(+), 29 deletions(-) diff --git a/github_actions_ci/install-wdl.sh b/github_actions_ci/install-wdl.sh index 5a7bf3fc1..092a8cd16 100755 --- a/github_actions_ci/install-wdl.sh +++ b/github_actions_ci/install-wdl.sh @@ -12,8 +12,8 @@ fetch_jar_from_github () { ln -s $_jar_fname $_tool_name.jar } -fetch_jar_from_github broadinstitute cromwell womtool 61 -fetch_jar_from_github broadinstitute cromwell cromwell 61 +fetch_jar_from_github broadinstitute cromwell womtool 86 +fetch_jar_from_github broadinstitute cromwell cromwell 86 fetch_jar_from_github dnanexus dxWDL dxWDL v1.50 TGZ=dx-toolkit-v0.311.0-ubuntu-20.04-amd64.tar.gz diff --git a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl index 7ab4d3476..06de459d9 100644 --- a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl +++ b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl @@ -1,4 +1,4 @@ -version 1.0 +version 1.1 import "../tasks/tasks_assembly.wdl" as assembly import "../tasks/tasks_ncbi.wdl" as ncbi @@ -17,39 +17,43 @@ workflow scaffold_and_refine_multitaxa { String sample_id File reads_unmapped_bam - Array[Pair[Int,Array[String]+]] taxid_to_ref_accessions = [ - (208893, ["KY654518.1"]), # RSV-A - (208895, ["MZ516105.1"]), # RSV-B - (573824, ["NC_038311.1"]), # Rhino A1 - (185900, ["ON311191.1"]), # Rhino B27 - (1418033, ["ON311169.1"]), # Rhino C15 - (463676, ["JN837686.2"]), # Rhino C45 - (11137, ["NC_002645.1"]), # HCoV 229E - (290028, ["NC_006577.2"]), # HCoV HKU1 - (277944, ["NC_005831.2"]), # HCoV NL63 - (31631, ["NC_006213.1"]), # HCoV OC43 - (2697049, ["NC_045512.2"]), # SARS-CoV-2 Wuhan Hu-1 - (641809, ["NC_026438.1", "NC_026435.1", "NC_026437.1", "NC_026433.1", "NC_026436.1", "NC_026434.1", "NC_026431.1", "NC_026432.1"]), # Flu A/California/07/2009 H1N1 - (335341, ["NC_007373.1", "NC_007372.1", "NC_007371.1", "NC_007366.1", "NC_007369.1", "NC_007368.1", "NC_007367.1", "NC_007370.1"]), # Flu A/New York/392/2004 H3N2 - (518987, ["NC_002204.1", "NC_002205.1", "NC_002206.1", "NC_002207.1", "NC_002208.1", "NC_002209.1", "NC_002210.1", "NC_002211.1"]), # Flu B/Lee/1940 - (162145, ["NC_039199.1"]), # metapneumo - (12730, ["NC_003461.1"]), # paraflu 1 - (2560525, ["NC_003443.1"]), # paraflu 2 - (11216, ["NC_001796.2"]), # paraflu 3 - (11224, ["NC_021928.1"]), # paraflu 4 - (129951, ["NC_001405.1"]) # adenovirus C - ] + #File taxid_to_ref_accessions_json + Map[Int,Array[String]+] taxid_to_ref_accessions = { + 208893: ["KY654518.1"], # RSV-A + 208895: ["MZ516105.1"], # RSV-B + 573824: ["NC_038311.1"], # Rhino A1 + 185900: ["ON311191.1"], # Rhino B27 + 1418033: ["ON311169.1"], # Rhino C15 + 463676: ["JN837686.2"], # Rhino C45 + 11137: ["NC_002645.1"], # HCoV 229E + 290028: ["NC_006577.2"], # HCoV HKU1 + 277944: ["NC_005831.2"], # HCoV NL63 + 31631: ["NC_006213.1"], # HCoV OC43 + 2697049: ["NC_045512.2"], # SARS-CoV-2 Wuhan Hu-1 + 641809: ["NC_026438.1", "NC_026435.1", "NC_026437.1", "NC_026433.1", "NC_026436.1", "NC_026434.1", "NC_026431.1", "NC_026432.1"], # Flu A/California/07/2009 H1N1 + 335341: ["NC_007373.1", "NC_007372.1", "NC_007371.1", "NC_007366.1", "NC_007369.1", "NC_007368.1", "NC_007367.1", "NC_007370.1"], # Flu A/New York/392/2004 H3N2 + 518987: ["NC_002204.1", "NC_002205.1", "NC_002206.1", "NC_002207.1", "NC_002208.1", "NC_002209.1", "NC_002210.1", "NC_002211.1"], # Flu B/Lee/1940 + 162145: ["NC_039199.1"], # metapneumo + 12730: ["NC_003461.1"], # paraflu 1 + 2560525: ["NC_003443.1"], # paraflu 2 + 11216: ["NC_001796.2"], # paraflu 3 + 11224: ["NC_021928.1"], # paraflu 4 + 565302: ["NC_011203.1"], # adenovirus B1 + 565303: ["NC_011202.1"], # adenovirus B2 + 129951: ["NC_001405.1"] # adenovirus C + } # Float min_pct_reference_covered = 0.1 } + #Array[Pair[Int,Array[String]+]] taxid_to_ref_accessions = read_json(taxid_to_ref_accessions_json) Array[String] assembly_header = ["sample_id", "taxid", "assembly_fasta", "aligned_only_reads_bam", "coverage_plot", "assembly_length", "assembly_length_unambiguous", "reads_aligned", "mean_coverage", "percent_reference_covered", "intermediate_gapfill_fasta", "assembly_preimpute_length_unambiguous", "replicate_concordant_sites", "replicate_discordant_snps", "replicate_discordant_indels", "replicate_discordant_vcf", "isnvsFile", "aligned_bam", "coverage_tsv", "read_pairs_aligned", "bases_aligned"] - scatter(taxon in taxid_to_ref_accessions) { + scatter(taxid in keys(taxid_to_ref_accessions)) { call ncbi.download_annotations { input: - accessions = taxon.right, - combined_out_prefix = taxon.left + accessions = taxid_to_ref_accessions[taxid], + combined_out_prefix = taxid } call assembly.scaffold { input: @@ -74,7 +78,7 @@ workflow scaffold_and_refine_multitaxa { Map[String, String] stats_by_taxon = { "sample_id" : sample_id, - "taxid" : taxon.left, + "taxid" : taxid, "assembly_fasta" : refine.assembly_fasta, "aligned_only_reads_bam" : refine.align_to_self_merged_aligned_only_bam, From 02cf671893bac41bf60257b9afb98b0de3bb8364 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Thu, 8 Feb 2024 15:38:52 -0500 Subject: [PATCH 07/20] try wdl development and see what happens --- pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl index 06de459d9..849f88f98 100644 --- a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl +++ b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl @@ -1,4 +1,4 @@ -version 1.1 +version development import "../tasks/tasks_assembly.wdl" as assembly import "../tasks/tasks_ncbi.wdl" as ncbi From d824518413d773276b07f8e7e1ddb3ba632b87c7 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Thu, 8 Feb 2024 16:34:53 -0500 Subject: [PATCH 08/20] update to take tsv instead of json input for reference/tax map --- pipes/WDL/tasks/tasks_utils.wdl | 29 ++++++++++++ .../scaffold_and_refine_multitaxa.wdl | 47 ++++++------------- 2 files changed, 44 insertions(+), 32 deletions(-) diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl index cbe618a1e..bbef0044c 100644 --- a/pipes/WDL/tasks/tasks_utils.wdl +++ b/pipes/WDL/tasks/tasks_utils.wdl @@ -712,6 +712,35 @@ task s3_copy { } } +task string_split { + meta { + description: "split a string by a delimiter" + } + input { + String joined_string + String delimiter + } + command <<< + set -e + python3<>> + output { + Array[String] tokens = read_lines("TOKENS") + } + runtime { + docker: "python:slim" + memory: "1 GB" + cpu: 1 + disks: "local-disk 50 SSD" + disk: "50 GB" # TES + maxRetries: 2 + } +} + task filter_sequences_by_length { meta { description: "Filter sequences in a fasta file to enforce a minimum count of non-N bases." diff --git a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl index 849f88f98..b613d7ce4 100644 --- a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl +++ b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl @@ -1,4 +1,4 @@ -version development +version 1.0 import "../tasks/tasks_assembly.wdl" as assembly import "../tasks/tasks_ncbi.wdl" as ncbi @@ -17,43 +17,25 @@ workflow scaffold_and_refine_multitaxa { String sample_id File reads_unmapped_bam - #File taxid_to_ref_accessions_json - Map[Int,Array[String]+] taxid_to_ref_accessions = { - 208893: ["KY654518.1"], # RSV-A - 208895: ["MZ516105.1"], # RSV-B - 573824: ["NC_038311.1"], # Rhino A1 - 185900: ["ON311191.1"], # Rhino B27 - 1418033: ["ON311169.1"], # Rhino C15 - 463676: ["JN837686.2"], # Rhino C45 - 11137: ["NC_002645.1"], # HCoV 229E - 290028: ["NC_006577.2"], # HCoV HKU1 - 277944: ["NC_005831.2"], # HCoV NL63 - 31631: ["NC_006213.1"], # HCoV OC43 - 2697049: ["NC_045512.2"], # SARS-CoV-2 Wuhan Hu-1 - 641809: ["NC_026438.1", "NC_026435.1", "NC_026437.1", "NC_026433.1", "NC_026436.1", "NC_026434.1", "NC_026431.1", "NC_026432.1"], # Flu A/California/07/2009 H1N1 - 335341: ["NC_007373.1", "NC_007372.1", "NC_007371.1", "NC_007366.1", "NC_007369.1", "NC_007368.1", "NC_007367.1", "NC_007370.1"], # Flu A/New York/392/2004 H3N2 - 518987: ["NC_002204.1", "NC_002205.1", "NC_002206.1", "NC_002207.1", "NC_002208.1", "NC_002209.1", "NC_002210.1", "NC_002211.1"], # Flu B/Lee/1940 - 162145: ["NC_039199.1"], # metapneumo - 12730: ["NC_003461.1"], # paraflu 1 - 2560525: ["NC_003443.1"], # paraflu 2 - 11216: ["NC_001796.2"], # paraflu 3 - 11224: ["NC_021928.1"], # paraflu 4 - 565302: ["NC_011203.1"], # adenovirus B1 - 565303: ["NC_011202.1"], # adenovirus B2 - 129951: ["NC_001405.1"] # adenovirus C - } + File taxid_to_ref_accessions_tsv # Float min_pct_reference_covered = 0.1 } - #Array[Pair[Int,Array[String]+]] taxid_to_ref_accessions = read_json(taxid_to_ref_accessions_json) - Array[String] assembly_header = ["sample_id", "taxid", "assembly_fasta", "aligned_only_reads_bam", "coverage_plot", "assembly_length", "assembly_length_unambiguous", "reads_aligned", "mean_coverage", "percent_reference_covered", "intermediate_gapfill_fasta", "assembly_preimpute_length_unambiguous", "replicate_concordant_sites", "replicate_discordant_snps", "replicate_discordant_indels", "replicate_discordant_vcf", "isnvsFile", "aligned_bam", "coverage_tsv", "read_pairs_aligned", "bases_aligned"] + Array[Array[String]] taxid_to_ref_accessions = read_tsv(taxid_to_ref_accessions_tsv) + Array[String] assembly_header = ["sample_id", "taxid", "tax_name", "assembly_fasta", "aligned_only_reads_bam", "coverage_plot", "assembly_length", "assembly_length_unambiguous", "reads_aligned", "mean_coverage", "percent_reference_covered", "intermediate_gapfill_fasta", "assembly_preimpute_length_unambiguous", "replicate_concordant_sites", "replicate_discordant_snps", "replicate_discordant_indels", "replicate_discordant_vcf", "isnvsFile", "aligned_bam", "coverage_tsv", "read_pairs_aligned", "bases_aligned"] - scatter(taxid in keys(taxid_to_ref_accessions)) { + scatter(taxon in taxid_to_ref_accessions) { + # taxon = [taxid, taxname, semicolon_delim_accession_list] + call utils.string_split { + input: + joined_string = taxon[2], + delimiter = ";" + } call ncbi.download_annotations { input: - accessions = taxid_to_ref_accessions[taxid], - combined_out_prefix = taxid + accessions = string_split.tokens, + combined_out_prefix = taxon[0] } call assembly.scaffold { input: @@ -78,7 +60,8 @@ workflow scaffold_and_refine_multitaxa { Map[String, String] stats_by_taxon = { "sample_id" : sample_id, - "taxid" : taxid, + "taxid" : taxon[0], + "tax_name" : taxon[1], "assembly_fasta" : refine.assembly_fasta, "aligned_only_reads_bam" : refine.align_to_self_merged_aligned_only_bam, From fa07252ac73f3f03e4d48a8f13e52865a76b51c2 Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Tue, 13 Feb 2024 10:02:14 -0500 Subject: [PATCH 09/20] attempt to not fail in scaffolding when some but not all segments of multi-segment genome are recovered --- pipes/WDL/tasks/tasks_assembly.wdl | 34 ++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl index d2c45fb6b..7120cc702 100644 --- a/pipes/WDL/tasks/tasks_assembly.wdl +++ b/pipes/WDL/tasks/tasks_assembly.wdl @@ -231,19 +231,29 @@ task scaffold { set +e +o pipefail grep -v '^>' ~{sample_name}.intermediate_gapfill.fasta | tr -d '\n' | wc -c | tee assembly_preimpute_length grep -v '^>' ~{sample_name}.intermediate_gapfill.fasta | tr -d '\nNn' | wc -c | tee assembly_preimpute_length_unambiguous + grep '^>' ~{sample_name}.intermediate_gapfill.fasta | wc -l | tee assembly_num_segments_recovered + grep '^>' ~{sample_name}.scaffolding_chosen_ref.fasta | wc -l | tee reference_num_segments_required set -e -o pipefail - #Input assembly/contigs, FASTA, already ordered oriented and merged with the reference gneome (FASTA) - assembly.py impute_from_reference \ - ~{sample_name}.intermediate_gapfill.fasta \ - ~{sample_name}.scaffolding_chosen_ref.fasta \ - ~{sample_name}.scaffolded_imputed.fasta \ - --newName ~{sample_name} \ - ~{'--replaceLength=' + replace_length} \ - ~{'--minLengthFraction=' + min_length_fraction} \ - ~{'--minUnambig=' + min_unambig} \ - ~{'--aligner=' + aligner} \ - --loglevel=DEBUG + if [[ ~{true='1' false='0' allow_incomplete_output} && (cmp -s assembly_num_segments_recovered reference_num_segments_required) ]]; then + # draft assembly does not have enough segments--and that's okay + file_utils.py rename_fasta_sequences \ + ~{sample_name}.intermediate_gapfill.fasta \ + ~{sample_name}.scaffolded_imputed.fasta \ + "~{sample_name}" --suffix_always --loglevel=DEBUG + else + # draft assembly must have the right number of segments (fail if not) + assembly.py impute_from_reference \ + ~{sample_name}.intermediate_gapfill.fasta \ + ~{sample_name}.scaffolding_chosen_ref.fasta \ + ~{sample_name}.scaffolded_imputed.fasta \ + --newName ~{sample_name} \ + ~{'--replaceLength=' + replace_length} \ + ~{'--minLengthFraction=' + min_length_fraction} \ + ~{'--minUnambig=' + min_unambig} \ + ~{'--aligner=' + aligner} \ + --loglevel=DEBUG + fi } output { @@ -252,6 +262,8 @@ task scaffold { File intermediate_gapfill_fasta = "~{sample_name}.intermediate_gapfill.fasta" Int assembly_preimpute_length = read_int("assembly_preimpute_length") Int assembly_preimpute_length_unambiguous = read_int("assembly_preimpute_length_unambiguous") + Int assembly_num_segments_recovered = read_int("assembly_num_segments_recovered") + Int reference_num_segments_required = read_int("reference_num_segments_required") Array[String] scaffolding_chosen_ref_names = read_lines("~{sample_name}.scaffolding_chosen_refs.txt") File scaffolding_chosen_ref = "~{sample_name}.scaffolding_chosen_ref.fasta" File scaffolding_stats = "~{sample_name}.scaffolding_stats.txt" From 031a294c6e000fd888fe5516cc0467efc3f834a1 Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Tue, 13 Feb 2024 10:23:03 -0500 Subject: [PATCH 10/20] forgot $ --- pipes/WDL/tasks/tasks_assembly.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl index 7120cc702..93456d5dd 100644 --- a/pipes/WDL/tasks/tasks_assembly.wdl +++ b/pipes/WDL/tasks/tasks_assembly.wdl @@ -235,7 +235,7 @@ task scaffold { grep '^>' ~{sample_name}.scaffolding_chosen_ref.fasta | wc -l | tee reference_num_segments_required set -e -o pipefail - if [[ ~{true='1' false='0' allow_incomplete_output} && (cmp -s assembly_num_segments_recovered reference_num_segments_required) ]]; then + if [[ ~{true='1' false='0' allow_incomplete_output} && $(cmp -s assembly_num_segments_recovered reference_num_segments_required) ]]; then # draft assembly does not have enough segments--and that's okay file_utils.py rename_fasta_sequences \ ~{sample_name}.intermediate_gapfill.fasta \ From 8a9b26f6dfc2c58f40ce6c04badf4ee9fc29f3fe Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Tue, 13 Feb 2024 10:29:45 -0500 Subject: [PATCH 11/20] remove random empty newline introduced in this branch --- pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl | 1 - 1 file changed, 1 deletion(-) diff --git a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl index b613d7ce4..e1238dd2a 100644 --- a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl +++ b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl @@ -87,7 +87,6 @@ workflow scaffold_and_refine_multitaxa { "bases_aligned" : refine.align_to_self_merged_bases_aligned } - scatter(h in assembly_header) { String stat_by_taxon = stats_by_taxon[h] } From 165eb6669db113150a77c31a589018a395cbf2d1 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Wed, 14 Feb 2024 09:14:01 -0500 Subject: [PATCH 12/20] fix bash logical construction --- pipes/WDL/tasks/tasks_assembly.wdl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl index 93456d5dd..7754eba16 100644 --- a/pipes/WDL/tasks/tasks_assembly.wdl +++ b/pipes/WDL/tasks/tasks_assembly.wdl @@ -235,8 +235,9 @@ task scaffold { grep '^>' ~{sample_name}.scaffolding_chosen_ref.fasta | wc -l | tee reference_num_segments_required set -e -o pipefail - if [[ ~{true='1' false='0' allow_incomplete_output} && $(cmp -s assembly_num_segments_recovered reference_num_segments_required) ]]; then - # draft assembly does not have enough segments--and that's okay + if ~{true='true' false='false' allow_incomplete_output} && ! cmp -s assembly_num_segments_recovered reference_num_segments_required + then + # draft assembly does not have enough segments--and that's okay (allow_incomplete_output=true) file_utils.py rename_fasta_sequences \ ~{sample_name}.intermediate_gapfill.fasta \ ~{sample_name}.scaffolded_imputed.fasta \ From 1080d494e7c39da5d43d7bbce446dcad847443ca Mon Sep 17 00:00:00 2001 From: Danny Park Date: Wed, 14 Feb 2024 16:40:45 -0500 Subject: [PATCH 13/20] initial draft of task for filtering reference list --- pipes/WDL/tasks/tasks_metagenomics.wdl | 37 ++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/pipes/WDL/tasks/tasks_metagenomics.wdl b/pipes/WDL/tasks/tasks_metagenomics.wdl index 457a75819..5e1f23c77 100644 --- a/pipes/WDL/tasks/tasks_metagenomics.wdl +++ b/pipes/WDL/tasks/tasks_metagenomics.wdl @@ -375,6 +375,43 @@ task report_primary_kraken_taxa { } } +task filter_refs_to_found_taxa { + meta { + description: "Filters a taxid_to_ref_accessions_tsv to the set of taxa found in a focal_report." + } + input { + File taxid_to_ref_accessions_tsv + File focal_report_tsv + File taxdump_tgz + Int min_read_count = 100 + + String docker = "quay.io/broadinstitute/viral-classify:dp-ksummary" #skip-global-version-pin + } + String ref_basename = basename(taxid_to_ref_accessions_tsv, '.tsv') + String hits_basename = basename(focal_report_tsv, '.tsv') + Int disk_size = 50 + + command <<< + set -e + metagenomics.py filter_taxids_to_focal_hits "~{taxid_to_ref_accessions_tsv}" "~{focal_report_tsv}" "~{taxdump_tgz}" ~{min_read_count} "~{ref_basename}-~{hits_basename}.tsv" + >>> + + output { + File filtered_taxid_to_ref_accessions_tsv = "~{ref_basename}-~{hits_basename}.tsv" + } + + runtime { + docker: docker + memory: "2 GB" + cpu: 1 + disks: "local-disk " + disk_size + " LOCAL" + disk: disk_size + " GB" # TESs + dx_instance_type: "mem1_ssd1_v2_x2" + preemptible: 2 + maxRetries: 2 + } +} + task build_kraken2_db { meta { description: "Builds a custom kraken2 database. Outputs tar.zst tarballs of kraken2 database, associated krona taxonomy db, and an ncbi taxdump.tar.gz. Requires live internet access if any standard_libraries are specified or if taxonomy_db_tgz is absent." From 1a77bf7dc5f805598498c0751346a3cf3c112e18 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Wed, 14 Feb 2024 16:44:29 -0500 Subject: [PATCH 14/20] pre-extract taxdump tarball --- pipes/WDL/tasks/tasks_metagenomics.wdl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pipes/WDL/tasks/tasks_metagenomics.wdl b/pipes/WDL/tasks/tasks_metagenomics.wdl index 5e1f23c77..095a4369c 100644 --- a/pipes/WDL/tasks/tasks_metagenomics.wdl +++ b/pipes/WDL/tasks/tasks_metagenomics.wdl @@ -393,7 +393,9 @@ task filter_refs_to_found_taxa { command <<< set -e - metagenomics.py filter_taxids_to_focal_hits "~{taxid_to_ref_accessions_tsv}" "~{focal_report_tsv}" "~{taxdump_tgz}" ~{min_read_count} "~{ref_basename}-~{hits_basename}.tsv" + mkdir -p taxdump + read_utils.py extract_tarball "~{taxdump_tgz}" taxdump + metagenomics.py filter_taxids_to_focal_hits "~{taxid_to_ref_accessions_tsv}" "~{focal_report_tsv}" taxdump ~{min_read_count} "~{ref_basename}-~{hits_basename}.tsv" >>> output { From d31c14a686c2885fef8b9cb9336c22097c3867bb Mon Sep 17 00:00:00 2001 From: Danny Park Date: Thu, 15 Feb 2024 10:02:42 -0500 Subject: [PATCH 15/20] add optional kraken-based reference selection to multitaxa --- .../scaffold_and_refine_multitaxa.wdl | 31 ++++++++++++------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl index e1238dd2a..d95eea937 100644 --- a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl +++ b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl @@ -1,6 +1,7 @@ version 1.0 import "../tasks/tasks_assembly.wdl" as assembly +import "../tasks/tasks_metagenomics.wdl" as metagenomics import "../tasks/tasks_ncbi.wdl" as ncbi import "../tasks/tasks_utils.wdl" as utils import "assemble_refbased.wdl" as assemble_refbased @@ -18,11 +19,23 @@ workflow scaffold_and_refine_multitaxa { File reads_unmapped_bam File taxid_to_ref_accessions_tsv + File? focal_report_tsv + File? ncbi_taxdump_tgz # Float min_pct_reference_covered = 0.1 } - Array[Array[String]] taxid_to_ref_accessions = read_tsv(taxid_to_ref_accessions_tsv) + # if kraken reports are available, filter scaffold list to observed hits (output might be empty!) + if(defined(focal_report_tsv) && defined(ncbi_taxdump_tgz)) { + call metagenomics.filter_refs_to_found_taxa { + input: + taxid_to_ref_accessions_tsv = taxid_to_ref_accessions_tsv, + taxdump_tgz = select_first([ncbi_taxdump_tgz]), + focal_report_tsv = select_first([focal_report_tsv]) + } + } + + Array[Array[String]] taxid_to_ref_accessions = read_tsv(select_first([filter_refs_to_found_taxa.filtered_taxid_to_ref_accessions_tsv, taxid_to_ref_accessions_tsv])) Array[String] assembly_header = ["sample_id", "taxid", "tax_name", "assembly_fasta", "aligned_only_reads_bam", "coverage_plot", "assembly_length", "assembly_length_unambiguous", "reads_aligned", "mean_coverage", "percent_reference_covered", "intermediate_gapfill_fasta", "assembly_preimpute_length_unambiguous", "replicate_concordant_sites", "replicate_discordant_snps", "replicate_discordant_indels", "replicate_discordant_vcf", "isnvsFile", "aligned_bam", "coverage_tsv", "read_pairs_aligned", "bases_aligned"] scatter(taxon in taxid_to_ref_accessions) { @@ -51,8 +64,8 @@ workflow scaffold_and_refine_multitaxa { reference_fasta = scaffold.scaffold_fasta, sample_name = sample_id } - # to do: if percent_reference_covered > some threshold, run ncbi.rename_fasta_header and ncbi.align_and_annot_transfer_single - # to do: if biosample attributes file provided, run ncbi.biosample_to_genbank + # TO DO: if percent_reference_covered > some threshold, run ncbi.rename_fasta_header and ncbi.align_and_annot_transfer_single + # TO DO: if biosample attributes file provided, run ncbi.biosample_to_genbank if (refine.reference_genome_length > 0) { Float percent_reference_covered = 1.0 * refine.assembly_length_unambiguous / refine.reference_genome_length @@ -100,14 +113,10 @@ workflow scaffold_and_refine_multitaxa { } output { - Array[Map[String,String]] assembly_stats_by_taxon = stats_by_taxon - File assembly_stats_by_taxon_tsv = concatenate.combined - - Int num_read_groups = refine.num_read_groups[0] - Int num_libraries = refine.num_libraries[0] + Array[Map[String,String]] assembly_stats_by_taxon = stats_by_taxon + File assembly_stats_by_taxon_tsv = concatenate.combined + String assembly_method = "viral-ngs/scaffold_and_refine_multitaxa" - String assembly_method = "viral-ngs/scaffold_and_refine_multitaxa" - String scaffold_viral_assemble_version = scaffold.viralngs_version[0] - String refine_viral_assemble_version = refine.viral_assemble_version[0] + # TO DO: some summary stats on stats_by_taxon: how many rows, numbers from the best row, etc } } From 526cece756923953beb55e30b63de8b0eccbd8ac Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Fri, 16 Feb 2024 10:18:55 -0500 Subject: [PATCH 16/20] why cromwell do you behave poorly on edge cases --- .../scaffold_and_refine_multitaxa.wdl | 128 +++++++++--------- 1 file changed, 65 insertions(+), 63 deletions(-) diff --git a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl index d95eea937..56f371cf0 100644 --- a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl +++ b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl @@ -40,75 +40,77 @@ workflow scaffold_and_refine_multitaxa { scatter(taxon in taxid_to_ref_accessions) { # taxon = [taxid, taxname, semicolon_delim_accession_list] - call utils.string_split { - input: - joined_string = taxon[2], - delimiter = ";" - } - call ncbi.download_annotations { - input: - accessions = string_split.tokens, - combined_out_prefix = taxon[0] - } - call assembly.scaffold { - input: - reads_bam = reads_unmapped_bam, - reference_genome_fasta = [download_annotations.combined_fasta], - min_length_fraction = 0, - min_unambig = 0, - allow_incomplete_output = true - } - call assemble_refbased.assemble_refbased as refine { - input: - reads_unmapped_bams = [reads_unmapped_bam], - reference_fasta = scaffold.scaffold_fasta, - sample_name = sample_id - } - # TO DO: if percent_reference_covered > some threshold, run ncbi.rename_fasta_header and ncbi.align_and_annot_transfer_single - # TO DO: if biosample attributes file provided, run ncbi.biosample_to_genbank - - if (refine.reference_genome_length > 0) { - Float percent_reference_covered = 1.0 * refine.assembly_length_unambiguous / refine.reference_genome_length - } - - Map[String, String] stats_by_taxon = { - "sample_id" : sample_id, - "taxid" : taxon[0], - "tax_name" : taxon[1], - - "assembly_fasta" : refine.assembly_fasta, - "aligned_only_reads_bam" : refine.align_to_self_merged_aligned_only_bam, - "coverage_plot" : refine.align_to_self_merged_coverage_plot, - "assembly_length" : refine.assembly_length, - "assembly_length_unambiguous" : refine.assembly_length_unambiguous, - "reads_aligned" : refine.align_to_self_merged_reads_aligned, - "mean_coverage" : refine.align_to_self_merged_mean_coverage, - "percent_reference_covered" : select_first([percent_reference_covered, 0.0]), - - "intermediate_gapfill_fasta" : scaffold.intermediate_gapfill_fasta, - "assembly_preimpute_length_unambiguous" : scaffold.assembly_preimpute_length_unambiguous, - - "replicate_concordant_sites" : refine.replicate_concordant_sites, - "replicate_discordant_snps" : refine.replicate_discordant_snps, - "replicate_discordant_indels" : refine.replicate_discordant_indels, - "replicate_discordant_vcf" : refine.replicate_discordant_vcf, - - "isnvsFile" : refine.align_to_self_isnvs_vcf, - "aligned_bam" : refine.align_to_self_merged_aligned_only_bam, - "coverage_tsv" : refine.align_to_self_merged_coverage_tsv, - "read_pairs_aligned" : refine.align_to_self_merged_read_pairs_aligned, - "bases_aligned" : refine.align_to_self_merged_bases_aligned - } - - scatter(h in assembly_header) { - String stat_by_taxon = stats_by_taxon[h] + if(length(taxon)>1) { # <-- workaround for serious bug in cromwell's read_tsv on empty files + call utils.string_split { + input: + joined_string = taxon[2], + delimiter = ";" + } + call ncbi.download_annotations { + input: + accessions = string_split.tokens, + combined_out_prefix = taxon[0] + } + call assembly.scaffold { + input: + reads_bam = reads_unmapped_bam, + reference_genome_fasta = [download_annotations.combined_fasta], + min_length_fraction = 0, + min_unambig = 0, + allow_incomplete_output = true + } + call assemble_refbased.assemble_refbased as refine { + input: + reads_unmapped_bams = [reads_unmapped_bam], + reference_fasta = scaffold.scaffold_fasta, + sample_name = sample_id + } + # TO DO: if percent_reference_covered > some threshold, run ncbi.rename_fasta_header and ncbi.align_and_annot_transfer_single + # TO DO: if biosample attributes file provided, run ncbi.biosample_to_genbank + + if (refine.reference_genome_length > 0) { + Float percent_reference_covered = 1.0 * refine.assembly_length_unambiguous / refine.reference_genome_length + } + + Map[String, String] stats_by_taxon = { + "sample_id" : sample_id, + "taxid" : taxon[0], + "tax_name" : taxon[1], + + "assembly_fasta" : refine.assembly_fasta, + "aligned_only_reads_bam" : refine.align_to_self_merged_aligned_only_bam, + "coverage_plot" : refine.align_to_self_merged_coverage_plot, + "assembly_length" : refine.assembly_length, + "assembly_length_unambiguous" : refine.assembly_length_unambiguous, + "reads_aligned" : refine.align_to_self_merged_reads_aligned, + "mean_coverage" : refine.align_to_self_merged_mean_coverage, + "percent_reference_covered" : select_first([percent_reference_covered, 0.0]), + + "intermediate_gapfill_fasta" : scaffold.intermediate_gapfill_fasta, + "assembly_preimpute_length_unambiguous" : scaffold.assembly_preimpute_length_unambiguous, + + "replicate_concordant_sites" : refine.replicate_concordant_sites, + "replicate_discordant_snps" : refine.replicate_discordant_snps, + "replicate_discordant_indels" : refine.replicate_discordant_indels, + "replicate_discordant_vcf" : refine.replicate_discordant_vcf, + + "isnvsFile" : refine.align_to_self_isnvs_vcf, + "aligned_bam" : refine.align_to_self_merged_aligned_only_bam, + "coverage_tsv" : refine.align_to_self_merged_coverage_tsv, + "read_pairs_aligned" : refine.align_to_self_merged_read_pairs_aligned, + "bases_aligned" : refine.align_to_self_merged_bases_aligned + } + + scatter(h in assembly_header) { + String stat_by_taxon = stats_by_taxon[h] + } } } ### summary stats call utils.concatenate { input: - infiles = [write_tsv([assembly_header]), write_tsv(stat_by_taxon)], + infiles = [write_tsv([assembly_header]), write_tsv(select_all(stat_by_taxon))], output_name = "assembly_metadata-~{sample_id}.tsv" } From f02a58bd114524ce78f95834433a4f7f00898ab0 Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Fri, 16 Feb 2024 11:00:10 -0500 Subject: [PATCH 17/20] more stats and outputs, revert to refbased if cant denovo, dont polish if cant denovo --- pipes/WDL/tasks/tasks_assembly.wdl | 2 + .../scaffold_and_refine_multitaxa.wdl | 95 ++++++++++++------- 2 files changed, 64 insertions(+), 33 deletions(-) diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl index 7754eba16..a0ff7e0fe 100644 --- a/pipes/WDL/tasks/tasks_assembly.wdl +++ b/pipes/WDL/tasks/tasks_assembly.wdl @@ -233,6 +233,7 @@ task scaffold { grep -v '^>' ~{sample_name}.intermediate_gapfill.fasta | tr -d '\nNn' | wc -c | tee assembly_preimpute_length_unambiguous grep '^>' ~{sample_name}.intermediate_gapfill.fasta | wc -l | tee assembly_num_segments_recovered grep '^>' ~{sample_name}.scaffolding_chosen_ref.fasta | wc -l | tee reference_num_segments_required + grep -v '^>' ~{sample_name}.scaffolding_chosen_ref.fasta | tr -d '\n' | wc -c | tee reference_length set -e -o pipefail if ~{true='true' false='false' allow_incomplete_output} && ! cmp -s assembly_num_segments_recovered reference_num_segments_required @@ -265,6 +266,7 @@ task scaffold { Int assembly_preimpute_length_unambiguous = read_int("assembly_preimpute_length_unambiguous") Int assembly_num_segments_recovered = read_int("assembly_num_segments_recovered") Int reference_num_segments_required = read_int("reference_num_segments_required") + Int reference_length = read_int("reference_length") Array[String] scaffolding_chosen_ref_names = read_lines("~{sample_name}.scaffolding_chosen_refs.txt") File scaffolding_chosen_ref = "~{sample_name}.scaffolding_chosen_ref.fasta" File scaffolding_stats = "~{sample_name}.scaffolding_stats.txt" diff --git a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl index 56f371cf0..ebc7e51f2 100644 --- a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl +++ b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl @@ -25,6 +25,8 @@ workflow scaffold_and_refine_multitaxa { # Float min_pct_reference_covered = 0.1 } + Int min_scaffold_unambig = 10 + # if kraken reports are available, filter scaffold list to observed hits (output might be empty!) if(defined(focal_report_tsv) && defined(ncbi_taxdump_tgz)) { call metagenomics.filter_refs_to_found_taxa { @@ -59,46 +61,64 @@ workflow scaffold_and_refine_multitaxa { min_unambig = 0, allow_incomplete_output = true } - call assemble_refbased.assemble_refbased as refine { - input: - reads_unmapped_bams = [reads_unmapped_bam], - reference_fasta = scaffold.scaffold_fasta, - sample_name = sample_id + if (scaffold.assembly_preimpute_length_unambiguous > min_scaffold_unambig) { + # polish de novo assembly with reads + call assemble_refbased.assemble_refbased as refine { + input: + reads_unmapped_bams = [reads_unmapped_bam], + reference_fasta = scaffold.scaffold_fasta, + sample_name = sample_id + } + String assembly_method_denovo = "viral-ngs/assemble_denovo" } + if (scaffold.assembly_preimpute_length_unambiguous <= min_scaffold_unambig) { + # fall back to refbased assembly if de novo fails + call assemble_refbased.assemble_refbased as ref_based { + input: + reads_unmapped_bams = [reads_unmapped_bam], + reference_fasta = download_annotations.combined_fasta, + sample_name = sample_id + } + String assembly_method_refbased = "viral-ngs/assemble_refbased" + } + # TO DO: if percent_reference_covered > some threshold, run ncbi.rename_fasta_header and ncbi.align_and_annot_transfer_single # TO DO: if biosample attributes file provided, run ncbi.biosample_to_genbank - if (refine.reference_genome_length > 0) { - Float percent_reference_covered = 1.0 * refine.assembly_length_unambiguous / refine.reference_genome_length - } - + String taxid = taxon[0] + String tax_name = taxon[1] + Int assembly_length_unambiguous = select_first([refine.assembly_length_unambiguous, ref_based.assembly_length_unambiguous]) + Float percent_reference_covered = 1.0 * assembly_length_unambiguous / scaffold.reference_length + File assembly_fasta = select_first([refine.assembly_fasta, ref_based.assembly_fasta]) Map[String, String] stats_by_taxon = { "sample_id" : sample_id, - "taxid" : taxon[0], - "tax_name" : taxon[1], - - "assembly_fasta" : refine.assembly_fasta, - "aligned_only_reads_bam" : refine.align_to_self_merged_aligned_only_bam, - "coverage_plot" : refine.align_to_self_merged_coverage_plot, - "assembly_length" : refine.assembly_length, - "assembly_length_unambiguous" : refine.assembly_length_unambiguous, - "reads_aligned" : refine.align_to_self_merged_reads_aligned, - "mean_coverage" : refine.align_to_self_merged_mean_coverage, - "percent_reference_covered" : select_first([percent_reference_covered, 0.0]), - - "intermediate_gapfill_fasta" : scaffold.intermediate_gapfill_fasta, + "taxid" : taxid, + "tax_name" : tax_name, + + "assembly_fasta" : assembly_fasta, + "aligned_only_reads_bam" : select_first([refine.align_to_self_merged_aligned_only_bam, ref_based.align_to_self_merged_aligned_only_bam]), + "coverage_plot" : select_first([refine.align_to_self_merged_coverage_plot, ref_based.align_to_self_merged_coverage_plot]), + "assembly_length" : select_first([refine.assembly_length, ref_based.assembly_length]), + "assembly_length_unambiguous" : assembly_length_unambiguous, + "reads_aligned" : select_first([refine.align_to_self_merged_reads_aligned, ref_based.align_to_self_merged_reads_aligned]), + "mean_coverage" : select_first([refine.align_to_self_merged_mean_coverage, ref_based.align_to_self_merged_mean_coverage]), + "percent_reference_covered" : percent_reference_covered, + + "intermediate_gapfill_fasta" : scaffold.intermediate_gapfill_fasta, "assembly_preimpute_length_unambiguous" : scaffold.assembly_preimpute_length_unambiguous, - "replicate_concordant_sites" : refine.replicate_concordant_sites, - "replicate_discordant_snps" : refine.replicate_discordant_snps, - "replicate_discordant_indels" : refine.replicate_discordant_indels, - "replicate_discordant_vcf" : refine.replicate_discordant_vcf, + "replicate_concordant_sites" : select_first([refine.replicate_concordant_sites, ref_based.replicate_concordant_sites]), + "replicate_discordant_snps" : select_first([refine.replicate_discordant_snps, ref_based.replicate_discordant_snps]), + "replicate_discordant_indels" : select_first([refine.replicate_discordant_indels, ref_based.replicate_discordant_indels]), + "replicate_discordant_vcf" : select_first([refine.replicate_discordant_vcf, ref_based.replicate_discordant_vcf]), + + "isnvsFile" : select_first([refine.align_to_self_isnvs_vcf, ref_based.align_to_self_isnvs_vcf]), + "aligned_bam" : select_first([refine.align_to_self_merged_aligned_only_bam, ref_based.align_to_self_merged_aligned_only_bam]), + "coverage_tsv" : select_first([refine.align_to_self_merged_coverage_tsv, ref_based.align_to_self_merged_coverage_tsv]), + "read_pairs_aligned" : select_first([refine.align_to_self_merged_read_pairs_aligned, ref_based.align_to_self_merged_read_pairs_aligned]), + "bases_aligned" : select_first([refine.align_to_self_merged_bases_aligned, ref_based.align_to_self_merged_bases_aligned]), - "isnvsFile" : refine.align_to_self_isnvs_vcf, - "aligned_bam" : refine.align_to_self_merged_aligned_only_bam, - "coverage_tsv" : refine.align_to_self_merged_coverage_tsv, - "read_pairs_aligned" : refine.align_to_self_merged_read_pairs_aligned, - "bases_aligned" : refine.align_to_self_merged_bases_aligned + "assembly_method" : select_first([assembly_method_denovo, assembly_method_refbased]) } scatter(h in assembly_header) { @@ -115,10 +135,19 @@ workflow scaffold_and_refine_multitaxa { } output { - Array[Map[String,String]] assembly_stats_by_taxon = stats_by_taxon + Array[Map[String,String]] assembly_stats_by_taxon = select_all(stats_by_taxon) File assembly_stats_by_taxon_tsv = concatenate.combined String assembly_method = "viral-ngs/scaffold_and_refine_multitaxa" - # TO DO: some summary stats on stats_by_taxon: how many rows, numbers from the best row, etc + ## TO DO: some summary stats on stats_by_taxon: how many rows, numbers from the best row, etc + #String assembly_top_taxon_id = + #String assembly_top_length_unambiguous = + #Float assembly_top_pct_ref_cov = + #File assembly_top_fasta = + Array[String] assembly_all_taxids = select_all(taxid) + Array[String] assembly_all_taxnames = select_all(tax_name) + Array[Int] assembly_all_lengths_unambig = select_all(assembly_length_unambiguous) + Array[Float] assembly_all_pct_ref_cov = select_all(percent_reference_covered) + Array[File] assembly_all_fastas = select_all(assembly_fasta) } } From bc6bee7b5755afe6ade790bf9fbd7ed752d046de Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Fri, 16 Feb 2024 12:24:35 -0500 Subject: [PATCH 18/20] simplify cromwell fix --- .../scaffold_and_refine_multitaxa.wdl | 174 +++++++++--------- 1 file changed, 89 insertions(+), 85 deletions(-) diff --git a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl index ebc7e51f2..a2dd28723 100644 --- a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl +++ b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl @@ -37,105 +37,109 @@ workflow scaffold_and_refine_multitaxa { } } - Array[Array[String]] taxid_to_ref_accessions = read_tsv(select_first([filter_refs_to_found_taxa.filtered_taxid_to_ref_accessions_tsv, taxid_to_ref_accessions_tsv])) Array[String] assembly_header = ["sample_id", "taxid", "tax_name", "assembly_fasta", "aligned_only_reads_bam", "coverage_plot", "assembly_length", "assembly_length_unambiguous", "reads_aligned", "mean_coverage", "percent_reference_covered", "intermediate_gapfill_fasta", "assembly_preimpute_length_unambiguous", "replicate_concordant_sites", "replicate_discordant_snps", "replicate_discordant_indels", "replicate_discordant_vcf", "isnvsFile", "aligned_bam", "coverage_tsv", "read_pairs_aligned", "bases_aligned"] - + Array[Array[String]] taxid_to_ref_accessions = read_tsv(select_first([filter_refs_to_found_taxa.filtered_taxid_to_ref_accessions_tsv, taxid_to_ref_accessions_tsv])) scatter(taxon in taxid_to_ref_accessions) { + # cromwell's read_tsv emits [[""]] on empty (0-byte) file input, turn it into [] + if(length(taxon)>1) { + Array[String] taxid_to_ref_accessions_fix_cromwell_read_tsv_bug = taxon + } + } + + scatter(taxon in select_all(taxid_to_ref_accessions_fix_cromwell_read_tsv_bug)) { # taxon = [taxid, taxname, semicolon_delim_accession_list] - if(length(taxon)>1) { # <-- workaround for serious bug in cromwell's read_tsv on empty files - call utils.string_split { - input: - joined_string = taxon[2], - delimiter = ";" - } - call ncbi.download_annotations { + call utils.string_split { + input: + joined_string = taxon[2], + delimiter = ";" + } + call ncbi.download_annotations { + input: + accessions = string_split.tokens, + combined_out_prefix = taxon[0] + } + call assembly.scaffold { + input: + reads_bam = reads_unmapped_bam, + reference_genome_fasta = [download_annotations.combined_fasta], + min_length_fraction = 0, + min_unambig = 0, + allow_incomplete_output = true + } + if (scaffold.assembly_preimpute_length_unambiguous > min_scaffold_unambig) { + # polish de novo assembly with reads + call assemble_refbased.assemble_refbased as refine { input: - accessions = string_split.tokens, - combined_out_prefix = taxon[0] + reads_unmapped_bams = [reads_unmapped_bam], + reference_fasta = scaffold.scaffold_fasta, + sample_name = sample_id } - call assembly.scaffold { + String assembly_method_denovo = "viral-ngs/assemble_denovo" + } + if (scaffold.assembly_preimpute_length_unambiguous <= min_scaffold_unambig) { + # fall back to refbased assembly if de novo fails + call assemble_refbased.assemble_refbased as ref_based { input: - reads_bam = reads_unmapped_bam, - reference_genome_fasta = [download_annotations.combined_fasta], - min_length_fraction = 0, - min_unambig = 0, - allow_incomplete_output = true - } - if (scaffold.assembly_preimpute_length_unambiguous > min_scaffold_unambig) { - # polish de novo assembly with reads - call assemble_refbased.assemble_refbased as refine { - input: - reads_unmapped_bams = [reads_unmapped_bam], - reference_fasta = scaffold.scaffold_fasta, - sample_name = sample_id - } - String assembly_method_denovo = "viral-ngs/assemble_denovo" - } - if (scaffold.assembly_preimpute_length_unambiguous <= min_scaffold_unambig) { - # fall back to refbased assembly if de novo fails - call assemble_refbased.assemble_refbased as ref_based { - input: - reads_unmapped_bams = [reads_unmapped_bam], - reference_fasta = download_annotations.combined_fasta, - sample_name = sample_id - } - String assembly_method_refbased = "viral-ngs/assemble_refbased" + reads_unmapped_bams = [reads_unmapped_bam], + reference_fasta = download_annotations.combined_fasta, + sample_name = sample_id } + String assembly_method_refbased = "viral-ngs/assemble_refbased" + } - # TO DO: if percent_reference_covered > some threshold, run ncbi.rename_fasta_header and ncbi.align_and_annot_transfer_single - # TO DO: if biosample attributes file provided, run ncbi.biosample_to_genbank - - String taxid = taxon[0] - String tax_name = taxon[1] - Int assembly_length_unambiguous = select_first([refine.assembly_length_unambiguous, ref_based.assembly_length_unambiguous]) - Float percent_reference_covered = 1.0 * assembly_length_unambiguous / scaffold.reference_length - File assembly_fasta = select_first([refine.assembly_fasta, ref_based.assembly_fasta]) - Map[String, String] stats_by_taxon = { - "sample_id" : sample_id, - "taxid" : taxid, - "tax_name" : tax_name, - - "assembly_fasta" : assembly_fasta, - "aligned_only_reads_bam" : select_first([refine.align_to_self_merged_aligned_only_bam, ref_based.align_to_self_merged_aligned_only_bam]), - "coverage_plot" : select_first([refine.align_to_self_merged_coverage_plot, ref_based.align_to_self_merged_coverage_plot]), - "assembly_length" : select_first([refine.assembly_length, ref_based.assembly_length]), - "assembly_length_unambiguous" : assembly_length_unambiguous, - "reads_aligned" : select_first([refine.align_to_self_merged_reads_aligned, ref_based.align_to_self_merged_reads_aligned]), - "mean_coverage" : select_first([refine.align_to_self_merged_mean_coverage, ref_based.align_to_self_merged_mean_coverage]), - "percent_reference_covered" : percent_reference_covered, - - "intermediate_gapfill_fasta" : scaffold.intermediate_gapfill_fasta, - "assembly_preimpute_length_unambiguous" : scaffold.assembly_preimpute_length_unambiguous, - - "replicate_concordant_sites" : select_first([refine.replicate_concordant_sites, ref_based.replicate_concordant_sites]), - "replicate_discordant_snps" : select_first([refine.replicate_discordant_snps, ref_based.replicate_discordant_snps]), - "replicate_discordant_indels" : select_first([refine.replicate_discordant_indels, ref_based.replicate_discordant_indels]), - "replicate_discordant_vcf" : select_first([refine.replicate_discordant_vcf, ref_based.replicate_discordant_vcf]), - - "isnvsFile" : select_first([refine.align_to_self_isnvs_vcf, ref_based.align_to_self_isnvs_vcf]), - "aligned_bam" : select_first([refine.align_to_self_merged_aligned_only_bam, ref_based.align_to_self_merged_aligned_only_bam]), - "coverage_tsv" : select_first([refine.align_to_self_merged_coverage_tsv, ref_based.align_to_self_merged_coverage_tsv]), - "read_pairs_aligned" : select_first([refine.align_to_self_merged_read_pairs_aligned, ref_based.align_to_self_merged_read_pairs_aligned]), - "bases_aligned" : select_first([refine.align_to_self_merged_bases_aligned, ref_based.align_to_self_merged_bases_aligned]), - - "assembly_method" : select_first([assembly_method_denovo, assembly_method_refbased]) - } + # TO DO: if percent_reference_covered > some threshold, run ncbi.rename_fasta_header and ncbi.align_and_annot_transfer_single + # TO DO: if biosample attributes file provided, run ncbi.biosample_to_genbank + + String taxid = taxon[0] + String tax_name = taxon[1] + Int assembly_length_unambiguous = select_first([refine.assembly_length_unambiguous, ref_based.assembly_length_unambiguous]) + Float percent_reference_covered = 1.0 * assembly_length_unambiguous / scaffold.reference_length + File assembly_fasta = select_first([refine.assembly_fasta, ref_based.assembly_fasta]) + Map[String, String] stats_by_taxon = { + "sample_id" : sample_id, + "taxid" : taxid, + "tax_name" : tax_name, + + "assembly_fasta" : assembly_fasta, + "aligned_only_reads_bam" : select_first([refine.align_to_self_merged_aligned_only_bam, ref_based.align_to_self_merged_aligned_only_bam]), + "coverage_plot" : select_first([refine.align_to_self_merged_coverage_plot, ref_based.align_to_self_merged_coverage_plot]), + "assembly_length" : select_first([refine.assembly_length, ref_based.assembly_length]), + "assembly_length_unambiguous" : assembly_length_unambiguous, + "reads_aligned" : select_first([refine.align_to_self_merged_reads_aligned, ref_based.align_to_self_merged_reads_aligned]), + "mean_coverage" : select_first([refine.align_to_self_merged_mean_coverage, ref_based.align_to_self_merged_mean_coverage]), + "percent_reference_covered" : percent_reference_covered, + + "intermediate_gapfill_fasta" : scaffold.intermediate_gapfill_fasta, + "assembly_preimpute_length_unambiguous" : scaffold.assembly_preimpute_length_unambiguous, + + "replicate_concordant_sites" : select_first([refine.replicate_concordant_sites, ref_based.replicate_concordant_sites]), + "replicate_discordant_snps" : select_first([refine.replicate_discordant_snps, ref_based.replicate_discordant_snps]), + "replicate_discordant_indels" : select_first([refine.replicate_discordant_indels, ref_based.replicate_discordant_indels]), + "replicate_discordant_vcf" : select_first([refine.replicate_discordant_vcf, ref_based.replicate_discordant_vcf]), + + "isnvsFile" : select_first([refine.align_to_self_isnvs_vcf, ref_based.align_to_self_isnvs_vcf]), + "aligned_bam" : select_first([refine.align_to_self_merged_aligned_only_bam, ref_based.align_to_self_merged_aligned_only_bam]), + "coverage_tsv" : select_first([refine.align_to_self_merged_coverage_tsv, ref_based.align_to_self_merged_coverage_tsv]), + "read_pairs_aligned" : select_first([refine.align_to_self_merged_read_pairs_aligned, ref_based.align_to_self_merged_read_pairs_aligned]), + "bases_aligned" : select_first([refine.align_to_self_merged_bases_aligned, ref_based.align_to_self_merged_bases_aligned]), + + "assembly_method" : select_first([assembly_method_denovo, assembly_method_refbased]) + } - scatter(h in assembly_header) { - String stat_by_taxon = stats_by_taxon[h] - } + scatter(h in assembly_header) { + String stat_by_taxon = stats_by_taxon[h] } } ### summary stats call utils.concatenate { input: - infiles = [write_tsv([assembly_header]), write_tsv(select_all(stat_by_taxon))], + infiles = [write_tsv([assembly_header]), write_tsv(stat_by_taxon)], output_name = "assembly_metadata-~{sample_id}.tsv" } output { - Array[Map[String,String]] assembly_stats_by_taxon = select_all(stats_by_taxon) + Array[Map[String,String]] assembly_stats_by_taxon = stats_by_taxon File assembly_stats_by_taxon_tsv = concatenate.combined String assembly_method = "viral-ngs/scaffold_and_refine_multitaxa" @@ -144,10 +148,10 @@ workflow scaffold_and_refine_multitaxa { #String assembly_top_length_unambiguous = #Float assembly_top_pct_ref_cov = #File assembly_top_fasta = - Array[String] assembly_all_taxids = select_all(taxid) - Array[String] assembly_all_taxnames = select_all(tax_name) - Array[Int] assembly_all_lengths_unambig = select_all(assembly_length_unambiguous) - Array[Float] assembly_all_pct_ref_cov = select_all(percent_reference_covered) - Array[File] assembly_all_fastas = select_all(assembly_fasta) + Array[String] assembly_all_taxids = taxid + Array[String] assembly_all_taxnames = tax_name + Array[Int] assembly_all_lengths_unambig = assembly_length_unambiguous + Array[Float] assembly_all_pct_ref_cov = percent_reference_covered + Array[File] assembly_all_fastas = assembly_fasta } } From 93d455fa2443039f3b4f7daf6e0865eadfa7826d Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Fri, 16 Feb 2024 17:26:37 -0500 Subject: [PATCH 19/20] bump viral-classify 2.2.3.0 to 2.2.4.1 --- pipes/WDL/tasks/tasks_metagenomics.wdl | 20 ++++++++++---------- pipes/WDL/tasks/tasks_reports.wdl | 2 +- pipes/WDL/tasks/tasks_taxon_filter.wdl | 6 +++--- requirements-modules.txt | 2 +- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/pipes/WDL/tasks/tasks_metagenomics.wdl b/pipes/WDL/tasks/tasks_metagenomics.wdl index 095a4369c..12f5e738b 100644 --- a/pipes/WDL/tasks/tasks_metagenomics.wdl +++ b/pipes/WDL/tasks/tasks_metagenomics.wdl @@ -11,7 +11,7 @@ task krakenuniq { File krona_taxonomy_db_tgz # taxonomy.tab Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0" #skip-global-version-pin + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.1" #skip-global-version-pin } Int disk_size = 750 @@ -143,7 +143,7 @@ task build_krakenuniq_db { Int? zstd_compression_level Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0" #skip-global-version-pin + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.1" #skip-global-version-pin } Int disk_size = 750 @@ -213,7 +213,7 @@ task kraken2 { Int? min_base_qual Int machine_mem_gb = 72 - String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0" + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.1" } parameter_meta { @@ -334,7 +334,7 @@ task report_primary_kraken_taxa { File kraken_summary_report String focal_taxon = "Viruses" - String docker = "quay.io/broadinstitute/viral-classify:dp-ksummary" #skip-global-version-pin + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.1" } String out_basename = basename(kraken_summary_report, '.txt') Int disk_size = 50 @@ -385,7 +385,7 @@ task filter_refs_to_found_taxa { File taxdump_tgz Int min_read_count = 100 - String docker = "quay.io/broadinstitute/viral-classify:dp-ksummary" #skip-global-version-pin + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.1" } String ref_basename = basename(taxid_to_ref_accessions_tsv, '.tsv') String hits_basename = basename(focal_report_tsv, '.tsv') @@ -436,7 +436,7 @@ task build_kraken2_db { Int? zstd_compression_level Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0" + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.1" } Int disk_size = 750 @@ -578,7 +578,7 @@ task blastx { File krona_taxonomy_db_tgz Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0" + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.1" } parameter_meta { @@ -668,7 +668,7 @@ task krona { Int? magnitude_column Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0" + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.1" } Int disk_size = 50 @@ -775,7 +775,7 @@ task filter_bam_to_taxa { String out_filename_suffix = "filtered" Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0" + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.1" } String out_basename = basename(classified_bam, ".bam") + "." + out_filename_suffix @@ -862,7 +862,7 @@ task kaiju { File krona_taxonomy_db_tgz # taxonomy/taxonomy.tab Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0" + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.1" } String input_basename = basename(reads_unmapped_bam, ".bam") diff --git a/pipes/WDL/tasks/tasks_reports.wdl b/pipes/WDL/tasks/tasks_reports.wdl index 1b4188e11..25e303ed1 100644 --- a/pipes/WDL/tasks/tasks_reports.wdl +++ b/pipes/WDL/tasks/tasks_reports.wdl @@ -488,7 +488,7 @@ task aggregate_metagenomics_reports { String aggregate_taxlevel_focus = "species" Int aggregate_top_N_hits = 5 - String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0" + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.1" } parameter_meta { diff --git a/pipes/WDL/tasks/tasks_taxon_filter.wdl b/pipes/WDL/tasks/tasks_taxon_filter.wdl index 18394306e..d96d4db8b 100644 --- a/pipes/WDL/tasks/tasks_taxon_filter.wdl +++ b/pipes/WDL/tasks/tasks_taxon_filter.wdl @@ -14,7 +14,7 @@ task deplete_taxa { Int? cpu=8 Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0" + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.1" } parameter_meta { @@ -113,7 +113,7 @@ task filter_to_taxon { String? neg_control_prefixes_space_separated = "neg water NTC" Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0" + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.1" } # do this in two steps in case the input doesn't actually have "cleaned" in the name @@ -172,7 +172,7 @@ task build_lastal_db { File sequences_fasta Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0" + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.1" } String db_name = basename(sequences_fasta, ".fasta") diff --git a/requirements-modules.txt b/requirements-modules.txt index 2cf41a97a..13125958f 100644 --- a/requirements-modules.txt +++ b/requirements-modules.txt @@ -1,6 +1,6 @@ broadinstitute/viral-core=2.2.4 broadinstitute/viral-assemble=2.2.4.0 -broadinstitute/viral-classify=2.2.3.0 +broadinstitute/viral-classify=2.2.4.1 broadinstitute/viral-phylo=2.1.20.2 broadinstitute/py3-bio=0.1.2 broadinstitute/beast-beagle-cuda=1.10.5pre From 88ca4d192b217803f9d4315ef8aa243ef4941896 Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Fri, 16 Feb 2024 17:46:41 -0500 Subject: [PATCH 20/20] revert version --- pipes/WDL/tasks/tasks_metagenomics.wdl | 20 ++++++++++---------- pipes/WDL/tasks/tasks_reports.wdl | 2 +- pipes/WDL/tasks/tasks_taxon_filter.wdl | 6 +++--- requirements-modules.txt | 2 +- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/pipes/WDL/tasks/tasks_metagenomics.wdl b/pipes/WDL/tasks/tasks_metagenomics.wdl index 12f5e738b..078dc4bb9 100644 --- a/pipes/WDL/tasks/tasks_metagenomics.wdl +++ b/pipes/WDL/tasks/tasks_metagenomics.wdl @@ -11,7 +11,7 @@ task krakenuniq { File krona_taxonomy_db_tgz # taxonomy.tab Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.2.4.1" #skip-global-version-pin + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0" #skip-global-version-pin } Int disk_size = 750 @@ -143,7 +143,7 @@ task build_krakenuniq_db { Int? zstd_compression_level Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.2.4.1" #skip-global-version-pin + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0" #skip-global-version-pin } Int disk_size = 750 @@ -213,7 +213,7 @@ task kraken2 { Int? min_base_qual Int machine_mem_gb = 72 - String docker = "quay.io/broadinstitute/viral-classify:2.2.4.1" + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0" } parameter_meta { @@ -334,7 +334,7 @@ task report_primary_kraken_taxa { File kraken_summary_report String focal_taxon = "Viruses" - String docker = "quay.io/broadinstitute/viral-classify:2.2.4.1" + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0" } String out_basename = basename(kraken_summary_report, '.txt') Int disk_size = 50 @@ -385,7 +385,7 @@ task filter_refs_to_found_taxa { File taxdump_tgz Int min_read_count = 100 - String docker = "quay.io/broadinstitute/viral-classify:2.2.4.1" + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0" } String ref_basename = basename(taxid_to_ref_accessions_tsv, '.tsv') String hits_basename = basename(focal_report_tsv, '.tsv') @@ -436,7 +436,7 @@ task build_kraken2_db { Int? zstd_compression_level Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.2.4.1" + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0" } Int disk_size = 750 @@ -578,7 +578,7 @@ task blastx { File krona_taxonomy_db_tgz Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.2.4.1" + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0" } parameter_meta { @@ -668,7 +668,7 @@ task krona { Int? magnitude_column Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.2.4.1" + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0" } Int disk_size = 50 @@ -775,7 +775,7 @@ task filter_bam_to_taxa { String out_filename_suffix = "filtered" Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.2.4.1" + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0" } String out_basename = basename(classified_bam, ".bam") + "." + out_filename_suffix @@ -862,7 +862,7 @@ task kaiju { File krona_taxonomy_db_tgz # taxonomy/taxonomy.tab Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.2.4.1" + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0" } String input_basename = basename(reads_unmapped_bam, ".bam") diff --git a/pipes/WDL/tasks/tasks_reports.wdl b/pipes/WDL/tasks/tasks_reports.wdl index 25e303ed1..d38faa7ce 100644 --- a/pipes/WDL/tasks/tasks_reports.wdl +++ b/pipes/WDL/tasks/tasks_reports.wdl @@ -488,7 +488,7 @@ task aggregate_metagenomics_reports { String aggregate_taxlevel_focus = "species" Int aggregate_top_N_hits = 5 - String docker = "quay.io/broadinstitute/viral-classify:2.2.4.1" + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0" } parameter_meta { diff --git a/pipes/WDL/tasks/tasks_taxon_filter.wdl b/pipes/WDL/tasks/tasks_taxon_filter.wdl index d96d4db8b..d57528a8b 100644 --- a/pipes/WDL/tasks/tasks_taxon_filter.wdl +++ b/pipes/WDL/tasks/tasks_taxon_filter.wdl @@ -14,7 +14,7 @@ task deplete_taxa { Int? cpu=8 Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.2.4.1" + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0" } parameter_meta { @@ -113,7 +113,7 @@ task filter_to_taxon { String? neg_control_prefixes_space_separated = "neg water NTC" Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.2.4.1" + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0" } # do this in two steps in case the input doesn't actually have "cleaned" in the name @@ -172,7 +172,7 @@ task build_lastal_db { File sequences_fasta Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.2.4.1" + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0" } String db_name = basename(sequences_fasta, ".fasta") diff --git a/requirements-modules.txt b/requirements-modules.txt index 13125958f..d0f852682 100644 --- a/requirements-modules.txt +++ b/requirements-modules.txt @@ -1,6 +1,6 @@ broadinstitute/viral-core=2.2.4 broadinstitute/viral-assemble=2.2.4.0 -broadinstitute/viral-classify=2.2.4.1 +broadinstitute/viral-classify=2.2.4.0 broadinstitute/viral-phylo=2.1.20.2 broadinstitute/py3-bio=0.1.2 broadinstitute/beast-beagle-cuda=1.10.5pre