diff --git a/github_actions_ci/install-wdl.sh b/github_actions_ci/install-wdl.sh index 5a7bf3fc1..092a8cd16 100755 --- a/github_actions_ci/install-wdl.sh +++ b/github_actions_ci/install-wdl.sh @@ -12,8 +12,8 @@ fetch_jar_from_github () { ln -s $_jar_fname $_tool_name.jar } -fetch_jar_from_github broadinstitute cromwell womtool 61 -fetch_jar_from_github broadinstitute cromwell cromwell 61 +fetch_jar_from_github broadinstitute cromwell womtool 86 +fetch_jar_from_github broadinstitute cromwell cromwell 86 fetch_jar_from_github dnanexus dxWDL dxWDL v1.50 TGZ=dx-toolkit-v0.311.0-ubuntu-20.04-amd64.tar.gz diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl index d2c45fb6b..a0ff7e0fe 100644 --- a/pipes/WDL/tasks/tasks_assembly.wdl +++ b/pipes/WDL/tasks/tasks_assembly.wdl @@ -231,19 +231,31 @@ task scaffold { set +e +o pipefail grep -v '^>' ~{sample_name}.intermediate_gapfill.fasta | tr -d '\n' | wc -c | tee assembly_preimpute_length grep -v '^>' ~{sample_name}.intermediate_gapfill.fasta | tr -d '\nNn' | wc -c | tee assembly_preimpute_length_unambiguous + grep '^>' ~{sample_name}.intermediate_gapfill.fasta | wc -l | tee assembly_num_segments_recovered + grep '^>' ~{sample_name}.scaffolding_chosen_ref.fasta | wc -l | tee reference_num_segments_required + grep -v '^>' ~{sample_name}.scaffolding_chosen_ref.fasta | tr -d '\n' | wc -c | tee reference_length set -e -o pipefail - #Input assembly/contigs, FASTA, already ordered oriented and merged with the reference gneome (FASTA) - assembly.py impute_from_reference \ - ~{sample_name}.intermediate_gapfill.fasta \ - ~{sample_name}.scaffolding_chosen_ref.fasta \ - ~{sample_name}.scaffolded_imputed.fasta \ - --newName ~{sample_name} \ - ~{'--replaceLength=' + replace_length} \ - ~{'--minLengthFraction=' + min_length_fraction} \ - ~{'--minUnambig=' + min_unambig} \ - ~{'--aligner=' + aligner} \ - --loglevel=DEBUG + if ~{true='true' false='false' allow_incomplete_output} && ! cmp -s assembly_num_segments_recovered reference_num_segments_required + then + # draft assembly does not have enough segments--and that's okay (allow_incomplete_output=true) + file_utils.py rename_fasta_sequences \ + ~{sample_name}.intermediate_gapfill.fasta \ + ~{sample_name}.scaffolded_imputed.fasta \ + "~{sample_name}" --suffix_always --loglevel=DEBUG + else + # draft assembly must have the right number of segments (fail if not) + assembly.py impute_from_reference \ + ~{sample_name}.intermediate_gapfill.fasta \ + ~{sample_name}.scaffolding_chosen_ref.fasta \ + ~{sample_name}.scaffolded_imputed.fasta \ + --newName ~{sample_name} \ + ~{'--replaceLength=' + replace_length} \ + ~{'--minLengthFraction=' + min_length_fraction} \ + ~{'--minUnambig=' + min_unambig} \ + ~{'--aligner=' + aligner} \ + --loglevel=DEBUG + fi } output { @@ -252,6 +264,9 @@ task scaffold { File intermediate_gapfill_fasta = "~{sample_name}.intermediate_gapfill.fasta" Int assembly_preimpute_length = read_int("assembly_preimpute_length") Int assembly_preimpute_length_unambiguous = read_int("assembly_preimpute_length_unambiguous") + Int assembly_num_segments_recovered = read_int("assembly_num_segments_recovered") + Int reference_num_segments_required = read_int("reference_num_segments_required") + Int reference_length = read_int("reference_length") Array[String] scaffolding_chosen_ref_names = read_lines("~{sample_name}.scaffolding_chosen_refs.txt") File scaffolding_chosen_ref = "~{sample_name}.scaffolding_chosen_ref.fasta" File scaffolding_stats = "~{sample_name}.scaffolding_stats.txt" diff --git a/pipes/WDL/tasks/tasks_metagenomics.wdl b/pipes/WDL/tasks/tasks_metagenomics.wdl index 58354426e..078dc4bb9 100644 --- a/pipes/WDL/tasks/tasks_metagenomics.wdl +++ b/pipes/WDL/tasks/tasks_metagenomics.wdl @@ -11,7 +11,7 @@ task krakenuniq { File krona_taxonomy_db_tgz # taxonomy.tab Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0" #skip-global-version-pin + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0" #skip-global-version-pin } Int disk_size = 750 @@ -143,7 +143,7 @@ task build_krakenuniq_db { Int? zstd_compression_level Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0" #skip-global-version-pin + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0" #skip-global-version-pin } Int disk_size = 750 @@ -213,7 +213,7 @@ task kraken2 { Int? min_base_qual Int machine_mem_gb = 72 - String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0" + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0" } parameter_meta { @@ -326,6 +326,94 @@ task kraken2 { } } +task report_primary_kraken_taxa { + meta { + description: "Interprets a kraken (or kraken2 or krakenuniq) summary report file and emits the primary contributing taxa under a focal taxon of interest." + } + input { + File kraken_summary_report + String focal_taxon = "Viruses" + + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0" + } + String out_basename = basename(kraken_summary_report, '.txt') + Int disk_size = 50 + Int machine_mem_gb = 2 + + command <<< + set -e + metagenomics.py taxlevel_plurality "~{kraken_summary_report}" "~{focal_taxon}" "~{out_basename}.ranked_focal_report.tsv" + cat "~{out_basename}.ranked_focal_report.tsv" | head -2 | tail +2 > TOPROW + cut -f 2 TOPROW > NUM_FOCAL + cut -f 4 TOPROW > PCT_OF_FOCAL + cut -f 7 TOPROW > NUM_READS + cut -f 8 TOPROW > TAX_RANK + cut -f 9 TOPROW > TAX_ID + cut -f 10 TOPROW > TAX_NAME + >>> + + output { + String focal_tax_name = focal_taxon + File ranked_focal_report = "~{out_basename}.ranked_focal_report.tsv" + Int total_focal_reads = read_int("NUM_FOCAL") + Float percent_of_focal = read_float("PCT_OF_FOCAL") + Int num_reads = read_int("NUM_READS") + String tax_rank = read_string("TAX_RANK") + String tax_id = read_string("TAX_ID") + String tax_name = read_string("TAX_NAME") + } + + runtime { + docker: docker + memory: machine_mem_gb + " GB" + cpu: 1 + disks: "local-disk " + disk_size + " LOCAL" + disk: disk_size + " GB" # TESs + dx_instance_type: "mem1_ssd1_v2_x2" + preemptible: 2 + maxRetries: 2 + } +} + +task filter_refs_to_found_taxa { + meta { + description: "Filters a taxid_to_ref_accessions_tsv to the set of taxa found in a focal_report." + } + input { + File taxid_to_ref_accessions_tsv + File focal_report_tsv + File taxdump_tgz + Int min_read_count = 100 + + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0" + } + String ref_basename = basename(taxid_to_ref_accessions_tsv, '.tsv') + String hits_basename = basename(focal_report_tsv, '.tsv') + Int disk_size = 50 + + command <<< + set -e + mkdir -p taxdump + read_utils.py extract_tarball "~{taxdump_tgz}" taxdump + metagenomics.py filter_taxids_to_focal_hits "~{taxid_to_ref_accessions_tsv}" "~{focal_report_tsv}" taxdump ~{min_read_count} "~{ref_basename}-~{hits_basename}.tsv" + >>> + + output { + File filtered_taxid_to_ref_accessions_tsv = "~{ref_basename}-~{hits_basename}.tsv" + } + + runtime { + docker: docker + memory: "2 GB" + cpu: 1 + disks: "local-disk " + disk_size + " LOCAL" + disk: disk_size + " GB" # TESs + dx_instance_type: "mem1_ssd1_v2_x2" + preemptible: 2 + maxRetries: 2 + } +} + task build_kraken2_db { meta { description: "Builds a custom kraken2 database. Outputs tar.zst tarballs of kraken2 database, associated krona taxonomy db, and an ncbi taxdump.tar.gz. Requires live internet access if any standard_libraries are specified or if taxonomy_db_tgz is absent." @@ -348,7 +436,7 @@ task build_kraken2_db { Int? zstd_compression_level Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0" + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0" } Int disk_size = 750 @@ -490,7 +578,7 @@ task blastx { File krona_taxonomy_db_tgz Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0" + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0" } parameter_meta { @@ -580,7 +668,7 @@ task krona { Int? magnitude_column Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0" + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0" } Int disk_size = 50 @@ -687,7 +775,7 @@ task filter_bam_to_taxa { String out_filename_suffix = "filtered" Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0" + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0" } String out_basename = basename(classified_bam, ".bam") + "." + out_filename_suffix @@ -774,7 +862,7 @@ task kaiju { File krona_taxonomy_db_tgz # taxonomy/taxonomy.tab Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0" + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0" } String input_basename = basename(reads_unmapped_bam, ".bam") diff --git a/pipes/WDL/tasks/tasks_reports.wdl b/pipes/WDL/tasks/tasks_reports.wdl index 1b4188e11..d38faa7ce 100644 --- a/pipes/WDL/tasks/tasks_reports.wdl +++ b/pipes/WDL/tasks/tasks_reports.wdl @@ -488,7 +488,7 @@ task aggregate_metagenomics_reports { String aggregate_taxlevel_focus = "species" Int aggregate_top_N_hits = 5 - String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0" + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0" } parameter_meta { diff --git a/pipes/WDL/tasks/tasks_taxon_filter.wdl b/pipes/WDL/tasks/tasks_taxon_filter.wdl index 18394306e..d57528a8b 100644 --- a/pipes/WDL/tasks/tasks_taxon_filter.wdl +++ b/pipes/WDL/tasks/tasks_taxon_filter.wdl @@ -14,7 +14,7 @@ task deplete_taxa { Int? cpu=8 Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0" + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0" } parameter_meta { @@ -113,7 +113,7 @@ task filter_to_taxon { String? neg_control_prefixes_space_separated = "neg water NTC" Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0" + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0" } # do this in two steps in case the input doesn't actually have "cleaned" in the name @@ -172,7 +172,7 @@ task build_lastal_db { File sequences_fasta Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0" + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0" } String db_name = basename(sequences_fasta, ".fasta") diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl index cbe618a1e..bbef0044c 100644 --- a/pipes/WDL/tasks/tasks_utils.wdl +++ b/pipes/WDL/tasks/tasks_utils.wdl @@ -712,6 +712,35 @@ task s3_copy { } } +task string_split { + meta { + description: "split a string by a delimiter" + } + input { + String joined_string + String delimiter + } + command <<< + set -e + python3<>> + output { + Array[String] tokens = read_lines("TOKENS") + } + runtime { + docker: "python:slim" + memory: "1 GB" + cpu: 1 + disks: "local-disk 50 SSD" + disk: "50 GB" # TES + maxRetries: 2 + } +} + task filter_sequences_by_length { meta { description: "Filter sequences in a fasta file to enforce a minimum count of non-N bases." diff --git a/pipes/WDL/workflows/classify_single.wdl b/pipes/WDL/workflows/classify_single.wdl index 1bfd793ae..bea728228 100644 --- a/pipes/WDL/workflows/classify_single.wdl +++ b/pipes/WDL/workflows/classify_single.wdl @@ -121,6 +121,10 @@ workflow classify_single { trim_clip_db = trim_clip_db, always_succeed = true } + call metagenomics.report_primary_kraken_taxa { + input: + kraken_summary_report = kraken2.kraken2_summary_report + } output { File cleaned_reads_unaligned_bam = deplete.bam_filtered_to_taxa @@ -134,6 +138,15 @@ workflow classify_single { File kraken2_summary_report = kraken2.kraken2_summary_report File kraken2_krona_plot = kraken2.krona_report_html + File kraken2_top_taxa_report = report_primary_kraken_taxa.ranked_focal_report + String kraken2_focal_taxon_name = report_primary_kraken_taxa.focal_tax_name + Int kraken2_focal_total_reads = report_primary_kraken_taxa.total_focal_reads + String kraken2_top_taxon_id = report_primary_kraken_taxa.tax_id + String kraken2_top_taxon_name = report_primary_kraken_taxa.tax_name + String kraken2_top_taxon_rank = report_primary_kraken_taxa.tax_rank + Int kraken2_top_taxon_num_reads = report_primary_kraken_taxa.num_reads + Float kraken2_top_taxon_pct_of_focal = report_primary_kraken_taxa.percent_of_focal + File raw_fastqc = merge_raw_reads.fastqc File cleaned_fastqc = fastqc_cleaned.fastqc_html File spikein_report = spikein.report diff --git a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl index eac47dc1f..a2dd28723 100644 --- a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl +++ b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl @@ -1,6 +1,7 @@ version 1.0 import "../tasks/tasks_assembly.wdl" as assembly +import "../tasks/tasks_metagenomics.wdl" as metagenomics import "../tasks/tasks_ncbi.wdl" as ncbi import "../tasks/tasks_utils.wdl" as utils import "assemble_refbased.wdl" as assemble_refbased @@ -17,39 +18,45 @@ workflow scaffold_and_refine_multitaxa { String sample_id File reads_unmapped_bam - Array[Pair[Int,Array[String]+]] taxid_to_ref_accessions = [ - (208893, ["KY654518.1"]), # RSV-A - (208895, ["MZ516105.1"]), # RSV-B - (573824, ["NC_038311.1"]), # Rhino A1 - (185900, ["ON311191.1"]), # Rhino B27 - (1418033, ["ON311169.1"]), # Rhino C15 - (463676, ["JN837686.2"]), # Rhino C45 - (11137, ["NC_002645.1"]), # HCoV 229E - (290028, ["NC_006577.2"]), # HCoV HKU1 - (277944, ["NC_005831.2"]), # HCoV NL63 - (31631, ["NC_006213.1"]), # HCoV OC43 - (2697049, ["NC_045512.2"]), # SARS-CoV-2 Wuhan Hu-1 - (641809, ["NC_026438.1", "NC_026435.1", "NC_026437.1", "NC_026433.1", "NC_026436.1", "NC_026434.1", "NC_026431.1", "NC_026432.1"]), # Flu A/California/07/2009 H1N1 - (335341, ["NC_007373.1", "NC_007372.1", "NC_007371.1", "NC_007366.1", "NC_007369.1", "NC_007368.1", "NC_007367.1", "NC_007370.1"]), # Flu A/New York/392/2004 H3N2 - (518987, ["NC_002204.1", "NC_002205.1", "NC_002206.1", "NC_002207.1", "NC_002208.1", "NC_002209.1", "NC_002210.1", "NC_002211.1"]), # Flu B/Lee/1940 - (162145, ["NC_039199.1"]), # metapneumo - (12730, ["NC_003461.1"]), # paraflu 1 - (2560525, ["NC_003443.1"]), # paraflu 2 - (11216, ["NC_001796.2"]), # paraflu 3 - (11224, ["NC_021928.1"]), # paraflu 4 - (129951, ["NC_001405.1"]) # adenovirus C - ] + File taxid_to_ref_accessions_tsv + File? focal_report_tsv + File? ncbi_taxdump_tgz # Float min_pct_reference_covered = 0.1 } - Array[String] assembly_header = ["sample_id", "taxid", "assembly_fasta", "aligned_only_reads_bam", "coverage_plot", "assembly_length", "assembly_length_unambiguous", "reads_aligned", "mean_coverage", "percent_reference_covered", "intermediate_gapfill_fasta", "assembly_preimpute_length_unambiguous", "replicate_concordant_sites", "replicate_discordant_snps", "replicate_discordant_indels", "replicate_discordant_vcf", "isnvsFile", "aligned_bam", "coverage_tsv", "read_pairs_aligned", "bases_aligned"] + Int min_scaffold_unambig = 10 + # if kraken reports are available, filter scaffold list to observed hits (output might be empty!) + if(defined(focal_report_tsv) && defined(ncbi_taxdump_tgz)) { + call metagenomics.filter_refs_to_found_taxa { + input: + taxid_to_ref_accessions_tsv = taxid_to_ref_accessions_tsv, + taxdump_tgz = select_first([ncbi_taxdump_tgz]), + focal_report_tsv = select_first([focal_report_tsv]) + } + } + + Array[String] assembly_header = ["sample_id", "taxid", "tax_name", "assembly_fasta", "aligned_only_reads_bam", "coverage_plot", "assembly_length", "assembly_length_unambiguous", "reads_aligned", "mean_coverage", "percent_reference_covered", "intermediate_gapfill_fasta", "assembly_preimpute_length_unambiguous", "replicate_concordant_sites", "replicate_discordant_snps", "replicate_discordant_indels", "replicate_discordant_vcf", "isnvsFile", "aligned_bam", "coverage_tsv", "read_pairs_aligned", "bases_aligned"] + Array[Array[String]] taxid_to_ref_accessions = read_tsv(select_first([filter_refs_to_found_taxa.filtered_taxid_to_ref_accessions_tsv, taxid_to_ref_accessions_tsv])) scatter(taxon in taxid_to_ref_accessions) { + # cromwell's read_tsv emits [[""]] on empty (0-byte) file input, turn it into [] + if(length(taxon)>1) { + Array[String] taxid_to_ref_accessions_fix_cromwell_read_tsv_bug = taxon + } + } + + scatter(taxon in select_all(taxid_to_ref_accessions_fix_cromwell_read_tsv_bug)) { + # taxon = [taxid, taxname, semicolon_delim_accession_list] + call utils.string_split { + input: + joined_string = taxon[2], + delimiter = ";" + } call ncbi.download_annotations { input: - accessions = taxon.right, - combined_out_prefix = taxon.left + accessions = string_split.tokens, + combined_out_prefix = taxon[0] } call assembly.scaffold { input: @@ -59,41 +66,64 @@ workflow scaffold_and_refine_multitaxa { min_unambig = 0, allow_incomplete_output = true } - call assemble_refbased.assemble_refbased as refine { - input: - reads_unmapped_bams = [reads_unmapped_bam], - reference_fasta = scaffold.scaffold_fasta, - sample_name = sample_id + if (scaffold.assembly_preimpute_length_unambiguous > min_scaffold_unambig) { + # polish de novo assembly with reads + call assemble_refbased.assemble_refbased as refine { + input: + reads_unmapped_bams = [reads_unmapped_bam], + reference_fasta = scaffold.scaffold_fasta, + sample_name = sample_id + } + String assembly_method_denovo = "viral-ngs/assemble_denovo" } - # to do: if percent_reference_covered > some threshold, run ncbi.rename_fasta_header and ncbi.align_and_annot_transfer_single - # to do: if biosample attributes file provided, run ncbi.biosample_to_genbank + if (scaffold.assembly_preimpute_length_unambiguous <= min_scaffold_unambig) { + # fall back to refbased assembly if de novo fails + call assemble_refbased.assemble_refbased as ref_based { + input: + reads_unmapped_bams = [reads_unmapped_bam], + reference_fasta = download_annotations.combined_fasta, + sample_name = sample_id + } + String assembly_method_refbased = "viral-ngs/assemble_refbased" + } + + # TO DO: if percent_reference_covered > some threshold, run ncbi.rename_fasta_header and ncbi.align_and_annot_transfer_single + # TO DO: if biosample attributes file provided, run ncbi.biosample_to_genbank + String taxid = taxon[0] + String tax_name = taxon[1] + Int assembly_length_unambiguous = select_first([refine.assembly_length_unambiguous, ref_based.assembly_length_unambiguous]) + Float percent_reference_covered = 1.0 * assembly_length_unambiguous / scaffold.reference_length + File assembly_fasta = select_first([refine.assembly_fasta, ref_based.assembly_fasta]) Map[String, String] stats_by_taxon = { "sample_id" : sample_id, - "taxid" : taxon.left, - - "assembly_fasta" : refine.assembly_fasta, - "aligned_only_reads_bam" : refine.align_to_self_merged_aligned_only_bam, - "coverage_plot" : refine.align_to_self_merged_coverage_plot, - "assembly_length" : refine.assembly_length, - "assembly_length_unambiguous" : refine.assembly_length_unambiguous, - "reads_aligned" : refine.align_to_self_merged_reads_aligned, - "mean_coverage" : refine.align_to_self_merged_mean_coverage, - "percent_reference_covered" : 1.0 * refine.assembly_length_unambiguous / refine.reference_genome_length, - - "intermediate_gapfill_fasta" : scaffold.intermediate_gapfill_fasta, + "taxid" : taxid, + "tax_name" : tax_name, + + "assembly_fasta" : assembly_fasta, + "aligned_only_reads_bam" : select_first([refine.align_to_self_merged_aligned_only_bam, ref_based.align_to_self_merged_aligned_only_bam]), + "coverage_plot" : select_first([refine.align_to_self_merged_coverage_plot, ref_based.align_to_self_merged_coverage_plot]), + "assembly_length" : select_first([refine.assembly_length, ref_based.assembly_length]), + "assembly_length_unambiguous" : assembly_length_unambiguous, + "reads_aligned" : select_first([refine.align_to_self_merged_reads_aligned, ref_based.align_to_self_merged_reads_aligned]), + "mean_coverage" : select_first([refine.align_to_self_merged_mean_coverage, ref_based.align_to_self_merged_mean_coverage]), + "percent_reference_covered" : percent_reference_covered, + + "intermediate_gapfill_fasta" : scaffold.intermediate_gapfill_fasta, "assembly_preimpute_length_unambiguous" : scaffold.assembly_preimpute_length_unambiguous, - "replicate_concordant_sites" : refine.replicate_concordant_sites, - "replicate_discordant_snps" : refine.replicate_discordant_snps, - "replicate_discordant_indels" : refine.replicate_discordant_indels, - "replicate_discordant_vcf" : refine.replicate_discordant_vcf, + "replicate_concordant_sites" : select_first([refine.replicate_concordant_sites, ref_based.replicate_concordant_sites]), + "replicate_discordant_snps" : select_first([refine.replicate_discordant_snps, ref_based.replicate_discordant_snps]), + "replicate_discordant_indels" : select_first([refine.replicate_discordant_indels, ref_based.replicate_discordant_indels]), + "replicate_discordant_vcf" : select_first([refine.replicate_discordant_vcf, ref_based.replicate_discordant_vcf]), - "isnvsFile" : refine.align_to_self_isnvs_vcf, - "aligned_bam" : refine.align_to_self_merged_aligned_only_bam, - "coverage_tsv" : refine.align_to_self_merged_coverage_tsv, - "read_pairs_aligned" : refine.align_to_self_merged_read_pairs_aligned, - "bases_aligned" : refine.align_to_self_merged_bases_aligned + "isnvsFile" : select_first([refine.align_to_self_isnvs_vcf, ref_based.align_to_self_isnvs_vcf]), + "aligned_bam" : select_first([refine.align_to_self_merged_aligned_only_bam, ref_based.align_to_self_merged_aligned_only_bam]), + "coverage_tsv" : select_first([refine.align_to_self_merged_coverage_tsv, ref_based.align_to_self_merged_coverage_tsv]), + "read_pairs_aligned" : select_first([refine.align_to_self_merged_read_pairs_aligned, ref_based.align_to_self_merged_read_pairs_aligned]), + "bases_aligned" : select_first([refine.align_to_self_merged_bases_aligned, ref_based.align_to_self_merged_bases_aligned]), + + "assembly_method" : select_first([assembly_method_denovo, assembly_method_refbased]) } scatter(h in assembly_header) { @@ -109,14 +139,19 @@ workflow scaffold_and_refine_multitaxa { } output { - Array[Map[String,String]] assembly_stats_by_taxon = stats_by_taxon - File assembly_stats_by_taxon_tsv = concatenate.combined - - Int num_read_groups = refine.num_read_groups[0] - Int num_libraries = refine.num_libraries[0] - - String assembly_method = "viral-ngs/scaffold_and_refine_multitaxa" - String scaffold_viral_assemble_version = scaffold.viralngs_version[0] - String refine_viral_assemble_version = refine.viral_assemble_version[0] + Array[Map[String,String]] assembly_stats_by_taxon = stats_by_taxon + File assembly_stats_by_taxon_tsv = concatenate.combined + String assembly_method = "viral-ngs/scaffold_and_refine_multitaxa" + + ## TO DO: some summary stats on stats_by_taxon: how many rows, numbers from the best row, etc + #String assembly_top_taxon_id = + #String assembly_top_length_unambiguous = + #Float assembly_top_pct_ref_cov = + #File assembly_top_fasta = + Array[String] assembly_all_taxids = taxid + Array[String] assembly_all_taxnames = tax_name + Array[Int] assembly_all_lengths_unambig = assembly_length_unambiguous + Array[Float] assembly_all_pct_ref_cov = percent_reference_covered + Array[File] assembly_all_fastas = assembly_fasta } } diff --git a/requirements-modules.txt b/requirements-modules.txt index 2cf41a97a..d0f852682 100644 --- a/requirements-modules.txt +++ b/requirements-modules.txt @@ -1,6 +1,6 @@ broadinstitute/viral-core=2.2.4 broadinstitute/viral-assemble=2.2.4.0 -broadinstitute/viral-classify=2.2.3.0 +broadinstitute/viral-classify=2.2.4.0 broadinstitute/viral-phylo=2.1.20.2 broadinstitute/py3-bio=0.1.2 broadinstitute/beast-beagle-cuda=1.10.5pre