From d3cfaa0b0bbf26054a40daf4b00865cf968272bb Mon Sep 17 00:00:00 2001 From: Danny Park Date: Sat, 20 Jun 2020 08:49:09 -0400 Subject: [PATCH 01/13] remove file ext --- pipes/WDL/workflows/diff_genome_sets.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipes/WDL/workflows/diff_genome_sets.wdl b/pipes/WDL/workflows/diff_genome_sets.wdl index 6d9b3c9aa..c30a02991 100644 --- a/pipes/WDL/workflows/diff_genome_sets.wdl +++ b/pipes/WDL/workflows/diff_genome_sets.wdl @@ -21,7 +21,7 @@ workflow diff_genome_sets { call reports.tsv_stack { input: input_tsvs = compare_two_genomes.comparison_table, - out_basename = "diff_genome_sets.txt" + out_basename = "diff_genome_sets" } output { From 60cbad2e1c889ba1c6f9d653b1c7a876fdd476dd Mon Sep 17 00:00:00 2001 From: Danny Park Date: Sat, 20 Jun 2020 10:42:50 -0400 Subject: [PATCH 02/13] remove task align_and_count_summary and convert to tsv_stack --- pipes/WDL/tasks/tasks_reports.wdl | 30 ------------------- .../align_and_count_multiple_report.wdl | 16 +++++----- pipes/WDL/workflows/classify_multi.wdl | 7 +++-- pipes/WDL/workflows/demux_metag.wdl | 7 +++-- pipes/WDL/workflows/demux_plus.wdl | 7 +++-- 5 files changed, 20 insertions(+), 47 deletions(-) diff --git a/pipes/WDL/tasks/tasks_reports.wdl b/pipes/WDL/tasks/tasks_reports.wdl index c16319367..caa92079e 100644 --- a/pipes/WDL/tasks/tasks_reports.wdl +++ b/pipes/WDL/tasks/tasks_reports.wdl @@ -187,36 +187,6 @@ task align_and_count { } } -task align_and_count_summary { - input { - Array[File]+ counts_txt - - String? output_prefix="count_summary" - - String docker="quay.io/broadinstitute/viral-core" - } - - command { - set -ex -o pipefail - - reports.py --version | tee VERSION - reports.py aggregate_alignment_counts ${sep=' ' counts_txt} "${output_prefix}".tsv --loglevel=DEBUG - } - - output { - File count_summary = "${output_prefix}.tsv" - String viralngs_version = read_string("VERSION") - } - - runtime { - memory: "3 GB" - cpu: 2 - docker: "${docker}" - disks: "local-disk 50 HDD" - dx_instance_type: "mem1_ssd1_v2_x2" - } -} - task aggregate_metagenomics_reports { input { Array[File]+ kraken_summary_reports diff --git a/pipes/WDL/workflows/align_and_count_multiple_report.wdl b/pipes/WDL/workflows/align_and_count_multiple_report.wdl index d963b3f45..5620c0cef 100644 --- a/pipes/WDL/workflows/align_and_count_multiple_report.wdl +++ b/pipes/WDL/workflows/align_and_count_multiple_report.wdl @@ -33,20 +33,20 @@ workflow align_and_count_multiple_report { } } - call reports.align_and_count_summary { + call reports.tsv_stack as align_and_count_summary { input: - counts_txt = align_and_count.report + input_tsvs = align_and_count.report, + out_basename = "count_summary" } - call reports.align_and_count_summary as align_and_count_summary_top_hits { + call reports.tsv_stack as align_and_count_summary_top_hits { input: - counts_txt = align_and_count.report_top_hits, - output_prefix = "count_summary_top_hits" + input_tsvs = align_and_count.report_top_hits, + out_basename = "count_summary_top_hits" } output { - File report = align_and_count_summary.count_summary - File report_top_hits = align_and_count_summary_top_hits.count_summary - String viral_core_version = align_and_count_summary.viralngs_version + File report = align_and_count_summary.out_tsv + File report_top_hits = align_and_count_summary_top_hits.out_tsv } } diff --git a/pipes/WDL/workflows/classify_multi.wdl b/pipes/WDL/workflows/classify_multi.wdl index a8634cb5f..59f6569be 100644 --- a/pipes/WDL/workflows/classify_multi.wdl +++ b/pipes/WDL/workflows/classify_multi.wdl @@ -126,9 +126,10 @@ workflow classify_multi { file_name = "multiqc-dedup.html" } - call reports.align_and_count_summary as spike_summary { + call reports.tsv_stack as spike_summary { input: - counts_txt = spikein.report + input_tsvs = spikein.report, + out_basename = "spikein_summary" } call reports.aggregate_metagenomics_reports as metag_summary_report { @@ -157,7 +158,7 @@ workflow classify_multi { File multiqc_report_raw = multiqc_raw.multiqc_report File multiqc_report_cleaned = multiqc_cleaned.multiqc_report File multiqc_report_dedup = multiqc_dedup.multiqc_report - File spikein_counts = spike_summary.count_summary + File spikein_counts = spike_summary.out_tsv File kraken2_merged_krona = krona_merge_kraken2.krona_report_html File kraken2_summary = metag_summary_report.krakenuniq_aggregate_taxlevel_summary diff --git a/pipes/WDL/workflows/demux_metag.wdl b/pipes/WDL/workflows/demux_metag.wdl index 02bb802a0..870d35ea2 100644 --- a/pipes/WDL/workflows/demux_metag.wdl +++ b/pipes/WDL/workflows/demux_metag.wdl @@ -79,9 +79,10 @@ workflow demux_metag { file_name = "multiqc-dedup.html" } - call reports.align_and_count_summary as spike_summary { + call reports.tsv_stack as spike_summary { input: - counts_txt = spikein.report + input_tsvs = spikein.report, + out_basename = "spikein_summary" } call reports.aggregate_metagenomics_reports as metag_summary_report { @@ -119,7 +120,7 @@ workflow demux_metag { File multiqc_report_raw = multiqc_raw.multiqc_report File multiqc_report_cleaned = multiqc_cleaned.multiqc_report File multiqc_report_dedup = multiqc_dedup.multiqc_report - File spikein_counts = spike_summary.count_summary + File spikein_counts = spike_summary.out_tsv File kraken2_merged_krona = krona_merge_kraken2.krona_report_html File kraken2_summary = metag_summary_report.krakenuniq_aggregate_taxlevel_summary File blastx_merged_krona = krona_merge_blastx.krona_report_html diff --git a/pipes/WDL/workflows/demux_plus.wdl b/pipes/WDL/workflows/demux_plus.wdl index 7092d615e..b22ffbb6a 100644 --- a/pipes/WDL/workflows/demux_plus.wdl +++ b/pipes/WDL/workflows/demux_plus.wdl @@ -58,9 +58,10 @@ workflow demux_plus { reads_unmapped_bam = illumina_demux.raw_reads_unaligned_bams } - call reports.align_and_count_summary as spike_summary { + call reports.tsv_stack as spike_summary { input: - counts_txt = spikein.report + input_tsvs = spikein.report, + out_basename = "spikein_summary" } call reports.aggregate_metagenomics_reports as metag_summary_report { @@ -83,7 +84,7 @@ workflow demux_plus { File multiqc_report_raw = multiqc_raw.multiqc_report File multiqc_report_cleaned = multiqc_cleaned.multiqc_report - File spikein_counts = spike_summary.count_summary + File spikein_counts = spike_summary.out_tsv File metagenomics_krona = krakenuniq.krona_report_merged_html File metagenomics_summary = metag_summary_report.krakenuniq_aggregate_taxlevel_summary Array[File] krakenuniq_classified_reads = krakenuniq.krakenuniq_classified_reads From bc9c723f71f1345a3686615107fb18eaf2dd41a4 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Sat, 20 Jun 2020 10:48:37 -0400 Subject: [PATCH 03/13] convert align_and_count to minimap2 from bwa, convert tsv_stack and tsv_join from 3rd party docker to viral-core --- pipes/WDL/tasks/tasks_reports.wdl | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pipes/WDL/tasks/tasks_reports.wdl b/pipes/WDL/tasks/tasks_reports.wdl index caa92079e..bd9378d71 100644 --- a/pipes/WDL/tasks/tasks_reports.wdl +++ b/pipes/WDL/tasks/tasks_reports.wdl @@ -145,7 +145,6 @@ task align_and_count { input { File reads_bam File ref_db - Int? minScoreToFilter Int? topNHits = 3 Int? machine_mem_gb @@ -161,11 +160,10 @@ task align_and_count { read_utils.py --version | tee VERSION ln -s ${reads_bam} ${reads_basename}.bam - read_utils.py bwamem_idxstats \ + read_utils.py minimap2_idxstats \ ${reads_basename}.bam \ ${ref_db} \ --outStats ${reads_basename}.count.${ref_basename}.txt.unsorted \ - ${'--minScoreToFilter=' + minScoreToFilter} \ --loglevel=DEBUG sort -b -r -n -k3 ${reads_basename}.count.${ref_basename}.txt.unsorted > ${reads_basename}.count.${ref_basename}.txt @@ -179,7 +177,7 @@ task align_and_count { } runtime { - memory: select_first([machine_mem_gb, 7]) + " GB" + memory: select_first([machine_mem_gb, 3]) + " GB" cpu: 4 docker: "${docker}" disks: "local-disk 375 LOCAL" @@ -337,7 +335,7 @@ task tsv_join { String join_type="inner" String out_basename - String docker="stratdat/csvkit" + String docker="quay.io/broadinstitute/viral-core" } command { @@ -378,7 +376,7 @@ task tsv_stack { input { Array[File]+ input_tsvs String out_basename - String docker="stratdat/csvkit" + String docker="quay.io/broadinstitute/viral-core" } command { From 14371b7bf8d44e7c0a380a114230bb0d44e59c15 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Sat, 20 Jun 2020 10:58:57 -0400 Subject: [PATCH 04/13] bump viral-core to 2.1.4 --- requirements-modules.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-modules.txt b/requirements-modules.txt index 37dd691e2..a7cc0b358 100644 --- a/requirements-modules.txt +++ b/requirements-modules.txt @@ -1,4 +1,4 @@ -broadinstitute/viral-core=2.1.3 +broadinstitute/viral-core=2.1.4 broadinstitute/viral-assemble=2.1.3.1 broadinstitute/viral-classify=2.1.3.1 broadinstitute/viral-phylo=2.1.3.1 From 65e98c0f9c98ce7a9881e1d3920f64ed473c5c42 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Sat, 20 Jun 2020 13:01:05 -0400 Subject: [PATCH 05/13] bump upstream modules --- requirements-modules.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements-modules.txt b/requirements-modules.txt index a7cc0b358..98cd6e8f8 100644 --- a/requirements-modules.txt +++ b/requirements-modules.txt @@ -1,7 +1,7 @@ broadinstitute/viral-core=2.1.4 -broadinstitute/viral-assemble=2.1.3.1 +broadinstitute/viral-assemble=2.1.4.0 broadinstitute/viral-classify=2.1.3.1 -broadinstitute/viral-phylo=2.1.3.1 +broadinstitute/viral-phylo=2.1.4.0 broadinstitute/beast-beagle-cuda=1.10.5pre broadinstitute/ncbi-tools=2.10.7.0 nextstrain/base=build-20200608T223413Z From c75c06efe0c1386c5f9df607c86ed849356fbe7d Mon Sep 17 00:00:00 2001 From: Danny Park Date: Sat, 20 Jun 2020 13:27:55 -0400 Subject: [PATCH 06/13] bump upstream --- requirements-modules.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-modules.txt b/requirements-modules.txt index 98cd6e8f8..df4d77273 100644 --- a/requirements-modules.txt +++ b/requirements-modules.txt @@ -1,6 +1,6 @@ broadinstitute/viral-core=2.1.4 broadinstitute/viral-assemble=2.1.4.0 -broadinstitute/viral-classify=2.1.3.1 +broadinstitute/viral-classify=2.1.4.0 broadinstitute/viral-phylo=2.1.4.0 broadinstitute/beast-beagle-cuda=1.10.5pre broadinstitute/ncbi-tools=2.10.7.0 From fae99c1bcb5abf1bc93074541480bed512cba6af Mon Sep 17 00:00:00 2001 From: Danny Park Date: Sat, 20 Jun 2020 16:49:19 -0400 Subject: [PATCH 07/13] add task run_discordance --- pipes/WDL/tasks/tasks_assembly.wdl | 62 +++++++++++++++++++ pipes/WDL/workflows/assemble_refbased.wdl | 12 ++++ .../test_outputs-assemble_refbased-local.json | 4 +- 3 files changed, 77 insertions(+), 1 deletion(-) diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl index 878ed920c..9a5c93a17 100644 --- a/pipes/WDL/tasks/tasks_assembly.wdl +++ b/pipes/WDL/tasks/tasks_assembly.wdl @@ -648,3 +648,65 @@ task refine_2x_and_plot { dx_instance_type: "mem1_ssd1_v2_x8" } } + +task run_discordance { + meta { + description: "This step evaluates discordance between sequencing runs of the same sample. The input is a merged, aligned BAM file for a single sample. If multiple runs (read groups) exist, we split the aligned reads by read group and separately evaluate consensus calls per read group using bcftools mpileup and call. A VCF is emitted that describes variation between runs." + } + + input { + File reads_aligned_bam + File reference_fasta + String out_basename = "run" + + String docker="quay.io/broadinstitute/viral-core" + } + + command { + set -ex -o pipefail + + read_utils.py --version | tee VERSION + + # create 2-col table with read group ids in both cols + python3 <0' > "${out_basename}.discordant.vcf" + cat "${out_basename}.discordant.vcf" | bcftools filter -i 'TYPE="snp"' | grep -v '^#' | wc -l | tee num_discordant_snps + cat "${out_basename}.discordant.vcf" | bcftools filter -i 'TYPE!="snp"' | grep -v '^#' | wc -l | tee num_discordant_indels + } + + output { + File discordant_sites_vcf = "${out_basename}.discordant.vcf" + Int concordant_sites = read_int("num_concordant") + Int discordant_snps = read_int("num_discordant_snps") + Int discordant_indels = read_int("num_discordant_indels") + String viralngs_version = read_string("VERSION") + } + + runtime { + docker: "${docker}" + memory: "3 GB" + cpu: 2 + disks: "local-disk 100 HDD" + dx_instance_type: "mem1_ssd1_v2_x2" + preemptible: 1 + } +} diff --git a/pipes/WDL/workflows/assemble_refbased.wdl b/pipes/WDL/workflows/assemble_refbased.wdl index 10f027543..46ad34a05 100644 --- a/pipes/WDL/workflows/assemble_refbased.wdl +++ b/pipes/WDL/workflows/assemble_refbased.wdl @@ -94,6 +94,13 @@ workflow assemble_refbased { out_basename = "${sample_name}.align_to_ref.trimmed" } + call assembly.run_discordance { + input: + reads_aligned_bam = merge_align_to_ref.out_bam, + reference_fasta = reference_fasta, + out_basename = sample_name + } + call reports.plot_coverage as plot_ref_coverage { input: aligned_reads_bam = merge_align_to_ref.out_bam, @@ -140,6 +147,11 @@ workflow assemble_refbased { Int reference_genome_length = plot_ref_coverage.assembly_length Float assembly_mean_coverage = plot_ref_coverage.mean_coverage + Int replicate_concordant_sites = run_discordance.concordant_sites + Int replicate_discordant_snps = run_discordance.discordant_snps + Int replicate_discordant_indels = run_discordance.discordant_indels + File replicate_discordant_vcf = run_discordance.discordant_sites_vcf + Array[File] align_to_ref_per_input_aligned_flagstat = align_to_ref.aligned_bam_flagstat Array[Int] align_to_ref_per_input_reads_provided = align_to_ref.reads_provided Array[Int] align_to_ref_per_input_reads_aligned = align_to_ref.reads_aligned diff --git a/test/input/WDL/test_outputs-assemble_refbased-local.json b/test/input/WDL/test_outputs-assemble_refbased-local.json index 5eb7bef8f..829701192 100644 --- a/test/input/WDL/test_outputs-assemble_refbased-local.json +++ b/test/input/WDL/test_outputs-assemble_refbased-local.json @@ -7,5 +7,7 @@ "assemble_refbased.align_to_self_merged_reads_aligned": 18409, "assemble_refbased.reference_genome_length": 18959, "assemble_refbased.assembly_length_unambiguous": 18889, - "assemble_refbased.assembly_length": 18889 + "assemble_refbased.assembly_length": 18889, + "assemble_refbased.replicate_discordant_indels": 0, + "assemble_refbased.replicate_discordant_snps": 0 } From 3962cd4ec3065f2e20ec7eb4831488af25818425 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Sat, 20 Jun 2020 17:04:18 -0400 Subject: [PATCH 08/13] Revert "remove task align_and_count_summary and convert to tsv_stack" This reverts commit 60cbad2e1c889ba1c6f9d653b1c7a876fdd476dd. --- pipes/WDL/tasks/tasks_reports.wdl | 30 +++++++++++++++++++ .../align_and_count_multiple_report.wdl | 16 +++++----- pipes/WDL/workflows/classify_multi.wdl | 7 ++--- pipes/WDL/workflows/demux_metag.wdl | 7 ++--- pipes/WDL/workflows/demux_plus.wdl | 7 ++--- 5 files changed, 47 insertions(+), 20 deletions(-) diff --git a/pipes/WDL/tasks/tasks_reports.wdl b/pipes/WDL/tasks/tasks_reports.wdl index bd9378d71..03e178824 100644 --- a/pipes/WDL/tasks/tasks_reports.wdl +++ b/pipes/WDL/tasks/tasks_reports.wdl @@ -185,6 +185,36 @@ task align_and_count { } } +task align_and_count_summary { + input { + Array[File]+ counts_txt + + String? output_prefix="count_summary" + + String docker="quay.io/broadinstitute/viral-core" + } + + command { + set -ex -o pipefail + + reports.py --version | tee VERSION + reports.py aggregate_alignment_counts ${sep=' ' counts_txt} "${output_prefix}".tsv --loglevel=DEBUG + } + + output { + File count_summary = "${output_prefix}.tsv" + String viralngs_version = read_string("VERSION") + } + + runtime { + memory: "3 GB" + cpu: 2 + docker: "${docker}" + disks: "local-disk 50 HDD" + dx_instance_type: "mem1_ssd1_v2_x2" + } +} + task aggregate_metagenomics_reports { input { Array[File]+ kraken_summary_reports diff --git a/pipes/WDL/workflows/align_and_count_multiple_report.wdl b/pipes/WDL/workflows/align_and_count_multiple_report.wdl index 5620c0cef..d963b3f45 100644 --- a/pipes/WDL/workflows/align_and_count_multiple_report.wdl +++ b/pipes/WDL/workflows/align_and_count_multiple_report.wdl @@ -33,20 +33,20 @@ workflow align_and_count_multiple_report { } } - call reports.tsv_stack as align_and_count_summary { + call reports.align_and_count_summary { input: - input_tsvs = align_and_count.report, - out_basename = "count_summary" + counts_txt = align_and_count.report } - call reports.tsv_stack as align_and_count_summary_top_hits { + call reports.align_and_count_summary as align_and_count_summary_top_hits { input: - input_tsvs = align_and_count.report_top_hits, - out_basename = "count_summary_top_hits" + counts_txt = align_and_count.report_top_hits, + output_prefix = "count_summary_top_hits" } output { - File report = align_and_count_summary.out_tsv - File report_top_hits = align_and_count_summary_top_hits.out_tsv + File report = align_and_count_summary.count_summary + File report_top_hits = align_and_count_summary_top_hits.count_summary + String viral_core_version = align_and_count_summary.viralngs_version } } diff --git a/pipes/WDL/workflows/classify_multi.wdl b/pipes/WDL/workflows/classify_multi.wdl index 59f6569be..a8634cb5f 100644 --- a/pipes/WDL/workflows/classify_multi.wdl +++ b/pipes/WDL/workflows/classify_multi.wdl @@ -126,10 +126,9 @@ workflow classify_multi { file_name = "multiqc-dedup.html" } - call reports.tsv_stack as spike_summary { + call reports.align_and_count_summary as spike_summary { input: - input_tsvs = spikein.report, - out_basename = "spikein_summary" + counts_txt = spikein.report } call reports.aggregate_metagenomics_reports as metag_summary_report { @@ -158,7 +157,7 @@ workflow classify_multi { File multiqc_report_raw = multiqc_raw.multiqc_report File multiqc_report_cleaned = multiqc_cleaned.multiqc_report File multiqc_report_dedup = multiqc_dedup.multiqc_report - File spikein_counts = spike_summary.out_tsv + File spikein_counts = spike_summary.count_summary File kraken2_merged_krona = krona_merge_kraken2.krona_report_html File kraken2_summary = metag_summary_report.krakenuniq_aggregate_taxlevel_summary diff --git a/pipes/WDL/workflows/demux_metag.wdl b/pipes/WDL/workflows/demux_metag.wdl index 870d35ea2..02bb802a0 100644 --- a/pipes/WDL/workflows/demux_metag.wdl +++ b/pipes/WDL/workflows/demux_metag.wdl @@ -79,10 +79,9 @@ workflow demux_metag { file_name = "multiqc-dedup.html" } - call reports.tsv_stack as spike_summary { + call reports.align_and_count_summary as spike_summary { input: - input_tsvs = spikein.report, - out_basename = "spikein_summary" + counts_txt = spikein.report } call reports.aggregate_metagenomics_reports as metag_summary_report { @@ -120,7 +119,7 @@ workflow demux_metag { File multiqc_report_raw = multiqc_raw.multiqc_report File multiqc_report_cleaned = multiqc_cleaned.multiqc_report File multiqc_report_dedup = multiqc_dedup.multiqc_report - File spikein_counts = spike_summary.out_tsv + File spikein_counts = spike_summary.count_summary File kraken2_merged_krona = krona_merge_kraken2.krona_report_html File kraken2_summary = metag_summary_report.krakenuniq_aggregate_taxlevel_summary File blastx_merged_krona = krona_merge_blastx.krona_report_html diff --git a/pipes/WDL/workflows/demux_plus.wdl b/pipes/WDL/workflows/demux_plus.wdl index b22ffbb6a..7092d615e 100644 --- a/pipes/WDL/workflows/demux_plus.wdl +++ b/pipes/WDL/workflows/demux_plus.wdl @@ -58,10 +58,9 @@ workflow demux_plus { reads_unmapped_bam = illumina_demux.raw_reads_unaligned_bams } - call reports.tsv_stack as spike_summary { + call reports.align_and_count_summary as spike_summary { input: - input_tsvs = spikein.report, - out_basename = "spikein_summary" + counts_txt = spikein.report } call reports.aggregate_metagenomics_reports as metag_summary_report { @@ -84,7 +83,7 @@ workflow demux_plus { File multiqc_report_raw = multiqc_raw.multiqc_report File multiqc_report_cleaned = multiqc_cleaned.multiqc_report - File spikein_counts = spike_summary.out_tsv + File spikein_counts = spike_summary.count_summary File metagenomics_krona = krakenuniq.krona_report_merged_html File metagenomics_summary = metag_summary_report.krakenuniq_aggregate_taxlevel_summary Array[File] krakenuniq_classified_reads = krakenuniq.krakenuniq_classified_reads From ee3a445ae31483145cf273dc43a5efed9b7113e9 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Sat, 20 Jun 2020 17:08:39 -0400 Subject: [PATCH 09/13] dedent python inline code a bit --- pipes/WDL/tasks/tasks_assembly.wdl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl index 9a5c93a17..7dfd192b7 100644 --- a/pipes/WDL/tasks/tasks_assembly.wdl +++ b/pipes/WDL/tasks/tasks_assembly.wdl @@ -669,12 +669,12 @@ task run_discordance { # create 2-col table with read group ids in both cols python3 < Date: Sat, 20 Jun 2020 17:31:12 -0400 Subject: [PATCH 10/13] huh, discordance in the test data! --- test/input/WDL/test_outputs-assemble_refbased-local.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/input/WDL/test_outputs-assemble_refbased-local.json b/test/input/WDL/test_outputs-assemble_refbased-local.json index 829701192..0b23c65e9 100644 --- a/test/input/WDL/test_outputs-assemble_refbased-local.json +++ b/test/input/WDL/test_outputs-assemble_refbased-local.json @@ -8,6 +8,6 @@ "assemble_refbased.reference_genome_length": 18959, "assemble_refbased.assembly_length_unambiguous": 18889, "assemble_refbased.assembly_length": 18889, - "assemble_refbased.replicate_discordant_indels": 0, - "assemble_refbased.replicate_discordant_snps": 0 + "assemble_refbased.replicate_discordant_indels": 3, + "assemble_refbased.replicate_discordant_snps": 18 } From a9d57fcaef726c12b76716c169f84daff407b578 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Sat, 20 Jun 2020 18:30:02 -0400 Subject: [PATCH 11/13] add allele depth to discrepancy vcf --- pipes/WDL/tasks/tasks_assembly.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl index 7dfd192b7..cbfdb7ee9 100644 --- a/pipes/WDL/tasks/tasks_assembly.wdl +++ b/pipes/WDL/tasks/tasks_assembly.wdl @@ -679,10 +679,10 @@ task run_discordance { # bcftools call snps while treating each RG as a separate sample bcftools mpileup \ - -G readgroups.txt -d 10000 -Ou \ + -G readgroups.txt -d 10000 -a "FORMAT/AD" -q 1 -m 2 -Ou \ -f "${reference_fasta}" "${reads_aligned_bam}" \ | bcftools call \ - -m -P 0 --ploidy 1 \ + -m --ploidy 1 \ --threads `nproc` \ -Ov -o everything.vcf From 56751cf943dc23e390613312e2a1519482ac0ade Mon Sep 17 00:00:00 2001 From: Danny Park Date: Sat, 20 Jun 2020 19:33:18 -0400 Subject: [PATCH 12/13] fix --- pipes/WDL/tasks/tasks_assembly.wdl | 1 + test/input/WDL/test_outputs-assemble_refbased-local.json | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl index cbfdb7ee9..3717b76ab 100644 --- a/pipes/WDL/tasks/tasks_assembly.wdl +++ b/pipes/WDL/tasks/tasks_assembly.wdl @@ -687,6 +687,7 @@ task run_discordance { -Ov -o everything.vcf # tally outputs + set +o pipefail # to handle empty grep cat everything.vcf | bcftools filter -i 'MAC=0' | grep -v '^#' | wc -l | tee num_concordant cat everything.vcf | bcftools filter -i 'MAC>0' > "${out_basename}.discordant.vcf" cat "${out_basename}.discordant.vcf" | bcftools filter -i 'TYPE="snp"' | grep -v '^#' | wc -l | tee num_discordant_snps diff --git a/test/input/WDL/test_outputs-assemble_refbased-local.json b/test/input/WDL/test_outputs-assemble_refbased-local.json index 0b23c65e9..3c8fdaf20 100644 --- a/test/input/WDL/test_outputs-assemble_refbased-local.json +++ b/test/input/WDL/test_outputs-assemble_refbased-local.json @@ -8,6 +8,6 @@ "assemble_refbased.reference_genome_length": 18959, "assemble_refbased.assembly_length_unambiguous": 18889, "assemble_refbased.assembly_length": 18889, - "assemble_refbased.replicate_discordant_indels": 3, - "assemble_refbased.replicate_discordant_snps": 18 + "assemble_refbased.replicate_discordant_indels": 0, + "assemble_refbased.replicate_discordant_snps": 7 } From 68e8900ccf17c0337b37dfe4718ccb6504a8e308 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Sat, 20 Jun 2020 21:42:23 -0400 Subject: [PATCH 13/13] remove spurious snps and indels by adding minimum read depth filter --- pipes/WDL/tasks/tasks_assembly.wdl | 14 +++++++++----- .../WDL/test_outputs-assemble_refbased-local.json | 2 +- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl index 3717b76ab..ae5715e40 100644 --- a/pipes/WDL/tasks/tasks_assembly.wdl +++ b/pipes/WDL/tasks/tasks_assembly.wdl @@ -679,17 +679,21 @@ task run_discordance { # bcftools call snps while treating each RG as a separate sample bcftools mpileup \ - -G readgroups.txt -d 10000 -a "FORMAT/AD" -q 1 -m 2 -Ou \ + -G readgroups.txt -d 10000 -a "FORMAT/DP,FORMAT/AD" \ + -q 1 -m 2 -Ou \ -f "${reference_fasta}" "${reads_aligned_bam}" \ | bcftools call \ - -m --ploidy 1 \ - --threads `nproc` \ + -P 0 -m --ploidy 1 \ + --threads $(nproc) \ -Ov -o everything.vcf + # mask all GT calls when less than 3 reads + cat everything.vcf | bcftools filter -e 'FMT/DP<3' -S . > filtered.vcf + cat filtered.vcf | bcftools filter -i 'MAC>0' > "${out_basename}.discordant.vcf" + # tally outputs set +o pipefail # to handle empty grep - cat everything.vcf | bcftools filter -i 'MAC=0' | grep -v '^#' | wc -l | tee num_concordant - cat everything.vcf | bcftools filter -i 'MAC>0' > "${out_basename}.discordant.vcf" + cat filtered.vcf | bcftools filter -i 'MAC=0' | grep -v '^#' | wc -l | tee num_concordant cat "${out_basename}.discordant.vcf" | bcftools filter -i 'TYPE="snp"' | grep -v '^#' | wc -l | tee num_discordant_snps cat "${out_basename}.discordant.vcf" | bcftools filter -i 'TYPE!="snp"' | grep -v '^#' | wc -l | tee num_discordant_indels } diff --git a/test/input/WDL/test_outputs-assemble_refbased-local.json b/test/input/WDL/test_outputs-assemble_refbased-local.json index 3c8fdaf20..829701192 100644 --- a/test/input/WDL/test_outputs-assemble_refbased-local.json +++ b/test/input/WDL/test_outputs-assemble_refbased-local.json @@ -9,5 +9,5 @@ "assemble_refbased.assembly_length_unambiguous": 18889, "assemble_refbased.assembly_length": 18889, "assemble_refbased.replicate_discordant_indels": 0, - "assemble_refbased.replicate_discordant_snps": 7 + "assemble_refbased.replicate_discordant_snps": 0 }