Skip to content

Commit

Permalink
Add VCF input option to Vapor WDL (#771)
Browse files Browse the repository at this point in the history
  • Loading branch information
mwalker174 authored Feb 4, 2025
1 parent d9a905e commit c91ae82
Show file tree
Hide file tree
Showing 6 changed files with 125 additions and 20 deletions.
9 changes: 9 additions & 0 deletions .github/.dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,15 @@ workflows:
tags:
- /.*/

- subclass: WDL
name: Vapor
primaryDescriptorPath: /wdl/Vapor.wdl
filters:
branches:
- main
tags:
- /.*/

- subclass: WDL
name: VisualizeCnvs
primaryDescriptorPath: /wdl/VisualizeCnvs.wdl
Expand Down
1 change: 0 additions & 1 deletion inputs/templates/test/Vapor/Vapor.json.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
"Vapor.ref_dict": {{ reference_resources.reference_dict | tojson }},
"Vapor.save_plots": "false",

"Vapor.prefix": {{ test_batch.example_pacbio_sample_id | tojson }},
"Vapor.sample_id": {{ test_batch.example_pacbio_sample_id | tojson }},
"Vapor.bam_or_cram_file": {{ test_batch.example_pacbio_cram | tojson }},
"Vapor.bam_or_cram_index": {{ test_batch.example_pacbio_cram_index | tojson }},
Expand Down
6 changes: 3 additions & 3 deletions src/sv-pipeline/scripts/preprocess_bed_for_vapor.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,13 @@ def handle_header(line, columns, fields, default_num_columns, sample_to_extract)
for i, name in enumerate(fields[default_num_columns:]):
columns[name] = default_num_columns + i
if "SVLEN" not in columns:
logging.warning("SVLEN column not found. Will not be able to add SVLEN info to INS events")
raise ValueError("SVLEN column not found in header")
else:
logging.warning("Header not found. Will not be able to add SVLEN info to INS events")
raise ValueError("Header not found. Header must exist and start with #")
if len(fields) >= default_num_columns:
columns['samples'] = default_num_columns # if no header but extra fields, assume samples is next column
if sample_to_extract is not None and "samples" not in columns:
logging.warning("Sample to extract provided but no samples column found")
raise ValueError("Sample to extract provided but no samples column found")


def reformat(bed_in, bed_out, contig, sample_to_extract):
Expand Down
66 changes: 66 additions & 0 deletions wdl/Utils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -783,6 +783,72 @@ task SubsetVcfBySamplesList {
}
}
# Subset a VCF to a specific sample
task SubsetVcfToSample {
input {
File vcf
File? vcf_idx
String sample
String? outfile_name
Boolean remove_sample = false # If false (default), keep the sample If true, remove it.
Boolean remove_private_sites = true # If true (default), remove sites that are private to excluded samples. If false, keep sites even if no remaining samples are non-ref.
Boolean keep_af = true # If true (default), do not recalculate allele frequencies (AC/AF/AN)
String sv_base_mini_docker
RuntimeAttr? runtime_attr_override
}
String vcf_subset_filename = select_first([outfile_name, basename(vcf, ".vcf.gz") + ".subset.vcf.gz"])
String vcf_subset_idx_filename = vcf_subset_filename + ".tbi"
String remove_private_sites_flag = if remove_private_sites then " | bcftools view -e 'SVTYPE!=\"CNV\" && COUNT(GT=\"alt\")==0' " else ""
String keep_af_flag = if keep_af then "--no-update" else ""
String complement_flag = if remove_sample then "^" else ""
# Disk must be scaled proportionally to the size of the VCF
Float input_size = size(vcf, "GiB")
RuntimeAttr default_attr = object {
mem_gb: 3.75,
disk_gb: ceil(10.0 + (input_size * 2)),
cpu_cores: 1,
preemptible_tries: 3,
max_retries: 1,
boot_disk_gb: 10
}
RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
command <<<
set -euo pipefail
bcftools view \
-s ~{complement_flag}~{sample} \
--force-samples \
~{keep_af_flag} \
~{vcf} \
~{remove_private_sites_flag} \
-O z \
-o ~{vcf_subset_filename}
tabix -f -p vcf ~{vcf_subset_filename}
>>>
output {
File vcf_subset = vcf_subset_filename
File vcf_subset_index = vcf_subset_idx_filename
}

runtime {
cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
docker: sv_base_mini_docker
preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
}
}
task VcfToBed {

input {
Expand Down
54 changes: 43 additions & 11 deletions wdl/Vapor.wdl
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
version 1.0

import "Utils.wdl" as utils
import "Structs.wdl"

workflow Vapor {
input {
String prefix
String sample_id
File bam_or_cram_file
File bam_or_cram_index
File bed_file
String sample_id

# One of the following must be specified. May be single- or multi-sample.
File? bed_file
File? vcf_file

Boolean save_plots # Control whether plots are final output
Expand All @@ -21,31 +24,53 @@ workflow Vapor {
String sv_base_mini_docker
String sv_pipeline_docker

RuntimeAttr? runtime_attr_subset_sample
RuntimeAttr? runtime_attr_vcf_to_bed
RuntimeAttr? runtime_attr_vapor
RuntimeAttr? runtime_attr_bcf2vcf
RuntimeAttr? runtime_attr_vcf2bed
RuntimeAttr? runtime_attr_split_vcf
RuntimeAttr? runtime_attr_concat_beds
RuntimeAttr? runtime_attr_LocalizeCram
File? NONE_FILE_ # Create a null file - do not use this input
}
# Convert vcf to bed if provided
if (defined(vcf_file) && !defined(bed_file)) {

call utils.SubsetVcfToSample {
input:
vcf=select_first([vcf_file]),
vcf_idx=select_first([vcf_file]) + ".tbi",
sample=sample_id,
outfile_name=sample_id,
sv_base_mini_docker=sv_base_mini_docker,
runtime_attr_override = runtime_attr_subset_sample
}
call utils.VcfToBed {
input:
vcf_file = SubsetVcfToSample.vcf_subset,
args = "-i SVLEN",
variant_interpretation_docker = sv_pipeline_docker,
runtime_attr_override = runtime_attr_vcf_to_bed
}
}
scatter (contig in read_lines(contigs)) {

call PreprocessBedForVapor {
input:
prefix = "~{prefix}.~{contig}.preprocess",
prefix = "~{sample_id}.~{contig}.preprocess",
contig = contig,
sample_to_extract = sample_id,
bed_file = bed_file,
bed_file = select_first([bed_file, VcfToBed.bed_output]),
sv_pipeline_docker = sv_pipeline_docker,
runtime_attr_override = runtime_attr_split_vcf
}
call RunVaporWithCram {
input:
prefix = "~{prefix}.~{contig}",
prefix = "~{sample_id}.~{contig}",
contig = contig,
bam_or_cram_file = bam_or_cram_file,
bam_or_cram_index = bam_or_cram_index,
Expand All @@ -62,7 +87,7 @@ workflow Vapor {
input:
shard_bed_files = RunVaporWithCram.vapor,
shard_plots = RunVaporWithCram.vapor_plot,
prefix = prefix,
prefix = sample_id,
sv_base_mini_docker = sv_base_mini_docker,
runtime_attr_override = runtime_attr_concat_beds
}
Expand Down Expand Up @@ -101,9 +126,16 @@ task PreprocessBedForVapor {

command <<<
set -euo pipefail

if [[ ~{bed_file} == *.gz ]]; then
gunzip -c ~{bed_file} > in.bed
else
cp ~{bed_file} in.bed
fi

python /opt/sv-pipeline/scripts/preprocess_bed_for_vapor.py \
--contig ~{contig} \
--bed-in ~{bed_file} \
--bed-in in.bed \
--bed-out ~{prefix}.bed \
~{"-s " + sample_to_extract}
>>>
Expand Down
9 changes: 4 additions & 5 deletions wdl/VaporBatch.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,16 @@ workflow VaporBatch {

Boolean save_plots

RuntimeAttr? runtime_attr_subset_sample
RuntimeAttr? runtime_attr_vcf_to_bed
RuntimeAttr? runtime_attr_vapor
RuntimeAttr? runtime_attr_bcf2vcf
RuntimeAttr? runtime_attr_vcf2bed
RuntimeAttr? runtime_attr_split_vcf
RuntimeAttr? runtime_attr_concat_beds
}
scatter (i in range(length(bam_or_cram_files))) {
call vapor_bed.Vapor {
input:
prefix = samples[i],
bam_or_cram_file = bam_or_cram_files[i],
bam_or_cram_index = bam_or_cram_indexes[i],
bed_file = bed_file,
Expand All @@ -45,8 +44,8 @@ workflow VaporBatch {
sv_base_mini_docker = sv_base_mini_docker,
sv_pipeline_docker = sv_pipeline_docker,
runtime_attr_vapor = runtime_attr_vapor,
runtime_attr_bcf2vcf = runtime_attr_bcf2vcf,
runtime_attr_vcf2bed = runtime_attr_vcf2bed,
runtime_attr_subset_sample = runtime_attr_subset_sample,
runtime_attr_vcf_to_bed = runtime_attr_vcf_to_bed,
runtime_attr_split_vcf = runtime_attr_split_vcf,
runtime_attr_concat_beds = runtime_attr_concat_beds
}
Expand Down

0 comments on commit c91ae82

Please sign in to comment.