Add VCF input option to Vapor WDL (#771)

broadinstitute · Feb 4, 2025 · c91ae82 · c91ae82
1 parent d9a905e
commit c91ae82
Show file tree

Hide file tree

Showing 6 changed files with 125 additions and 20 deletions.
diff --git a/.github/.dockstore.yml b/.github/.dockstore.yml
@@ -198,6 +198,15 @@ workflows:
       tags:
         - /.*/
 
+  - subclass: WDL
+    name: Vapor
+    primaryDescriptorPath: /wdl/Vapor.wdl
+    filters:
+      branches:
+        - main
+      tags:
+        - /.*/
+
   - subclass: WDL
     name: VisualizeCnvs
     primaryDescriptorPath: /wdl/VisualizeCnvs.wdl

diff --git a/inputs/templates/test/Vapor/Vapor.json.tmpl b/inputs/templates/test/Vapor/Vapor.json.tmpl
@@ -8,7 +8,6 @@
     "Vapor.ref_dict": {{ reference_resources.reference_dict | tojson }},
     "Vapor.save_plots": "false",
 
-    "Vapor.prefix": {{ test_batch.example_pacbio_sample_id | tojson }},
     "Vapor.sample_id": {{ test_batch.example_pacbio_sample_id | tojson }},
     "Vapor.bam_or_cram_file": {{ test_batch.example_pacbio_cram | tojson }},
     "Vapor.bam_or_cram_index": {{ test_batch.example_pacbio_cram_index | tojson }},

diff --git a/src/sv-pipeline/scripts/preprocess_bed_for_vapor.py b/src/sv-pipeline/scripts/preprocess_bed_for_vapor.py
@@ -33,13 +33,13 @@ def handle_header(line, columns, fields, default_num_columns, sample_to_extract)
         for i, name in enumerate(fields[default_num_columns:]):
             columns[name] = default_num_columns + i
         if "SVLEN" not in columns:
-            logging.warning("SVLEN column not found. Will not be able to add SVLEN info to INS events")
+            raise ValueError("SVLEN column not found in header")
     else:
-        logging.warning("Header not found. Will not be able to add SVLEN info to INS events")
+        raise ValueError("Header not found. Header must exist and start with #")
         if len(fields) >= default_num_columns:
             columns['samples'] = default_num_columns  # if no header but extra fields, assume samples is next column
     if sample_to_extract is not None and "samples" not in columns:
-        logging.warning("Sample to extract provided but no samples column found")
+        raise ValueError("Sample to extract provided but no samples column found")
 
 
 def reformat(bed_in, bed_out, contig, sample_to_extract):

diff --git a/wdl/Utils.wdl b/wdl/Utils.wdl
@@ -783,6 +783,72 @@ task SubsetVcfBySamplesList {
   }
 }
 
+# Subset a VCF to a specific sample
+task SubsetVcfToSample {
+  input {
+    File vcf
+    File? vcf_idx
+    String sample
+    String? outfile_name
+    Boolean remove_sample = false  # If false (default), keep the sample If true, remove it.
+    Boolean remove_private_sites = true  # If true (default), remove sites that are private to excluded samples. If false, keep sites even if no remaining samples are non-ref.
+    Boolean keep_af = true  # If true (default), do not recalculate allele frequencies (AC/AF/AN)
+    String sv_base_mini_docker
+    RuntimeAttr? runtime_attr_override
+  }
+
+  String vcf_subset_filename = select_first([outfile_name, basename(vcf, ".vcf.gz") + ".subset.vcf.gz"])
+  String vcf_subset_idx_filename = vcf_subset_filename + ".tbi"
+
+  String remove_private_sites_flag = if remove_private_sites then " | bcftools view -e 'SVTYPE!=\"CNV\" && COUNT(GT=\"alt\")==0' " else ""
+  String keep_af_flag = if keep_af then "--no-update" else ""
+  String complement_flag = if remove_sample then "^" else ""
+
+  # Disk must be scaled proportionally to the size of the VCF
+  Float input_size = size(vcf, "GiB")
+  RuntimeAttr default_attr = object {
+                               mem_gb: 3.75,
+                               disk_gb: ceil(10.0 + (input_size * 2)),
+                               cpu_cores: 1,
+                               preemptible_tries: 3,
+                               max_retries: 1,
+                               boot_disk_gb: 10
+                             }
+  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+  command <<<
+
+    set -euo pipefail
+
+    bcftools view \
+      -s ~{complement_flag}~{sample} \
+      --force-samples \
+      ~{keep_af_flag} \
+      ~{vcf} \
+      ~{remove_private_sites_flag} \
+      -O z \
+      -o ~{vcf_subset_filename}
+
+    tabix -f -p vcf ~{vcf_subset_filename}
+
+  >>>
+
+  output {
+    File vcf_subset = vcf_subset_filename
+    File vcf_subset_index = vcf_subset_idx_filename
+  }
+
+  runtime {
+    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+    docker: sv_base_mini_docker
+    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+  }
+}
+
 task VcfToBed {
 
     input {

diff --git a/wdl/Vapor.wdl b/wdl/Vapor.wdl
@@ -1,14 +1,17 @@
 version 1.0
 
+import "Utils.wdl" as utils
 import "Structs.wdl"
 
 workflow Vapor {
   input {
-    String prefix
+    String sample_id
     File bam_or_cram_file
     File bam_or_cram_index
-    File bed_file
-    String sample_id
+
+    # One of the following must be specified. May be single- or multi-sample.
+    File? bed_file
+    File? vcf_file
 
     Boolean save_plots  # Control whether plots are final output
 
@@ -21,31 +24,53 @@ workflow Vapor {
     String sv_base_mini_docker
     String sv_pipeline_docker
 
+    RuntimeAttr? runtime_attr_subset_sample
+    RuntimeAttr? runtime_attr_vcf_to_bed
     RuntimeAttr? runtime_attr_vapor
-    RuntimeAttr? runtime_attr_bcf2vcf
-    RuntimeAttr? runtime_attr_vcf2bed
     RuntimeAttr? runtime_attr_split_vcf
     RuntimeAttr? runtime_attr_concat_beds
-    RuntimeAttr? runtime_attr_LocalizeCram
 
     File? NONE_FILE_ # Create a null file - do not use this input
   }
 
+  # Convert vcf to bed if provided
+  if (defined(vcf_file) && !defined(bed_file)) {
+
+    call utils.SubsetVcfToSample {
+      input:
+        vcf=select_first([vcf_file]),
+        vcf_idx=select_first([vcf_file]) + ".tbi",
+        sample=sample_id,
+        outfile_name=sample_id,
+        sv_base_mini_docker=sv_base_mini_docker,
+        runtime_attr_override = runtime_attr_subset_sample
+    }
+
+    call utils.VcfToBed {
+      input:
+        vcf_file = SubsetVcfToSample.vcf_subset,
+        args = "-i SVLEN",
+        variant_interpretation_docker = sv_pipeline_docker,
+        runtime_attr_override = runtime_attr_vcf_to_bed
+    }
+
+  }
+
   scatter (contig in read_lines(contigs)) {
 
     call PreprocessBedForVapor {
       input:
-        prefix = "~{prefix}.~{contig}.preprocess",
+        prefix = "~{sample_id}.~{contig}.preprocess",
         contig = contig,
         sample_to_extract = sample_id,
-        bed_file = bed_file,
+        bed_file = select_first([bed_file, VcfToBed.bed_output]),
         sv_pipeline_docker = sv_pipeline_docker,
         runtime_attr_override = runtime_attr_split_vcf
     }
 
     call RunVaporWithCram {
       input:
-        prefix = "~{prefix}.~{contig}",
+        prefix = "~{sample_id}.~{contig}",
         contig = contig,
         bam_or_cram_file = bam_or_cram_file,
         bam_or_cram_index = bam_or_cram_index,
@@ -62,7 +87,7 @@ workflow Vapor {
     input:
       shard_bed_files = RunVaporWithCram.vapor,
       shard_plots = RunVaporWithCram.vapor_plot,
-      prefix = prefix,
+      prefix = sample_id,
       sv_base_mini_docker = sv_base_mini_docker,
       runtime_attr_override = runtime_attr_concat_beds
   }
@@ -101,9 +126,16 @@ task PreprocessBedForVapor {
 
   command <<<
     set -euo pipefail
+
+    if [[ ~{bed_file} == *.gz ]]; then
+      gunzip -c ~{bed_file} > in.bed
+    else
+      cp ~{bed_file} in.bed
+    fi
+
     python /opt/sv-pipeline/scripts/preprocess_bed_for_vapor.py \
       --contig ~{contig} \
-      --bed-in ~{bed_file} \
+      --bed-in in.bed \
       --bed-out ~{prefix}.bed \
       ~{"-s " + sample_to_extract}
   >>>

diff --git a/wdl/VaporBatch.wdl b/wdl/VaporBatch.wdl
@@ -21,17 +21,16 @@ workflow VaporBatch {
 
     Boolean save_plots
 
+    RuntimeAttr? runtime_attr_subset_sample
+    RuntimeAttr? runtime_attr_vcf_to_bed
     RuntimeAttr? runtime_attr_vapor
-    RuntimeAttr? runtime_attr_bcf2vcf
-    RuntimeAttr? runtime_attr_vcf2bed
     RuntimeAttr? runtime_attr_split_vcf
     RuntimeAttr? runtime_attr_concat_beds
   }
 
   scatter (i in range(length(bam_or_cram_files))) {
     call vapor_bed.Vapor {
       input:
-        prefix = samples[i],
         bam_or_cram_file = bam_or_cram_files[i],
         bam_or_cram_index = bam_or_cram_indexes[i],
         bed_file = bed_file,
@@ -45,8 +44,8 @@ workflow VaporBatch {
         sv_base_mini_docker = sv_base_mini_docker,
         sv_pipeline_docker = sv_pipeline_docker,
         runtime_attr_vapor = runtime_attr_vapor,
-        runtime_attr_bcf2vcf = runtime_attr_bcf2vcf,
-        runtime_attr_vcf2bed = runtime_attr_vcf2bed,
+        runtime_attr_subset_sample = runtime_attr_subset_sample,
+        runtime_attr_vcf_to_bed = runtime_attr_vcf_to_bed,
         runtime_attr_split_vcf = runtime_attr_split_vcf,
         runtime_attr_concat_beds = runtime_attr_concat_beds
     }