used constants; implemented non-AS transformation (#7718)

* used constants; implemented non-AS transformation * per Laura, using zero for ref MQ in fallback case * added support to force non-as loading path * Calculate SUM(AD) and make available * fixed tests, PR comments * bump JAR
broadinstitute · Mar 24, 2022 · 55b9ead · 55b9ead
1 parent 91cef1c
commit 55b9ead
Show file tree

Hide file tree

Showing 12 changed files with 586 additions and 215 deletions.
diff --git a/.dockstore.yml b/.dockstore.yml
@@ -124,6 +124,7 @@ workflows:
        branches:
          - master
          - ah_var_store
+         - kc_vqsr_magic
    - name: GvsPrepareCallset
      subclass: WDL
      primaryDescriptorPath: /scripts/variantstore/wdl/GvsPrepareCallset.wdl

diff --git a/scripts/variantstore/wdl/GvsBenchmarkExtractTask.wdl b/scripts/variantstore/wdl/GvsBenchmarkExtractTask.wdl
@@ -0,0 +1,269 @@
+version 1.0
+
+workflow GvsBenchmarkExtractTask {
+   input {
+        String data_project
+        String default_dataset
+
+        File wgs_intervals
+
+        File reference
+        File reference_index
+        File reference_dict
+
+        # For reblocking v1, the default is "SIXTY" instead of "FORTY"
+        String? drop_state = "FORTY"
+
+       # NOTE: this is just the cohort table prefix, not including project or dataset qualifiers
+       # without a default value, ranges users are forced to specify a value even though it is meaningless
+       String extract_table_prefix = ""
+       String query_project = data_project
+       String fq_ranges_dataset = "~{data_project}.~{default_dataset}"
+
+        Boolean do_not_filter_override = false
+        String? filter_set_name
+        Boolean? vqslod_filter_by_site
+        String fq_filter_set_info_table = "~{data_project}.~{default_dataset}.filter_set_info"
+        String fq_filter_set_site_table = "~{data_project}.~{default_dataset}.filter_set_sites"
+        String fq_filter_set_tranches_table = "~{data_project}.~{default_dataset}.filter_set_tranches"
+
+        # if these are unset, default sensitivity levels will be used
+        Float? snps_truth_sensitivity_filter_level_override
+        Float? indels_truth_sensitivity_filter_level_override
+
+        File? excluded_intervals
+        Boolean? emit_pls = false
+
+        Int? extract_cpu_override = 2
+        String? extract_memory_override = "12 GB"
+
+        Int? extract_preemptible_override
+        Int? extract_maxretries_override
+        Int? split_intervals_disk_size_override
+
+        String mode = "RANGES"
+
+        String? service_account_json_path
+
+        String output_file_base_name
+        String? output_gcs_dir
+        File? gatk_override = "gs://broad-dsp-spec-ops/scratch/bigquery-jointcalling/jars/kc_vqsr_magic_20220324/gatk-package-4.2.0.0-478-gd0e381c-SNAPSHOT-local.jar"
+        Int local_disk_for_extract = 150
+
+        String fq_samples_to_extract_table = "~{data_project}.~{default_dataset}.~{extract_table_prefix}__SAMPLES"
+        String fq_cohort_extract_table  = "~{data_project}.~{default_dataset}.~{extract_table_prefix}__DATA"
+   }
+
+
+        call ExtractTask {
+            input:
+                gatk_override                   = gatk_override,
+                reference                       = reference,
+                reference_index                 = reference_index,
+                reference_dict                  = reference_dict,
+                fq_samples_to_extract_table     = fq_samples_to_extract_table,
+                intervals                       = wgs_intervals,
+                fq_cohort_extract_table         = fq_cohort_extract_table,
+                read_project_id                 = query_project,
+                do_not_filter_override          = do_not_filter_override,
+                fq_ranges_dataset               = fq_ranges_dataset,
+                fq_filter_set_info_table        = fq_filter_set_info_table,
+                fq_filter_set_site_table        = fq_filter_set_site_table,
+                fq_filter_set_tranches_table    = fq_filter_set_tranches_table,
+                filter_set_name                 = filter_set_name,
+                vqslod_filter_by_site           = vqslod_filter_by_site,
+                snps_truth_sensitivity_filter_level = snps_truth_sensitivity_filter_level_override,
+                indels_truth_sensitivity_filter_level = indels_truth_sensitivity_filter_level_override,
+                excluded_intervals              = excluded_intervals,
+                emit_pls                        = emit_pls,
+                service_account_json_path       = service_account_json_path,
+                drop_state                      = drop_state,
+                output_file                     = "${output_file_base_name}.vcf.gz",
+                local_disk                      = local_disk_for_extract,
+                extract_preemptible_override    = extract_preemptible_override,
+                extract_cpu_override            = extract_cpu_override,
+                extract_memory_override         = extract_memory_override,
+                extract_maxretries_override     = extract_maxretries_override
+        }
+}
+
+################################################################################
+task ExtractTask {
+    # indicates that this task should NOT be call cached
+    meta {
+        volatile: true
+    }
+
+    input {
+        # ------------------------------------------------
+        # Input args:
+        File reference
+        File reference_index
+        File reference_dict
+
+        String fq_samples_to_extract_table
+
+        File intervals
+        String? drop_state
+
+        String fq_cohort_extract_table
+        String read_project_id
+        String output_file
+
+        Boolean do_not_filter_override
+        String fq_ranges_dataset
+        String fq_filter_set_info_table
+        String fq_filter_set_site_table
+        String fq_filter_set_tranches_table
+        String? filter_set_name
+        Boolean? vqslod_filter_by_site
+        Float? snps_truth_sensitivity_filter_level
+        Float? indels_truth_sensitivity_filter_level
+
+        File? excluded_intervals
+        Boolean? emit_pls
+
+        # Runtime Options:
+        String? service_account_json_path
+        File? gatk_override
+        Int? extract_preemptible_override
+        Int? extract_maxretries_override
+        String? extract_memory_override
+        Int? extract_cpu_override
+
+        Int? local_sort_max_records_in_ram = 10000000
+        Int local_disk
+
+    }
+
+    String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'
+
+    # ------------------------------------------------
+    # Run our command:
+    command <<<
+        set -e
+        export GATK_LOCAL_JAR="~{default="/root/gatk.jar" gatk_override}"
+
+        if [ ~{has_service_account_file} = 'true' ]; then
+            gsutil cp ~{service_account_json_path} local.service_account.json
+            export GOOGLE_APPLICATION_CREDENTIALS=local.service_account.json
+            gcloud auth activate-service-account --key-file=local.service_account.json
+            gcloud config set project ~{read_project_id}
+        fi
+
+        df -h
+
+        if [ ~{do_not_filter_override} = 'true' ]; then
+            FILTERING_ARGS=''
+        else
+            FILTERING_ARGS='--filter-set-info-table ~{fq_filter_set_info_table}
+                --filter-set-site-table ~{fq_filter_set_site_table}
+                --tranches-table ~{fq_filter_set_tranches_table}
+                --filter-set-name ~{filter_set_name}
+                ~{true='--vqslod-filter-by-site' false='' vqslod_filter_by_site}
+                ~{"--snps-truth-sensitivity-filter-level " + snps_truth_sensitivity_filter_level}
+                ~{"--indels-truth-sensitivity-filter-level " + indels_truth_sensitivity_filter_level}'
+        fi
+
+        gatk --java-options "-Xmx9g" \
+            ExtractCohort \
+                --mode RANGES --vet-ranges-fq-dataset ~{fq_ranges_dataset} \
+                --ref-version 38 \
+                -R ~{reference} \
+                -O ~{output_file} \
+                --local-sort-max-records-in-ram ~{local_sort_max_records_in_ram} \
+                --sample-table ~{fq_samples_to_extract_table} \
+                ~{"--inferred-reference-state " + drop_state} \
+                -L ~{intervals} \
+                ~{"-XL " + excluded_intervals} \
+                --project-id ~{read_project_id} \
+                ~{true='--emit-pls' false='' emit_pls} \
+                ${FILTERING_ARGS}
+    >>>
+
+    # ------------------------------------------------
+    # Runtime settings:
+    runtime {
+        docker: "us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_d8a72b825eab2d979c8877448c0ca948fd9b34c7_change_to_hwe"
+        memory: select_first([extract_memory_override, "12 GB"])
+        disks: "local-disk ~{local_disk} HDD"
+        bootDiskSizeGb: 15
+        preemptible: select_first([extract_preemptible_override, "2"])
+        maxRetries: select_first([extract_maxretries_override, "3"])
+        cpu: select_first([extract_cpu_override, "2"])
+    }
+
+    output {
+        File output_vcf = "~{output_file}"
+        File output_vcf_index = "~{output_file}.tbi"
+    }
+}
+
+task SumBytes {
+
+  input {
+    Array[Float] file_sizes_bytes
+  }
+
+  command <<<
+    set -e
+    echo "~{sep=" " file_sizes_bytes}" | tr " " "\n" | python -c "
+    import sys;
+    total_bytes = sum(float(i.strip()) for i in sys.stdin);
+    total_mb = total_bytes/10**6;
+    print(total_mb);"
+  >>>
+
+  output {
+    Float total_mb = read_float(stdout())
+  }
+
+  runtime {
+    docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:305.0.0"
+    memory: "3 GB"
+    disks: "local-disk 10 HDD"
+    preemptible: 3
+    cpu: 1
+  }
+}
+
+task CreateManifest {
+
+    input {
+        Array[String] manifest_lines
+        String? output_gcs_dir
+        String? service_account_json_path
+    }
+
+    String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'
+
+    command <<<
+        set -e
+        MANIFEST_LINES_TXT=~{write_lines(manifest_lines)}
+        echo "vcf_file_location, vcf_file_bytes, vcf_index_location, vcf_index_bytes" >> manifest.txt
+        sort -n ${MANIFEST_LINES_TXT} | cut -d',' -f 2- >> manifest.txt
+
+        # Drop trailing slash if one exists
+        OUTPUT_GCS_DIR=$(echo ~{output_gcs_dir} | sed 's/\/$//')
+
+        if [ -n "$OUTPUT_GCS_DIR" ]; then
+          if [ ~{has_service_account_file} = 'true' ]; then
+            gsutil cp ~{service_account_json_path} local.service_account.json
+            gcloud auth activate-service-account --key-file=local.service_account.json
+          fi
+          gsutil cp manifest.txt ${OUTPUT_GCS_DIR}/
+        fi
+    >>>
+
+    output {
+        File manifest = "manifest.txt"
+    }
+
+    runtime {
+        docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:305.0.0"
+        memory: "3 GB"
+        disks: "local-disk 10 HDD"
+        preemptible: 3
+        cpu: 1
+    }
+}
diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
@@ -25,7 +25,7 @@ workflow GvsCreateFilterSet {
   # (SNPsVariantRecalibratorClassic vs. SNPsVariantRecalibratorCreateModel and SNPsVariantRecalibratorScattered)
   Int snps_variant_recalibration_threshold = 20000
 
-  File gatk_override = "gs://broad-dsp-spec-ops/scratch/bigquery-jointcalling/jars/kc_extract_perf_20220111/gatk-package-4.2.0.0-455-g40a40bc-SNAPSHOT-local.jar"
+  File gatk_override = "gs://broad-dsp-spec-ops/scratch/bigquery-jointcalling/jars/kc_vqsr_magic_20220324/gatk-package-4.2.0.0-478-gd0e381c-SNAPSHOT-local.jar"
 
   Array[String] snp_recalibration_tranche_values = ["100.0", "99.95", "99.9", "99.8", "99.6", "99.5", "99.4", "99.3", "99.0", "98.0", "97.0", "90.0" ]
 

diff --git a/scripts/variantstore/wdl/GvsImportGenomes.wdl b/scripts/variantstore/wdl/GvsImportGenomes.wdl
@@ -12,7 +12,7 @@ workflow GvsImportGenomes {
 
     File interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list"
     Int? load_data_preemptible_override
-    File? load_data_gatk_override = "gs://broad-dsp-spec-ops/scratch/bigquery-jointcalling/jars/kc_ranges_prepare_20220118/gatk-package-4.2.0.0-462-gc0e684c-SNAPSHOT-local.jar"
+    File? load_data_gatk_override = "gs://broad-dsp-spec-ops/scratch/bigquery-jointcalling/jars/kc_vqsr_magic_20220324/gatk-package-4.2.0.0-478-gd0e381c-SNAPSHOT-local.jar"
     String? service_account_json_path
   }
 
@@ -200,6 +200,8 @@ task LoadData {
 
     String? drop_state
     Boolean? drop_state_includes_greater_than = false
+    Boolean force_loading_from_non_allele_specific = false
+
     File? gatk_override
     Int? load_data_preemptible_override
     String? service_account_json_path
@@ -261,6 +263,7 @@ task LoadData {
         -V ${updated_input_vcf} \
         -L ~{interval_list} \
         ~{"-IG " + drop_state} \
+        --force-loading-from-non-allele-specific ~{force_loading_from_non_allele_specific} \
         --ignore-above-gq-threshold ~{drop_state_includes_greater_than} \
         --project-id ~{project_id} \
         --dataset-name ~{dataset_name} \

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/SchemaUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/SchemaUtils.java
@@ -54,6 +54,7 @@ public class SchemaUtils {
     public static final String AS_SB_TABLE = AS_FIELD_PREFIX + "SB_TABLE";
     public static final String AS_VarDP = AS_FIELD_PREFIX + "VarDP";
     public static final String CALL_AD = GENOTYPE_FIELD_PREFIX + "AD";
+    public static final String SUM_AD = "SUM_AD";
     public static final String RAW_AD = "RAW_AD";
     public static final String CALL_PGT = GENOTYPE_FIELD_PREFIX + "PGT";
     public static final String CALL_PID = GENOTYPE_FIELD_PREFIX + "PID";
@@ -92,7 +93,7 @@ public class SchemaUtils {
     public static final List<String> VET_FIELDS = Arrays.asList(SAMPLE_ID_FIELD_NAME, LOCATION_FIELD_NAME, REF_ALLELE_FIELD_NAME, ALT_ALLELE_FIELD_NAME, AS_RAW_MQ,
             AS_RAW_MQRankSum, QUALapprox, AS_QUALapprox, AS_RAW_ReadPosRankSum, AS_SB_TABLE, AS_VarDP, CALL_GT, CALL_AD, CALL_GQ, CALL_PGT, CALL_PID, CALL_PL);
     public static final List<String> ALT_ALLELE_FIELDS = Arrays.asList(LOCATION_FIELD_NAME, SAMPLE_ID_FIELD_NAME, REF_ALLELE_FIELD_NAME, "allele", ALT_ALLELE_FIELD_NAME, "allele_pos", CALL_GT, AS_RAW_MQ, RAW_MQ, AS_RAW_MQRankSum, "raw_mqranksum_x_10", AS_QUALapprox, "qual", AS_RAW_ReadPosRankSum, "raw_readposranksum_x_10", AS_SB_TABLE, "SB_REF_PLUS","SB_REF_MINUS","SB_ALT_PLUS","SB_ALT_MINUS", CALL_AD, "ref_ad", "ad");
-    public static final List<String> FEATURE_EXTRACT_FIELDS = Arrays.asList(LOCATION_FIELD_NAME, REF_ALLELE_FIELD_NAME, "allele", RAW_QUAL, "ref_ad", AS_MQRankSum, "AS_MQRankSum_ft", AS_ReadPosRankSum, "AS_ReadPosRankSum_ft", RAW_MQ, RAW_AD, "RAW_AD_GT_1", "SB_REF_PLUS","SB_REF_MINUS","SB_ALT_PLUS","SB_ALT_MINUS","num_het_samples","num_homvar_samples","distinct_alleles","hq_genotype_samples", "sum_qualapprox", "num_snp_alleles");
+    public static final List<String> FEATURE_EXTRACT_FIELDS = Arrays.asList(LOCATION_FIELD_NAME, REF_ALLELE_FIELD_NAME, "allele", RAW_QUAL, "ref_ad", AS_MQRankSum, "AS_MQRankSum_ft", AS_ReadPosRankSum, "AS_ReadPosRankSum_ft", RAW_MQ, SUM_AD, RAW_AD, "RAW_AD_GT_1", "SB_REF_PLUS","SB_REF_MINUS","SB_ALT_PLUS","SB_ALT_MINUS","num_het_samples","num_homvar_samples","distinct_alleles","hq_genotype_samples", "sum_qualapprox", "num_snp_alleles");
 
     public static final String LOAD_STATUS_FIELD_NAME = "status";
     public static final String LOAD_STATUS_EVENT_TIMESTAMP_NAME = "event_timestamp";

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/filtering/ExtractFeaturesRecord.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/filtering/ExtractFeaturesRecord.java
@@ -22,6 +22,7 @@ public class ExtractFeaturesRecord implements Locatable {
     private final String asReadPosRankSumFreqTable; // nullable
     private final Double rawMQ;
     private final Double rawAD;
+    private final Double sumAD;
     private final Double rawADGT1;
     private final int sbRefPlus;
     private final int sbRefMinus;
@@ -79,6 +80,10 @@ public ExtractFeaturesRecord(GenericRecord genericRecord) {
         Object refADNullable = genericRecord.get("ref_ad");
         this.refAD = ( refADNullable == null ) ? 0 : Double.valueOf(Objects.toString(refADNullable));
 
+        // if sum_AD is not defined, set it to zero
+        Object sumADNullable = genericRecord.get(SchemaUtils.SUM_AD);
+        this.sumAD = ( sumADNullable == null ) ? 0 : Double.valueOf(Objects.toString(sumADNullable));
+
     }
 
     @Override
@@ -112,6 +117,8 @@ public ExtractFeaturesRecord(GenericRecord genericRecord) {
 
     public Double getRawAD() { return this.rawAD; }
 
+    public Double getSumAD() { return this.sumAD; }
+
     public Double getRawADGT1() { return this.rawADGT1; }
 
     public Double getQualApprox() { return this.qualApprox; }

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java
@@ -149,6 +149,13 @@ public final class CreateVariantIngestFiles extends VariantWalker {
     protected String datasetName = null;
 
 
+    @Argument(
+            fullName = "force-loading-from-non-allele-specific",
+            doc = "Even if there are allele-specific (AS) annotations, use backwards compatibility mode",
+            optional = true
+    )
+    protected boolean forceLoadingFromNonAlleleSpecific = false;
+
     // getGenotypes() returns list of lists for all samples at variant
     // assuming one sample per gvcf, getGenotype(0) retrieves GT for sample at index 0
     public static boolean isNoCall(VariantContext variant) {
@@ -212,7 +219,7 @@ public void onTraversalStart() {
         }
 
         if (enableVet) {
-            vetCreator = new VetCreator(sampleIdentifierForOutputFileName, sampleId, tableNumber, outputDir, outputType, projectID, datasetName);
+            vetCreator = new VetCreator(sampleIdentifierForOutputFileName, sampleId, tableNumber, outputDir, outputType, projectID, datasetName, forceLoadingFromNonAlleleSpecific);
         }
 
         // check the load status table to see if this sample has already been loaded...