Skip to content

Commit

Permalink
used constants; implemented non-AS transformation (#7718)
Browse files Browse the repository at this point in the history
* used constants; implemented non-AS transformation

* per Laura, using zero for ref MQ in fallback case

* added support to force non-as loading path

* Calculate SUM(AD) and make available

* fixed tests, PR comments

* bump JAR
  • Loading branch information
kcibul authored Mar 24, 2022
1 parent 91cef1c commit 55b9ead
Show file tree
Hide file tree
Showing 12 changed files with 586 additions and 215 deletions.
1 change: 1 addition & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ workflows:
branches:
- master
- ah_var_store
- kc_vqsr_magic
- name: GvsPrepareCallset
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsPrepareCallset.wdl
Expand Down
269 changes: 269 additions & 0 deletions scripts/variantstore/wdl/GvsBenchmarkExtractTask.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,269 @@
version 1.0

workflow GvsBenchmarkExtractTask {
input {
String data_project
String default_dataset

File wgs_intervals

File reference
File reference_index
File reference_dict

# For reblocking v1, the default is "SIXTY" instead of "FORTY"
String? drop_state = "FORTY"

# NOTE: this is just the cohort table prefix, not including project or dataset qualifiers
# without a default value, ranges users are forced to specify a value even though it is meaningless
String extract_table_prefix = ""
String query_project = data_project
String fq_ranges_dataset = "~{data_project}.~{default_dataset}"

Boolean do_not_filter_override = false
String? filter_set_name
Boolean? vqslod_filter_by_site
String fq_filter_set_info_table = "~{data_project}.~{default_dataset}.filter_set_info"
String fq_filter_set_site_table = "~{data_project}.~{default_dataset}.filter_set_sites"
String fq_filter_set_tranches_table = "~{data_project}.~{default_dataset}.filter_set_tranches"

# if these are unset, default sensitivity levels will be used
Float? snps_truth_sensitivity_filter_level_override
Float? indels_truth_sensitivity_filter_level_override

File? excluded_intervals
Boolean? emit_pls = false

Int? extract_cpu_override = 2
String? extract_memory_override = "12 GB"

Int? extract_preemptible_override
Int? extract_maxretries_override
Int? split_intervals_disk_size_override

String mode = "RANGES"

String? service_account_json_path

String output_file_base_name
String? output_gcs_dir
File? gatk_override = "gs://broad-dsp-spec-ops/scratch/bigquery-jointcalling/jars/kc_vqsr_magic_20220324/gatk-package-4.2.0.0-478-gd0e381c-SNAPSHOT-local.jar"
Int local_disk_for_extract = 150

String fq_samples_to_extract_table = "~{data_project}.~{default_dataset}.~{extract_table_prefix}__SAMPLES"
String fq_cohort_extract_table = "~{data_project}.~{default_dataset}.~{extract_table_prefix}__DATA"
}


call ExtractTask {
input:
gatk_override = gatk_override,
reference = reference,
reference_index = reference_index,
reference_dict = reference_dict,
fq_samples_to_extract_table = fq_samples_to_extract_table,
intervals = wgs_intervals,
fq_cohort_extract_table = fq_cohort_extract_table,
read_project_id = query_project,
do_not_filter_override = do_not_filter_override,
fq_ranges_dataset = fq_ranges_dataset,
fq_filter_set_info_table = fq_filter_set_info_table,
fq_filter_set_site_table = fq_filter_set_site_table,
fq_filter_set_tranches_table = fq_filter_set_tranches_table,
filter_set_name = filter_set_name,
vqslod_filter_by_site = vqslod_filter_by_site,
snps_truth_sensitivity_filter_level = snps_truth_sensitivity_filter_level_override,
indels_truth_sensitivity_filter_level = indels_truth_sensitivity_filter_level_override,
excluded_intervals = excluded_intervals,
emit_pls = emit_pls,
service_account_json_path = service_account_json_path,
drop_state = drop_state,
output_file = "${output_file_base_name}.vcf.gz",
local_disk = local_disk_for_extract,
extract_preemptible_override = extract_preemptible_override,
extract_cpu_override = extract_cpu_override,
extract_memory_override = extract_memory_override,
extract_maxretries_override = extract_maxretries_override
}
}

################################################################################
task ExtractTask {
# indicates that this task should NOT be call cached
meta {
volatile: true
}

input {
# ------------------------------------------------
# Input args:
File reference
File reference_index
File reference_dict

String fq_samples_to_extract_table

File intervals
String? drop_state

String fq_cohort_extract_table
String read_project_id
String output_file

Boolean do_not_filter_override
String fq_ranges_dataset
String fq_filter_set_info_table
String fq_filter_set_site_table
String fq_filter_set_tranches_table
String? filter_set_name
Boolean? vqslod_filter_by_site
Float? snps_truth_sensitivity_filter_level
Float? indels_truth_sensitivity_filter_level

File? excluded_intervals
Boolean? emit_pls

# Runtime Options:
String? service_account_json_path
File? gatk_override
Int? extract_preemptible_override
Int? extract_maxretries_override
String? extract_memory_override
Int? extract_cpu_override

Int? local_sort_max_records_in_ram = 10000000
Int local_disk

}

String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'

# ------------------------------------------------
# Run our command:
command <<<
set -e
export GATK_LOCAL_JAR="~{default="/root/gatk.jar" gatk_override}"

if [ ~{has_service_account_file} = 'true' ]; then
gsutil cp ~{service_account_json_path} local.service_account.json
export GOOGLE_APPLICATION_CREDENTIALS=local.service_account.json
gcloud auth activate-service-account --key-file=local.service_account.json
gcloud config set project ~{read_project_id}
fi

df -h

if [ ~{do_not_filter_override} = 'true' ]; then
FILTERING_ARGS=''
else
FILTERING_ARGS='--filter-set-info-table ~{fq_filter_set_info_table}
--filter-set-site-table ~{fq_filter_set_site_table}
--tranches-table ~{fq_filter_set_tranches_table}
--filter-set-name ~{filter_set_name}
~{true='--vqslod-filter-by-site' false='' vqslod_filter_by_site}
~{"--snps-truth-sensitivity-filter-level " + snps_truth_sensitivity_filter_level}
~{"--indels-truth-sensitivity-filter-level " + indels_truth_sensitivity_filter_level}'
fi

gatk --java-options "-Xmx9g" \
ExtractCohort \
--mode RANGES --vet-ranges-fq-dataset ~{fq_ranges_dataset} \
--ref-version 38 \
-R ~{reference} \
-O ~{output_file} \
--local-sort-max-records-in-ram ~{local_sort_max_records_in_ram} \
--sample-table ~{fq_samples_to_extract_table} \
~{"--inferred-reference-state " + drop_state} \
-L ~{intervals} \
~{"-XL " + excluded_intervals} \
--project-id ~{read_project_id} \
~{true='--emit-pls' false='' emit_pls} \
${FILTERING_ARGS}
>>>

# ------------------------------------------------
# Runtime settings:
runtime {
docker: "us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_d8a72b825eab2d979c8877448c0ca948fd9b34c7_change_to_hwe"
memory: select_first([extract_memory_override, "12 GB"])
disks: "local-disk ~{local_disk} HDD"
bootDiskSizeGb: 15
preemptible: select_first([extract_preemptible_override, "2"])
maxRetries: select_first([extract_maxretries_override, "3"])
cpu: select_first([extract_cpu_override, "2"])
}

output {
File output_vcf = "~{output_file}"
File output_vcf_index = "~{output_file}.tbi"
}
}

task SumBytes {

input {
Array[Float] file_sizes_bytes
}

command <<<
set -e
echo "~{sep=" " file_sizes_bytes}" | tr " " "\n" | python -c "
import sys;
total_bytes = sum(float(i.strip()) for i in sys.stdin);
total_mb = total_bytes/10**6;
print(total_mb);"
>>>

output {
Float total_mb = read_float(stdout())
}

runtime {
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:305.0.0"
memory: "3 GB"
disks: "local-disk 10 HDD"
preemptible: 3
cpu: 1
}
}

task CreateManifest {

input {
Array[String] manifest_lines
String? output_gcs_dir
String? service_account_json_path
}

String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'

command <<<
set -e
MANIFEST_LINES_TXT=~{write_lines(manifest_lines)}
echo "vcf_file_location, vcf_file_bytes, vcf_index_location, vcf_index_bytes" >> manifest.txt
sort -n ${MANIFEST_LINES_TXT} | cut -d',' -f 2- >> manifest.txt

# Drop trailing slash if one exists
OUTPUT_GCS_DIR=$(echo ~{output_gcs_dir} | sed 's/\/$//')

if [ -n "$OUTPUT_GCS_DIR" ]; then
if [ ~{has_service_account_file} = 'true' ]; then
gsutil cp ~{service_account_json_path} local.service_account.json
gcloud auth activate-service-account --key-file=local.service_account.json
fi
gsutil cp manifest.txt ${OUTPUT_GCS_DIR}/
fi
>>>

output {
File manifest = "manifest.txt"
}

runtime {
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:305.0.0"
memory: "3 GB"
disks: "local-disk 10 HDD"
preemptible: 3
cpu: 1
}
}
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsCreateFilterSet.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ workflow GvsCreateFilterSet {
# (SNPsVariantRecalibratorClassic vs. SNPsVariantRecalibratorCreateModel and SNPsVariantRecalibratorScattered)
Int snps_variant_recalibration_threshold = 20000

File gatk_override = "gs://broad-dsp-spec-ops/scratch/bigquery-jointcalling/jars/kc_extract_perf_20220111/gatk-package-4.2.0.0-455-g40a40bc-SNAPSHOT-local.jar"
File gatk_override = "gs://broad-dsp-spec-ops/scratch/bigquery-jointcalling/jars/kc_vqsr_magic_20220324/gatk-package-4.2.0.0-478-gd0e381c-SNAPSHOT-local.jar"

Array[String] snp_recalibration_tranche_values = ["100.0", "99.95", "99.9", "99.8", "99.6", "99.5", "99.4", "99.3", "99.0", "98.0", "97.0", "90.0" ]

Expand Down
5 changes: 4 additions & 1 deletion scripts/variantstore/wdl/GvsImportGenomes.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ workflow GvsImportGenomes {

File interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list"
Int? load_data_preemptible_override
File? load_data_gatk_override = "gs://broad-dsp-spec-ops/scratch/bigquery-jointcalling/jars/kc_ranges_prepare_20220118/gatk-package-4.2.0.0-462-gc0e684c-SNAPSHOT-local.jar"
File? load_data_gatk_override = "gs://broad-dsp-spec-ops/scratch/bigquery-jointcalling/jars/kc_vqsr_magic_20220324/gatk-package-4.2.0.0-478-gd0e381c-SNAPSHOT-local.jar"
String? service_account_json_path
}

Expand Down Expand Up @@ -200,6 +200,8 @@ task LoadData {

String? drop_state
Boolean? drop_state_includes_greater_than = false
Boolean force_loading_from_non_allele_specific = false

File? gatk_override
Int? load_data_preemptible_override
String? service_account_json_path
Expand Down Expand Up @@ -261,6 +263,7 @@ task LoadData {
-V ${updated_input_vcf} \
-L ~{interval_list} \
~{"-IG " + drop_state} \
--force-loading-from-non-allele-specific ~{force_loading_from_non_allele_specific} \
--ignore-above-gq-threshold ~{drop_state_includes_greater_than} \
--project-id ~{project_id} \
--dataset-name ~{dataset_name} \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ public class SchemaUtils {
public static final String AS_SB_TABLE = AS_FIELD_PREFIX + "SB_TABLE";
public static final String AS_VarDP = AS_FIELD_PREFIX + "VarDP";
public static final String CALL_AD = GENOTYPE_FIELD_PREFIX + "AD";
public static final String SUM_AD = "SUM_AD";
public static final String RAW_AD = "RAW_AD";
public static final String CALL_PGT = GENOTYPE_FIELD_PREFIX + "PGT";
public static final String CALL_PID = GENOTYPE_FIELD_PREFIX + "PID";
Expand Down Expand Up @@ -92,7 +93,7 @@ public class SchemaUtils {
public static final List<String> VET_FIELDS = Arrays.asList(SAMPLE_ID_FIELD_NAME, LOCATION_FIELD_NAME, REF_ALLELE_FIELD_NAME, ALT_ALLELE_FIELD_NAME, AS_RAW_MQ,
AS_RAW_MQRankSum, QUALapprox, AS_QUALapprox, AS_RAW_ReadPosRankSum, AS_SB_TABLE, AS_VarDP, CALL_GT, CALL_AD, CALL_GQ, CALL_PGT, CALL_PID, CALL_PL);
public static final List<String> ALT_ALLELE_FIELDS = Arrays.asList(LOCATION_FIELD_NAME, SAMPLE_ID_FIELD_NAME, REF_ALLELE_FIELD_NAME, "allele", ALT_ALLELE_FIELD_NAME, "allele_pos", CALL_GT, AS_RAW_MQ, RAW_MQ, AS_RAW_MQRankSum, "raw_mqranksum_x_10", AS_QUALapprox, "qual", AS_RAW_ReadPosRankSum, "raw_readposranksum_x_10", AS_SB_TABLE, "SB_REF_PLUS","SB_REF_MINUS","SB_ALT_PLUS","SB_ALT_MINUS", CALL_AD, "ref_ad", "ad");
public static final List<String> FEATURE_EXTRACT_FIELDS = Arrays.asList(LOCATION_FIELD_NAME, REF_ALLELE_FIELD_NAME, "allele", RAW_QUAL, "ref_ad", AS_MQRankSum, "AS_MQRankSum_ft", AS_ReadPosRankSum, "AS_ReadPosRankSum_ft", RAW_MQ, RAW_AD, "RAW_AD_GT_1", "SB_REF_PLUS","SB_REF_MINUS","SB_ALT_PLUS","SB_ALT_MINUS","num_het_samples","num_homvar_samples","distinct_alleles","hq_genotype_samples", "sum_qualapprox", "num_snp_alleles");
public static final List<String> FEATURE_EXTRACT_FIELDS = Arrays.asList(LOCATION_FIELD_NAME, REF_ALLELE_FIELD_NAME, "allele", RAW_QUAL, "ref_ad", AS_MQRankSum, "AS_MQRankSum_ft", AS_ReadPosRankSum, "AS_ReadPosRankSum_ft", RAW_MQ, SUM_AD, RAW_AD, "RAW_AD_GT_1", "SB_REF_PLUS","SB_REF_MINUS","SB_ALT_PLUS","SB_ALT_MINUS","num_het_samples","num_homvar_samples","distinct_alleles","hq_genotype_samples", "sum_qualapprox", "num_snp_alleles");

public static final String LOAD_STATUS_FIELD_NAME = "status";
public static final String LOAD_STATUS_EVENT_TIMESTAMP_NAME = "event_timestamp";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ public class ExtractFeaturesRecord implements Locatable {
private final String asReadPosRankSumFreqTable; // nullable
private final Double rawMQ;
private final Double rawAD;
private final Double sumAD;
private final Double rawADGT1;
private final int sbRefPlus;
private final int sbRefMinus;
Expand Down Expand Up @@ -79,6 +80,10 @@ public ExtractFeaturesRecord(GenericRecord genericRecord) {
Object refADNullable = genericRecord.get("ref_ad");
this.refAD = ( refADNullable == null ) ? 0 : Double.valueOf(Objects.toString(refADNullable));

// if sum_AD is not defined, set it to zero
Object sumADNullable = genericRecord.get(SchemaUtils.SUM_AD);
this.sumAD = ( sumADNullable == null ) ? 0 : Double.valueOf(Objects.toString(sumADNullable));

}

@Override
Expand Down Expand Up @@ -112,6 +117,8 @@ public ExtractFeaturesRecord(GenericRecord genericRecord) {

public Double getRawAD() { return this.rawAD; }

public Double getSumAD() { return this.sumAD; }

public Double getRawADGT1() { return this.rawADGT1; }

public Double getQualApprox() { return this.qualApprox; }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,13 @@ public final class CreateVariantIngestFiles extends VariantWalker {
protected String datasetName = null;


@Argument(
fullName = "force-loading-from-non-allele-specific",
doc = "Even if there are allele-specific (AS) annotations, use backwards compatibility mode",
optional = true
)
protected boolean forceLoadingFromNonAlleleSpecific = false;

// getGenotypes() returns list of lists for all samples at variant
// assuming one sample per gvcf, getGenotype(0) retrieves GT for sample at index 0
public static boolean isNoCall(VariantContext variant) {
Expand Down Expand Up @@ -212,7 +219,7 @@ public void onTraversalStart() {
}

if (enableVet) {
vetCreator = new VetCreator(sampleIdentifierForOutputFileName, sampleId, tableNumber, outputDir, outputType, projectID, datasetName);
vetCreator = new VetCreator(sampleIdentifierForOutputFileName, sampleId, tableNumber, outputDir, outputType, projectID, datasetName, forceLoadingFromNonAlleleSpecific);
}

// check the load status table to see if this sample has already been loaded...
Expand Down
Loading

0 comments on commit 55b9ead

Please sign in to comment.