Skip to content

Commit

Permalink
Add task for VAT validation #4 (#7363)
Browse files Browse the repository at this point in the history
  • Loading branch information
rsasch authored Jul 23, 2021
1 parent d336359 commit 8d440b3
Show file tree
Hide file tree
Showing 2 changed files with 103 additions and 34 deletions.
1 change: 1 addition & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ workflows:
filters:
branches:
- ah_var_store
- rsa_add_vat_val_4
- name: MitochondriaPipeline
subclass: WDL
primaryDescriptorPath: /scripts/mitochondria_m2_wdl/MitochondriaPipeline.wdl
Expand Down
136 changes: 102 additions & 34 deletions scripts/variantstore/wdl/GvsValidateVAT.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,69 @@ workflow GvsValidateVatTable {
last_modified_timestamp = GetBQTableLastModifiedDatetime.last_modified_timestamp
}

# once there is more than one check, they will be gathered into this workflow output, in the format
# [{ValidationRule1: "PASS/FAIL Extra info from this test"},
# {ValidationRule2: "PASS/FAIL Extra from this test"}]
call SchemaOnlyOneRowPerNullTranscript {
input:
query_project_id = query_project_id,
fq_vat_table = fq_vat_table,
service_account_json_path = service_account_json_path,
last_modified_timestamp = GetBQTableLastModifiedDatetime.last_modified_timestamp
}

output {
Array[Map[String, String]] validation_results = [EnsureVatTableHasVariants.result, SpotCheckForExpectedTranscripts.result]
Array[Map[String, String]] validation_results = [EnsureVatTableHasVariants.result, SpotCheckForExpectedTranscripts.result, SchemaOnlyOneRowPerNullTranscript.result]
}
}


task GetBQTableLastModifiedDatetime {
# because this is being used to determine if the data has changed, never use call cache
meta {
volatile: true
}

input {
String query_project
String fq_table
String? service_account_json_path
}

String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'

# ------------------------------------------------
# try to get the last modified date for the table in question; fail if something comes back from BigQuery
# that isn't in the right format (e.g. an error)
command <<<
set -e

if [ ~{has_service_account_file} = 'true' ]; then
gsutil cp ~{service_account_json_path} local.service_account.json
gcloud auth activate-service-account --key-file=local.service_account.json
gcloud config set project ~{query_project}
fi

echo "project_id = ~{query_project}" > ~/.bigqueryrc

# bq needs the project name to be separate by a colon
DATASET_TABLE_COLON=$(echo ~{fq_table} | sed 's/\./:/')

LASTMODIFIED=$(bq --location=US --project_id=~{query_project} --format=json show ${DATASET_TABLE_COLON} | python3 -c "import sys, json; print(json.load(sys.stdin)['lastModifiedTime']);")
if [[ $LASTMODIFIED =~ ^[0-9]+$ ]]; then
echo $LASTMODIFIED
else
exit 1
fi
>>>

output {
String last_modified_timestamp = read_string(stdout())
}

runtime {
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:305.0.0"
memory: "3 GB"
disks: "local-disk 10 HDD"
preemptible: 3
cpu: 1
}
}

Expand All @@ -54,6 +112,7 @@ task EnsureVatTableHasVariants {
String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'

command <<<
set -e
if [ ~{has_service_account_file} = 'true' ]; then
gsutil cp ~{service_account_json_path} local.service_account.json
gcloud auth activate-service-account --key-file=local.service_account.json
Expand Down Expand Up @@ -104,6 +163,8 @@ task SpotCheckForExpectedTranscripts {
String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'

command <<<
set -e

if [ ~{has_service_account_file} = 'true' ]; then
gsutil cp ~{service_account_json_path} local.service_account.json
gcloud auth activate-service-account --key-file=local.service_account.json
Expand Down Expand Up @@ -135,10 +196,11 @@ task SpotCheckForExpectedTranscripts {
if [[ $NUMRESULTS = "0" ]]; then
echo "PASS: The VAT table ~{fq_vat_table} only has the expected transcripts at the tested location ('IGFLR1' and 'AD000671.2' in chromosome 19, between positions 35,740,407 - 35,740,469)." > validation_results.txt
else
echo "FAIL: The VAT table ~{fq_vat_table} had unexpected transcripts at the tested location: [csv output follows] " > validation_results.txt
echo "FAIL: The VAT table ~{fq_vat_table} had unexpected transcripts at the tested location: [csv output follows] " > validation_results.txt
cat bq_query_output.csv >> validation_results.txt
fi
>>>

# ------------------------------------------------
# Runtime settings:
runtime {
Expand All @@ -148,61 +210,67 @@ task SpotCheckForExpectedTranscripts {
cpu: "1"
disks: "local-disk 100 HDD"
}
# ------------------------------------------------
# Output: {"Name of validation rule": "PASS/FAIL plus additional validation results"}

output {
Map[String, String] result = {"SpotCheckForExpectedTranscripts": read_string('validation_results.txt')}
}
}

task GetBQTableLastModifiedDatetime {
# because this is being used to determine if the data has changed, never use call cache
meta {
volatile: true
}

task SchemaOnlyOneRowPerNullTranscript {
input {
String query_project
String fq_table
String query_project_id
String fq_vat_table
String? service_account_json_path
String last_modified_timestamp
}

String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'

# ------------------------------------------------
# try to get the last modified date for the table in question; fail if something comes back from BigQuery
# that isn't in the right format (e.g. an error)
command <<<
set -e

if [ ~{has_service_account_file} = 'true' ]; then
gsutil cp ~{service_account_json_path} local.service_account.json
gcloud auth activate-service-account --key-file=local.service_account.json
gcloud config set project ~{query_project}
gcloud config set project ~{query_project_id}
fi
echo "project_id = ~{query_project_id}" > ~/.bigqueryrc

echo "project_id = ~{query_project}" > ~/.bigqueryrc
bq query --nouse_legacy_sql --project_id=~{query_project_id} --format=csv 'SELECT
vid,
COUNT(vid) AS num_rows
FROM
~{fq_vat_table}
WHERE
transcript_source is NULL AND
transcript is NULL
GROUP BY vid
HAVING num_rows = 1' > bq_variant_count.csv

# bq needs the project name to be separate by a colon
DATASET_TABLE_COLON=$(echo ~{fq_table} | sed 's/\./:/')
# get number of lines in bq query output
NUMRESULTS=$(awk 'END{print NR}' bq_variant_count.csv)

LASTMODIFIED=$(bq --location=US --project_id=~{query_project} --format=json show ${DATASET_TABLE_COLON} | python3 -c "import sys, json; print(json.load(sys.stdin)['lastModifiedTime']);")
if [[ $LASTMODIFIED =~ ^[0-9]+$ ]]; then
echo $LASTMODIFIED
# if the result of the query has any rows, that means there were vids will null transcripts and multiple
# rows in the VAT, which should not be the case
if [[ $NUMRESULTS = "0" ]]; then
echo "PASS: The VAT table ~{fq_vat_table} only has 1 row per vid with a null transcript" > validation_results.txt
else
exit 1
echo "FAIL: The VAT table ~{fq_vat_table} had at least one vid with a null transcript and more than one row: [csv output follows] " > validation_results.txt
cat bq_variant_count.csv >> validation_results.txt
fi
>>>

output {
String last_modified_timestamp = read_string(stdout())
}

# ------------------------------------------------
# Runtime settings:
runtime {
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:305.0.0"
memory: "3 GB"
disks: "local-disk 10 HDD"
memory: "1 GB"
preemptible: 3
cpu: 1
cpu: "1"
disks: "local-disk 100 HDD"
}
# ------------------------------------------------
# Output: {"Name of validation rule": "PASS/FAIL plus additional validation results"}
output {
Map[String, String] result = {"SchemaOnlyOneRowPerNullTranscript": read_string('validation_results.txt')}
}
}

0 comments on commit 8d440b3

Please sign in to comment.