From ea0057f803dfa4cccec3c354026bd44dd6bbc1b4 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Mon, 13 Jun 2022 17:12:55 -0400 Subject: [PATCH 01/15] Calculate size / cost of Core Storage Model [VS-473] --- .dockstore.yml | 1 + scripts/variantstore/wdl/GvsCallsetCost.wdl | 104 ++++++++++++++++++-- 2 files changed, 95 insertions(+), 10 deletions(-) diff --git a/.dockstore.yml b/.dockstore.yml index c33ca5575e2..8d8f82420af 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -218,6 +218,7 @@ workflows: - master - ah_var_store - vs_472_workflow_compute_costs + - vs_473_core_storage_model_cost - name: MitochondriaPipeline subclass: WDL primaryDescriptorPath: /scripts/mitochondria_m2_wdl/MitochondriaPipeline.wdl diff --git a/scripts/variantstore/wdl/GvsCallsetCost.wdl b/scripts/variantstore/wdl/GvsCallsetCost.wdl index 5a6d964bdb9..7d3b64b81f7 100644 --- a/scripts/variantstore/wdl/GvsCallsetCost.wdl +++ b/scripts/variantstore/wdl/GvsCallsetCost.wdl @@ -2,19 +2,25 @@ version 1.0 workflow GvsCallsetCost { input { -# String project_id -# String dataset_name - String workspace_namespace - String workspace_name + String project_name + String dataset_name +# String workspace_namespace +# String workspace_name # String callset_name Array[String] excluded_submission_ids = [] } - call WorkflowComputeCosts { + # call WorkflowComputeCosts { + # input: + # workspace_namespace = workspace_namespace, + # workspace_name = workspace_name, + # excluded_submission_ids = excluded_submission_ids + # } + + call CoreStorageModelSizes { input: - workspace_namespace = workspace_namespace, - workspace_name = workspace_name, - excluded_submission_ids = excluded_submission_ids + project_name = project_name, + dataset_name = dataset_name } # call BigQueryWriteAPICost { @@ -38,8 +44,11 @@ workflow GvsCallsetCost { # } output { - File workflow_compute_costs = WorkflowComputeCosts.costs - File workflow_compute_costs_log = WorkflowComputeCosts.log + # File workflow_compute_costs = WorkflowComputeCosts.costs + # File workflow_compute_costs_log = WorkflowComputeCosts.log + String vet_gib = CoreStorageModelSizes.vet_gib + String ref_ranges_gib = CoreStorageModelSizes.ref_ranges_gib + String alt_allele_gib = CoreStorageModelSizes.alt_allele_gib } } @@ -75,6 +84,81 @@ task WorkflowComputeCosts { } } +task CoreStorageModelSizes { + input { + String project_name + String dataset_name + } + meta { + description: "Read sizes of vet_%, ref_ranges_%, and alt_allele tables from `INFORMATION_SCHEMA.PARTITIONS`." + # Definitely don't cache this, the values will change while the inputs to this task will not! + volatile: true + } + command <<< + get_billable_bytes_in_gib() { + local table_pattern="$1" + local output_file_name="$2" + + bq query --location=US --project_id='~{project_name}' --format=csv --use_legacy_sql=false \ + "SELECT round(sum(total_billable_bytes) / (1024*1024*1024),2) \ + FROM \`~{project_name}.~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS\` \ + WHERE table_name LIKE '${table_pattern}'" > ${output_file_name} + } + + fail=0 + valid='-_0-9a-zA-Z' + # Technically single quotes and exclamation points are allowed but c'mon. + # https://cloud.google.com/resource-manager/docs/creating-managing-projects#:~:text=A%20project%20name%20can%20contain,between%204%20and%2030%20characters. + if [[ ~{project_name} =~ [^$valid] ]] + then + echo "Invalid project name '~{project_name}': contains disallowed characters" + fail=1 + fi + + project_name='~{project_name}' + project_name_length=${#project_name} + + if [[ $project_name_length -lt 4 ]] || [[ $project_name_length -gt 30 ]] + then + echo "Invalid project name '~{project_name}', length must be between 4 and 30 characters inclusive." + fail=1 + fi + + valid="0-9A-Za-z_" + if [[ ~{dataset_name} =~ [^$valid] ]] + then + echo "Invalid dataset name '~{dataset_name}': must contains only letters, numbers, or underscores." + fail=1 + fi + + dataset_name='~{dataset_name}' + dataset_name_length=${#dataset_name} + + if [[ $dataset_name_length -lt 1 ]] || [[ $dataset_name_length -gt 1024 ]] + then + echo "Invalid dataset name '~{dataset_name}': must be no more than 1024 characters." + fail=1 + fi + + if [[ $fail -eq 1 ]] + then + exit 1 + fi + + get_billable_bytes_in_gib "vet_%" vet_gib.txt + get_billable_bytes_in_gib "ref_ranges_%" ref_ranges_gib.txt + get_billable_bytes_in_gib "alt_allele" alt_allele_gib.txt + >>> + runtime { + docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:390.0.0" + } + output { + Float vet_gib = read_float("vet_gib.txt") + Float ref_ranges_gib = read_float("ref_ranges_gib.txt") + Float alt_allele_gib = read_float("alt_allele_gib.txt") + } +} + #task BigQueryWriteAPICost { # meta { # description: "Estimate GvsImportGenomes use of the BQ Write API via core storage costs from the sizes of vet_% and ref_ranges_% tables." From e802c223f5608df90c58887a1945509d47be59cc Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Wed, 22 Jun 2022 17:34:25 -0400 Subject: [PATCH 02/15] lose header --- scripts/variantstore/wdl/GvsCallsetCost.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/wdl/GvsCallsetCost.wdl b/scripts/variantstore/wdl/GvsCallsetCost.wdl index 7d3b64b81f7..a60b4a745f7 100644 --- a/scripts/variantstore/wdl/GvsCallsetCost.wdl +++ b/scripts/variantstore/wdl/GvsCallsetCost.wdl @@ -102,7 +102,7 @@ task CoreStorageModelSizes { bq query --location=US --project_id='~{project_name}' --format=csv --use_legacy_sql=false \ "SELECT round(sum(total_billable_bytes) / (1024*1024*1024),2) \ FROM \`~{project_name}.~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS\` \ - WHERE table_name LIKE '${table_pattern}'" > ${output_file_name} + WHERE table_name LIKE '${table_pattern}'" | tail -1 > ${output_file_name} } fail=0 From a6432d6f859936b383d57e26a01153d6ac9d137d Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Wed, 22 Jun 2022 18:08:26 -0400 Subject: [PATCH 03/15] maybe final form --- scripts/variantstore/wdl/GvsCallsetCost.wdl | 54 +++++---------------- 1 file changed, 12 insertions(+), 42 deletions(-) diff --git a/scripts/variantstore/wdl/GvsCallsetCost.wdl b/scripts/variantstore/wdl/GvsCallsetCost.wdl index a60b4a745f7..bd9be13953c 100644 --- a/scripts/variantstore/wdl/GvsCallsetCost.wdl +++ b/scripts/variantstore/wdl/GvsCallsetCost.wdl @@ -4,18 +4,18 @@ workflow GvsCallsetCost { input { String project_name String dataset_name -# String workspace_namespace -# String workspace_name -# String callset_name + String workspace_namespace + String workspace_name + String callset_name Array[String] excluded_submission_ids = [] } - # call WorkflowComputeCosts { - # input: - # workspace_namespace = workspace_namespace, - # workspace_name = workspace_name, - # excluded_submission_ids = excluded_submission_ids - # } + call WorkflowComputeCosts { + input: + workspace_namespace = workspace_namespace, + workspace_name = workspace_name, + excluded_submission_ids = excluded_submission_ids + } call CoreStorageModelSizes { input: @@ -23,12 +23,6 @@ workflow GvsCallsetCost { dataset_name = dataset_name } -# call BigQueryWriteAPICost { -# input: -# project_id = project_id, -# dataset_name = dataset_name -# } -# # call BigQueryScannedCost { # input: # project_id = project_id, @@ -44,8 +38,8 @@ workflow GvsCallsetCost { # } output { - # File workflow_compute_costs = WorkflowComputeCosts.costs - # File workflow_compute_costs_log = WorkflowComputeCosts.log + File workflow_compute_costs = WorkflowComputeCosts.costs + File workflow_compute_costs_log = WorkflowComputeCosts.log String vet_gib = CoreStorageModelSizes.vet_gib String ref_ranges_gib = CoreStorageModelSizes.ref_ranges_gib String alt_allele_gib = CoreStorageModelSizes.alt_allele_gib @@ -136,7 +130,7 @@ task CoreStorageModelSizes { if [[ $dataset_name_length -lt 1 ]] || [[ $dataset_name_length -gt 1024 ]] then - echo "Invalid dataset name '~{dataset_name}': must be no more than 1024 characters." + echo "Invalid dataset name '~{dataset_name}': must be at least one but no more than 1024 characters." fail=1 fi @@ -159,30 +153,6 @@ task CoreStorageModelSizes { } } -#task BigQueryWriteAPICost { -# meta { -# description: "Estimate GvsImportGenomes use of the BQ Write API via core storage costs from the sizes of vet_% and ref_ranges_% tables." -# volatile: true -# } -# -# input { -# String project_id -# String dataset_name -# } -# command <<< -# >>> -# -# runtime { -# docker: "" -# } -# -# output { -# Float vet_gib = read_float("") -# Float ref_ranges_gib = read_float("") -# Float import_genomes_cost = 3 -# } -#} - #task BigQueryScannedCost { # meta { # description: "Determine BigQuery scanned cost for GVSCreateAltAllele, GVSCreateFilterSet, and GVSPrepareRanges" From 13fbbcff3862a3bedbac4376640cc50ea4eefdf1 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Wed, 22 Jun 2022 18:22:58 -0400 Subject: [PATCH 04/15] cleanup --- scripts/variantstore/wdl/GvsCallsetCost.wdl | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/scripts/variantstore/wdl/GvsCallsetCost.wdl b/scripts/variantstore/wdl/GvsCallsetCost.wdl index bd9be13953c..b38a8a5a09a 100644 --- a/scripts/variantstore/wdl/GvsCallsetCost.wdl +++ b/scripts/variantstore/wdl/GvsCallsetCost.wdl @@ -101,7 +101,7 @@ task CoreStorageModelSizes { fail=0 valid='-_0-9a-zA-Z' - # Technically single quotes and exclamation points are allowed but c'mon. + # Technically single quotes and exclamation points are allowed but none of that nonsense here. # https://cloud.google.com/resource-manager/docs/creating-managing-projects#:~:text=A%20project%20name%20can%20contain,between%204%20and%2030%20characters. if [[ ~{project_name} =~ [^$valid] ]] then @@ -111,7 +111,6 @@ task CoreStorageModelSizes { project_name='~{project_name}' project_name_length=${#project_name} - if [[ $project_name_length -lt 4 ]] || [[ $project_name_length -gt 30 ]] then echo "Invalid project name '~{project_name}', length must be between 4 and 30 characters inclusive." @@ -121,16 +120,15 @@ task CoreStorageModelSizes { valid="0-9A-Za-z_" if [[ ~{dataset_name} =~ [^$valid] ]] then - echo "Invalid dataset name '~{dataset_name}': must contains only letters, numbers, or underscores." + echo "Invalid dataset name '~{dataset_name}': must contain only letters, numbers, or underscores." fail=1 fi dataset_name='~{dataset_name}' dataset_name_length=${#dataset_name} - if [[ $dataset_name_length -lt 1 ]] || [[ $dataset_name_length -gt 1024 ]] then - echo "Invalid dataset name '~{dataset_name}': must be at least one but no more than 1024 characters." + echo "Invalid dataset name '~{dataset_name}': length must be between 1 and 1024 characters inclusive." fail=1 fi From 28391fb7fdf20dffc03c5f86bc346c075226df3d Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Thu, 23 Jun 2022 16:58:28 -0400 Subject: [PATCH 05/15] cleanup --- scripts/variantstore/wdl/GvsCallsetCost.wdl | 79 ++++++++++++--------- 1 file changed, 47 insertions(+), 32 deletions(-) diff --git a/scripts/variantstore/wdl/GvsCallsetCost.wdl b/scripts/variantstore/wdl/GvsCallsetCost.wdl index b38a8a5a09a..09bcb35e3e8 100644 --- a/scripts/variantstore/wdl/GvsCallsetCost.wdl +++ b/scripts/variantstore/wdl/GvsCallsetCost.wdl @@ -89,48 +89,63 @@ task CoreStorageModelSizes { volatile: true } command <<< + + sanity_check_project() { + local -n outfail="fail" + + # Technically single quotes and exclamation points are allowed but none of that nonsense here. + # https://cloud.google.com/resource-manager/docs/creating-managing-projects#:~:text=A%20project%20name%20can%20contain,between%204%20and%2030%20characters. + valid='-_0-9a-zA-Z' + + if [[ "~{project_name}" =~ [^$valid] ]] + then + echo "Invalid project name '~{project_name}': contains disallowed characters" + outfail=1 + fi + + project_name='~{project_name}' + project_name_length=${#project_name} + if [[ $project_name_length -lt 4 ]] || [[ $project_name_length -gt 30 ]] + then + echo "Invalid project name '~{project_name}', length must be between 4 and 30 characters inclusive." + outfail=1 + fi + } + + sanity_check_dataset_name() { + local -n outfail="fail" + + valid="0-9A-Za-z_" + + if [[ "~{dataset_name}" =~ [^$valid] ]] + then + echo "Invalid dataset name '~{dataset_name}': must contain only letters, numbers, or underscores." + outfail=1 + fi + + dataset_name='~{dataset_name}' + dataset_name_length=${#dataset_name} + if [[ $dataset_name_length -lt 1 ]] || [[ $dataset_name_length -gt 1024 ]] + then + echo "Invalid dataset name '~{dataset_name}': length must be between 1 and 1024 characters inclusive." + outfail=1 + fi + } + get_billable_bytes_in_gib() { local table_pattern="$1" local output_file_name="$2" bq query --location=US --project_id='~{project_name}' --format=csv --use_legacy_sql=false \ "SELECT round(sum(total_billable_bytes) / (1024*1024*1024),2) \ - FROM \`~{project_name}.~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS\` \ - WHERE table_name LIKE '${table_pattern}'" | tail -1 > ${output_file_name} + FROM \`~{project_name}.~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS\` \ + WHERE table_name LIKE '${table_pattern}'" | tail -1 > ${output_file_name} } fail=0 - valid='-_0-9a-zA-Z' - # Technically single quotes and exclamation points are allowed but none of that nonsense here. - # https://cloud.google.com/resource-manager/docs/creating-managing-projects#:~:text=A%20project%20name%20can%20contain,between%204%20and%2030%20characters. - if [[ ~{project_name} =~ [^$valid] ]] - then - echo "Invalid project name '~{project_name}': contains disallowed characters" - fail=1 - fi - project_name='~{project_name}' - project_name_length=${#project_name} - if [[ $project_name_length -lt 4 ]] || [[ $project_name_length -gt 30 ]] - then - echo "Invalid project name '~{project_name}', length must be between 4 and 30 characters inclusive." - fail=1 - fi - - valid="0-9A-Za-z_" - if [[ ~{dataset_name} =~ [^$valid] ]] - then - echo "Invalid dataset name '~{dataset_name}': must contain only letters, numbers, or underscores." - fail=1 - fi - - dataset_name='~{dataset_name}' - dataset_name_length=${#dataset_name} - if [[ $dataset_name_length -lt 1 ]] || [[ $dataset_name_length -gt 1024 ]] - then - echo "Invalid dataset name '~{dataset_name}': length must be between 1 and 1024 characters inclusive." - fail=1 - fi + sanity_check_project + sanity_check_dataset_name if [[ $fail -eq 1 ]] then From daf39459b052985572ecf39f44f53665cd990c81 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Thu, 23 Jun 2022 17:17:24 -0400 Subject: [PATCH 06/15] improve error message --- scripts/variantstore/wdl/GvsCallsetCost.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/variantstore/wdl/GvsCallsetCost.wdl b/scripts/variantstore/wdl/GvsCallsetCost.wdl index 09bcb35e3e8..2fb809b82a6 100644 --- a/scripts/variantstore/wdl/GvsCallsetCost.wdl +++ b/scripts/variantstore/wdl/GvsCallsetCost.wdl @@ -99,7 +99,7 @@ task CoreStorageModelSizes { if [[ "~{project_name}" =~ [^$valid] ]] then - echo "Invalid project name '~{project_name}': contains disallowed characters" + echo "Invalid project name '~{project_name}': contains invalid characters, valid characters in [$valid]." outfail=1 fi @@ -119,7 +119,7 @@ task CoreStorageModelSizes { if [[ "~{dataset_name}" =~ [^$valid] ]] then - echo "Invalid dataset name '~{dataset_name}': must contain only letters, numbers, or underscores." + echo "Invalid dataset name '~{dataset_name}': contains invalid characters, valid characters in [$valid]." outfail=1 fi From 45da5596a9acce25b8267b6e40518c83871cee4b Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Mon, 27 Jun 2022 12:55:10 -0400 Subject: [PATCH 07/15] PR feedback --- scripts/variantstore/wdl/GvsCallsetCost.wdl | 32 ++++++++++----------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/scripts/variantstore/wdl/GvsCallsetCost.wdl b/scripts/variantstore/wdl/GvsCallsetCost.wdl index 2fb809b82a6..f78facbc448 100644 --- a/scripts/variantstore/wdl/GvsCallsetCost.wdl +++ b/scripts/variantstore/wdl/GvsCallsetCost.wdl @@ -2,11 +2,11 @@ version 1.0 workflow GvsCallsetCost { input { - String project_name + String project_identifier String dataset_name String workspace_namespace String workspace_name - String callset_name + String callset_identifier Array[String] excluded_submission_ids = [] } @@ -19,7 +19,7 @@ workflow GvsCallsetCost { call CoreStorageModelSizes { input: - project_name = project_name, + project_identifier = project_identifier, dataset_name = dataset_name } @@ -27,14 +27,14 @@ workflow GvsCallsetCost { # input: # project_id = project_id, # dataset_name = dataset_name, -# callset_name = callset_name +# callset_identifier = callset_identifier # } # # call BigQueryStorageAPIScannedCost { # input: # project_id = project_id, # dataset_name = dataset_name, -# callset_name = callset_name +# callset_identifier = callset_identifier # } output { @@ -80,7 +80,7 @@ task WorkflowComputeCosts { task CoreStorageModelSizes { input { - String project_name + String project_identifier String dataset_name } meta { @@ -97,17 +97,17 @@ task CoreStorageModelSizes { # https://cloud.google.com/resource-manager/docs/creating-managing-projects#:~:text=A%20project%20name%20can%20contain,between%204%20and%2030%20characters. valid='-_0-9a-zA-Z' - if [[ "~{project_name}" =~ [^$valid] ]] + if [[ "~{project_identifier}" =~ [^$valid] ]] then - echo "Invalid project name '~{project_name}': contains invalid characters, valid characters in [$valid]." + echo "Invalid project name '~{project_identifier}': contains invalid characters, valid characters in [$valid]." outfail=1 fi - project_name='~{project_name}' - project_name_length=${#project_name} - if [[ $project_name_length -lt 4 ]] || [[ $project_name_length -gt 30 ]] + project_identifier='~{project_identifier}' + project_identifier_length=${#project_identifier} + if [[ $project_identifier_length -lt 4 ]] || [[ $project_identifier_length -gt 30 ]] then - echo "Invalid project name '~{project_name}', length must be between 4 and 30 characters inclusive." + echo "Invalid project name '~{project_identifier}', length must be between 4 and 30 characters inclusive." outfail=1 fi } @@ -136,9 +136,9 @@ task CoreStorageModelSizes { local table_pattern="$1" local output_file_name="$2" - bq query --location=US --project_id='~{project_name}' --format=csv --use_legacy_sql=false \ + bq query --location=US --project_id='~{project_identifier}' --format=csv --use_legacy_sql=false \ "SELECT round(sum(total_billable_bytes) / (1024*1024*1024),2) \ - FROM \`~{project_name}.~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS\` \ + FROM \`~{project_identifier}.~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS\` \ WHERE table_name LIKE '${table_pattern}'" | tail -1 > ${output_file_name} } @@ -175,7 +175,7 @@ task CoreStorageModelSizes { # input { # String project_id # String dataset_name -# String callset_name +# String callset_identifier # } # # command <<< @@ -202,7 +202,7 @@ task CoreStorageModelSizes { # input { # String project_id # String dataset_name -# String callset_name +# String callset_identifier # } # # command <<< From 9e2bb238dd93ef0f8971076c1094e9abec7be209 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Mon, 27 Jun 2022 13:24:58 -0400 Subject: [PATCH 08/15] and again --- scripts/variantstore/wdl/GvsCallsetCost.wdl | 22 ++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/scripts/variantstore/wdl/GvsCallsetCost.wdl b/scripts/variantstore/wdl/GvsCallsetCost.wdl index f78facbc448..6a49e6ee554 100644 --- a/scripts/variantstore/wdl/GvsCallsetCost.wdl +++ b/scripts/variantstore/wdl/GvsCallsetCost.wdl @@ -2,7 +2,7 @@ version 1.0 workflow GvsCallsetCost { input { - String project_identifier + String project_id String dataset_name String workspace_namespace String workspace_name @@ -19,7 +19,7 @@ workflow GvsCallsetCost { call CoreStorageModelSizes { input: - project_identifier = project_identifier, + project_id = project_id, dataset_name = dataset_name } @@ -80,7 +80,7 @@ task WorkflowComputeCosts { task CoreStorageModelSizes { input { - String project_identifier + String project_id String dataset_name } meta { @@ -97,17 +97,17 @@ task CoreStorageModelSizes { # https://cloud.google.com/resource-manager/docs/creating-managing-projects#:~:text=A%20project%20name%20can%20contain,between%204%20and%2030%20characters. valid='-_0-9a-zA-Z' - if [[ "~{project_identifier}" =~ [^$valid] ]] + if [[ "~{project_id}" =~ [^$valid] ]] then - echo "Invalid project name '~{project_identifier}': contains invalid characters, valid characters in [$valid]." + echo "Invalid project name '~{project_id}': contains invalid characters, valid characters in [$valid]." outfail=1 fi - project_identifier='~{project_identifier}' - project_identifier_length=${#project_identifier} - if [[ $project_identifier_length -lt 4 ]] || [[ $project_identifier_length -gt 30 ]] + project_id='~{project_id}' + project_id_length=${#project_id} + if [[ $project_id_length -lt 4 ]] || [[ $project_id_length -gt 30 ]] then - echo "Invalid project name '~{project_identifier}', length must be between 4 and 30 characters inclusive." + echo "Invalid project name '~{project_id}', length must be between 4 and 30 characters inclusive." outfail=1 fi } @@ -136,9 +136,9 @@ task CoreStorageModelSizes { local table_pattern="$1" local output_file_name="$2" - bq query --location=US --project_id='~{project_identifier}' --format=csv --use_legacy_sql=false \ + bq query --location=US --project_id='~{project_id}' --format=csv --use_legacy_sql=false \ "SELECT round(sum(total_billable_bytes) / (1024*1024*1024),2) \ - FROM \`~{project_identifier}.~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS\` \ + FROM \`~{project_id}.~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS\` \ WHERE table_name LIKE '${table_pattern}'" | tail -1 > ${output_file_name} } From 3615581afc7c89ad5c076145afeb0b7d98b82604 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Mon, 27 Jun 2022 16:11:00 -0400 Subject: [PATCH 09/15] PR feedback --- scripts/variantstore/wdl/GvsCallsetCost.wdl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/variantstore/wdl/GvsCallsetCost.wdl b/scripts/variantstore/wdl/GvsCallsetCost.wdl index 6a49e6ee554..bd48cc2b10a 100644 --- a/scripts/variantstore/wdl/GvsCallsetCost.wdl +++ b/scripts/variantstore/wdl/GvsCallsetCost.wdl @@ -6,7 +6,7 @@ workflow GvsCallsetCost { String dataset_name String workspace_namespace String workspace_name - String callset_identifier + String call_set_identifier Array[String] excluded_submission_ids = [] } @@ -27,14 +27,14 @@ workflow GvsCallsetCost { # input: # project_id = project_id, # dataset_name = dataset_name, -# callset_identifier = callset_identifier +# call_set_identifier = call_set_identifier # } # # call BigQueryStorageAPIScannedCost { # input: # project_id = project_id, # dataset_name = dataset_name, -# callset_identifier = callset_identifier +# call_set_identifier = call_set_identifier # } output { @@ -175,7 +175,7 @@ task CoreStorageModelSizes { # input { # String project_id # String dataset_name -# String callset_identifier +# String call_set_identifier # } # # command <<< @@ -202,7 +202,7 @@ task CoreStorageModelSizes { # input { # String project_id # String dataset_name -# String callset_identifier +# String call_set_identifier # } # # command <<< From 863abf9bfa679c2c9f2fa386d1c9cab314f39691 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Mon, 27 Jun 2022 16:28:10 -0400 Subject: [PATCH 10/15] bring validation from in-progress work here --- scripts/variantstore/wdl/GvsCallsetCost.wdl | 115 +++++++++++++++++++- 1 file changed, 110 insertions(+), 5 deletions(-) diff --git a/scripts/variantstore/wdl/GvsCallsetCost.wdl b/scripts/variantstore/wdl/GvsCallsetCost.wdl index bd48cc2b10a..bae7b84a967 100644 --- a/scripts/variantstore/wdl/GvsCallsetCost.wdl +++ b/scripts/variantstore/wdl/GvsCallsetCost.wdl @@ -10,8 +10,18 @@ workflow GvsCallsetCost { Array[String] excluded_submission_ids = [] } + call ValidateInputs { + input: + project_id = project_id, + dataset_name = dataset_name, + workspace_namespace = workspace_namespace, + workspace_name = workspace_name, + call_set_identifier = call_set_identifier + } + call WorkflowComputeCosts { input: + go = ValidateInputs.done, workspace_namespace = workspace_namespace, workspace_name = workspace_name, excluded_submission_ids = excluded_submission_ids @@ -19,6 +29,7 @@ workflow GvsCallsetCost { call CoreStorageModelSizes { input: + go = ValidateInputs.done, project_id = project_id, dataset_name = dataset_name } @@ -46,13 +57,106 @@ workflow GvsCallsetCost { } } +task ValidateInputs { + meta { + description: "Sanity check inputs before running anything" + # OK for this to call cache so it's not `volatile`. + } + input { + String project_id + String dataset_name + String workspace_namespace + String workspace_name + String call_set_identifier + } + command <<< + + sanity_check_input() { + local -n outfail="fail" + local description="$1" + local input="$2" + local valid_characters="$3" + local minimum_length="$4" + local maximum_length="$5" + + + if [[ ${#valid_characters} -gt 0 ]] + then + if [[ "${input}" =~ [^${valid_characters}] ]] + then + echo "Invalid ${description} '${input}': contains invalid characters, valid characters in [$valid]." + outfail=1 + fi + fi + + local input_length=${#input} + if [[ ${input_length} -lt ${minimum_length} ]] || [[ ${input_length} -gt ${maximum_length} ]] + then + echo "Invalid ${description} '$input', length must be between ${minimum_length} and ${maximum_length} characters inclusive." + outfail=1 + fi + } + + fail=0 + + # Technically single quotes and exclamation points are allowed in project names but none of that nonsense here. + # https://cloud.google.com/resource-manager/docs/creating-managing-projects#:~:text=A%20project%20name%20can%20contain,between%204%20and%2030%20characters. + sanity_check_input \ + "project id" \ + "~{project_id}" \ + '-_0-9a-zA-Z' \ + 4 \ + 30 + + sanity_check_input \ + "dataset name" \ + "~{dataset_name}" \ + "0-9A-Za-z_" \ + 1 \ + 1024 + + # The following non-Google restrictions are arbitrary but comforting, relax if necessary. + sanity_check_input \ + "call set identifier" \ + "~{call_set_identifier}" \ + '-_0-9a-zA-Z' \ + 1 \ + 100 + + sanity_check_input \ + "workspace namespace" \ + "~{workspace_namespace}" \ + '' \ + 1 \ + 100 + + sanity_check_input \ + "workspace name" \ + "~{workspace_name}" \ + '' \ + 1 \ + 100 + + if [[ $fail -eq 1 ]] + then + exit 1 + fi + >>> + runtime { + docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:390.0.0" + } + output { + Boolean done = true + } +} + task WorkflowComputeCosts { meta { description: "Calculate workflow compute costs by calling Firecloud APIs for submissions in the specified workspace" volatile: true } - input { + Boolean go = true String workspace_namespace String workspace_name Array[String] excluded_submission_ids @@ -79,15 +183,16 @@ task WorkflowComputeCosts { } task CoreStorageModelSizes { - input { - String project_id - String dataset_name - } meta { description: "Read sizes of vet_%, ref_ranges_%, and alt_allele tables from `INFORMATION_SCHEMA.PARTITIONS`." # Definitely don't cache this, the values will change while the inputs to this task will not! volatile: true } + input { + Boolean go = true + String project_id + String dataset_name + } command <<< sanity_check_project() { From 556cd010fb2c8105d12e79605d8d5a78dd9b1e1f Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Mon, 27 Jun 2022 16:30:48 -0400 Subject: [PATCH 11/15] comments --- scripts/variantstore/wdl/GvsCallsetCost.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/variantstore/wdl/GvsCallsetCost.wdl b/scripts/variantstore/wdl/GvsCallsetCost.wdl index bae7b84a967..1ff350ac927 100644 --- a/scripts/variantstore/wdl/GvsCallsetCost.wdl +++ b/scripts/variantstore/wdl/GvsCallsetCost.wdl @@ -79,7 +79,7 @@ task ValidateInputs { local minimum_length="$4" local maximum_length="$5" - + # Do not check for valid characters if the `valid_characters` variable is empty. if [[ ${#valid_characters} -gt 0 ]] then if [[ "${input}" =~ [^${valid_characters}] ]] @@ -115,7 +115,7 @@ task ValidateInputs { 1 \ 1024 - # The following non-Google restrictions are arbitrary but comforting, relax if necessary. + # The following non-Google restrictions are arbitrary but comforting and could be relaxed. sanity_check_input \ "call set identifier" \ "~{call_set_identifier}" \ From 711ba6e8058ed88f372813ae2af67a998463aff4 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Mon, 27 Jun 2022 16:36:32 -0400 Subject: [PATCH 12/15] uh whoops --- scripts/variantstore/wdl/GvsCallsetCost.wdl | 52 --------------------- 1 file changed, 52 deletions(-) diff --git a/scripts/variantstore/wdl/GvsCallsetCost.wdl b/scripts/variantstore/wdl/GvsCallsetCost.wdl index 1ff350ac927..457875759f7 100644 --- a/scripts/variantstore/wdl/GvsCallsetCost.wdl +++ b/scripts/variantstore/wdl/GvsCallsetCost.wdl @@ -195,48 +195,6 @@ task CoreStorageModelSizes { } command <<< - sanity_check_project() { - local -n outfail="fail" - - # Technically single quotes and exclamation points are allowed but none of that nonsense here. - # https://cloud.google.com/resource-manager/docs/creating-managing-projects#:~:text=A%20project%20name%20can%20contain,between%204%20and%2030%20characters. - valid='-_0-9a-zA-Z' - - if [[ "~{project_id}" =~ [^$valid] ]] - then - echo "Invalid project name '~{project_id}': contains invalid characters, valid characters in [$valid]." - outfail=1 - fi - - project_id='~{project_id}' - project_id_length=${#project_id} - if [[ $project_id_length -lt 4 ]] || [[ $project_id_length -gt 30 ]] - then - echo "Invalid project name '~{project_id}', length must be between 4 and 30 characters inclusive." - outfail=1 - fi - } - - sanity_check_dataset_name() { - local -n outfail="fail" - - valid="0-9A-Za-z_" - - if [[ "~{dataset_name}" =~ [^$valid] ]] - then - echo "Invalid dataset name '~{dataset_name}': contains invalid characters, valid characters in [$valid]." - outfail=1 - fi - - dataset_name='~{dataset_name}' - dataset_name_length=${#dataset_name} - if [[ $dataset_name_length -lt 1 ]] || [[ $dataset_name_length -gt 1024 ]] - then - echo "Invalid dataset name '~{dataset_name}': length must be between 1 and 1024 characters inclusive." - outfail=1 - fi - } - get_billable_bytes_in_gib() { local table_pattern="$1" local output_file_name="$2" @@ -247,16 +205,6 @@ task CoreStorageModelSizes { WHERE table_name LIKE '${table_pattern}'" | tail -1 > ${output_file_name} } - fail=0 - - sanity_check_project - sanity_check_dataset_name - - if [[ $fail -eq 1 ]] - then - exit 1 - fi - get_billable_bytes_in_gib "vet_%" vet_gib.txt get_billable_bytes_in_gib "ref_ranges_%" ref_ranges_gib.txt get_billable_bytes_in_gib "alt_allele" alt_allele_gib.txt From efdadbcc502dc609de722508c70d6ca28d19678c Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Mon, 27 Jun 2022 16:46:00 -0400 Subject: [PATCH 13/15] whoops --- scripts/variantstore/wdl/GvsCallsetCost.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/wdl/GvsCallsetCost.wdl b/scripts/variantstore/wdl/GvsCallsetCost.wdl index 457875759f7..20fe222b840 100644 --- a/scripts/variantstore/wdl/GvsCallsetCost.wdl +++ b/scripts/variantstore/wdl/GvsCallsetCost.wdl @@ -84,7 +84,7 @@ task ValidateInputs { then if [[ "${input}" =~ [^${valid_characters}] ]] then - echo "Invalid ${description} '${input}': contains invalid characters, valid characters in [$valid]." + echo "Invalid ${description} '${input}': contains invalid characters, valid characters in [${valid_characters}]." outfail=1 fi fi From 68c9e2b5a21921548029b5f0bb9f2987f2ad47f7 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Mon, 27 Jun 2022 17:39:36 -0400 Subject: [PATCH 14/15] gah --- scripts/variantstore/wdl/GvsCallsetCost.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/wdl/GvsCallsetCost.wdl b/scripts/variantstore/wdl/GvsCallsetCost.wdl index 20fe222b840..d62d45c9790 100644 --- a/scripts/variantstore/wdl/GvsCallsetCost.wdl +++ b/scripts/variantstore/wdl/GvsCallsetCost.wdl @@ -99,7 +99,7 @@ task ValidateInputs { fail=0 - # Technically single quotes and exclamation points are allowed in project names but none of that nonsense here. + # Technically single quotes and exclamation points are allowed in project ids but none of that nonsense here. # https://cloud.google.com/resource-manager/docs/creating-managing-projects#:~:text=A%20project%20name%20can%20contain,between%204%20and%2030%20characters. sanity_check_input \ "project id" \ From aad4ef85e358f31e847dcbf6a4b5c77b020d7285 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Tue, 28 Jun 2022 16:20:45 -0400 Subject: [PATCH 15/15] remove undesired stuff --- scripts/variantstore/wdl/GvsCallsetCost.wdl | 105 -------------------- 1 file changed, 105 deletions(-) diff --git a/scripts/variantstore/wdl/GvsCallsetCost.wdl b/scripts/variantstore/wdl/GvsCallsetCost.wdl index d62d45c9790..3d7f4f5d852 100644 --- a/scripts/variantstore/wdl/GvsCallsetCost.wdl +++ b/scripts/variantstore/wdl/GvsCallsetCost.wdl @@ -10,18 +10,8 @@ workflow GvsCallsetCost { Array[String] excluded_submission_ids = [] } - call ValidateInputs { - input: - project_id = project_id, - dataset_name = dataset_name, - workspace_namespace = workspace_namespace, - workspace_name = workspace_name, - call_set_identifier = call_set_identifier - } - call WorkflowComputeCosts { input: - go = ValidateInputs.done, workspace_namespace = workspace_namespace, workspace_name = workspace_name, excluded_submission_ids = excluded_submission_ids @@ -29,7 +19,6 @@ workflow GvsCallsetCost { call CoreStorageModelSizes { input: - go = ValidateInputs.done, project_id = project_id, dataset_name = dataset_name } @@ -57,98 +46,6 @@ workflow GvsCallsetCost { } } -task ValidateInputs { - meta { - description: "Sanity check inputs before running anything" - # OK for this to call cache so it's not `volatile`. - } - input { - String project_id - String dataset_name - String workspace_namespace - String workspace_name - String call_set_identifier - } - command <<< - - sanity_check_input() { - local -n outfail="fail" - local description="$1" - local input="$2" - local valid_characters="$3" - local minimum_length="$4" - local maximum_length="$5" - - # Do not check for valid characters if the `valid_characters` variable is empty. - if [[ ${#valid_characters} -gt 0 ]] - then - if [[ "${input}" =~ [^${valid_characters}] ]] - then - echo "Invalid ${description} '${input}': contains invalid characters, valid characters in [${valid_characters}]." - outfail=1 - fi - fi - - local input_length=${#input} - if [[ ${input_length} -lt ${minimum_length} ]] || [[ ${input_length} -gt ${maximum_length} ]] - then - echo "Invalid ${description} '$input', length must be between ${minimum_length} and ${maximum_length} characters inclusive." - outfail=1 - fi - } - - fail=0 - - # Technically single quotes and exclamation points are allowed in project ids but none of that nonsense here. - # https://cloud.google.com/resource-manager/docs/creating-managing-projects#:~:text=A%20project%20name%20can%20contain,between%204%20and%2030%20characters. - sanity_check_input \ - "project id" \ - "~{project_id}" \ - '-_0-9a-zA-Z' \ - 4 \ - 30 - - sanity_check_input \ - "dataset name" \ - "~{dataset_name}" \ - "0-9A-Za-z_" \ - 1 \ - 1024 - - # The following non-Google restrictions are arbitrary but comforting and could be relaxed. - sanity_check_input \ - "call set identifier" \ - "~{call_set_identifier}" \ - '-_0-9a-zA-Z' \ - 1 \ - 100 - - sanity_check_input \ - "workspace namespace" \ - "~{workspace_namespace}" \ - '' \ - 1 \ - 100 - - sanity_check_input \ - "workspace name" \ - "~{workspace_name}" \ - '' \ - 1 \ - 100 - - if [[ $fail -eq 1 ]] - then - exit 1 - fi - >>> - runtime { - docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:390.0.0" - } - output { - Boolean done = true - } -} task WorkflowComputeCosts { meta { @@ -156,7 +53,6 @@ task WorkflowComputeCosts { volatile: true } input { - Boolean go = true String workspace_namespace String workspace_name Array[String] excluded_submission_ids @@ -189,7 +85,6 @@ task CoreStorageModelSizes { volatile: true } input { - Boolean go = true String project_id String dataset_name }