From c05bc7a0b791fe6f1a7534aaa5b0c72e0e1bd984 Mon Sep 17 00:00:00 2001 From: bradmiro Date: Mon, 18 Sep 2023 15:54:26 -0400 Subject: [PATCH 01/20] changing dataplex tables to managed by default, removing manual 'upgrade' logic --- dataplex.tf | 10 +++-- src/yaml/project-setup.yaml | 84 ------------------------------------- 2 files changed, 6 insertions(+), 88 deletions(-) diff --git a/dataplex.tf b/dataplex.tf index 958eb3e..69e375d 100644 --- a/dataplex.tf +++ b/dataplex.tf @@ -114,8 +114,9 @@ resource "google_dataplex_asset" "gcp_primary_textocr" { } resource_spec { - name = "projects/${module.project-services.project_id}/buckets/${google_storage_bucket.textocr_images_bucket.name}" - type = "STORAGE_BUCKET" + name = "projects/${module.project-services.project_id}/buckets/${google_storage_bucket.textocr_images_bucket.name}" + type = "STORAGE_BUCKET" + read_access_mode = "MANAGED" } project = module.project-services.project_id @@ -136,8 +137,9 @@ resource "google_dataplex_asset" "gcp_primary_ga4_obfuscated_sample_ecommerce" { } resource_spec { - name = "projects/${module.project-services.project_id}/buckets/${google_storage_bucket.ga4_images_bucket.name}" - type = "STORAGE_BUCKET" + name = "projects/${module.project-services.project_id}/buckets/${google_storage_bucket.ga4_images_bucket.name}" + type = "STORAGE_BUCKET" + read_access_mode = "MANAGED" } project = module.project-services.project_id diff --git a/src/yaml/project-setup.yaml b/src/yaml/project-setup.yaml index 935cd2d..7bb702e 100644 --- a/src/yaml/project-setup.yaml +++ b/src/yaml/project-setup.yaml @@ -33,9 +33,6 @@ main: - dataproc_service_account_name: ${dataproc_service_account} - provisioner_bucket_name: ${provisioner_bucket} - warehouse_bucket_name: ${warehouse_bucket} - - sub_upgrade_dataplex_assets: - call: upgrade_dataplex_assets - result: upgrade_dataplex_assets_output # TODO: change this to poll for BigQuery table creation - sub_wait_for_dataplex_discovery: call: sys.sleep @@ -56,87 +53,6 @@ main: call: create_taxonomy result: create_taxonomy_output -# Subworkflow to upgrade all Dataplex Assets to Managed -# Subworkflow gets all lakes, then all zones within each lake, then all assets within each zone and upgrades -upgrade_dataplex_assets: - steps: - - init: - assign: - - project_id: $${sys.get_env("GOOGLE_CLOUD_PROJECT_ID")} - - location: $${sys.get_env("GOOGLE_CLOUD_LOCATION")} - - zones: [] - - get_lakes: - call: http.get - args: - url: $${"https://dataplex.googleapis.com/v1/projects/"+project_id+"/locations/"+location+"/lakes"} - auth: - type: OAuth2 - result: Response - - assign_lakes: - assign: - - response_lakes: $${Response.body.lakes} - - get_zones: - for: - value: lake - index: i - in: $${response_lakes} - steps: - - get_zones_in_lake: - call: http.get - args: - url: $${"https://dataplex.googleapis.com/v1/"+lake.name+"/zones"} - auth: - type: OAuth2 - result: Response - - assign_zones: - assign: - - response_zones: $${Response.body.zones} - - - save_zones: - for: - value: zone - index: j - in: $${response_zones} - steps: - - save_to_list: - assign: - - zones: $${list.concat(zones, zone)} - - get_and_upgrade_all_assets: - for: - value: zone - index: i - in: $${zones} - steps: - - get_assets_in_zone: - call: http.get - args: - url: $${"https://dataplex.googleapis.com/v1/"+zone.name+"/assets"} - auth: - type: OAuth2 - result: Response - - check_for_assets: - switch: - - condition: $${not("assets" in Response.body)} - next: continue - - assign_assets: - assign: - - response_assets: $${Response.body.assets} - - upgrade_all_assets: - for: - value: asset - index: j - in: $${response_assets} - steps: - - upgrade_asset: - call: http.patch - args: - url: $${"https://dataplex.googleapis.com/v1/"+asset.name+"?updateMask=resourceSpec.readAccessMode"} - auth: - type: OAuth2 - body: - resourceSpec: - readAccessMode: "MANAGED" - # Subworkflow to create BigQuery views create_tables: steps: From 308f2dc6b178eeb53b735844722ce0603fabc5f9 Mon Sep 17 00:00:00 2001 From: bradmiro Date: Mon, 18 Sep 2023 16:14:26 -0400 Subject: [PATCH 02/20] scrubbing unnecessary explicit 'depends_on' --- dataplex.tf | 6 +++--- dataproc.tf | 16 ---------------- workflows.tf | 2 -- 3 files changed, 3 insertions(+), 21 deletions(-) diff --git a/dataplex.tf b/dataplex.tf index 69e375d..6e3e121 100644 --- a/dataplex.tf +++ b/dataplex.tf @@ -160,13 +160,13 @@ resource "google_dataplex_asset" "gcp_primary_tables" { } resource_spec { - name = "projects/${module.project-services.project_id}/buckets/${google_storage_bucket.tables_bucket.name}" - type = "STORAGE_BUCKET" + name = "projects/${module.project-services.project_id}/buckets/${google_storage_bucket.tables_bucket.name}" + type = "STORAGE_BUCKET" + read_access_mode = "MANAGED" } project = module.project-services.project_id depends_on = [time_sleep.wait_after_all_resources, google_project_iam_member.dataplex_bucket_access] - } diff --git a/dataproc.tf b/dataproc.tf index 91609b0..d51e62b 100644 --- a/dataproc.tf +++ b/dataproc.tf @@ -31,10 +31,6 @@ resource "google_compute_subnetwork" "subnet" { region = var.region network = google_compute_network.default_network.id private_ip_google_access = true - - depends_on = [ - google_compute_network.default_network, - ] } # Firewall rule for dataproc cluster @@ -83,10 +79,6 @@ resource "google_project_iam_member" "dataproc_sa_roles" { project = module.project-services.project_id role = each.key member = "serviceAccount:${google_service_account.dataproc_service_account.email}" - - depends_on = [ - google_service_account.dataproc_service_account - ] } # # Create a BigQuery connection @@ -103,10 +95,6 @@ resource "google_project_iam_member" "bq_connection_iam_object_viewer" { project = module.project-services.project_id role = "roles/storage.objectViewer" member = "serviceAccount:${google_bigquery_connection.ds_connection.cloud_resource[0].service_account_id}" - - depends_on = [ - google_bigquery_connection.ds_connection - ] } # # Grant IAM access to the BigQuery Connection account for BigLake Metastore @@ -114,10 +102,6 @@ resource "google_project_iam_member" "bq_connection_iam_biglake" { project = module.project-services.project_id role = "roles/biglake.admin" member = "serviceAccount:${google_bigquery_connection.ds_connection.cloud_resource[0].service_account_id}" - - depends_on = [ - google_bigquery_connection.ds_connection - ] } # # Create a BigQuery external table. diff --git a/workflows.tf b/workflows.tf index ec50d71..75316fd 100644 --- a/workflows.tf +++ b/workflows.tf @@ -120,7 +120,6 @@ data "http" "call_workflows_copy_data" { Accept = "application/json" Authorization = "Bearer ${data.google_client_config.current.access_token}" } depends_on = [ - google_workflows_workflow.copy_data, google_storage_bucket.textocr_images_bucket, google_storage_bucket.ga4_images_bucket, google_storage_bucket.tables_bucket @@ -135,7 +134,6 @@ data "http" "call_workflows_project_setup" { Accept = "application/json" Authorization = "Bearer ${data.google_client_config.current.access_token}" } depends_on = [ - google_workflows_workflow.project_setup, google_dataplex_asset.gcp_primary_textocr, google_dataplex_asset.gcp_primary_ga4_obfuscated_sample_ecommerce, google_dataplex_asset.gcp_primary_tables From 752e109b86377e16fcb525d8338fd63b5c101bb6 Mon Sep 17 00:00:00 2001 From: bradmiro Date: Mon, 18 Sep 2023 16:39:32 -0400 Subject: [PATCH 03/20] remove creating bq external table --- dataplex.tf | 15 ++++---- dataproc.tf | 100 +--------------------------------------------------- 2 files changed, 8 insertions(+), 107 deletions(-) diff --git a/dataplex.tf b/dataplex.tf index 6e3e121..db72c2e 100644 --- a/dataplex.tf +++ b/dataplex.tf @@ -20,6 +20,13 @@ resource "google_project_service_identity" "dataplex_sa" { service = "dataplex.googleapis.com" } +#give dataplex access to biglake bucket +resource "google_project_iam_member" "dataplex_bucket_access" { + project = module.project-services.project_id + role = "roles/dataplex.serviceAgent" + member = "serviceAccount:${google_project_service_identity.dataplex_sa.email}" +} + resource "google_dataplex_lake" "gcp_primary" { location = var.region name = "gcp-primary-lake" @@ -168,11 +175,3 @@ resource "google_dataplex_asset" "gcp_primary_tables" { project = module.project-services.project_id depends_on = [time_sleep.wait_after_all_resources, google_project_iam_member.dataplex_bucket_access] } - - -#give dataplex access to biglake bucket -resource "google_project_iam_member" "dataplex_bucket_access" { - project = module.project-services.project_id - role = "roles/dataplex.serviceAgent" - member = "serviceAccount:${google_project_service_identity.dataplex_sa.email}" -} diff --git a/dataproc.tf b/dataproc.tf index d51e62b..323eed7 100644 --- a/dataproc.tf +++ b/dataproc.tf @@ -102,102 +102,4 @@ resource "google_project_iam_member" "bq_connection_iam_biglake" { project = module.project-services.project_id role = "roles/biglake.admin" member = "serviceAccount:${google_bigquery_connection.ds_connection.cloud_resource[0].service_account_id}" -} - -# # Create a BigQuery external table. -resource "google_bigquery_table" "tbl_thelook_events" { - dataset_id = google_bigquery_dataset.gcp_lakehouse_ds.dataset_id - table_id = "gcp_tbl_events" - project = module.project-services.project_id - deletion_protection = var.deletion_protection - - external_data_configuration { - autodetect = true - connection_id = google_bigquery_connection.ds_connection.name #TODO: Change other solutions to remove hardcoded reference - source_format = "PARQUET" - source_uris = ["gs://${var.public_data_bucket}/thelook_ecommerce/events-*.Parquet"] - - } - - schema = < Date: Mon, 18 Sep 2023 16:47:20 -0400 Subject: [PATCH 04/20] adding options for stage and nonstage table prefix --- src/bigquery.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/bigquery.py b/src/bigquery.py index 3ad9e09..bade034 100644 --- a/src/bigquery.py +++ b/src/bigquery.py @@ -44,9 +44,14 @@ # Load data from BigQuery. -events = spark.read.format("bigquery") \ - .option("table", "gcp_primary_staging.stage_thelook_ecommerce_events") \ - .load() +try: + events = spark.read.format("bigquery") \ + .option("table", "gcp_primary_staging.thelook_ecommerce_events") \ + .load() +except: + events = spark.read.format("bigquery") \ + .option("table", "gcp_primary_staging.stage_thelook_ecommerce_events") \ + .load() events.createOrReplaceTempView("events") # Create Iceberg Table if not exists From 642073fcd4d1e25bee787b34499ed4631524cc1b Mon Sep 17 00:00:00 2001 From: bradmiro Date: Thu, 21 Sep 2023 18:34:14 -0400 Subject: [PATCH 05/20] remove wait_after_all_resources and move dependencies to more appropriate spots --- dataplex.tf | 6 +++--- dataproc.tf | 4 ++++ main.tf | 18 ------------------ workflows.tf | 23 +++++++++++++++++++++-- 4 files changed, 28 insertions(+), 23 deletions(-) diff --git a/dataplex.tf b/dataplex.tf index db72c2e..07c41df 100644 --- a/dataplex.tf +++ b/dataplex.tf @@ -127,7 +127,7 @@ resource "google_dataplex_asset" "gcp_primary_textocr" { } project = module.project-services.project_id - depends_on = [time_sleep.wait_after_all_resources, google_project_iam_member.dataplex_bucket_access] + depends_on = [time_sleep.wait_after_copy_data, google_project_iam_member.dataplex_bucket_access] } @@ -150,7 +150,7 @@ resource "google_dataplex_asset" "gcp_primary_ga4_obfuscated_sample_ecommerce" { } project = module.project-services.project_id - depends_on = [time_sleep.wait_after_all_resources, google_project_iam_member.dataplex_bucket_access] + depends_on = [time_sleep.wait_after_copy_data, google_project_iam_member.dataplex_bucket_access] } @@ -173,5 +173,5 @@ resource "google_dataplex_asset" "gcp_primary_tables" { } project = module.project-services.project_id - depends_on = [time_sleep.wait_after_all_resources, google_project_iam_member.dataplex_bucket_access] + depends_on = [time_sleep.wait_after_copy_data, google_project_iam_member.dataplex_bucket_access] } diff --git a/dataproc.tf b/dataproc.tf index 323eed7..8075d04 100644 --- a/dataproc.tf +++ b/dataproc.tf @@ -102,4 +102,8 @@ resource "google_project_iam_member" "bq_connection_iam_biglake" { project = module.project-services.project_id role = "roles/biglake.admin" member = "serviceAccount:${google_bigquery_connection.ds_connection.cloud_resource[0].service_account_id}" +} + +resource "google_dataproc_cluster" "phs" { + } \ No newline at end of file diff --git a/main.tf b/main.tf index 26cc876..01d2d81 100644 --- a/main.tf +++ b/main.tf @@ -161,21 +161,3 @@ resource "google_storage_bucket" "dataplex_bucket" { uniform_bucket_level_access = true force_destroy = var.force_destroy } - -# Resources are dependent on one another. We will ensure the following set of resources are created before proceeding. -resource "time_sleep" "wait_after_all_resources" { - create_duration = "120s" - depends_on = [ - module.project-services, - google_storage_bucket.provisioning_bucket, - google_bigquery_dataset.gcp_lakehouse_ds, - google_bigquery_connection.gcp_lakehouse_connection, - google_project_iam_member.connectionPermissionGrant, - google_workflows_workflow.project_setup, - google_dataplex_zone.gcp_primary_raw, - google_dataplex_zone.gcp_primary_staging, - google_dataplex_zone.gcp_primary_curated_bi, - data.google_storage_project_service_account.gcs_account, - data.http.call_workflows_copy_data - ] -} diff --git a/workflows.tf b/workflows.tf index 75316fd..910f9a0 100644 --- a/workflows.tf +++ b/workflows.tf @@ -126,7 +126,15 @@ data "http" "call_workflows_copy_data" { ] } -# # execute the other project setup workflow +resource "time_sleep" "wait_after_copy_data" { + create_duration = "30s" + depends_on = [ + data.google_storage_project_service_account.gcs_account, + data.http.call_workflows_copy_data + ] +} + +# execute the other project setup workflow data "http" "call_workflows_project_setup" { url = "https://workflowexecutions.googleapis.com/v1/projects/${module.project-services.project_id}/locations/${var.region}/workflows/${google_workflows_workflow.project_setup.name}/executions" method = "POST" @@ -134,9 +142,20 @@ data "http" "call_workflows_project_setup" { Accept = "application/json" Authorization = "Bearer ${data.google_client_config.current.access_token}" } depends_on = [ + google_storage_bucket.temp_bucket, + google_storage_bucket.provisioning_bucket, + google_storage_bucket.warehouse_bucket, + google_storage_bucket.dataproc_service_account, + google_bigquery_dataset.gcp_lakehouse_ds, + google_bigquery_connection.gcp_lakehouse_connection, google_dataplex_asset.gcp_primary_textocr, google_dataplex_asset.gcp_primary_ga4_obfuscated_sample_ecommerce, - google_dataplex_asset.gcp_primary_tables + google_dataplex_asset.gcp_primary_tables, + google_project_iam_member.connectionPermissionGrant, + google_project_iam_member.dataproc_sa_roles, + + google_project_iam_member.connectionPermissionGrant, + time_sleep.wait_after_copy_data ] } From e83e8d9396604ae3cf2e37cc669c7be0dd2825f5 Mon Sep 17 00:00:00 2001 From: bradmiro Date: Fri, 22 Sep 2023 15:46:34 -0400 Subject: [PATCH 06/20] fixes --- dataproc.tf | 4 ---- workflows.tf | 13 ++++++------- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/dataproc.tf b/dataproc.tf index 8075d04..fd86315 100644 --- a/dataproc.tf +++ b/dataproc.tf @@ -103,7 +103,3 @@ resource "google_project_iam_member" "bq_connection_iam_biglake" { role = "roles/biglake.admin" member = "serviceAccount:${google_bigquery_connection.ds_connection.cloud_resource[0].service_account_id}" } - -resource "google_dataproc_cluster" "phs" { - -} \ No newline at end of file diff --git a/workflows.tf b/workflows.tf index 910f9a0..3b25bf9 100644 --- a/workflows.tf +++ b/workflows.tf @@ -142,19 +142,18 @@ data "http" "call_workflows_project_setup" { Accept = "application/json" Authorization = "Bearer ${data.google_client_config.current.access_token}" } depends_on = [ - google_storage_bucket.temp_bucket, - google_storage_bucket.provisioning_bucket, - google_storage_bucket.warehouse_bucket, - google_storage_bucket.dataproc_service_account, google_bigquery_dataset.gcp_lakehouse_ds, google_bigquery_connection.gcp_lakehouse_connection, - google_dataplex_asset.gcp_primary_textocr, google_dataplex_asset.gcp_primary_ga4_obfuscated_sample_ecommerce, google_dataplex_asset.gcp_primary_tables, + google_dataplex_asset.gcp_primary_textocr, google_project_iam_member.connectionPermissionGrant, - google_project_iam_member.dataproc_sa_roles, - google_project_iam_member.connectionPermissionGrant, + google_project_iam_member.dataproc_sa_roles, + google_storage_bucket.temp_bucket, + google_storage_bucket.provisioning_bucket, + google_storage_bucket.warehouse_bucket, + google_storage_bucket.dataproc_service_account, time_sleep.wait_after_copy_data ] } From 869d5aeabc9e9904839f0bef3845260270462414 Mon Sep 17 00:00:00 2001 From: bradmiro Date: Tue, 26 Sep 2023 11:42:31 -0400 Subject: [PATCH 07/20] updating terraform version --- versions.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/versions.tf b/versions.tf index d9e3288..95afc51 100644 --- a/versions.tf +++ b/versions.tf @@ -18,7 +18,7 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = "<= 4.69.0, != 4.65.0, != 4.65.1" + version = ">= 4.83.0, <= 4.89.0" } google-beta = { source = "hashicorp/google-beta" From c14106da8c789d7b93abe3161077a38a38b1e455 Mon Sep 17 00:00:00 2001 From: bradmiro Date: Tue, 26 Sep 2023 12:23:21 -0400 Subject: [PATCH 08/20] removing temporary bucket --- src/bigquery.py | 4 ++-- workflows.tf | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/bigquery.py b/src/bigquery.py index bade034..69d81d3 100644 --- a/src/bigquery.py +++ b/src/bigquery.py @@ -25,14 +25,14 @@ catalog = os.getenv("lakehouse_catalog", "lakehouse_catalog") database = os.getenv("lakehouse_db", "lakehouse_db") -bucket = os.getenv("temp_bucket", "gcp-lakehouse-provisioner-8a68acad") +# bucket = os.getenv("temp_bucket", "gcp-lakehouse-provisioner-8a68acad") bq_dataset = os.getenv("bq_dataset", "gcp_lakehouse_ds") bq_connection = os.getenv("bq_gcs_connection", "us-central1.gcp_gcs_connection") # Use the Cloud Storage bucket for temporary BigQuery export data # used by the connector. -spark.conf.set("temporaryGcsBucket", bucket) +# spark.conf.set("temporaryGcsBucket", bucket) # Delete the BigLake Catalog if it currently exists to ensure proper setup. spark.sql(f"DROP NAMESPACE IF EXISTS {catalog} CASCADE;") diff --git a/workflows.tf b/workflows.tf index 3b25bf9..a3c9948 100644 --- a/workflows.tf +++ b/workflows.tf @@ -150,7 +150,7 @@ data "http" "call_workflows_project_setup" { google_project_iam_member.connectionPermissionGrant, google_project_iam_member.connectionPermissionGrant, google_project_iam_member.dataproc_sa_roles, - google_storage_bucket.temp_bucket, + # google_storage_bucket.temp_bucket, google_storage_bucket.provisioning_bucket, google_storage_bucket.warehouse_bucket, google_storage_bucket.dataproc_service_account, From 0865d5879fa0787e06b0afb51a50be8e07f571c2 Mon Sep 17 00:00:00 2001 From: bradmiro Date: Tue, 26 Sep 2023 12:28:45 -0400 Subject: [PATCH 09/20] lint fixes --- workflows.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows.tf b/workflows.tf index a3c9948..e121169 100644 --- a/workflows.tf +++ b/workflows.tf @@ -150,10 +150,10 @@ data "http" "call_workflows_project_setup" { google_project_iam_member.connectionPermissionGrant, google_project_iam_member.connectionPermissionGrant, google_project_iam_member.dataproc_sa_roles, + google_service_account.dataproc_service_account, # google_storage_bucket.temp_bucket, google_storage_bucket.provisioning_bucket, google_storage_bucket.warehouse_bucket, - google_storage_bucket.dataproc_service_account, time_sleep.wait_after_copy_data ] } From 23b335cfa13e832ce1095517c7dd1bb95645a971 Mon Sep 17 00:00:00 2001 From: bradmiro Date: Tue, 26 Sep 2023 15:18:24 -0400 Subject: [PATCH 10/20] lint cleanup --- src/bigquery.py | 4 +++- variables.tf | 12 ------------ 2 files changed, 3 insertions(+), 13 deletions(-) diff --git a/src/bigquery.py b/src/bigquery.py index 69d81d3..62d33c1 100644 --- a/src/bigquery.py +++ b/src/bigquery.py @@ -14,6 +14,8 @@ # limitations under the License. """BigQuery I/O with BigLake Iceberg PySpark example.""" +from py4j.protocol import Py4JJavaError + from pyspark.sql import SparkSession import os @@ -48,7 +50,7 @@ events = spark.read.format("bigquery") \ .option("table", "gcp_primary_staging.thelook_ecommerce_events") \ .load() -except: +except Py4JJavaError: events = spark.read.format("bigquery") \ .option("table", "gcp_primary_staging.stage_thelook_ecommerce_events") \ .load() diff --git a/variables.tf b/variables.tf index d48efd1..21b8177 100644 --- a/variables.tf +++ b/variables.tf @@ -48,20 +48,8 @@ variable "force_destroy" { default = false } -variable "deletion_protection" { - type = string - description = "Whether or not to protect GCS resources from deletion when solution is modified or changed." - default = true -} - variable "use_case_short" { type = string description = "Short name for use case" default = "lakehouse" } - -variable "public_data_bucket" { - type = string - description = "Public Data bucket for access" - default = "data-analytics-demos" -} From af117fd893c0a31e5afd6553c11fce456dc38c73 Mon Sep 17 00:00:00 2001 From: bradmiro Date: Tue, 26 Sep 2023 15:42:55 -0400 Subject: [PATCH 11/20] remove 'delete_protection' from example --- examples/analytics_lakehouse/main.tf | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/analytics_lakehouse/main.tf b/examples/analytics_lakehouse/main.tf index bd65a98..057e705 100644 --- a/examples/analytics_lakehouse/main.tf +++ b/examples/analytics_lakehouse/main.tf @@ -17,9 +17,8 @@ module "analytics_lakehouse" { source = "../.." - project_id = var.project_id - region = "us-central1" - deletion_protection = false - force_destroy = true + project_id = var.project_id + region = "us-central1" + force_destroy = true } From 9c9b1ec49c9d75c464fb2de45ef19bea19893edd Mon Sep 17 00:00:00 2001 From: bradmiro Date: Wed, 4 Oct 2023 12:24:33 -0400 Subject: [PATCH 12/20] resolving merge conflict --- examples/analytics_lakehouse/main.tf | 7 ++++--- src/bigquery.py | 3 ++- variables.tf | 6 ++++++ 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/examples/analytics_lakehouse/main.tf b/examples/analytics_lakehouse/main.tf index 057e705..bd65a98 100644 --- a/examples/analytics_lakehouse/main.tf +++ b/examples/analytics_lakehouse/main.tf @@ -17,8 +17,9 @@ module "analytics_lakehouse" { source = "../.." - project_id = var.project_id - region = "us-central1" - force_destroy = true + project_id = var.project_id + region = "us-central1" + deletion_protection = false + force_destroy = true } diff --git a/src/bigquery.py b/src/bigquery.py index 62d33c1..11b62d4 100644 --- a/src/bigquery.py +++ b/src/bigquery.py @@ -52,7 +52,8 @@ .load() except Py4JJavaError: events = spark.read.format("bigquery") \ - .option("table", "gcp_primary_staging.stage_thelook_ecommerce_events") \ + .option("table", + "gcp_primary_staging.stage_thelook_ecommerce_events") \ .load() events.createOrReplaceTempView("events") diff --git a/variables.tf b/variables.tf index 21b8177..169f26e 100644 --- a/variables.tf +++ b/variables.tf @@ -48,6 +48,12 @@ variable "force_destroy" { default = false } +variable "deletion_protection" { + type = string + description = "Whether or not to protect GCS resources from deletion when solution is modified or changed." + default = true +} + variable "use_case_short" { type = string description = "Short name for use case" From 28ec8c713f2991a3e66b4e42866dbffa0d47a1e3 Mon Sep 17 00:00:00 2001 From: bradmiro Date: Wed, 27 Sep 2023 17:20:11 -0400 Subject: [PATCH 13/20] removing delete_protection, fixing docstrings, adding public_data_bucket to workflow --- README.md | 1 - bigquery.tf | 2 -- src/yaml/copy-data.yaml | 2 +- variables.tf | 14 +++++++------- versions.tf | 2 +- workflows.tf | 1 + 6 files changed, 10 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index e23acde..8168ee6 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,6 @@ Functional examples are included in the | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| deletion\_protection | Whether or not to protect GCS resources from deletion when solution is modified or changed. | `string` | `true` | no | | enable\_apis | Whether or not to enable underlying apis in this solution. . | `string` | `true` | no | | force\_destroy | Whether or not to protect BigQuery resources from deletion when solution is modified or changed. | `string` | `false` | no | | labels | A map of labels to apply to contained resources. | `map(string)` |
{
"analytics-lakehouse": true
}
| no | diff --git a/bigquery.tf b/bigquery.tf index cd2a34d..dd86d63 100644 --- a/bigquery.tf +++ b/bigquery.tf @@ -26,8 +26,6 @@ resource "google_bigquery_dataset" "gcp_lakehouse_ds" { delete_contents_on_destroy = var.force_destroy } - - # # Create a BigQuery connection resource "google_bigquery_connection" "gcp_lakehouse_connection" { project = module.project-services.project_id diff --git a/src/yaml/copy-data.yaml b/src/yaml/copy-data.yaml index 9901d7e..75dd0a0 100644 --- a/src/yaml/copy-data.yaml +++ b/src/yaml/copy-data.yaml @@ -18,7 +18,7 @@ main: - init: # Define local variables from terraform env variables assign: - - source_bucket_name: "data-analytics-demos" + - source_bucket_name: ${public_data_bucket} - dest_ga4_images_bucket_name: ${ga4_images_bucket} - dest_textocr_images_bucket_name: ${textocr_images_bucket} - dest_tables_bucket_name: ${tables_bucket} diff --git a/variables.tf b/variables.tf index 169f26e..f8fa017 100644 --- a/variables.tf +++ b/variables.tf @@ -43,15 +43,9 @@ variable "enable_apis" { } variable "force_destroy" { - type = string - description = "Whether or not to protect BigQuery resources from deletion when solution is modified or changed." - default = false -} - -variable "deletion_protection" { type = string description = "Whether or not to protect GCS resources from deletion when solution is modified or changed." - default = true + default = false } variable "use_case_short" { @@ -59,3 +53,9 @@ variable "use_case_short" { description = "Short name for use case" default = "lakehouse" } + +variable "public_data_bucket" { + type = string + description = "Public Data bucket for access" + default = "data-analytics-demos" +} diff --git a/versions.tf b/versions.tf index 95afc51..9a48b03 100644 --- a/versions.tf +++ b/versions.tf @@ -41,7 +41,7 @@ terraform { version = ">= 3.2.1" } } - required_version = ">= 0.13" + required_version = ">= 0.13, <= 1.2.3" provider_meta "google" { module_name = "blueprints/terraform/terraform-google-analytics-lakehouse/v0.3.0" diff --git a/workflows.tf b/workflows.tf index e121169..424307b 100644 --- a/workflows.tf +++ b/workflows.tf @@ -98,6 +98,7 @@ resource "google_workflows_workflow" "project_setup" { provisioner_bucket = google_storage_bucket.provisioning_bucket.name, warehouse_bucket = google_storage_bucket.warehouse_bucket.name, temp_bucket = google_storage_bucket.warehouse_bucket.name, + public_data_bucket = var.public_data_bucket }) depends_on = [ From b60add1b5da15a4a779ed1b5ac86b6e790a71ccd Mon Sep 17 00:00:00 2001 From: bradmiro Date: Wed, 27 Sep 2023 17:27:33 -0400 Subject: [PATCH 14/20] revert version pinning --- versions.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/versions.tf b/versions.tf index 9a48b03..95afc51 100644 --- a/versions.tf +++ b/versions.tf @@ -41,7 +41,7 @@ terraform { version = ">= 3.2.1" } } - required_version = ">= 0.13, <= 1.2.3" + required_version = ">= 0.13" provider_meta "google" { module_name = "blueprints/terraform/terraform-google-analytics-lakehouse/v0.3.0" From b2131cce0b959e2d579dcb627e83e26e923baaad Mon Sep 17 00:00:00 2001 From: bradmiro Date: Wed, 27 Sep 2023 17:42:55 -0400 Subject: [PATCH 15/20] updating examples --- examples/analytics_lakehouse/main.tf | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/analytics_lakehouse/main.tf b/examples/analytics_lakehouse/main.tf index bd65a98..057e705 100644 --- a/examples/analytics_lakehouse/main.tf +++ b/examples/analytics_lakehouse/main.tf @@ -17,9 +17,8 @@ module "analytics_lakehouse" { source = "../.." - project_id = var.project_id - region = "us-central1" - deletion_protection = false - force_destroy = true + project_id = var.project_id + region = "us-central1" + force_destroy = true } From a89b7b687f578d6d8cd3f6188dd0d183f21431c4 Mon Sep 17 00:00:00 2001 From: bradmiro Date: Wed, 27 Sep 2023 17:50:31 -0400 Subject: [PATCH 16/20] moving vars around --- workflows.tf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflows.tf b/workflows.tf index 424307b..eca9444 100644 --- a/workflows.tf +++ b/workflows.tf @@ -66,6 +66,7 @@ resource "google_workflows_workflow" "copy_data" { description = "Copies data and performs project setup" service_account = google_service_account.workflows_sa.email source_contents = templatefile("${path.module}/src/yaml/copy-data.yaml", { + public_data_bucket = var.public_data_bucket, textocr_images_bucket = google_storage_bucket.textocr_images_bucket.name, ga4_images_bucket = google_storage_bucket.ga4_images_bucket.name, tables_bucket = google_storage_bucket.tables_bucket.name, @@ -97,8 +98,7 @@ resource "google_workflows_workflow" "project_setup" { dataproc_service_account = google_service_account.dataproc_service_account.email, provisioner_bucket = google_storage_bucket.provisioning_bucket.name, warehouse_bucket = google_storage_bucket.warehouse_bucket.name, - temp_bucket = google_storage_bucket.warehouse_bucket.name, - public_data_bucket = var.public_data_bucket + temp_bucket = google_storage_bucket.warehouse_bucket.name }) depends_on = [ @@ -169,4 +169,4 @@ resource "time_sleep" "wait_after_all_workflows" { depends_on = [ data.http.call_workflows_project_setup, ] -} +} \ No newline at end of file From 1909f2cb2321aba48c5ba6769bd838287a1d35b1 Mon Sep 17 00:00:00 2001 From: bradmiro Date: Wed, 27 Sep 2023 18:08:50 -0400 Subject: [PATCH 17/20] lint fix --- workflows.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows.tf b/workflows.tf index eca9444..f56f0eb 100644 --- a/workflows.tf +++ b/workflows.tf @@ -169,4 +169,4 @@ resource "time_sleep" "wait_after_all_workflows" { depends_on = [ data.http.call_workflows_project_setup, ] -} \ No newline at end of file +} From cbc5ee1441912713ad5ebafb4783fa9345873cb7 Mon Sep 17 00:00:00 2001 From: bradmiro Date: Wed, 27 Sep 2023 18:16:14 -0400 Subject: [PATCH 18/20] adding docs updates --- README.md | 2 +- metadata.yaml | 115 ++++++++++++++++++++++++-------------------------- 2 files changed, 57 insertions(+), 60 deletions(-) diff --git a/README.md b/README.md index 8168ee6..22d23d3 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ Functional examples are included in the | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | enable\_apis | Whether or not to enable underlying apis in this solution. . | `string` | `true` | no | -| force\_destroy | Whether or not to protect BigQuery resources from deletion when solution is modified or changed. | `string` | `false` | no | +| force\_destroy | Whether or not to protect GCS resources from deletion when solution is modified or changed. | `string` | `false` | no | | labels | A map of labels to apply to contained resources. | `map(string)` |
{
"analytics-lakehouse": true
}
| no | | project\_id | Google Cloud Project ID | `string` | n/a | yes | | public\_data\_bucket | Public Data bucket for access | `string` | `"data-analytics-demos"` | no | diff --git a/metadata.yaml b/metadata.yaml index fc57108..4b7f003 100644 --- a/metadata.yaml +++ b/metadata.yaml @@ -24,74 +24,71 @@ spec: source: repo: https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse.git sourceType: git - version: 0.1.0 + version: 0.3.0 actuationTool: flavor: Terraform - version: '>= 0.13' + version: ">= 0.13" description: {} content: documentation: - - title: Create an Analytics Lakehouse - url: https://cloud.google.com/architecture/big-data-analytics/analytics-lakehouse + - title: Create an Analytics Lakehouse + url: https://cloud.google.com/architecture/big-data-analytics/analytics-lakehouse examples: - - name: analytics_lakehouse - location: examples/analytics_lakehouse + - name: analytics_lakehouse + location: examples/analytics_lakehouse interfaces: variables: - - name: deletion_protection - description: Whether or not to protect GCS resources from deletion when solution is modified or changed. - varType: string - defaultValue: true - - name: enable_apis - description: Whether or not to enable underlying apis in this solution. . - varType: string - defaultValue: true - - name: force_destroy - description: Whether or not to protect BigQuery resources from deletion when solution is modified or changed. - varType: string - defaultValue: false - - name: labels - description: A map of labels to apply to contained resources. - varType: map(string) - defaultValue: - analytics-lakehouse: true - - name: project_id - description: Google Cloud Project ID - varType: string - required: true - - name: public_data_bucket - description: Public Data bucket for access - varType: string - defaultValue: data-analytics-demos - - name: region - description: Google Cloud Region - varType: string - defaultValue: us-central1 - - name: use_case_short - description: Short name for use case - varType: string - defaultValue: lakehouse + - name: enable_apis + description: Whether or not to enable underlying apis in this solution. . + varType: string + defaultValue: true + - name: force_destroy + description: Whether or not to protect GCS resources from deletion when solution is modified or changed. + varType: string + defaultValue: false + - name: labels + description: A map of labels to apply to contained resources. + varType: map(string) + defaultValue: + analytics-lakehouse: true + - name: project_id + description: Google Cloud Project ID + varType: string + defaultValue: null + required: true + - name: public_data_bucket + description: Public Data bucket for access + varType: string + defaultValue: data-analytics-demos + - name: region + description: Google Cloud Region + varType: string + defaultValue: us-central1 + - name: use_case_short + description: Short name for use case + varType: string + defaultValue: lakehouse outputs: - - name: bigquery_editor_url - description: The URL to launch the BigQuery editor - - name: lakehouse_colab_url - description: The URL to launch the in-console tutorial for the Analytics Lakehouse solution - - name: lookerstudio_report_url - description: The URL to create a new Looker Studio report displays a sample dashboard for data analysis - - name: neos_tutorial_url - description: The URL to launch the in-console tutorial for the Analytics Lakehouse solution - - name: workflow_return_project_setup - description: Output of the project setup workflow + - name: bigquery_editor_url + description: The URL to launch the BigQuery editor + - name: lakehouse_colab_url + description: The URL to launch the in-console tutorial for the Analytics Lakehouse solution + - name: lookerstudio_report_url + description: The URL to create a new Looker Studio report displays a sample dashboard for data analysis + - name: neos_tutorial_url + description: The URL to launch the in-console tutorial for the Analytics Lakehouse solution + - name: workflow_return_project_setup + description: Output of the project setup workflow requirements: roles: - - level: Project - roles: - - roles/owner + - level: Project + roles: + - roles/owner services: - - cloudkms.googleapis.com - - cloudresourcemanager.googleapis.com - - bigquery.googleapis.com - - bigquerystorage.googleapis.com - - bigqueryconnection.googleapis.com - - serviceusage.googleapis.com - - iam.googleapis.com + - cloudkms.googleapis.com + - cloudresourcemanager.googleapis.com + - bigquery.googleapis.com + - bigquerystorage.googleapis.com + - bigqueryconnection.googleapis.com + - serviceusage.googleapis.com + - iam.googleapis.com From f8771b7ecf19520e9e5763f546b662be40b997fe Mon Sep 17 00:00:00 2001 From: bradmiro Date: Wed, 27 Sep 2023 22:26:01 -0400 Subject: [PATCH 19/20] remove stage_ references --- src/bigquery.py | 14 +++----------- src/sql/view_ecommerce.sql | 10 +++++----- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/src/bigquery.py b/src/bigquery.py index 11b62d4..1069923 100644 --- a/src/bigquery.py +++ b/src/bigquery.py @@ -14,8 +14,6 @@ # limitations under the License. """BigQuery I/O with BigLake Iceberg PySpark example.""" -from py4j.protocol import Py4JJavaError - from pyspark.sql import SparkSession import os @@ -46,15 +44,9 @@ # Load data from BigQuery. -try: - events = spark.read.format("bigquery") \ - .option("table", "gcp_primary_staging.thelook_ecommerce_events") \ - .load() -except Py4JJavaError: - events = spark.read.format("bigquery") \ - .option("table", - "gcp_primary_staging.stage_thelook_ecommerce_events") \ - .load() +events = spark.read.format("bigquery") \ + .option("table", "gcp_primary_staging.thelook_ecommerce_events") \ + .load() events.createOrReplaceTempView("events") # Create Iceberg Table if not exists diff --git a/src/sql/view_ecommerce.sql b/src/sql/view_ecommerce.sql index a83dfba..fa104fb 100644 --- a/src/sql/view_ecommerce.sql +++ b/src/sql/view_ecommerce.sql @@ -51,21 +51,21 @@ SELECT u.longitude user_long, u.traffic_source user_traffic_source FROM - gcp_primary_staging.stage_thelook_ecommerce_orders o + gcp_primary_staging.thelook_ecommerce_orders o INNER JOIN - gcp_primary_staging.stage_thelook_ecommerce_order_items i + gcp_primary_staging.thelook_ecommerce_order_items i ON o.order_id = i.order_id INNER JOIN - `gcp_primary_staging.stage_thelook_ecommerce_products` p + `gcp_primary_staging.thelook_ecommerce_products` p ON i.product_id = p.id INNER JOIN - `gcp_primary_staging.stage_thelook_ecommerce_distribution_centers` d + `gcp_primary_staging.thelook_ecommerce_distribution_centers` d ON p.distribution_center_id = d.id INNER JOIN - `gcp_primary_staging.stage_thelook_ecommerce_users` u + `gcp_primary_staging.thelook_ecommerce_users` u ON o.user_id = u.id ; From b893a5828128b987d5a72fd989ea1cd2e9f3ec58 Mon Sep 17 00:00:00 2001 From: bradmiro Date: Wed, 4 Oct 2023 12:25:50 -0400 Subject: [PATCH 20/20] added comment to dataplex bucket --- main.tf | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/main.tf b/main.tf index 01d2d81..f703699 100644 --- a/main.tf +++ b/main.tf @@ -144,6 +144,15 @@ resource "google_storage_bucket" "tables_bucket" { force_destroy = var.force_destroy } +# Bucket used to store BI data in Dataplex +resource "google_storage_bucket" "dataplex_bucket" { + name = "gcp-${var.use_case_short}-dataplex-${random_id.id.hex}" + project = module.project-services.project_id + location = var.region + uniform_bucket_level_access = true + force_destroy = var.force_destroy +} + resource "google_storage_bucket_object" "pyspark_file" { bucket = google_storage_bucket.provisioning_bucket.name name = "bigquery.py" @@ -153,11 +162,3 @@ resource "google_storage_bucket_object" "pyspark_file" { google_storage_bucket.provisioning_bucket ] } - -resource "google_storage_bucket" "dataplex_bucket" { - name = "gcp-${var.use_case_short}-dataplex-${random_id.id.hex}" - project = module.project-services.project_id - location = var.region - uniform_bucket_level_access = true - force_destroy = var.force_destroy -}