Merge pull request #190 from dup05/pipeline_performance

Automated Performance testing framework
GoogleCloudPlatform · Apr 1, 2024 · 4213e27 · 4213e27
2 parents 1d32b05 + 4088505
commit 4213e27
Show file tree

Hide file tree

Showing 10 changed files with 713 additions and 0 deletions.
diff --git a/.github/workflows/cleanup/action.yml b/.github/workflows/cleanup/action.yml
@@ -0,0 +1,35 @@
+name: "Cleanup resources"
+description: "Cleanup BQ and Dataflow job"
+inputs:
+  job_id:
+    description: "JobId"
+    required: false
+  project_id:
+    description: "project_id"
+    required: true
+  dataset:
+    description: "dataset"
+    required: true
+  input_gcs_bucket:
+    description: "Bucket with run time created files"
+    required: true
+  job_type:
+    description: "Batch/Streaming"
+    required: true
+
+runs:
+  using: "composite"
+  steps:
+    - name: Cleanup BQ Tables
+      shell: bash
+      run: bq rm -r -f -d ${{inputs.project_id}}:${{inputs.dataset}}
+
+    - name: Cleanup GCS files
+      if: always() && inputs.job_type == 'streaming'
+      shell: bash
+      run: gcloud storage rm gs://${{inputs.input_gcs_bucket}}/*
+
+    - name: Delete pub/sub notification config
+      if: always() && inputs.job_type == 'streaming'
+      shell: bash
+      run: gsutil notification delete gs://${{inputs.input_gcs_bucket}}
diff --git a/.github/workflows/configs/tests_config.json b/.github/workflows/configs/tests_config.json
@@ -0,0 +1,71 @@
+[
+  {
+    "name": "Streaming1_csv",
+    "type": "streaming",
+    "file_type": "CSV",
+    "file_size": "100MB/min",
+    "gcs_file_path": "gs://input_load_test_streaming_job/*.csv",
+    "deid_template": "projects/dlp-dataflow-load-test/deidentifyTemplates/dlp-demo-deid-latest-1706594483019",
+    "raw_file_pattern": "gs://input_dlp_load_test_2/largecsv100MB.csv"
+  },
+  {
+    "name": "Streaming2_csv",
+    "type": "streaming",
+    "file_type": "CSV",
+    "file_size": "500MB/min",
+    "gcs_file_path": "gs://input_load_test_streaming_job/*.csv",
+    "deid_template": "projects/dlp-dataflow-load-test/deidentifyTemplates/dlp-demo-deid-latest-1706594483019",
+    "raw_file_pattern": "gs://input_dlp_load_test_2/largecsv500MB.csv"
+  },
+  {
+    "name": "Batch1_csv",
+    "type": "batch",
+    "file_type": "CSV",
+    "file_size": "500MB",
+    "gcs_file_path": "gs://input_dlp_load_test_2/largecsv500MB.csv",
+    "deid_template": "projects/dlp-dataflow-load-test/deidentifyTemplates/dlp-demo-deid-latest-1706594483019"
+
+  },
+  {
+    "name": "Batch2_csv",
+    "type": "batch",
+    "file_type": "CSV",
+    "file_size": "1GB",
+    "gcs_file_path": "gs://input_dlp_load_test_2/largecsv1GB.csv",
+    "deid_template": "projects/dlp-dataflow-load-test/deidentifyTemplates/dlp-demo-deid-latest-1706594483019"
+
+  },
+  {
+    "name": "Batch3_csv",
+    "type": "batch",
+    "file_type": "CSV",
+    "file_size": "2GB",
+    "gcs_file_path": "gs://input_dlp_load_test_2/largecsv2GB.csv",
+    "deid_template": "projects/dlp-dataflow-load-test/deidentifyTemplates/dlp-demo-deid-latest-1706594483019"
+  },
+  {
+    "name": "Batch1_avro",
+    "type": "batch",
+    "file_type": "AVRO",
+    "file_size": "500MB",
+    "gcs_file_path": "gs://input_dlp_load_test_2/largeavro500MB.avro",
+    "deid_template": "projects/dlp-dataflow-load-test/deidentifyTemplates/dlp-demo-deid-latest-1706594483019"
+
+  },
+  {
+    "name": "Batch2_avro",
+    "type": "batch",
+    "file_type": "AVRO",
+    "file_size": "750MB",
+    "gcs_file_path": "gs://input_dlp_load_test_2/largeavro750MB.avro",
+    "deid_template": "projects/dlp-dataflow-load-test/deidentifyTemplates/dlp-demo-deid-latest-1706594483019"
+  },
+  {
+    "name": "Batch3_avro",
+    "type": "batch",
+    "file_type": "AVRO",
+    "file_size": "1500MB",
+    "gcs_file_path": "gs://input_dlp_load_test_2/largeavro1500MB.avro",
+    "deid_template": "projects/dlp-dataflow-load-test/deidentifyTemplates/dlp-demo-deid-latest-1706594483019"
+  }
+]
diff --git a/.github/workflows/execute-copy-workflow/action.yml b/.github/workflows/execute-copy-workflow/action.yml
@@ -0,0 +1,48 @@
+name: "Execute copy workflow"
+description: "Copies files from raw bucket to specified input bucket"
+
+inputs:
+  raw_bucket:
+    description: "GCS Raw bucket name"
+    required: true
+  raw_file_pattern:
+    description: "File name pattern"
+    required: true
+  input_gcs_bucket:
+    description: "GCS bucket name"
+    required: true
+  job_id:
+    description: "Job ID"
+    required: true
+  workflow_name:
+    description: "Workflow name"
+    required: true
+  region:
+    description: "Region"
+    required: true
+
+
+runs:
+  using: "composite"
+  steps:
+    - name: Execute the workflow
+      shell: bash
+      run: |
+        raw_file_pattern=$(echo "${{inputs.raw_file_pattern}}" | awk -F "/" '{print $NF}')
+        raw_bucket=$(echo "${{inputs.raw_file_pattern}}" | awk -F "/" '{print $3}')
+        not_finished=true
+        num_executions=1
+        while [ $num_executions -le 10 ];
+        do
+          echo "Executing workflow: $num_executions"
+          gcloud workflows run ${{inputs.workflow_name}} \
+                --call-log-level=log-errors-only \
+                --data="{\"input_bucket\": \"${{inputs.input_gcs_bucket}}\",\"raw_bucket\": \"$raw_bucket\",\"source_file\": \"$raw_file_pattern\"}"
+          num_executions=$((num_executions+1))
+          sleep 60s
+        done
+
+    - name: Drain the pipeline
+      shell: bash
+      run: |
+        gcloud dataflow jobs drain ${{inputs.job_id}} --region ${{inputs.region}} 
diff --git a/.github/workflows/fetch-metrics/action.yml b/.github/workflows/fetch-metrics/action.yml
@@ -0,0 +1,28 @@
+name: "Fetch job metrics"
+description: "Fetch the metrics of Dataflow job"
+inputs:
+  job_id:
+    description: "JobId"
+    required: true
+  project_id:
+    description: "gcp project id"
+    required: true
+  test_uuid:
+    description: "test uuid"
+    required: true
+  test_name:
+    description: "Test name"
+    required: true
+  test_details:
+    description: "Test configuration details"
+    required: true
+
+
+runs:
+  using: "composite"
+  steps:
+    - name: Execute script
+      shell: bash
+      run: python3 .github/workflows/scripts/fetchJobMetrics.py ${{inputs.project_id}} ${{inputs.job_id}} ${{inputs.test_uuid}} ${{inputs.test_name}} '${{inputs.test_details}}'
+
+
diff --git a/.github/workflows/performance-testing-main.yml b/.github/workflows/performance-testing-main.yml
@@ -0,0 +1,144 @@
+name: Performance testing
+
+on:
+  workflow_dispatch:
+    inputs:
+      test_config_json:
+        description: test configs
+        type: String
+
+
+env:
+  PROJECT_ID: "dlp-dataflow-load-test"
+  REGION: "us-central1"
+  INSPECT_TEMPLATE: "projects/dlp-dataflow-load-test/inspectTemplates/dlp-demo-inspect-latest-1706594483019"
+  DEID_TEMPLATE: "projects/dlp-dataflow-load-test/deidentifyTemplates/dlp-demo-deid-latest-1706594483019"
+  PUB_SUB_TOPIC: "projects/dlp-dataflow-load-test/topics/load_test_pub_sub_topic"
+  CLOUD_WORKFLOW: "generate_files_workflow"
+
+
+jobs:
+  generate-uuid:
+    runs-on: ubuntu-latest
+
+    timeout-minutes: 5
+
+    outputs:
+      uuid: ${{ steps.gen-uuid.outputs.uuid }}
+
+    steps:
+      - name: Generate UUID for workflow
+        id: gen-uuid
+        run: |
+          new_uuid=$(uuidgen)
+          modified_uuid=$(echo "$new_uuid" | cut -c1-8 )
+          echo "uuid=$modified_uuid" >> "$GITHUB_OUTPUT"
+
+  pre-processing:
+    needs: generate-uuid
+    runs-on: [self-hosted, load-testing]
+    timeout-minutes: 5
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Read test details
+        id: set-matrix
+        run: |
+          matrix=$(jq -c . < .github/workflows/configs/tests_config.json)
+          if [[ "${{github.event.inputs.test_config_json}}" != "" ]]; then 
+            matrix=$(echo '${{github.event.inputs.test_config_json}}'  | jq .)
+          fi
+          echo "matrix={\"include\":$(echo $matrix)}" >> $GITHUB_OUTPUT  
+          echo $matrix
+
+  run-test:
+    needs:
+      - generate-uuid
+      - pre-processing
+
+    runs-on: [self-hosted, load-testing]
+    continue-on-error: true
+    strategy:
+      max-parallel: 1
+      matrix: ${{ fromJSON(needs.pre-processing.outputs.matrix) }}
+
+    steps:
+      - name: Set job parameters
+        id: set-job-params
+        run: |
+          uuid=${{needs.generate-uuid.outputs.uuid}}
+          test_name=$(echo "${{matrix.name}}" | tr '_' '-')
+          echo "job_name=load-test-${{needs.generate-uuid.outputs.uuid}}-test-$test_name" >> $GITHUB_OUTPUT
+          echo "dataset=dataset_${{needs.generate-uuid.outputs.uuid}}_${{matrix.name}}" >> $GITHUB_OUTPUT
+          echo "Test details: ${{matrix.name}}"
+          echo "job_name=load-test-${{needs.generate-uuid.outputs.uuid}}-test-$test_name"
+          echo "dataset=dataset_${{needs.generate-uuid.outputs.uuid}}_${{matrix.name}}"
+          input_gcs_bucket=$(echo "${{ matrix.gcs_file_path }}" | awk -F "/" '{print $3}')
+          echo "input_gcs_bucket=$input_gcs_bucket" >> $GITHUB_OUTPUT
+
+      - name: Submit dataflow job
+        id: submit-dataflow-job
+        uses: ./.github/workflows/submit-dataflow-job
+        with:
+          project_id: ${{env.PROJECT_ID}}
+          input_gcs_bucket: ${{ steps.set-job-params.outputs.input_gcs_bucket }}
+          gcs_file_path: ${{ matrix.gcs_file_path }}
+          dataset: ${{ steps.set-job-params.outputs.dataset }}
+          inspect_template: ${{env.INSPECT_TEMPLATE}}
+          deid_template: ${{ matrix.deid_template }}
+          job_name: ${{steps.set-job-params.outputs.job_name}}
+          job_type: ${{ matrix.type }}
+          gcs_notification_topic: ${{env.PUB_SUB_TOPIC}}
+
+      - name: execute copy files workflow for streaming jobs
+        id: copy-files
+        if: always() &&  matrix.type == 'streaming'
+        uses: ./.github/workflows/execute-copy-workflow
+        with:
+          raw_bucket: ${{ matrix.source_file_bucket }}
+          raw_file_pattern: ${{ matrix.raw_file_pattern }}
+          input_gcs_bucket: ${{ steps.set-job-params.outputs.input_gcs_bucket }}
+          job_id: ${{steps.submit-dataflow-job.outputs.job_id}}
+          workflow_name: ${{env.CLOUD_WORKFLOW}}
+          region: ${{env.REGION}}
+
+      - name: Poll till job finishes
+        uses: ./.github/workflows/poll-job
+        with:
+          job_id: ${{steps.submit-dataflow-job.outputs.job_id}}
+          region: ${{env.REGION}}
+
+      - name: Fetch metrics
+        uses: ./.github/workflows/fetch-metrics
+        with:
+          job_id: ${{steps.submit-dataflow-job.outputs.job_id}}
+          project_id: ${{env.PROJECT_ID}}
+          test_uuid: ${{needs.generate-uuid.outputs.uuid}}
+          test_name: ${{ matrix.name }}
+          test_details: ${{ toJSON(matrix) }}
+
+      - name: Cleanup
+        if: always()
+        uses: ./.github/workflows/cleanup
+        with:
+          project_id: ${{env.PROJECT_ID}}
+          job_id: ${{steps.submit-dataflow-job.outputs.job_id}}
+          dataset: ${{steps.set-job-params.outputs.dataset}}
+          input_gcs_bucket: ${{ steps.set-job-params.outputs.input_gcs_bucket }}
+          job_type: ${{ matrix.type }}
+
+  publish-test-results:
+    needs:
+      - generate-uuid
+      - pre-processing
+      - run-test
+    runs-on: [self-hosted, load-testing]
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Execute publishMetrics Script
+        run: |
+          python3 .github/workflows/scripts/publishTestReport.py ${{env.PROJECT_ID}} ${{ needs.generate-uuid.outputs.uuid }}
+
diff --git a/.github/workflows/poll-job/action.yml b/.github/workflows/poll-job/action.yml
@@ -0,0 +1,39 @@
+name: "Poll for job"
+description: "Poll till job completes"
+inputs:
+  job_id:
+    description: "JobId"
+    required: true
+  region:
+    description: "Region"
+    required: true
+
+runs:
+  using: "composite"
+  steps:
+    - name: Poll
+      shell: bash
+      run: |
+        not_finished=true
+        while $not_finished; do
+          echo "Polling for job status"
+          status=$(gcloud dataflow jobs show ${{inputs.job_id}} --region ${{inputs.region}} | grep "state:" | awk '{print $2}') 
+          echo "Job status: $status"
+          if [[ "$status" == "Done" ]]; then
+            echo "BATCH JOB PASSED";
+            not_finished=false;
+          elif [[ "$status" == "Drained" ]]; then
+            echo "STREAMING JOB PASSED";
+            not_finished=false;
+          elif [[ "$status" == "Failed" ]]; then
+            echo "JOB FAILED";
+            not_finished=false;
+          elif [[ "$status" == "Cancelled" ]]; then
+            echo "JOB CANCELLED";
+            not_finished=false;
+          else
+            sleep 60s
+          fi
+        done
+        sleep 150s 
+        echo "Job with id ${{inputs.job_id}} $status"