Skip to content

Commit

Permalink
Merge pull request #190 from dup05/pipeline_performance
Browse files Browse the repository at this point in the history
Automated Performance testing framework
  • Loading branch information
dup05 authored Apr 1, 2024
2 parents 1d32b05 + 4088505 commit 4213e27
Show file tree
Hide file tree
Showing 10 changed files with 713 additions and 0 deletions.
35 changes: 35 additions & 0 deletions .github/workflows/cleanup/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
name: "Cleanup resources"
description: "Cleanup BQ and Dataflow job"
inputs:
job_id:
description: "JobId"
required: false
project_id:
description: "project_id"
required: true
dataset:
description: "dataset"
required: true
input_gcs_bucket:
description: "Bucket with run time created files"
required: true
job_type:
description: "Batch/Streaming"
required: true

runs:
using: "composite"
steps:
- name: Cleanup BQ Tables
shell: bash
run: bq rm -r -f -d ${{inputs.project_id}}:${{inputs.dataset}}

- name: Cleanup GCS files
if: always() && inputs.job_type == 'streaming'
shell: bash
run: gcloud storage rm gs://${{inputs.input_gcs_bucket}}/*

- name: Delete pub/sub notification config
if: always() && inputs.job_type == 'streaming'
shell: bash
run: gsutil notification delete gs://${{inputs.input_gcs_bucket}}
71 changes: 71 additions & 0 deletions .github/workflows/configs/tests_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
[
{
"name": "Streaming1_csv",
"type": "streaming",
"file_type": "CSV",
"file_size": "100MB/min",
"gcs_file_path": "gs://input_load_test_streaming_job/*.csv",
"deid_template": "projects/dlp-dataflow-load-test/deidentifyTemplates/dlp-demo-deid-latest-1706594483019",
"raw_file_pattern": "gs://input_dlp_load_test_2/largecsv100MB.csv"
},
{
"name": "Streaming2_csv",
"type": "streaming",
"file_type": "CSV",
"file_size": "500MB/min",
"gcs_file_path": "gs://input_load_test_streaming_job/*.csv",
"deid_template": "projects/dlp-dataflow-load-test/deidentifyTemplates/dlp-demo-deid-latest-1706594483019",
"raw_file_pattern": "gs://input_dlp_load_test_2/largecsv500MB.csv"
},
{
"name": "Batch1_csv",
"type": "batch",
"file_type": "CSV",
"file_size": "500MB",
"gcs_file_path": "gs://input_dlp_load_test_2/largecsv500MB.csv",
"deid_template": "projects/dlp-dataflow-load-test/deidentifyTemplates/dlp-demo-deid-latest-1706594483019"

},
{
"name": "Batch2_csv",
"type": "batch",
"file_type": "CSV",
"file_size": "1GB",
"gcs_file_path": "gs://input_dlp_load_test_2/largecsv1GB.csv",
"deid_template": "projects/dlp-dataflow-load-test/deidentifyTemplates/dlp-demo-deid-latest-1706594483019"

},
{
"name": "Batch3_csv",
"type": "batch",
"file_type": "CSV",
"file_size": "2GB",
"gcs_file_path": "gs://input_dlp_load_test_2/largecsv2GB.csv",
"deid_template": "projects/dlp-dataflow-load-test/deidentifyTemplates/dlp-demo-deid-latest-1706594483019"
},
{
"name": "Batch1_avro",
"type": "batch",
"file_type": "AVRO",
"file_size": "500MB",
"gcs_file_path": "gs://input_dlp_load_test_2/largeavro500MB.avro",
"deid_template": "projects/dlp-dataflow-load-test/deidentifyTemplates/dlp-demo-deid-latest-1706594483019"

},
{
"name": "Batch2_avro",
"type": "batch",
"file_type": "AVRO",
"file_size": "750MB",
"gcs_file_path": "gs://input_dlp_load_test_2/largeavro750MB.avro",
"deid_template": "projects/dlp-dataflow-load-test/deidentifyTemplates/dlp-demo-deid-latest-1706594483019"
},
{
"name": "Batch3_avro",
"type": "batch",
"file_type": "AVRO",
"file_size": "1500MB",
"gcs_file_path": "gs://input_dlp_load_test_2/largeavro1500MB.avro",
"deid_template": "projects/dlp-dataflow-load-test/deidentifyTemplates/dlp-demo-deid-latest-1706594483019"
}
]
48 changes: 48 additions & 0 deletions .github/workflows/execute-copy-workflow/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
name: "Execute copy workflow"
description: "Copies files from raw bucket to specified input bucket"

inputs:
raw_bucket:
description: "GCS Raw bucket name"
required: true
raw_file_pattern:
description: "File name pattern"
required: true
input_gcs_bucket:
description: "GCS bucket name"
required: true
job_id:
description: "Job ID"
required: true
workflow_name:
description: "Workflow name"
required: true
region:
description: "Region"
required: true


runs:
using: "composite"
steps:
- name: Execute the workflow
shell: bash
run: |
raw_file_pattern=$(echo "${{inputs.raw_file_pattern}}" | awk -F "/" '{print $NF}')
raw_bucket=$(echo "${{inputs.raw_file_pattern}}" | awk -F "/" '{print $3}')
not_finished=true
num_executions=1
while [ $num_executions -le 10 ];
do
echo "Executing workflow: $num_executions"
gcloud workflows run ${{inputs.workflow_name}} \
--call-log-level=log-errors-only \
--data="{\"input_bucket\": \"${{inputs.input_gcs_bucket}}\",\"raw_bucket\": \"$raw_bucket\",\"source_file\": \"$raw_file_pattern\"}"
num_executions=$((num_executions+1))
sleep 60s
done
- name: Drain the pipeline
shell: bash
run: |
gcloud dataflow jobs drain ${{inputs.job_id}} --region ${{inputs.region}}
28 changes: 28 additions & 0 deletions .github/workflows/fetch-metrics/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
name: "Fetch job metrics"
description: "Fetch the metrics of Dataflow job"
inputs:
job_id:
description: "JobId"
required: true
project_id:
description: "gcp project id"
required: true
test_uuid:
description: "test uuid"
required: true
test_name:
description: "Test name"
required: true
test_details:
description: "Test configuration details"
required: true


runs:
using: "composite"
steps:
- name: Execute script
shell: bash
run: python3 .github/workflows/scripts/fetchJobMetrics.py ${{inputs.project_id}} ${{inputs.job_id}} ${{inputs.test_uuid}} ${{inputs.test_name}} '${{inputs.test_details}}'


144 changes: 144 additions & 0 deletions .github/workflows/performance-testing-main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
name: Performance testing

on:
workflow_dispatch:
inputs:
test_config_json:
description: test configs
type: String


env:
PROJECT_ID: "dlp-dataflow-load-test"
REGION: "us-central1"
INSPECT_TEMPLATE: "projects/dlp-dataflow-load-test/inspectTemplates/dlp-demo-inspect-latest-1706594483019"
DEID_TEMPLATE: "projects/dlp-dataflow-load-test/deidentifyTemplates/dlp-demo-deid-latest-1706594483019"
PUB_SUB_TOPIC: "projects/dlp-dataflow-load-test/topics/load_test_pub_sub_topic"
CLOUD_WORKFLOW: "generate_files_workflow"


jobs:
generate-uuid:
runs-on: ubuntu-latest

timeout-minutes: 5

outputs:
uuid: ${{ steps.gen-uuid.outputs.uuid }}

steps:
- name: Generate UUID for workflow
id: gen-uuid
run: |
new_uuid=$(uuidgen)
modified_uuid=$(echo "$new_uuid" | cut -c1-8 )
echo "uuid=$modified_uuid" >> "$GITHUB_OUTPUT"
pre-processing:
needs: generate-uuid
runs-on: [self-hosted, load-testing]
timeout-minutes: 5
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
steps:
- uses: actions/checkout@v2

- name: Read test details
id: set-matrix
run: |
matrix=$(jq -c . < .github/workflows/configs/tests_config.json)
if [[ "${{github.event.inputs.test_config_json}}" != "" ]]; then
matrix=$(echo '${{github.event.inputs.test_config_json}}' | jq .)
fi
echo "matrix={\"include\":$(echo $matrix)}" >> $GITHUB_OUTPUT
echo $matrix
run-test:
needs:
- generate-uuid
- pre-processing

runs-on: [self-hosted, load-testing]
continue-on-error: true
strategy:
max-parallel: 1
matrix: ${{ fromJSON(needs.pre-processing.outputs.matrix) }}

steps:
- name: Set job parameters
id: set-job-params
run: |
uuid=${{needs.generate-uuid.outputs.uuid}}
test_name=$(echo "${{matrix.name}}" | tr '_' '-')
echo "job_name=load-test-${{needs.generate-uuid.outputs.uuid}}-test-$test_name" >> $GITHUB_OUTPUT
echo "dataset=dataset_${{needs.generate-uuid.outputs.uuid}}_${{matrix.name}}" >> $GITHUB_OUTPUT
echo "Test details: ${{matrix.name}}"
echo "job_name=load-test-${{needs.generate-uuid.outputs.uuid}}-test-$test_name"
echo "dataset=dataset_${{needs.generate-uuid.outputs.uuid}}_${{matrix.name}}"
input_gcs_bucket=$(echo "${{ matrix.gcs_file_path }}" | awk -F "/" '{print $3}')
echo "input_gcs_bucket=$input_gcs_bucket" >> $GITHUB_OUTPUT
- name: Submit dataflow job
id: submit-dataflow-job
uses: ./.github/workflows/submit-dataflow-job
with:
project_id: ${{env.PROJECT_ID}}
input_gcs_bucket: ${{ steps.set-job-params.outputs.input_gcs_bucket }}
gcs_file_path: ${{ matrix.gcs_file_path }}
dataset: ${{ steps.set-job-params.outputs.dataset }}
inspect_template: ${{env.INSPECT_TEMPLATE}}
deid_template: ${{ matrix.deid_template }}
job_name: ${{steps.set-job-params.outputs.job_name}}
job_type: ${{ matrix.type }}
gcs_notification_topic: ${{env.PUB_SUB_TOPIC}}

- name: execute copy files workflow for streaming jobs
id: copy-files
if: always() && matrix.type == 'streaming'
uses: ./.github/workflows/execute-copy-workflow
with:
raw_bucket: ${{ matrix.source_file_bucket }}
raw_file_pattern: ${{ matrix.raw_file_pattern }}
input_gcs_bucket: ${{ steps.set-job-params.outputs.input_gcs_bucket }}
job_id: ${{steps.submit-dataflow-job.outputs.job_id}}
workflow_name: ${{env.CLOUD_WORKFLOW}}
region: ${{env.REGION}}

- name: Poll till job finishes
uses: ./.github/workflows/poll-job
with:
job_id: ${{steps.submit-dataflow-job.outputs.job_id}}
region: ${{env.REGION}}

- name: Fetch metrics
uses: ./.github/workflows/fetch-metrics
with:
job_id: ${{steps.submit-dataflow-job.outputs.job_id}}
project_id: ${{env.PROJECT_ID}}
test_uuid: ${{needs.generate-uuid.outputs.uuid}}
test_name: ${{ matrix.name }}
test_details: ${{ toJSON(matrix) }}

- name: Cleanup
if: always()
uses: ./.github/workflows/cleanup
with:
project_id: ${{env.PROJECT_ID}}
job_id: ${{steps.submit-dataflow-job.outputs.job_id}}
dataset: ${{steps.set-job-params.outputs.dataset}}
input_gcs_bucket: ${{ steps.set-job-params.outputs.input_gcs_bucket }}
job_type: ${{ matrix.type }}

publish-test-results:
needs:
- generate-uuid
- pre-processing
- run-test
runs-on: [self-hosted, load-testing]
steps:
- uses: actions/checkout@v2

- name: Execute publishMetrics Script
run: |
python3 .github/workflows/scripts/publishTestReport.py ${{env.PROJECT_ID}} ${{ needs.generate-uuid.outputs.uuid }}
39 changes: 39 additions & 0 deletions .github/workflows/poll-job/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
name: "Poll for job"
description: "Poll till job completes"
inputs:
job_id:
description: "JobId"
required: true
region:
description: "Region"
required: true

runs:
using: "composite"
steps:
- name: Poll
shell: bash
run: |
not_finished=true
while $not_finished; do
echo "Polling for job status"
status=$(gcloud dataflow jobs show ${{inputs.job_id}} --region ${{inputs.region}} | grep "state:" | awk '{print $2}')
echo "Job status: $status"
if [[ "$status" == "Done" ]]; then
echo "BATCH JOB PASSED";
not_finished=false;
elif [[ "$status" == "Drained" ]]; then
echo "STREAMING JOB PASSED";
not_finished=false;
elif [[ "$status" == "Failed" ]]; then
echo "JOB FAILED";
not_finished=false;
elif [[ "$status" == "Cancelled" ]]; then
echo "JOB CANCELLED";
not_finished=false;
else
sleep 60s
fi
done
sleep 150s
echo "Job with id ${{inputs.job_id}} $status"
Loading

0 comments on commit 4213e27

Please sign in to comment.