Skip to content

Commit

Permalink
Gitlab->Github phase 2 - extract_load (pre-merge)
Browse files Browse the repository at this point in the history
  • Loading branch information
jaceksan committed Jan 3, 2024
1 parent 57844d8 commit 2bdfd5c
Show file tree
Hide file tree
Showing 10 changed files with 271 additions and 88 deletions.
66 changes: 0 additions & 66 deletions .github/variables/elta.env

This file was deleted.

6 changes: 6 additions & 0 deletions .github/variables/elta_dev.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
ELT_ENVIRONMENT="gdp_dev"
DB_NAME="GDP_DEV"
GOODDATA_ENVIRONMENT_ID="development"
DB_USER="gdp_dev"
DB_WAREHOUSE="${SNOWFLAKE_WAREHOUSE}"
DB_ACCOUNT="${SNOWFLAKE_ACCOUNT}"
16 changes: 16 additions & 0 deletions .github/variables/elta_shared.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
DBT_PROFILES_DIR="profile"
MELTANO_TARGET="target-snowflake"
DBT_TARGET="snowflake"
GOODDATA_PROFILES="demo_cicd"
# TODO - uncomment once the GitHub pipeline is fully ready
# GOODDATA_PROFILES="demo_cicd closed_beta closed_beta_se labs_se"
# Snowflake objects are upper-case by default. We use Snowflake in most jobs.
GOODDATA_UPPER_CASE="--gooddata-upper-case"
SNOWFLAKE_ACCOUNT="gooddata"
SNOWFLAKE_WAREHOUSE="DEMO_WH"
VERTICA_HOST="140.236.88.151"
VERTICA_PORT="5433"
VERTICA_USER="gooddata"
VERTICA_DBNAME="PartPub80DB"
OUTPUT_SCHEMA="cicd_output_stage"
MELTANO_STATE_AWS_BUCKET="jacek-blueprint-data-pipeline"
58 changes: 58 additions & 0 deletions .github/workflows/elta_dev.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
name: Extract, Load, Transform, and Analytics (Dev)
# TODO - add analytics

on:
pull_request:
branches:
- main
paths:
# TODO - cannot define paths for extract_load and transform separately,
# - because I need to define INPUT_SCHEMA_* variables only once
# - they must be inputs because env vars are not evaluated in matrix(used in the reusable workflow)
- .github/workflows/elta_dev.yml
# Extract load
- data_pipeline/meltano.yml
- data_pipeline/meltano_conf/**/*
- data_pipeline/requirements-meltano.txt
- .github/workflows/reusable_extract_load.yml
- .github/variables/elta_dev.yml
- .github/variables/elta_shared.yml
# Transform
- data_pipeline/macros/**/*
- data_pipeline/models/**/*
- data_pipeline/profile/**/*
- data_pipeline/dbt_project.yml
- data_pipeline/packages.yml
- data_pipeline/requirements-dbt.txt
- data_pipeline/requirements-gooddata.txt
- .github/workflows/reusable_transform.yml

env:
INPUT_SCHEMA_FAA: "faa_input_stage"
INPUT_SCHEMA_GITHUB: "github_input_stage"
INPUT_SCHEMA_EXCHANGERATEHOST: "exchangeratehost_input_stage"
INPUT_SCHEMA_ECOMMERCE_DEMO: "ecommerce_demo_input_stage"
INPUT_SCHEMA_DATA_SCIENCE: "data_science_input_stage"

jobs:
extract-load-dev:
uses: ./.github/workflows/reusable_extract_load.yml
with:
ENV_FILE: ./.github/variables/elta_dev.env
INPUT_SCHEMA_FAA: ${{ env.INPUT_SCHEMA_FAA }}
INPUT_SCHEMA_GITHUB: ${{ env.INPUT_SCHEMA_GITHUB }}
INPUT_SCHEMA_EXCHANGERATEHOST: ${{ env.INPUT_SCHEMA_EXCHANGERATEHOST }}
INPUT_SCHEMA_ECOMMERCE_DEMO: ${{ env.INPUT_SCHEMA_ECOMMERCE_DEMO }}
INPUT_SCHEMA_DATA_SCIENCE: ${{ env.INPUT_SCHEMA_DATA_SCIENCE }}
secrets: inherit
transform-dev:
needs: extract-load-dev
uses: ./.github/workflows/reusable_transform.yml
with:
ENV_FILE: ./.github/variables/elta_dev.env
INPUT_SCHEMA_FAA: ${{ env.INPUT_SCHEMA_FAA }}
INPUT_SCHEMA_GITHUB: ${{ env.INPUT_SCHEMA_GITHUB }}
INPUT_SCHEMA_EXCHANGERATEHOST: ${{ env.INPUT_SCHEMA_EXCHANGERATEHOST }}
INPUT_SCHEMA_ECOMMERCE_DEMO: ${{ env.INPUT_SCHEMA_ECOMMERCE_DEMO }}
INPUT_SCHEMA_DATA_SCIENCE: ${{ env.INPUT_SCHEMA_DATA_SCIENCE }}
secrets: inherit
73 changes: 73 additions & 0 deletions .github/workflows/reusable_extract_load.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
on:
workflow_call:
inputs:
ENV_FILE:
required: true
type: string
INPUT_SCHEMA_FAA:
required: true
type: string
INPUT_SCHEMA_GITHUB:
required: true
type: string
INPUT_SCHEMA_EXCHANGERATEHOST:
required: true
type: string
INPUT_SCHEMA_ECOMMERCE_DEMO:
required: true
type: string
INPUT_SCHEMA_DATA_SCIENCE:
required: true
type: string

jobs:
reusable_extract_load:
strategy:
matrix:
MELTANO_SOURCE: [tap-github-repo, tap-github-org, tap-s3-csv-faa, tap-s3-csv-ecommerce-demo, tap-s3-csv-data-science]
include:
- MELTANO_SOURCE: tap-github-repo
TARGET_SCHEMA: "${{ inputs.INPUT_SCHEMA_GITHUB }}"
- MELTANO_SOURCE: tap-github-org
TARGET_SCHEMA: "${{ inputs.INPUT_SCHEMA_GITHUB }}"
- MELTANO_SOURCE: tap-s3-csv-faa
TARGET_SCHEMA: "${{ inputs.INPUT_SCHEMA_FAA }}"
- MELTANO_SOURCE: tap-s3-csv-ecommerce-demo
TARGET_SCHEMA: "${{ inputs.INPUT_SCHEMA_ECOMMERCE_DEMO }}"
- MELTANO_SOURCE: tap-s3-csv-data-science
TARGET_SCHEMA: "${{ inputs.INPUT_SCHEMA_DATA_SCIENCE }}"
# TODO - uncomment once https://github.com/anelendata/tap-exchangeratehost/issues/3 is fixed
# - MELTANO_SOURCE: tap-exchangeratehost
# TARGET_SCHEMA: "${env.INPUT_SCHEMA_EXCHANGERATEHOST}"
env:
GIT_DEPTH: "0"
runs-on: ubuntu-latest
environment: ${{ github.ref == 'refs/heads/prod' && 'prod' || github.ref == 'refs/heads/master' && 'staging' || 'dev' }}
steps:
- name: Checkout Repository
uses: actions/checkout@v4

- name: Set Environment Variables
uses: ./.github/actions/setvars
with:
varFilePath: "./.github/variables/elta_shared.env ${{ inputs.ENV_FILE }}"

- name: Setup Environment
run: |
ln -s ${{ env.IMAGES_WORKDIR }}/.meltano .meltano
- name: Run Extract and Load
timeout-minutes: 15
env:
FR_ARG: ${{ "${{ env.FULL_REFRESH }}" == "true" && "--full-refresh" || "" }}
TARGET_SCHEMA: "${{ matrix.TARGET_SCHEMA }}"
DB_PASS: "${{ secrets.DB_PASS }}"
MELTANO_STATE_AWS_ACCESS_KEY_ID: "${{ secrets.MELTANO_STATE_AWS_ACCESS_KEY_ID }}"
MELTANO_STATE_AWS_SECRET_ACCESS_KEY: "${{ secrets.MELTANO_STATE_AWS_SECRET_ACCESS_KEY }}"
TAP_GITHUB_AUTH_TOKEN: "${{ secrets.TAP_GITHUB_AUTH_TOKEN }}"
# TODO - move it to deploy_analytics workflow
# GOODDATA_PROFILES_FILE: "${{ secrets.GOODDATA_PROFILES_FILE }}"
run: |
# TODO - move it to deploy_analytics workflow
# echo $GOODDATA_PROFILES_FILE | base64 --decode > ~/.gooddata/profiles.yaml
meltano --environment ${{ env.ELT_ENVIRONMENT }} run ${{ matrix.MELTANO_SOURCE }} ${{ env.MELTANO_TARGET }} ${{ env.FR_ARG }}
83 changes: 83 additions & 0 deletions .github/workflows/reusable_transform.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
on:
workflow_call:
inputs:
ENV_FILE:
required: true
type: string
INPUT_SCHEMA_FAA:
required: true
type: string
INPUT_SCHEMA_GITHUB:
required: true
type: string
INPUT_SCHEMA_EXCHANGERATEHOST:
required: true
type: string
INPUT_SCHEMA_ECOMMERCE_DEMO:
required: true
type: string
INPUT_SCHEMA_DATA_SCIENCE:
required: true
type: string

jobs:
reusable_transform:
needs: reusable_extract_load
env:
GIT_DEPTH: "0"
runs-on: ubuntu-latest
environment: ${{ github.ref == 'refs/heads/prod' && 'prod' || github.ref == 'refs/heads/master' && 'staging' || 'dev' }}
steps:
- name: Checkout Repository
uses: actions/checkout@v4

- name: Set Environment Variables
uses: ./.github/actions/setvars
with:
varFilePath: "./.github/variables/elta_shared.env ${{ inputs.ENV_FILE }}"

- name: Setup Environment
env:
GOODDATA_PROFILES_FILE: "${{ secrets.GOODDATA_PROFILES_FILE }}"
run: |
mkdir -p ~/.gooddata
cp $GOODDATA_PROFILES_FILE ~/.gooddata/profiles.yaml
cd $SRC_DATA_PIPELINE
# dbt packages are installed during build of docker image to workdir
ln -s ${IMAGES_WORKDIR}/dbt_packages dbt_packages
echo $GOODDATA_PROFILES_FILE | base64 --decode > ~/.gooddata/profiles.yaml
- name: Run Transform
timeout-minutes: 15
env:
FR_ARG: ${{ "${{ env.FULL_REFRESH }}" == "true" && "--full-refresh" || "" }}
# dbt cloud insist on env variables must contain DBT_ prefix. We have to duplicate them here.
# dbt profiles.yml file in this repo relies on DBT_ prefix.
# It means that even jobs not running against dbt cloud rely on DBT_ prefix.
# More variables are duplicated later in this file based on what database is used.
DBT_OUTPUT_SCHEMA: "${{ env.OUTPUT_SCHEMA }}"
DBT_INPUT_SCHEMA_GITHUB: "${{ inputs.INPUT_SCHEMA_GITHUB }}"
DBT_INPUT_SCHEMA_FAA: "${{ inputs.INPUT_SCHEMA_FAA }}"
DBT_INPUT_SCHEMA_EXCHANGERATEHOST: "${{ inputs.INPUT_SCHEMA_EXCHANGERATEHOST }}"
DBT_INPUT_SCHEMA_ECOMMERCE_DEMO: "${{ inputs.INPUT_SCHEMA_ECOMMERCE_DEMO }}"
DBT_INPUT_SCHEMA_DATA_SCIENCE: "${{ inputs.INPUT_SCHEMA_DATA_SCIENCE }}"
DBT_DB_USER: "${{ env.DB_USER }}"
DBT_DB_WAREHOUSE: "${{ env.DB_WAREHOUSE }}"
DBT_DB_ACCOUNT: "${{ env.DB_ACCOUNT }}"
DBT_DB_HOST: "${{ env.DB_HOST }}"
DBT_DB_PORT: "${{ env.DB_PORT }}"
DBT_DB_NAME: "${{ env.DB_NAME }}"
DBT_DB_PASS: "${{ secrets.DB_PASS }}"
# TODO - move it to separate job dedicated to dbt Cloud
# Notify by sending comment to the merge request,
# if duration of a dbt model exceeds average duration from last X runs by DBT_ALLOWED_DEGRADATION percents
DBT_ALLOWED_DEGRADATION: 20
DBT_INCREMENTAL_STRATEGY: "merge"
run: |
dbt run --profiles-dir $DBT_PROFILES_DIR --profile $ELT_ENVIRONMENT --target $DBT_TARGET $FR_ARG
dbt test --profiles-dir $DBT_PROFILES_DIR --profile $ELT_ENVIRONMENT --target $DBT_TARGET
gooddata-dbt provision_workspaces
gooddata-dbt register_data_sources $GOODDATA_UPPER_CASE --profile $ELT_ENVIRONMENT --target $DBT_TARGET
gooddata-dbt deploy_ldm $GOODDATA_UPPER_CASE --profile $ELT_ENVIRONMENT --target $DBT_TARGET
# Invalidates GoodData caches
gooddata-dbt upload_notification --profile $ELT_ENVIRONMENT --target $DBT_TARGET
21 changes: 11 additions & 10 deletions .gitlab-ci/env_vars/elta.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
# Snowflake objects are upper-case by default. We use Snowflake in most jobs.
GOODDATA_UPPER_CASE: "--gooddata-upper-case"
SNOWFLAKE_ACCOUNT: "gooddata"
SNOWFLAKE_USER: "cicd"
SNOWFLAKE_WAREHOUSE: "DEMO_WH"
VERTICA_HOST: "140.236.88.151"
VERTICA_PORT: "5433"
Expand All @@ -24,31 +23,33 @@
# Environment-specific vars. Defined here so we can reuse them in extract/load nad transform phases
############################################################################################################
# DEV
DEV_ELT_ENVIRONMENT: "cicd_dev"
DEV_SNOWFLAKE_DBNAME: 'CICD_DEV'
DEV_ELT_ENVIRONMENT: "gdp_dev"
DEV_SNOWFLAKE_DBNAME: 'GDP_DEV'
DEV_GOODDATA_ENVIRONMENT_ID: "development"
# STAGING
STAGING_ELT_ENVIRONMENT: "cicd_staging"
STAGING_SNOWFLAKE_DBNAME: "CICD_STAGING"
STAGING_ELT_ENVIRONMENT: "gdp_staging"
STAGING_SNOWFLAKE_DBNAME: "GDP_STAGING"
STAGING_SNOWFLAKE_USER: "gdp_staging"
STAGING_GOODDATA_ENVIRONMENT_ID: "staging"
STAGING_GOODDATA_ENVIRONMENT_ID_VERTICA: "staging_vertica"
# PROD
PROD_ELT_ENVIRONMENT: "cicd_prod"
PROD_SNOWFLAKE_DBNAME: 'CICD_PROD'
PROD_ELT_ENVIRONMENT: "gdp_prod"
PROD_SNOWFLAKE_DBNAME: 'GDP_PROD'
PROD_SNOWFLAKE_USER: "gdp_prod"
PROD_GOODDATA_ENVIRONMENT_ID: "production"
############################################################
# For version running against cloud service, e.g. dbt cloud
# DEV
CLOUD_DEV_SNOWFLAKE_DBNAME: 'CICD_CLOUD_DEV'
CLOUD_DEV_SNOWFLAKE_DBNAME: 'GDP_CLOUD_DEV'
CLOUD_DEV_GOODDATA_ENVIRONMENT_ID: "cloud_development"
CLOUD_DEV_DBT_JOB_ID: 406899
# STAGING
CLOUD_STAGING_SNOWFLAKE_DBNAME: "CICD_CLOUD_STAGING"
CLOUD_STAGING_SNOWFLAKE_DBNAME: "GDP_CLOUD_STAGING"
CLOUD_STAGING_GOODDATA_ENVIRONMENT_ID: "cloud_staging"
CLOUD_STAGING_GOODDATA_ENVIRONMENT_ID_VERTICA: "cloud_staging_vertica"
CLOUD_STAGING_DBT_JOB_ID: 408385
# PROD
CLOUD_PROD_SNOWFLAKE_DBNAME: 'CICD_CLOUD_PROD'
CLOUD_PROD_SNOWFLAKE_DBNAME: 'GDP_CLOUD_PROD'
CLOUD_PROD_GOODDATA_ENVIRONMENT_ID: "cloud_production"
CLOUD_PROD_DBT_JOB_ID: 408386

Expand Down
Loading

0 comments on commit 2bdfd5c

Please sign in to comment.