Skip to content

Extract, transform and load #384

Extract, transform and load

Extract, transform and load #384

Workflow file for this run

name: Extract, transform and load
on:
workflow_dispatch:
schedule:
- cron: "0 14 * * Mon"
jobs:
start-runner:
name: Start self-hosted EC2 runner
runs-on: ubuntu-latest
outputs:
label: ${{ steps.start-ec2-runner.outputs.label }}
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ secrets.AWS_REGION }}
- name: Start EC2 runner
id: start-ec2-runner
uses: machulav/ec2-github-runner@v2
with:
mode: start
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
ec2-image-id: ami-0134997ea21a525a5
ec2-instance-type: t2.2xlarge
subnet-id: subnet-0add5e99d1a0f11e1
security-group-id: sg-055bdb11da346a139
etl:
name: Extract, transform and load
needs: start-runner
runs-on: ${{ needs.start-runner.outputs.label }}
services:
postgres:
image: postgres
env:
POSTGRES_PASSWORD: postgres
POSTGRES_DB: calaccess_website
POSTGRES_USER: postgres
options: >-
--health-cmd pg_isready
--health-interval 10s
--health-timeout 5s
--health-retries 5
ports:
- 5432:5432
steps:
- id: checkout
name: Checkout the `django-calaccess-downloads-website` repository
uses: actions/checkout@v4
- id: setup-python
name: Install Python 3.9
uses: actions/setup-python@v5
with:
python-version: '3.9'
cache: 'pipenv'
- id: install-pipenv
name: Install pipenv
run: curl https://raw.githubusercontent.com/pypa/pipenv/master/get-pipenv.py | python
- id: install-python-deps
name: Install Python packages
run: pipenv install
- id: django-settings
name: Configure Django settings
run: cp project/settings_actions.py.template project/settings_local.py
shell: bash
- id: migrate
name: Migrate database
run: pipenv run python -W ignore manage.py migrate;
shell: bash
env:
PGPASSWORD: postgres
- id: load-scraped
name: Load scraped data
run: pipenv run python -W ignore manage.py loadcalaccessscrapeddata --verbosity=3;
shell: bash
env:
PGPASSWORD: postgres
- id: update
name: Load and process CAL-ACCESS daily extract
run: pipenv run python -W ignore manage.py updatedownloadswebsite --keep-files --verbosity=3;
shell: bash
env:
PGPASSWORD: postgres
- id: upload-zip
name: Upload CAL-ACCESS ZIP
uses: actions/upload-artifact@v3
with:
name: source
path: data/download
if-no-files-found: error
- id: upload-raw
name: Upload raw CAL-ACCESS files
uses: actions/upload-artifact@v3
with:
name: raw-data
path: data/csv
if-no-files-found: error
- id: upload-processed
name: Upload processed CAL-ACCESS files
uses: actions/upload-artifact@v3
with:
name: processed-data
path: data/processed
if-no-files-found: error
- id: upload-logs
name: Upload logs
uses: actions/upload-artifact@v3
with:
name: logs
path: data/log
if-no-files-found: error
- name: Datestamp
run: date > timestamp.txt
shell: bash
- name: Commit results
run: |
git config --global user.name "github-actions[bot]"
git config --global user.email "actions@github.com"
git config pull.rebase false
git status
git pull origin $GITHUB_REF
git add ./timestamp.txt
git commit -m "Scrape" && git push || true
shell: bash
upload:
name: Upload data to Big Local News
needs:
- start-runner
- etl
runs-on: ubuntu-latest
steps:
- id: upload-raw
name: Upload raw CAL-ACCESS files
uses: actions/download-artifact@v3
with:
name: raw-data
path: data/csv
- id: upload-processed
name: Upload processed CAL-ACCESS files
uses: actions/download-artifact@v3
with:
name: processed-data
path: data/processed
- id: publish-to-bln
name: Publish files to biglocalnews.org
uses: biglocalnews/upload-files@v2
with:
api-key: ${{ secrets.BLN_API_TOKEN }}
project-id: ${{ secrets.BLN_PROJECT_ID }}
path: ./data/
stop-runner:
name: Stop self-hosted EC2 runner
needs:
- start-runner
- etl
runs-on: ubuntu-latest
if: ${{ always() }}
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ secrets.AWS_REGION }}
- name: Stop EC2 runner
uses: machulav/ec2-github-runner@v2
with:
mode: stop
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
label: ${{ needs.start-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}