Extract, transform and load #384
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Extract, transform and load | |
on: | |
workflow_dispatch: | |
schedule: | |
- cron: "0 14 * * Mon" | |
jobs: | |
start-runner: | |
name: Start self-hosted EC2 runner | |
runs-on: ubuntu-latest | |
outputs: | |
label: ${{ steps.start-ec2-runner.outputs.label }} | |
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} | |
steps: | |
- name: Configure AWS credentials | |
uses: aws-actions/configure-aws-credentials@v1 | |
with: | |
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
aws-region: ${{ secrets.AWS_REGION }} | |
- name: Start EC2 runner | |
id: start-ec2-runner | |
uses: machulav/ec2-github-runner@v2 | |
with: | |
mode: start | |
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} | |
ec2-image-id: ami-0134997ea21a525a5 | |
ec2-instance-type: t2.2xlarge | |
subnet-id: subnet-0add5e99d1a0f11e1 | |
security-group-id: sg-055bdb11da346a139 | |
etl: | |
name: Extract, transform and load | |
needs: start-runner | |
runs-on: ${{ needs.start-runner.outputs.label }} | |
services: | |
postgres: | |
image: postgres | |
env: | |
POSTGRES_PASSWORD: postgres | |
POSTGRES_DB: calaccess_website | |
POSTGRES_USER: postgres | |
options: >- | |
--health-cmd pg_isready | |
--health-interval 10s | |
--health-timeout 5s | |
--health-retries 5 | |
ports: | |
- 5432:5432 | |
steps: | |
- id: checkout | |
name: Checkout the `django-calaccess-downloads-website` repository | |
uses: actions/checkout@v4 | |
- id: setup-python | |
name: Install Python 3.9 | |
uses: actions/setup-python@v5 | |
with: | |
python-version: '3.9' | |
cache: 'pipenv' | |
- id: install-pipenv | |
name: Install pipenv | |
run: curl https://raw.githubusercontent.com/pypa/pipenv/master/get-pipenv.py | python | |
- id: install-python-deps | |
name: Install Python packages | |
run: pipenv install | |
- id: django-settings | |
name: Configure Django settings | |
run: cp project/settings_actions.py.template project/settings_local.py | |
shell: bash | |
- id: migrate | |
name: Migrate database | |
run: pipenv run python -W ignore manage.py migrate; | |
shell: bash | |
env: | |
PGPASSWORD: postgres | |
- id: load-scraped | |
name: Load scraped data | |
run: pipenv run python -W ignore manage.py loadcalaccessscrapeddata --verbosity=3; | |
shell: bash | |
env: | |
PGPASSWORD: postgres | |
- id: update | |
name: Load and process CAL-ACCESS daily extract | |
run: pipenv run python -W ignore manage.py updatedownloadswebsite --keep-files --verbosity=3; | |
shell: bash | |
env: | |
PGPASSWORD: postgres | |
- id: upload-zip | |
name: Upload CAL-ACCESS ZIP | |
uses: actions/upload-artifact@v3 | |
with: | |
name: source | |
path: data/download | |
if-no-files-found: error | |
- id: upload-raw | |
name: Upload raw CAL-ACCESS files | |
uses: actions/upload-artifact@v3 | |
with: | |
name: raw-data | |
path: data/csv | |
if-no-files-found: error | |
- id: upload-processed | |
name: Upload processed CAL-ACCESS files | |
uses: actions/upload-artifact@v3 | |
with: | |
name: processed-data | |
path: data/processed | |
if-no-files-found: error | |
- id: upload-logs | |
name: Upload logs | |
uses: actions/upload-artifact@v3 | |
with: | |
name: logs | |
path: data/log | |
if-no-files-found: error | |
- name: Datestamp | |
run: date > timestamp.txt | |
shell: bash | |
- name: Commit results | |
run: | | |
git config --global user.name "github-actions[bot]" | |
git config --global user.email "actions@github.com" | |
git config pull.rebase false | |
git status | |
git pull origin $GITHUB_REF | |
git add ./timestamp.txt | |
git commit -m "Scrape" && git push || true | |
shell: bash | |
upload: | |
name: Upload data to Big Local News | |
needs: | |
- start-runner | |
- etl | |
runs-on: ubuntu-latest | |
steps: | |
- id: upload-raw | |
name: Upload raw CAL-ACCESS files | |
uses: actions/download-artifact@v3 | |
with: | |
name: raw-data | |
path: data/csv | |
- id: upload-processed | |
name: Upload processed CAL-ACCESS files | |
uses: actions/download-artifact@v3 | |
with: | |
name: processed-data | |
path: data/processed | |
- id: publish-to-bln | |
name: Publish files to biglocalnews.org | |
uses: biglocalnews/upload-files@v2 | |
with: | |
api-key: ${{ secrets.BLN_API_TOKEN }} | |
project-id: ${{ secrets.BLN_PROJECT_ID }} | |
path: ./data/ | |
stop-runner: | |
name: Stop self-hosted EC2 runner | |
needs: | |
- start-runner | |
- etl | |
runs-on: ubuntu-latest | |
if: ${{ always() }} | |
steps: | |
- name: Configure AWS credentials | |
uses: aws-actions/configure-aws-credentials@v1 | |
with: | |
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
aws-region: ${{ secrets.AWS_REGION }} | |
- name: Stop EC2 runner | |
uses: machulav/ec2-github-runner@v2 | |
with: | |
mode: stop | |
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} | |
label: ${{ needs.start-runner.outputs.label }} | |
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }} | |