Run Scraper #303
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Heavily based on https://jacobian.org/til/github-actions-poetry/ | |
name: Run Scraper | |
# Controls when the action will run. Workflow runs when manually triggered using the UI | |
# or API. | |
on: | |
schedule: | |
- cron: "40 15 * * *" | |
workflow_dispatch: | |
# Inputs the workflow accepts. | |
inputs: | |
source: | |
# Friendly description to be shown in the UI instead of 'name' | |
description: 'Source System' | |
# Default value if no value is explicitly provided | |
# Input has to be provided for the workflow to run | |
required: true | |
# The data type of the input | |
type: string | |
jobs: | |
scrape: | |
strategy: | |
matrix: | |
source: | |
[cfpb, doc, doj, dol, fec, ftc, hhs] | |
fail-fast: false | |
# The type of runner that the job will run on | |
runs-on: ubuntu-latest | |
environment: scraper | |
# Steps represent a sequence of tasks that will be executed as part of the job | |
steps: | |
- uses: actions/checkout@v4 | |
- uses: actions/setup-python@v5 | |
with: | |
python-version: '3.11' | |
# Cache the installation of Poetry itself, e.g. the next step. This prevents the workflow | |
# from installing Poetry every time, which can be slow. Note the use of the Poetry version | |
# number in the cache key, and the "-0" suffix: this allows you to invalidate the cache | |
# manually if/when you want to upgrade Poetry, or if something goes wrong. This could be | |
# mildly cleaner by using an environment variable, but I don't really care. | |
- name: cache poetry install | |
uses: actions/cache@v4 | |
with: | |
path: ~/.local | |
key: poetry-1.6.1-0 | |
- uses: snok/install-poetry@v1 | |
with: | |
version: 1.6.1 | |
virtualenvs-create: true | |
virtualenvs-in-project: true | |
- name: cache deps | |
id: cache-deps | |
uses: actions/cache@v4 | |
with: | |
path: .venv | |
key: pydeps-${{ hashFiles('**/poetry.lock') }} | |
# Install dependencies. `--no-root` means "install all dependencies but not the project | |
# itself", which is what you want to avoid caching _your_ code. The `if` statement | |
# ensures this only runs on a cache miss. | |
- run: poetry install --no-interaction --no-root | |
if: steps.cache-deps.outputs.cache-hit != 'true' | |
- run: poetry install --no-interaction | |
- if: ${{ github.event_name == 'workflow_dispatch' }} | |
run: poetry run scrape ${{ inputs.source }} | |
env: | |
FEC_API_KEY: ${{ secrets.FEC_API_KEY }} | |
- if: ${{ github.event_name == 'schedule' }} | |
run: poetry run scrape ${{ matrix.source }} | |
env: | |
FEC_API_KEY: ${{ secrets.FEC_API_KEY }} | |
# publish the generated data files to github | |
- uses: mikeal/publish-to-github-action@master | |
env: | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # GitHub sets this for you | |
BRANCH_NAME: 'main' |