nextstrain · j23414 · Aug 2, 2024 · Jul 9, 2024 · Jul 9, 2024 · Jul 9, 2024
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -1,9 +1,16 @@
 name: CI
 
 on:
-  - push
-  - pull_request
+  push:
+    branches:
+      - main
+  pull_request:
+  workflow_dispatch:
+  # Routinely check that we continue to work in the face of external changes.
+  schedule:
+    # Every day at 18:37 UTC / 10:37 Seattle (winter) / 11:37 Seattle (summer)
+    - cron: "37 18 * * *"
 
 jobs:
   ci:
-    uses: nextstrain/.github/.github/workflows/pathogen-repo-ci.yaml@v0
+    uses: nextstrain/.github/.github/workflows/pathogen-repo-ci.yaml@master
diff --git a/.github/workflows/ingest-to-phylogenetic.yaml b/.github/workflows/ingest-to-phylogenetic.yaml
@@ -0,0 +1,102 @@
+name: Ingest to phylogenetic
+
+defaults:
+  run:
+    # This is the same as GitHub Action's `bash` keyword as of 20 June 2023:
+    # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsshell
+    #
+    # Completely spelling it out here so that GitHub can't change it out from under us
+    # and we don't have to refer to the docs to know the expected behavior.
+    shell: bash --noprofile --norc -eo pipefail {0}
+
+on:
+  schedule:
+    # Note times are in UTC, which is 1 or 2 hours behind CET depending on daylight savings.
+    #
+    # Note the actual runs might be late.
+    # Numerous people were confused, about that, including me:
+    #  - https://github.community/t/scheduled-action-running-consistently-late/138025/11
+    #  - https://github.com/github/docs/issues/3059
+    #
+    # Note, '*' is a special character in YAML, so you have to quote this string.
+    #
+    # Docs:
+    #  - https://docs.github.com/en/actions/learn-github-actions/events-that-trigger-workflows#schedule
+    #
+    # Tool that deciphers this particular format of crontab string:
+    #  - https://crontab.guru/
+    #
+    # Runs at 5pm UTC (1pm EDT/10am PDT) since curation by NCBI happens on the East Coast.
+    # We were running into invalid zip archive errors at 9am PDT, so hoping an hour
+    # delay will lower the error frequency
+    - cron: '0 17 * * *'
+
+  workflow_dispatch:
+    inputs:
+      ingest_image:
+        description: 'Specific container image to use for ingest workflow (will override the default of "nextstrain build")'
+        required: false
+      phylogenetic_image:
+        description: 'Specific container image to use for phylogenetic workflow (will override the default of "nextstrain build")'
+        required: false
+
+jobs:
+  ingest:
+    permissions:
+      id-token: write
+    uses: ./.github/workflows/ingest.yaml
+    secrets: inherit
+    with:
+      image: ${{ inputs.ingest_image }}
+
+  # Check if ingest results include new data by checking for the cache
+  # of the file with the results' Metadata.sh256sum (which should have been added within upload-to-s3)
+  # GitHub will remove any cache entries that have not been accessed in over 7 days,
+  # so if the workflow has not been run over 7 days then it will trigger phylogenetic.
+  check-new-data:
+    needs: [ingest]
+    runs-on: ubuntu-latest
+    outputs:
+      cache-hit: ${{ steps.check-cache.outputs.cache-hit }}
+    steps:
+      - name: Get sha256sum
+        id: get-sha256sum
+        env:
+          AWS_DEFAULT_REGION: ${{ vars.AWS_DEFAULT_REGION }}
+        run: |
+          s3_urls=(
+            "s3://nextstrain-data/files/workflows/lassa/metadata_all.tsv.zst"
+            "s3://nextstrain-data/files/workflows/lassa/sequences_all.fasta.zst"
-            "s3://nextstrain-data/files/workflows/lassa/metadata_all.tsv.zst"
-            "s3://nextstrain-data/files/workflows/lassa/sequences_all.fasta.zst"
+            "s3://nextstrain-data/files/workflows/lassa/all/metadata.tsv.zst"
+            "s3://nextstrain-data/files/workflows/lassa/all/sequences.fasta.zst"
-            "s3://nextstrain-data/files/workflows/lassa/metadata_all.tsv.zst"
-            "s3://nextstrain-data/files/workflows/lassa/sequences_all.fasta.zst"
+            "s3://nextstrain-data/files/workflows/lassa/all/metadata.tsv.zst"
+            "s3://nextstrain-data/files/workflows/lassa/all/sequences.fasta.zst"
+          )
+
+          # Code below is modified from ingest/upload-to-s3
+          # https://github.com/nextstrain/ingest/blob/c0b4c6bb5e6ccbba86374d2c09b42077768aac23/upload-to-s3#L23-L29
+
+          no_hash=0000000000000000000000000000000000000000000000000000000000000000
+
+          for s3_url in "${s3_urls[@]}"; do
+            s3path="${s3_url#s3://}"
+            bucket="${s3path%%/*}"
+            key="${s3path#*/}"
+
+            s3_hash="$(aws s3api head-object --no-sign-request --bucket "$bucket" --key "$key" --query Metadata.sha256sum --output text 2>/dev/null || echo "$no_hash")"
+            echo "${s3_hash}" | tee -a ingest-output-sha256sum
+          done
+
+      - name: Check cache
+        id: check-cache
+        uses: actions/cache@v4
+        with:
+          path: ingest-output-sha256sum
+          key: ingest-output-sha256sum-${{ hashFiles('ingest-output-sha256sum') }}
+          lookup-only: true
+
+  phylogenetic:
+    needs: [check-new-data]
+    if: ${{ needs.check-new-data.outputs.cache-hit != 'true' }}
+    permissions:
+      id-token: write
+    uses: ./.github/workflows/phylogenetic.yaml
+    secrets: inherit
+    with:
+      image: ${{ inputs.phylogenetic_image }}
diff --git a/.github/workflows/phylogenetic.yaml b/.github/workflows/phylogenetic.yaml
@@ -0,0 +1,107 @@
+name: Phylogenetic
+
+defaults:
+  run:
+    # This is the same as GitHub Action's `bash` keyword as of 20 June 2023:
+    # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsshell
+    #
+    # Completely spelling it out here so that GitHub can't change it out from under us
+    # and we don't have to refer to the docs to know the expected behavior.
+    shell: bash --noprofile --norc -eo pipefail {0}
+
+on:
+  workflow_call:
+    inputs:
+      image:
+        description: 'Specific container image to use for phylogenetic workflow (will override the default of "nextstrain build")'
+        required: false
+        type: string
+
+  workflow_dispatch:
+    inputs:
+      image:
+        description: 'Specific container image to use for ingest workflow (will override the default of "nextstrain build")'
+        required: false
+        type: string
+      trial_name:
+        description: |
+          Trial name for deploying builds.
+          If not set, builds will overwrite existing builds at s3://nextstrain-data/lassa*
+          If set, builds will be deployed to s3://nextstrain-staging/lassa_trials_<trial_name>_*
+        required: false
+        type: string
+      sequences_url:
+        description: |
+          URL for the sequences.fasta.zst file
+          If not provided, will use default sequences_url from phylogenetic/defaults/config.yaml
+        required: false
+        type: string
+      metadata_url:
+        description: |
+          URL for the metadata.tsv.zst file
+          If not provided, will use default metadata_url from phylogenetic/defaults/config.yaml
+        required: false
+        type: string
+
+jobs:
+  set_config_overrides:
+    runs-on: ubuntu-latest
+    steps:
+      - id: config
+        name: Set config overrides
+        env:
+          TRIAL_NAME: ${{ inputs.trial_name }}
+          SEQUENCES_URL: ${{ inputs.sequences_url }}
+          METADATA_URL: ${{ inputs.metadata_url }}
+        run: |
+          config=""
+
+          if [[ "$TRIAL_NAME" ]]; then
+            config+=" deploy_url='s3://nextstrain-staging/lassa_trials_"$TRIAL_NAME"_'"
+          fi
+
+          if [[ "$SEQUENCES_URL" ]]; then
+            config+=" sequences_url='"$SEQUENCES_URL"'"
+          fi
+
+          if [[ "$METADATA_URL" ]]; then
+            config+=" metadata_url='"$METADATA_URL"'"
+          fi
+
+          if [[ $config ]]; then
+            config="--config $config"
+          fi
+
+          echo "config=$config" >> "$GITHUB_OUTPUT"
+    outputs:
+      config_overrides: ${{ steps.config.outputs.config }}
+
+  phylogenetic:
+    needs: [set_config_overrides]
+    permissions:
+      id-token: write
+    uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master
+    secrets: inherit
+    with:
+      # Starting with the default docker runtime
+      # We can migrate to AWS Batch when/if we need to for more resources or if
+      # the job runs longer than the GH Action limit of 6 hours.
+      runtime: docker
+      env: |
+        NEXTSTRAIN_DOCKER_IMAGE: ${{ inputs.image }}
+        CONFIG_OVERRIDES: ${{ needs.set_config_overrides.outputs.config_overrides }}
+      run: |
+        nextstrain build \
+          phylogenetic \
+            deploy_all \
+            --configfile build-configs/nextstrain-automation/config.yaml \
+            $CONFIG_OVERRIDES
+      # Specifying artifact name to differentiate ingest build outputs from
+      # the phylogenetic build outputs
+      artifact-name: phylogenetic-build-output
+      artifact-paths: |
+        phylogenetic/auspice/
+        phylogenetic/results/
+        phylogenetic/benchmarks/
+        phylogenetic/logs/
+        phylogenetic/.snakemake/log/