TorchBench Optim Regression Detector on A100 #698
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: TorchBench Optim Regression Detector on A100 | |
on: | |
schedule: | |
- cron: '0 4 * * *' # run at 4 AM UTC = midnight ET | |
workflow_dispatch: | |
inputs: | |
userbenchmark_name: | |
description: "Name of the user benchmark to run" | |
userbenchmark_options: | |
description: "Option of the user benchmark to run" | |
jobs: | |
run-userbenchmark: | |
runs-on: linux.aws.a100 | |
timeout-minutes: 1440 # 24 hours | |
environment: docker-s3-upload | |
env: | |
CONDA_ENV: "optim" | |
PLATFORM_NAME: "gcp_a100" | |
TORCHBENCH_USERBENCHMARK_SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.TORCHBENCH_USERBENCHMARK_SCRIBE_GRAPHQL_ACCESS_TOKEN }} | |
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} | |
steps: | |
- name: Checkout TorchBench | |
uses: actions/checkout@v3 | |
with: | |
path: benchmark | |
- name: Install Conda | |
run: | | |
set -x | |
pushd benchmark | |
bash ./.ci/torchbench/install-conda.sh | |
- name: Install TorchBench | |
run: | | |
set -x | |
. "${HOME}"/miniconda3/etc/profile.d/conda.sh | |
conda activate "${CONDA_ENV}" | |
pushd benchmark | |
# only install the subset of models currently running. | |
python install.py BERT_pytorch DALLE2_pytorch hf_GPT2_large hf_T5_large resnet50 timm_vision_transformer_large yolov3 | |
- name: Print torch.version.git_version | |
run: | | |
set -x | |
. "${HOME}"/miniconda3/etc/profile.d/conda.sh | |
conda activate "${CONDA_ENV}" | |
python -c "import torch; print(torch.version.git_version)" | |
- name: Run optim user benchmark | |
run: | | |
set -x | |
. "${HOME}"/miniconda3/etc/profile.d/conda.sh | |
conda activate "${CONDA_ENV}" | |
# remove old results | |
if [ -d benchmark-output ]; then rm -Rf benchmark-output; fi | |
pushd benchmark | |
if [ -d .userbenchmark ]; then rm -Rf .userbenchmark; fi | |
# TODO: scale this to run other benchmarks, but let's start with optim | |
# Only run a subset, see if this fixes the workflow | |
python -m userbenchmark.optim.run_optim_benchmarks -s -c ${{ github.event.inputs.userbenchmark_options }} | |
cp -r ./.userbenchmark/optim ../benchmark-output | |
- name: Detect potential regressions | |
continue-on-error: true | |
run: | | |
set -x | |
. "${HOME}"/miniconda3/etc/profile.d/conda.sh | |
conda activate "${CONDA_ENV}" | |
pushd benchmark | |
RESULTS=($(find ${PWD}/../benchmark-output -name "metrics-*.json" -maxdepth 2 | sort -r)) | |
# TODO: the following assumes only one metrics-*.json is found. It will keep | |
# overwriting gh-issue.md if multiple are found. Scaling this up is a potential next step. | |
for r in ${RESULTS[@]}; do | |
python regression_detector.py --platform "${PLATFORM_NAME}" --treatment "${r}" --owner @janeyx99 \ | |
--gh-issue-path gh-issue.md --errors-path errors.txt | |
done | |
- name: Create the github issue | |
continue-on-error: true | |
if: false | |
uses: peter-evans/create-issue-from-file@v4 | |
with: | |
title: Optim Perf Signal Detected by TorchBench CI on ${{ env.TORCHBENCH_REGRESSION_DETECTED }} | |
token: ${{ secrets.TORCHBENCH_ACCESS_TOKEN }} | |
content-filepath: ./benchmark/gh-issue.md | |
labels: | | |
torchbench-perf-report | |
- name: Upload result jsons to Scribe and S3 | |
run: | | |
. "${HOME}"/miniconda3/etc/profile.d/conda.sh | |
conda activate "${CONDA_ENV}" | |
pushd benchmark | |
RESULTS=($(find ${PWD}/../benchmark-output -name "metrics-*.json" -maxdepth 2 | sort -r)) | |
echo "Uploading result jsons: ${RESULTS}" | |
for r in ${RESULTS[@]}; do | |
python ./scripts/userbenchmark/upload_scribe.py --userbenchmark_json "${r}" --userbenchmark_platform "${PLATFORM_NAME}" | |
python ./scripts/userbenchmark/upload_s3.py --upload-file "${r}" --userbenchmark_platform "${PLATFORM_NAME}" | |
done | |
- name: Upload artifact | |
uses: actions/upload-artifact@v4 | |
with: | |
name: TorchBench result | |
path: benchmark-output/ | |
- name: Finally, error if errors.txt exists | |
if: always() | |
run: | | |
set -x | |
# Do not error earlier as we want all artifacts and regressions to be reported first | |
# TODO: potentially move errors.txt to benchmark-output so it gets uploaded to S3 | |
pushd benchmark | |
if [ -e errors.txt ]; then cat errors.txt && exit 1; fi | |
- name: Remove conda environment | |
if: always() | |
run: | | |
. ${HOME}/miniconda3/etc/profile.d/conda.sh | |
conda remove -n "${CONDA_ENV}" --all |