(3/n) Support 2D Parallelism - Efficient loading of full-state checkpoints #20278
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Test PyTorch - TPU | |
on: | |
push: | |
branches: [master, "release/*"] | |
pull_request_target: | |
branches: [master, "release/*"] | |
types: [opened, reopened, ready_for_review, labeled, synchronize] | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} | |
cancel-in-progress: ${{ github.event_name == 'pull_request' }} | |
defaults: | |
run: | |
shell: bash | |
jobs: | |
test-on-tpus: | |
runs-on: ubuntu-22.04 | |
# run only when the PR title contains 'TPU' or is a merge to master | |
if: | | |
(github.event_name == 'push' && github.ref == 'refs/heads/master') || | |
(startsWith(github.event_name, 'pull_request') && contains(github.event.pull_request.labels.*.name, 'run TPU')) | |
strategy: | |
fail-fast: false | |
matrix: | |
pkg-name: ["fabric", "pytorch"] | |
accelerator_type: ["v4-8"] | |
timeout-minutes: 30 | |
env: | |
XLA_VER: "2.0" | |
PR_NUMBER: ${{ github.event.pull_request.number && github.event.pull_request.number || 'master' }} | |
SHA: ${{ github.event.pull_request.head.sha && github.event.pull_request.head.sha || github.sha }} | |
CLOUDSDK_CORE_DISABLE_PROMPTS: 1 # default to --quiet | |
steps: | |
- name: Set env | |
run: | | |
# define --zone: https://cloud.google.com/tpu/docs/regions-zones | |
if [[ "${{ matrix.accelerator_type }}" == v4* ]]; then | |
echo "CLOUDSDK_COMPUTE_ZONE=us-central2-b" >> $GITHUB_ENV | |
else | |
echo "CLOUDSDK_COMPUTE_ZONE=us-west4-a" >> $GITHUB_ENV | |
fi | |
- uses: actions/checkout@v4 | |
with: | |
ref: ${{ github.event.pull_request.head.sha }} | |
- uses: actions/setup-python@v5 | |
with: | |
python-version: "3.10" | |
- uses: google-github-actions/auth@v2 | |
with: | |
credentials_json: ${{ secrets.GKE_SA_KEY_BASE64 }} | |
- uses: "google-github-actions/setup-gcloud@v2" | |
- name: Time-based job cleanup | |
if: always() | |
run: | | |
gcloud compute tpus tpu-vm list --format='value(name,createTime)' > creation_times.txt | |
cat creation_times.txt | |
if [ ! -s "creation_times.txt" ]; then | |
echo "No existing jobs" | |
exit 0 | |
fi | |
jobs_deleted=false | |
while read -r job_name created_at; do | |
# Skip jobs with "keepalive" in the name | |
if [[ "$job_name" == *"keepalive"* ]]; then | |
echo "Skipping $job_name, has keepalive in name" | |
continue | |
fi | |
# Convert the creation time to Unix timestamp | |
created_timestamp=$(date -d "${created_at}" +%s) | |
# Calculate the difference between the current time and the creation time | |
current_timestamp=$(date +%s) | |
age=$((current_timestamp - created_timestamp)) | |
# Check if the age has surpassed a timeout | |
if ((age > 35 * 60)); then | |
# delete the job | |
gcloud compute tpus tpu-vm delete "$job_name" --async | |
jobs_deleted=true | |
else | |
echo "Skipping $job_name, alive for $age seconds" | |
fi | |
done < creation_times.txt | |
if [ "$jobs_deleted" = true ]; then | |
sleep 5 | |
# diagnostics | |
gcloud compute tpus tpu-vm list | |
fi | |
- name: Update script | |
run: | | |
import os | |
fname = f'tests/tests_${{ matrix.pkg-name }}/run_tpu_tests.sh' | |
with open(fname) as fopen: | |
data = fopen.read() | |
data = data.replace('{PYTORCH_VERSION}', os.environ["XLA_VER"]) | |
print(data) | |
with open(fname, "w") as fopen: | |
fopen.write(data) | |
shell: python | |
- name: Create node | |
id: tpu-create | |
# TPU capacity is very limited so this workflow's success is optional. continue normally if creation fails | |
continue-on-error: true | |
env: | |
JOB_NAME: ${{ env.PR_NUMBER }}-${{ matrix.pkg-name }}-${{ matrix.accelerator_type }}-${{ env.SHA }} | |
run: | | |
if [[ "${{ matrix.accelerator_type }}" == v4* ]]; then | |
gcloud compute tpus tpu-vm create "$JOB_NAME" \ | |
--accelerator-type=${{ matrix.accelerator_type }} \ | |
--version="tpu-vm-v4-pt-$XLA_VER" \ | |
--preemptible | |
fi | |
- name: Run tests | |
if: steps.tpu-create.outcome == 'success' | |
env: | |
JOB_NAME: ${{ env.PR_NUMBER }}-${{ matrix.pkg-name }}-${{ matrix.accelerator_type }}-${{ env.SHA }} | |
run: | | |
set -uex | |
# zip-copy-unzip the repository | |
zip -q -r repo.zip . -x .git/ | |
gcloud compute tpus tpu-vm scp --worker=all repo.zip "$JOB_NAME":~ | |
gcloud compute tpus tpu-vm ssh "$JOB_NAME" --worker=all --command="cd ~; unzip -q -o repo.zip" | |
# run script | |
gcloud compute tpus tpu-vm ssh "$JOB_NAME" --worker=all --command="cd ~; bash tests/tests_${{ matrix.pkg-name }}/run_tpu_tests.sh" | |
exit_code=$? | |
# pull out the coverage file | |
gcloud compute tpus tpu-vm scp "$JOB_NAME":~/coverage.xml . | |
exit $exit_code | |
- name: Cleanup job | |
if: always() | |
env: | |
JOB_NAME: ${{ env.PR_NUMBER }}-${{ matrix.pkg-name }}-${{ matrix.accelerator_type }}-${{ env.SHA }} | |
run: | | |
if ! gcloud compute tpus tpu-vm list | grep -q "$JOB_NAME"; then | |
echo "$JOB_NAME wasn't created" | |
exit 0 | |
fi | |
# diagnostics | |
gcloud compute tpus tpu-vm describe "$JOB_NAME" | |
# delete the job | |
gcloud compute tpus tpu-vm delete "$JOB_NAME" --async | |
sleep 5 | |
# diagnostics | |
gcloud compute tpus tpu-vm list | |
- name: Upload coverage to Codecov | |
uses: codecov/codecov-action@v4 | |
continue-on-error: true | |
with: | |
token: ${{ secrets.CODECOV_TOKEN }} | |
file: coverage.xml | |
flags: tpu,pytest | |
name: TPU-coverage | |
fail_ci_if_error: false |