Skip to content

Commit

Permalink
Add CI for Habana Labs HPU/Gaudi2 (#5244)
Browse files Browse the repository at this point in the history
Add basic workflow that tests on hpu-gaudi2. Currently, ops are not
implemented, so the full unit tests are not yet enabled.
  • Loading branch information
loadams authored Mar 12, 2024
1 parent 49359d0 commit 2989cf7
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 0 deletions.
48 changes: 48 additions & 0 deletions .github/workflows/hpu-gaudi2.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
name: hpu-gaudi2

on:
workflow_dispatch:
schedule:
- cron: "0 0 * * *"
pull_request:
paths:
- ".github/workflows/hpu-gaudi2.yml"

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

permissions:
contents: read
issues: write

jobs:
unit-tests:
# The type of runner that the job will run on
runs-on: [self-hosted, intel, gaudi2]
container:
image: vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest
ports:
- 80
options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice

# Steps represent a sequence of tasks that will be executed as part of the job
steps:
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
- uses: actions/checkout@v3

- name: Check container state
run: |
ldd --version
hl-smi
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Install deepspeed
run: |
pip install .[dev]
ds_report
- name: Python environment
run: |
pip list
3 changes: 3 additions & 0 deletions tests/unit/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,14 @@ def bf16_required_version_check(accelerator_check=True):
cuda_version_available = CUDA_MAJOR >= 11
nccl_version_available = NCCL_MAJOR > 2 or (NCCL_MAJOR == 2 and NCCL_MINOR >= 10)
npu_available = get_accelerator().device_name() == 'npu'
hpu_available = get_accelerator().device_name() == 'hpu'

if torch_version_available and cuda_version_available and nccl_version_available and accelerator_pass:
return True
elif npu_available:
return True
elif hpu_available:
return True
else:
return False

Expand Down

0 comments on commit 2989cf7

Please sign in to comment.