Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
a736c41
enable xpu ci test
DiweiSun Aug 19, 2025
7c96ad4
Revert "enable xpu ci test"
DiweiSun Aug 20, 2025
d1122fc
enable ci test for xpu
DiweiSun Aug 20, 2025
4593d95
Create ci_test_xpu.sh
DiweiSun Aug 20, 2025
7d90b8c
Update .github/workflows/pr-test-xpu.yml
DiweiSun Aug 22, 2025
e6bc407
Update .github/workflows/pr-test-xpu.yml
DiweiSun Aug 22, 2025
c34601f
fix for trigger scenarios
DiweiSun Sep 2, 2025
3085c2b
port from pytorch repo
DiweiSun Sep 2, 2025
d9ab09e
Rename action.yml to xpu-action.yml
DiweiSun Sep 2, 2025
f87892a
update to align with pytorch
DiweiSun Sep 2, 2025
c6f07b5
Revert "Rename action.yml to xpu-action.yml"
DiweiSun Sep 4, 2025
544593a
Revert "port from pytorch repo"
DiweiSun Sep 4, 2025
2e1dc50
Update .github/workflows/pr-test-xpu.yml
DiweiSun Sep 4, 2025
188a0f8
debug for runner
DiweiSun Sep 5, 2025
421d02c
lint format fix
DiweiSun Sep 8, 2025
6f6cd17
format fix
DiweiSun Sep 8, 2025
bae7000
format fix
DiweiSun Sep 8, 2025
7bd3d29
format fix
DiweiSun Sep 9, 2025
4fd2909
format fix
DiweiSun Sep 9, 2025
4a7d9af
format fix
DiweiSun Sep 9, 2025
c3f4384
format fix
DiweiSun Sep 9, 2025
e8936cb
format fix
DiweiSun Sep 10, 2025
50e56ec
trigger by tag only
DiweiSun Sep 15, 2025
5a46341
add xpu label for xpuci
DiweiSun Sep 16, 2025
030121f
fix docker path
DiweiSun Sep 17, 2025
965677d
debug for permission issue
DiweiSun Sep 17, 2025
419d824
debug for permission issue
DiweiSun Sep 17, 2025
ac3ebb9
debug for permission issue
DiweiSun Sep 17, 2025
7cd2475
format fix
DiweiSun Sep 18, 2025
f1dc7f0
fix for pytorch checkout
DiweiSun Sep 23, 2025
01cc8c9
bugfix
DiweiSun Sep 23, 2025
fae9339
bugfix
DiweiSun Sep 23, 2025
b4e67c7
bugfix for pytorch path
DiweiSun Sep 23, 2025
7e432ce
Update pr-test-xpu.yml
DiweiSun Sep 23, 2025
c330476
Update pr-test-xpu.yml
DiweiSun Sep 23, 2025
b18b7fd
update docker image for xpu ci test
DiweiSun Sep 23, 2025
c16e289
fix script path
DiweiSun Sep 24, 2025
ad3db33
Update pr-test-xpu.yml
DiweiSun Sep 24, 2025
d9f621b
fix path typo
DiweiSun Sep 24, 2025
6a2be3c
xpu ci test cases update
DiweiSun Sep 24, 2025
fdf8963
Merge branch 'pytorch:main' into ci/xpu_enabling
DiweiSun Sep 24, 2025
c00fbdd
Update ci_test_xpu.sh
DiweiSun Sep 24, 2025
687da70
Update ci_test_xpu.sh
DiweiSun Sep 24, 2025
264e5b2
utilize conda env to xpu ci test
DiweiSun Sep 24, 2025
3027dac
w/a sccache issue
DiweiSun Sep 25, 2025
b46ca63
refine ci test script
DiweiSun Sep 26, 2025
1d9339f
refine ci workflow for xpu
DiweiSun Sep 26, 2025
6481606
Update pr-test-xpu.yml
DiweiSun Sep 26, 2025
a42e14b
typo fix
DiweiSun Sep 26, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/pytorch-probot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ ciflow_push_tags:
- ciflow/tutorials
- ciflow/rocm
- ciflow/4xh100
- ciflow/xpu
17 changes: 17 additions & 0 deletions .github/scripts/ci_test_xpu.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash

conda create -yn xpu_ao_ci python=3.10
source activate xpu_ao_ci

export CC=/usr/bin/gcc
export CXX=/usr/bin/g++
export SCCACHE_DISABLE=1

python3 -m pip install torch torchvision torchaudio pytorch-triton-xpu --index-url https://download.pytorch.org/whl/nightly/xpu --force-reinstall --no-cache-dir
cd torchao && python3 setup.py install && cd ..

python3 -c "import torch; import torchao; print(f'Torch version: {torch.__version__}')"

pip install pytest expecttest parameterized accelerate hf_transfer 'modelscope!=1.15.0'

pytest -v -s torchao/test/quantization/quantize_/workflows/int4/test_int4_plain_int32_tensor.py
231 changes: 231 additions & 0 deletions .github/workflows/pr-test-xpu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
# TODO: this looks sort of similar to _linux-test, but there are like a dozen
# places where you would have to insert an if statement. Probably it's better to
# just use a different workflow altogether

name: xpu-test

on:
push:
tags:
- ciflow/xpu/*

permissions:
id-token: write
contents: read

concurrency:
group: xpu_ci_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
cancel-in-progress: true

jobs:
test:
# Don't run on forked repos or empty test matrix
# if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]'
timeout-minutes: 60
runs-on: linux.idc.xpu
env:
DOCKER_IMAGE: ci-image:pytorch-linux-jammy-xpu-n-py3
PYTORCH_RETRY_TEST_CASES: 1
PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
steps:
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
repository: pytorch/pytorch
ref: nightly
path: pytorch
fetch-depth: 1
submodules: false

- name: Checkout Torchao (ao)
uses: actions/checkout@v4
with:
repository: ${{ github.repository }}
ref: ${{ github.head_ref || github.ref }}
path: torchao
fetch-depth: 1
submodules: recursive

- name: Clean all stopped docker containers
if: always()
shell: bash
run: |
# Prune all stopped containers.
# If other runner is pruning on this node, will skip.
nprune=$(ps -ef | grep -c "docker container prune")
if [[ $nprune -eq 1 ]]; then
docker container prune -f
fi

- name: Runner health check system info
if: always()
shell: bash
run: |
cat /etc/os-release || true
cat /etc/apt/sources.list.d/oneAPI.list || true
cat /etc/apt/sources.list.d/intel-gpu-jammy.list || true
whoami

- name: Runner health check xpu-smi
if: always()
shell: bash
run: |
timeout 30 xpu-smi discovery || true

- name: Runner health check GPU count
if: always()
shell: bash
run: |
ngpu=$(timeout 30 xpu-smi discovery | grep -c -E 'Device Name' || true)
msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
if [[ $ngpu -eq 0 ]]; then
echo "Error: Failed to detect any GPUs on the runner"
echo "$msg"
exit 1
fi

- name: Runner diskspace health check
uses: pytorch/pytorch/.github/actions/diskspace-cleanup@main
if: always()

- name: Runner health check disconnect on failure
if: ${{ failure() }}
shell: bash
run: |
killall runsvc.sh

- name: Preserve github env variables for use in docker
shell: bash
run: |
env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"

- name: XPU set GPU_FLAG
shell: bash
run: |
# Add render group for container creation.
render_gid=`cat /etc/group | grep render | cut -d: -f3`
echo "GPU_FLAG=--device=/dev/mem --device=/dev/dri --group-add video --group-add $render_gid" >> "${GITHUB_ENV}"

- name: configure aws credentials
id: aws_creds
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
aws-region: us-east-1

- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1

- name: Calculate docker image
id: calculate-docker-image
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
with:
docker-image-name: ${{ env.DOCKER_IMAGE }}
working-directory: pytorch
repo-name: pytorch

- name: Use following to pull public copy of the image
id: print-ghcr-mirror
env:
ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
shell: bash
run: |
tag=${ECR_DOCKER_IMAGE##*:}
echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"

- name: Pull docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

- name: Runner health check GPU count
if: always()
shell: bash
run: |
ngpu=$(timeout 30 clinfo -l | grep -c -E 'Device' || true)
msg="Please file an issue on pytorch/ao reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
if [[ $ngpu -eq 0 ]]; then
echo "Error: Failed to detect any GPUs on the runner"
echo "$msg"
exit 1
fi

- name: Test
id: test
env:
TEST_COMMAND: torchao/.github/scripts/ci_test_xpu.sh
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
PR_NUMBER: ${{ github.event.pull_request.number }}
GITHUB_REPOSITORY: ${{ github.repository }}
GITHUB_WORKFLOW: ${{ github.workflow }}
GITHUB_JOB: ${{ github.job }}
GITHUB_RUN_ID: ${{ github.run_id }}
GITHUB_RUN_NUMBER: ${{ github.run_number }}
GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
timeout-minutes: 60
run: |
set -x

# detached container should get cleaned up by teardown_ec2_linux
# Used for GPU_FLAG since that doesn't play nice
# shellcheck disable=SC2086,SC2090
container_name=$(docker run \
${GPU_FLAG:-} \
-e PR_NUMBER \
-e GITHUB_ACTIONS \
-e GITHUB_REPOSITORY \
-e GITHUB_WORKFLOW \
-e GITHUB_JOB \
-e GITHUB_RUN_ID \
-e GITHUB_RUN_NUMBER \
-e GITHUB_RUN_ATTEMPT \
-e JOB_ID \
-e BRANCH \
-e SHA1 \
--user $(id -u):$(id -g) \
--ulimit stack=10485760:83886080 \
--ulimit core=0 \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--shm-size="8g" \
--tty \
--detach \
--name="${container_name}" \
--user jenkins \
--privileged \
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}"
)
# save container name for later step
echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV"
# jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home
docker exec -t "${container_name}" sh -c "bash ${TEST_COMMAND}"

- name: Collect backtraces from coredumps (if any)
if: always()
run: |
# shellcheck disable=SC2156
find . -iname "core.[1-9]*" -exec docker exec "${CONTAINER_NAME}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;

- name: Stop container before exit
if: always()
run: |
# Workaround for multiple runners on same IDC node
docker stop "${{ env.CONTAINER_NAME }}"

- name: Store Core dumps on GitHub
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
if: failure()
with:
name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
retention-days: 14
if-no-files-found: ignore
path: ./**/core.[1-9]*

- name: Teardown XPU
uses: pytorch/pytorch/.github/actions/teardown-xpu@main
Loading