Skip to content

[CI] add ray ci in github action #181

[CI] add ray ci in github action

[CI] add ray ci in github action #181

Workflow file for this run

name: Ray CI
on:
pull_request:
branches:
- main
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
env:
PYTHON_VERSION: '3.11'
DOCKER_BUILDKIT: 1
RAY_CI_POST_WHEEL_TESTS: 1
jobs:
start-ecs-runner:
name: Start ECS Runner
runs-on: ubuntu-latest
environment: aliyun
steps:
- name: Configure Aliyun CLI
run: |
wget https://aliyuncli.alicdn.com/aliyun-cli-linux-latest-amd64.tgz
tar -xvf aliyun-cli-linux-latest-amd64.tgz
./aliyun configure set --profile default \
--access-key-id ${{ secrets.ALIYUN_ACCESS_KEY }} \
--access-key-secret ${{ secrets.ALIYUN_ACCESS_SECRET }} \
--region us-west-1
# Add retry logic for StartInstance
max_attempts=3
for i in $(seq 1 $max_attempts); do
if ./aliyun ecs StartInstance --InstanceId ${{ secrets.ALIYUN_ECS_INSTANCE_ID }} --region us-west-1; then
echo "StartInstance succeeded on attempt $i"
break
else
echo "StartInstance failed on attempt $i"
if [ $i -eq $max_attempts ]; then
echo "All attempts failed - exiting"
exit 1
fi
sleep 10
fi
done
until ./aliyun ecs DescribeInstanceStatus --InstanceId ${{ secrets.ALIYUN_ECS_INSTANCE_ID }} --region us-west-1 | grep "Running"; do
sleep 10
done
for i in $(seq 1 $max_attempts); do
if ./aliyun ecs AttachDisk --InstanceId ${{ secrets.ALIYUN_ECS_INSTANCE_ID }} --DiskId ${{ secrets.ALIYUN_DISK_ID }} --region us-west-1; then
echo "AttachDisk succeeded on attempt $i"
break
else
echo "AttachDisk failed on attempt $i"
if [ $i -eq $max_attempts ]; then
echo "All attempts failed - exiting"
exit 1
fi
sleep 10
fi
done
check-permissions:
name: Check User Permissions
needs: start-ecs-runner
runs-on: self-hosted
outputs:
allowed: ${{ steps.check_user.outputs.allowed }}
steps:
- name: Check if user is allowed
id: check_user
run: |
if [ -f "/root/.workflow.cfg" ]; then
# Check if the PR author is in the allowed users list
if grep -qE "^allowed_users=(${GITHUB_ACTOR}|.*,${GITHUB_ACTOR}|${GITHUB_ACTOR},.*|.*,${GITHUB_ACTOR},.*)\$" /root/.workflow.cfg; then
echo "allowed=true" >> $GITHUB_OUTPUT
else
echo "allowed=false" >> $GITHUB_OUTPUT
echo "User ${GITHUB_ACTOR} is not authorized to trigger this workflow"
fi
else
# If config file doesn't exist, allow all users
echo "allowed=false" >> $GITHUB_OUTPUT
fi
build-base-images:
name: Build Base Images
needs: check-permissions
if: needs.check-permissions.outputs.allowed == 'true'
runs-on: self-hosted
strategy:
matrix:
python: ['3.11']
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Login to GitHub Container Registry
uses: docker/login-action@v2
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
- name: Check Test Image Existence
id: check_test_image
run: |
if docker manifest inspect ghcr.io/${{ github.repository }}/oss-ci-base_test-py${{ matrix.python }} > /dev/null 2>&1; then
echo "exists=true" >> $GITHUB_OUTPUT
else
echo "exists=false" >> $GITHUB_OUTPUT
fi
- name: Build Base Test Image
if: steps.check_test_image.outputs.exists != 'true'
uses: docker/build-push-action@v3
with:
context: .
file: ci/docker/base.test.Dockerfile
tags: |
ghcr.io/${{ github.repository }}/oss-ci-base_test-py${{ matrix.python }}
push: true
- name: Check Build Image Existence
id: check_build_image
run: |
if docker manifest inspect ghcr.io/${{ github.repository }}/oss-ci-base_build-py${{ matrix.python }} > /dev/null 2>&1; then
echo "exists=true" >> $GITHUB_OUTPUT
else
echo "exists=false" >> $GITHUB_OUTPUT
fi
- name: Build OSS CI Base
if: steps.check_build_image.outputs.exists != 'true'
uses: docker/build-push-action@v3
with:
context: .
file: ci/docker/base.build.Dockerfile
tags: ghcr.io/${{ github.repository }}/oss-ci-base_build-py${{ matrix.python }}
build-args: |
DOCKER_IMAGE_BASE_TEST=ghcr.io/${{ github.repository }}/oss-ci-base_test-py${{ matrix.python }}
push: true
core-tests:
name: Core Tests
env:
BAZEL_DIR: ${{ github.workspace }}/../tmp/bazel/${{ github.sha }}
RAYCI_BUILD_ID: main
RAY_DIR: ${{ github.workspace }}
RAYCI_CHECKOUT_DIR: ${{ github.workspace }}
COMMIT_HASH: ${{ github.sha }}
needs: [build-base-images, check-permissions]
if: needs.check-permissions.outputs.allowed == 'true'
runs-on: self-hosted
strategy:
matrix:
python: ['3.11']
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Login to GitHub Container Registry
uses: docker/login-action@v2
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
- name: Check Core Image Existence
id: check_core_image
run: |
if docker image ls --format '{{.Repository}}:{{.Tag}}' | grep -q "localhost:5000/citemp:${{ env.RAYCI_BUILD_ID }}-corebuild"; then
echo "exists=true" >> $GITHUB_OUTPUT
else
echo "exists=false" >> $GITHUB_OUTPUT
fi
- name: Build core CI Base
if: steps.check_core_image.outputs.exists != 'true'
uses: docker/build-push-action@v3
with:
context: .
file: ci/docker/core.build.Dockerfile
tags: |
localhost:5000/citemp:${{ env.RAYCI_BUILD_ID }}-corebuild
build-args: |
RAYCI_IS_GPU_BUILD=false
BUILDKITE=true
DOCKER_IMAGE_BASE_BUILD=ghcr.io/${{ github.repository }}/oss-ci-base_build-py${{ matrix.python }}
push: true
- name: Explicitly push core build image to local registry
if: '!cancelled()'
run: |
docker push localhost:5000/citemp:${{ env.RAYCI_BUILD_ID }}-corebuild
- name: Check Manylinux Image Existence
id: check_manylinux_image
run: |
if docker image ls --format '{{.Repository}}:{{.Tag}}' | grep -q "localhost:5000/citemp:${{ env.RAYCI_BUILD_ID }}-manylinux"; then
echo "exists=true" >> $GITHUB_OUTPUT
else
echo "exists=false" >> $GITHUB_OUTPUT
fi
- name: Build Manylinux Image
if: steps.check_manylinux_image.outputs.exists != 'true'
uses: docker/build-push-action@v3
with:
context: .
file: ci/docker/manylinux.Dockerfile
tags: |
localhost:5000/citemp:${{ env.RAYCI_BUILD_ID }}-manylinux
push: true
build-args: |
BUILDKITE_BAZEL_CACHE_URL
HOSTTYPE=x86_64
- name: Run Core Python Tests (1)
if: '!cancelled()'
env:
RAYCI_WORK_REPO: localhost:5000/citemp
run: |
echo "Running core: python tests..."
bazel run //ci/ray_ci:test_in_docker -- //python/ray/tests/... core \
--workers 1 --worker-id 0 --parallelism-per-worker 12 \
--except-tags debug_tests,asan_tests,post_wheel_build,ha_integration,mem_pressure,tmpfs,container,manual,use_all_core,multi_gpu,large_size_python_tests_shard_2 \
--test-env=BAZEL_DIR=$(realpath ${{ env.BAZEL_DIR }})
- name: Run Core Python Tests (2)
if: '!cancelled()'
env:
RAYCI_WORK_REPO: localhost:5000/citemp
run: |
echo "Running core: python tests..."
bazel run //ci/ray_ci:test_in_docker -- //python/ray/dag/... python/ray/autoscaler/v2/... core \
--workers 1 --worker-id 0 --parallelism-per-worker 12 --skip-ray-installation \
--except-tags debug_tests,asan_tests,post_wheel_build,ha_integration,mem_pressure,tmpfs,container,manual,use_all_core,multi_gpu,large_size_python_tests_shard_2 \
--test-env=BAZEL_DIR=$(realpath ${{ env.BAZEL_DIR }}) \
--skip-ray-installation
- name: Run Core Python Tests (3)
if: '!cancelled()'
env:
RAYCI_WORK_REPO: localhost:5000/citemp
run: |
#echo "Running core: python tests..."
bazel run //ci/ray_ci:test_in_docker -- //python/ray/tests/... //python/ray/dag/... python/ray/autoscaler/v2/... core \
--workers 1 --worker-id 0 --parallelism-per-worker 12 --skip-ray-installation \
--only-tags use_all_core \
--test-env=BAZEL_DIR=$(realpath ${{ env.BAZEL_DIR }})
- name: Update Test Image
if: '!cancelled()'
run: |
docker build -t localhost:5000/citemp:${{ env.RAYCI_BUILD_ID }}-corebuild -f ci/docker/core.build.ant.Dockerfile \
--build-arg DOCKER_IMAGE_BASE=localhost:5000/citemp:${{ env.RAYCI_BUILD_ID }}-corebuild \
.
- name: Run Core Cpp Tests
if: '!cancelled()'
env:
RAYCI_WORK_REPO: localhost:5000/citemp
run: |
echo "Running core: cpp tests..."
bazel run //ci/ray_ci:test_in_docker -- //:all //src/... core --build-type clang \
--workers 1 --worker-id 0 --parallelism-per-worker 12 --skip-ray-installation \
--test-env=BAZEL_DIR=$(realpath ${{ env.BAZEL_DIR }})
- name: Run Dashboard Tests
if: '!cancelled()'
env:
RAYCI_WORK_REPO: localhost:5000/citemp
run: |
echo "Running dashboard tests..."
bazel run //ci/ray_ci:test_in_docker -- python/ray/dashboard/... core \
--parallelism-per-worker 12 --skip-ray-installation \
--test-env=BAZEL_DIR=$(realpath ${{ env.BAZEL_DIR }})
- name: Run Workflow Tests
if: '!cancelled()'
env:
RAYCI_WORK_REPO: localhost:5000/citemp
run: |
echo "Running workflow tests..."
bazel run //ci/ray_ci:test_in_docker -- //python/ray/workflow/... core \
--workers 1 --worker-id 0 \
--except-tags use_all_core \
--parallelism-per-worker 12 \
--skip-ray-installation \
--test-env=BAZEL_DIR=$(realpath ${{ env.BAZEL_DIR }})
bazel run //ci/ray_ci:test_in_docker -- //python/ray/workflow/... core \
--workers 1 --worker-id 0 \
--skip-ray-installation \
--only-tags use_all_core \
--test-env=BAZEL_DIR=$(realpath ${{ env.BAZEL_DIR }})
- name: Run Debug Tests
if: '!cancelled()'
env:
RAYCI_WORK_REPO: localhost:5000/citemp
run: |
echo "Running debug tests..."
bazel run //ci/ray_ci:test_in_docker -- //python/ray/tests/... core \
--build-type debug \
--parallelism-per-worker 12 \
--only-tags debug_tests \
--except-tags kubernetes,manual \
--test-env=BAZEL_DIR=$(realpath ${{ env.BAZEL_DIR }}) \
--skip-ray-installation
- name: Run ASAN Tests
if: '!cancelled()'
env:
RAYCI_WORK_REPO: localhost:5000/citemp
run: |
echo "Running ASAN tests..."
bazel run //ci/ray_ci:test_in_docker -- //python/ray/tests/... core \
--build-type asan \
--parallelism-per-worker 12 \
--only-tags asan_tests \
--except-tags kubernetes,manual \
--skip-ray-installation \
--test-env=BAZEL_DIR=$(realpath ${{ env.BAZEL_DIR }})
- name: Workaround
if: '!cancelled()'
run: |
echo "artifacts_path=$(realpath ${{ env.BAZEL_DIR }})" >> $GITHUB_ENV
- name: Generate Report
if: '!cancelled()'
run: |
python ci/ray_ci/report_gen.py
- name: Cleanup Containers
if: always()
continue-on-error: true
run: |
docker ps -a --filter ancestor=localhost:5000/citemp:${{ env.RAYCI_BUILD_ID }}-corebuild -q | xargs -r docker rm --force || true
# Clean up any dangling containers
docker ps -a --filter status=exited -q | xargs -r docker rm || true
- name: Upload Test Reports
if: '!cancelled()'
continue-on-error: true
uses: actions/upload-artifact@v4
with:
name: test-reports-${{ github.sha }}
path: ${{ env.BAZEL_DIR }}/reports/
retention-days: 7
if-no-files-found: ignore
stop-ecs-runner:
name: Stop ECS Instance
environment: aliyun
needs:
- start-ecs-runner
- check-permissions
- build-base-images
- core-tests
if: always()
runs-on: ubuntu-latest
steps:
- name: Configure Aliyun CLI
run: |
wget https://aliyuncli.alicdn.com/aliyun-cli-linux-latest-amd64.tgz
tar -xvf aliyun-cli-linux-latest-amd64.tgz
./aliyun configure set --profile default \
--access-key-id ${{ secrets.ALIYUN_ACCESS_KEY }} \
--access-key-secret ${{ secrets.ALIYUN_ACCESS_SECRET }} \
--region us-west-1
# Add retry logic for StopInstance
max_attempts=3
for i in $(seq 1 $max_attempts); do
if ./aliyun ecs StopInstance --InstanceId ${{ secrets.ALIYUN_ECS_INSTANCE_ID }} --ForceStop true --region us-west-1; then
echo "StopInstance succeeded on attempt $i"
break
else
echo "StopInstance failed on attempt $i"
if [ $i -eq $max_attempts ]; then
echo "All attempts failed - exiting"
exit 1
fi
sleep 10
fi
done
until ./aliyun ecs DescribeInstanceStatus --InstanceId ${{ secrets.ALIYUN_ECS_INSTANCE_ID }} --region us-west-1 | grep "Stopped"; do
sleep 10
done