[CI] add ray ci in github action #170
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Ray CI | |
on: | |
pull_request: | |
branches: | |
- main | |
env: | |
PYTHON_VERSION: '3.11' | |
DOCKER_BUILDKIT: 1 | |
RAY_CI_POST_WHEEL_TESTS: 1 | |
jobs: | |
start-ecs-runner: | |
name: Start ECS Runner | |
runs-on: ubuntu-latest | |
environment: aliyun | |
steps: | |
- name: Configure Aliyun CLI | |
run: | | |
wget https://aliyuncli.alicdn.com/aliyun-cli-linux-latest-amd64.tgz | |
tar -xvf aliyun-cli-linux-latest-amd64.tgz | |
./aliyun configure set --profile default \ | |
--access-key-id ${{ secrets.ALIYUN_ACCESS_KEY }} \ | |
--access-key-secret ${{ secrets.ALIYUN_ACCESS_SECRET }} \ | |
--region us-west-1 | |
./aliyun ecs StartInstance --InstanceId ${{ secrets.ALIYUN_ECS_INSTANCE_ID }} | |
--region us-west-1 | |
until ./aliyun ecs DescribeInstanceStatus --InstanceId ${{ secrets.ALIYUN_ECS_INSTANCE_ID }} --region us-west-1 | grep "Running"; do | |
sleep 10 | |
done | |
check-permissions: | |
name: Check User Permissions | |
needs: start-ecs-runner | |
runs-on: self-hosted | |
outputs: | |
allowed: ${{ steps.check_user.outputs.allowed }} | |
steps: | |
- name: Check if user is allowed | |
id: check_user | |
run: | | |
if [ -f "/root/.workflow.cfg" ]; then | |
# Check if the PR author is in the allowed users list | |
if grep -qE "^allowed_users=(${GITHUB_ACTOR}|.*,${GITHUB_ACTOR}|${GITHUB_ACTOR},.*|.*,${GITHUB_ACTOR},.*)\$" /root/.workflow.cfg; then | |
echo "allowed=true" >> $GITHUB_OUTPUT | |
else | |
echo "allowed=false" >> $GITHUB_OUTPUT | |
echo "User ${GITHUB_ACTOR} is not authorized to trigger this workflow" | |
fi | |
else | |
# If config file doesn't exist, allow all users | |
echo "allowed=false" >> $GITHUB_OUTPUT | |
fi | |
build-base-images: | |
name: Build Base Images | |
needs: check-permissions | |
if: needs.check-permissions.outputs.allowed == 'true' | |
runs-on: self-hosted | |
strategy: | |
matrix: | |
python: ['3.11'] | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
- name: Login to GitHub Container Registry | |
uses: docker/login-action@v2 | |
with: | |
registry: ghcr.io | |
username: ${{ github.actor }} | |
password: ${{ secrets.GITHUB_TOKEN }} | |
- name: Set up Docker Buildx | |
uses: docker/setup-buildx-action@v2 | |
- name: Check Test Image Existence | |
id: check_test_image | |
run: | | |
if docker manifest inspect ghcr.io/${{ github.repository }}/oss-ci-base_test-py${{ matrix.python }} > /dev/null 2>&1; then | |
echo "exists=true" >> $GITHUB_OUTPUT | |
else | |
echo "exists=false" >> $GITHUB_OUTPUT | |
fi | |
- name: Build Base Test Image | |
if: steps.check_test_image.outputs.exists != 'true' | |
uses: docker/build-push-action@v3 | |
with: | |
context: . | |
file: ci/docker/base.test.Dockerfile | |
tags: | | |
ghcr.io/${{ github.repository }}/oss-ci-base_test-py${{ matrix.python }} | |
push: true | |
- name: Check Build Image Existence | |
id: check_build_image | |
run: | | |
if docker manifest inspect ghcr.io/${{ github.repository }}/oss-ci-base_build-py${{ matrix.python }} > /dev/null 2>&1; then | |
echo "exists=true" >> $GITHUB_OUTPUT | |
else | |
echo "exists=false" >> $GITHUB_OUTPUT | |
fi | |
- name: Build OSS CI Base | |
if: steps.check_build_image.outputs.exists != 'true' | |
uses: docker/build-push-action@v3 | |
with: | |
context: . | |
file: ci/docker/base.build.Dockerfile | |
tags: ghcr.io/${{ github.repository }}/oss-ci-base_build-py${{ matrix.python }} | |
build-args: | | |
DOCKER_IMAGE_BASE_TEST=ghcr.io/${{ github.repository }}/oss-ci-base_test-py${{ matrix.python }} | |
push: true | |
core-tests: | |
name: Core Tests | |
env: | |
BAZEL_DIR: ${{ github.workspace }}/../tmp/bazel/${{ github.sha }} | |
RAYCI_BUILD_ID: main | |
RAY_DIR: ${{ github.workspace }} | |
RAYCI_CHECKOUT_DIR: ${{ github.workspace }} | |
COMMIT_HASH: ${{ github.sha }} | |
needs: [build-base-images, check-permissions] | |
if: needs.check-permissions.outputs.allowed == 'true' | |
runs-on: self-hosted | |
strategy: | |
matrix: | |
python: ['3.11'] | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
- name: Login to GitHub Container Registry | |
uses: docker/login-action@v2 | |
with: | |
registry: ghcr.io | |
username: ${{ github.actor }} | |
password: ${{ secrets.GITHUB_TOKEN }} | |
- name: Set up Docker Buildx | |
uses: docker/setup-buildx-action@v2 | |
- name: Check Core Image Existence | |
id: check_core_image | |
run: | | |
if docker image ls --format '{{.Repository}}:{{.Tag}}' | grep -q "localhost:5000/citemp:${{ env.RAYCI_BUILD_ID }}-corebuild"; then | |
echo "exists=true" >> $GITHUB_OUTPUT | |
else | |
echo "exists=false" >> $GITHUB_OUTPUT | |
fi | |
- name: Build core CI Base | |
if: steps.check_core_image.outputs.exists != 'true' | |
uses: docker/build-push-action@v3 | |
with: | |
context: . | |
file: ci/docker/core.build.Dockerfile | |
tags: | | |
localhost:5000/citemp:${{ env.RAYCI_BUILD_ID }}-corebuild | |
build-args: | | |
RAYCI_IS_GPU_BUILD=false | |
BUILDKITE=true | |
DOCKER_IMAGE_BASE_BUILD=ghcr.io/${{ github.repository }}/oss-ci-base_build-py${{ matrix.python }} | |
push: true | |
- name: Check Manylinux Image Existence | |
id: check_manylinux_image | |
run: | | |
if docker image ls --format '{{.Repository}}:{{.Tag}}' | grep -q "localhost:5000/citemp:${{ env.RAYCI_BUILD_ID }}-manylinux"; then | |
echo "exists=true" >> $GITHUB_OUTPUT | |
else | |
echo "exists=false" >> $GITHUB_OUTPUT | |
fi | |
- name: Build Manylinux Image | |
if: steps.check_manylinux_image.outputs.exists != 'true' | |
uses: docker/build-push-action@v3 | |
with: | |
context: . | |
file: ci/docker/manylinux.Dockerfile | |
tags: | | |
localhost:5000/citemp:${{ env.RAYCI_BUILD_ID }}-manylinux | |
push: true | |
build-args: | | |
BUILDKITE_BAZEL_CACHE_URL | |
HOSTTYPE=x86_64 | |
- name: Run Core Python Tests (1) | |
if: '!cancelled()' | |
env: | |
RAYCI_WORK_REPO: localhost:5000/citemp | |
run: | | |
echo "Running core: python tests..." | |
bazel run //ci/ray_ci:test_in_docker -- //python/ray/tests/... core \ | |
--workers 1 --worker-id 0 --parallelism-per-worker 12 \ | |
--except-tags debug_tests,asan_tests,post_wheel_build,ha_integration,mem_pressure,tmpfs,container,manual,use_all_core,multi_gpu,large_size_python_tests_shard_2 \ | |
--test-env=BAZEL_DIR=$(realpath ${{ env.BAZEL_DIR }}) | |
- name: Run Core Python Tests (2) | |
if: '!cancelled()' | |
env: | |
RAYCI_WORK_REPO: localhost:5000/citemp | |
run: | | |
echo "Running core: python tests..." | |
bazel run //ci/ray_ci:test_in_docker -- //python/ray/dag/... python/ray/autoscaler/v2/... core \ | |
--workers 1 --worker-id 0 --parallelism-per-worker 12 --skip-ray-installation \ | |
--except-tags debug_tests,asan_tests,post_wheel_build,ha_integration,mem_pressure,tmpfs,container,manual,use_all_core,multi_gpu,large_size_python_tests_shard_2 \ | |
--test-env=BAZEL_DIR=$(realpath ${{ env.BAZEL_DIR }}) \ | |
--skip-ray-installation | |
- name: Run Core Python Tests (3) | |
if: '!cancelled()' | |
env: | |
RAYCI_WORK_REPO: localhost:5000/citemp | |
run: | | |
#echo "Running core: python tests..." | |
bazel run //ci/ray_ci:test_in_docker -- //python/ray/tests/... //python/ray/dag/... python/ray/autoscaler/v2/... core \ | |
--workers 1 --worker-id 0 --parallelism-per-worker 12 --skip-ray-installation \ | |
--only-tags use_all_core \ | |
--test-env=BAZEL_DIR=$(realpath ${{ env.BAZEL_DIR }}) | |
- name: Update Test Image | |
if: '!cancelled()' | |
run: | | |
docker build -t localhost:5000/citemp:${{ env.RAYCI_BUILD_ID }}-corebuild -f ci/docker/core.build.ant.Dockerfile \ | |
--build-arg DOCKER_IMAGE_BASE=localhost:5000/citemp:${{ env.RAYCI_BUILD_ID }}-corebuild \ | |
. | |
- name: Run Core Cpp Tests | |
if: '!cancelled()' | |
env: | |
RAYCI_WORK_REPO: localhost:5000/citemp | |
run: | | |
echo "Running core: cpp tests..." | |
bazel run //ci/ray_ci:test_in_docker -- //:all //src/... core --build-type clang \ | |
--workers 1 --worker-id 0 --parallelism-per-worker 12 --skip-ray-installation \ | |
--test-env=BAZEL_DIR=$(realpath ${{ env.BAZEL_DIR }}) | |
- name: Run Dashboard Tests | |
if: '!cancelled()' | |
env: | |
RAYCI_WORK_REPO: localhost:5000/citemp | |
run: | | |
echo "Running dashboard tests..." | |
bazel run //ci/ray_ci:test_in_docker -- python/ray/dashboard/... core \ | |
--parallelism-per-worker 12 --skip-ray-installation \ | |
--test-env=BAZEL_DIR=$(realpath ${{ env.BAZEL_DIR }}) | |
- name: Run Workflow Tests | |
if: '!cancelled()' | |
env: | |
RAYCI_WORK_REPO: localhost:5000/citemp | |
run: | | |
echo "Running workflow tests..." | |
bazel run //ci/ray_ci:test_in_docker -- //python/ray/workflow/... core \ | |
--workers 1 --worker-id 0 \ | |
--except-tags use_all_core \ | |
--parallelism-per-worker 12 \ | |
--skip-ray-installation \ | |
--test-env=BAZEL_DIR=$(realpath ${{ env.BAZEL_DIR }}) | |
bazel run //ci/ray_ci:test_in_docker -- //python/ray/workflow/... core \ | |
--workers 1 --worker-id 0 \ | |
--skip-ray-installation \ | |
--only-tags use_all_core \ | |
--test-env=BAZEL_DIR=$(realpath ${{ env.BAZEL_DIR }}) | |
- name: Run Debug Tests | |
if: '!cancelled()' | |
env: | |
RAYCI_WORK_REPO: localhost:5000/citemp | |
run: | | |
echo "Running debug tests..." | |
bazel run //ci/ray_ci:test_in_docker -- //python/ray/tests/... core \ | |
--build-type debug \ | |
--parallelism-per-worker 12 \ | |
--only-tags debug_tests \ | |
--except-tags kubernetes,manual \ | |
--test-env=BAZEL_DIR=$(realpath ${{ env.BAZEL_DIR }}) \ | |
--skip-ray-installation | |
- name: Run ASAN Tests | |
if: '!cancelled()' | |
env: | |
RAYCI_WORK_REPO: localhost:5000/citemp | |
run: | | |
echo "Running ASAN tests..." | |
bazel run //ci/ray_ci:test_in_docker -- //python/ray/tests/... core \ | |
--build-type asan \ | |
--parallelism-per-worker 12 \ | |
--only-tags asan_tests \ | |
--except-tags kubernetes,manual \ | |
--skip-ray-installation \ | |
--test-env=BAZEL_DIR=$(realpath ${{ env.BAZEL_DIR }}) | |
- name: Workaround | |
if: '!cancelled()' | |
run: | | |
echo "artifacts_path=$(realpath ${{ env.BAZEL_DIR }})" >> $GITHUB_ENV | |
- name: Generate Report | |
if: '!cancelled()' | |
run: | | |
python ci/ray_ci/report_gen.py | |
- name: Cleanup Containers | |
if: always() | |
continue-on-error: true | |
run: | | |
docker ps -a --filter ancestor=localhost:5000/citemp:${{ env.RAYCI_BUILD_ID }}-corebuild -q | xargs -r docker rm --force || true | |
# Clean up any dangling containers | |
docker ps -a --filter status=exited -q | xargs -r docker rm || true | |
- name: Copy Report Files | |
if: '!cancelled()' | |
continue-on-error: true | |
run: | | |
mkdir -p /data1/static/ci/${{ github.sha }} | |
cp -r ${{ env.BAZEL_DIR }}/reports/* /data1/static/ci/${{ github.sha }}/ || true | |
- name: Get Public IP | |
if: '!cancelled()' | |
continue-on-error: true | |
run: | | |
echo "RUNNER_PUBLIC_IP=$(curl -s https://api.ipify.org)" >> $GITHUB_ENV || echo "RUNNER_PUBLIC_IP=unknown" >> $GITHUB_ENV | |
- name: Show Report Location | |
if: '!cancelled()' | |
run: | | |
if [[ -n "${RUNNER_PUBLIC_IP}" && "${RUNNER_PUBLIC_IP}" != "unknown" ]]; then | |
echo "Report files are available at: http://${{ env.RUNNER_PUBLIC_IP }}:8000/${{ github.sha }}/index.html" | |
else | |
echo "Could not determine report location due to IP resolution failure" | |
fi | |
stop-ecs-runner: | |
name: Stop ECS Instance | |
environment: aliyun | |
needs: | |
- start-ecs-runner | |
- check-permissions | |
- build-base-images | |
- core-tests | |
if: always() | |
runs-on: ubuntu-latest | |
steps: | |
- name: Configure Aliyun CLI | |
run: | | |
wget https://aliyuncli.alicdn.com/aliyun-cli-linux-latest-amd64.tgz | |
tar -xvf aliyun-cli-linux-latest-amd64.tgz | |
./aliyun configure set --profile default \ | |
--access-key-id ${{ secrets.ALIYUN_ACCESS_KEY }} \ | |
--access-key-secret ${{ secrets.ALIYUN_ACCESS_SECRET }} \ | |
--region us-west-1 | |
./aliyun ecs StopInstance --InstanceId ${{ secrets.ALIYUN_ECS_INSTANCE_ID }} --ForceStop true --region us-west-1 | |
until ./aliyun ecs DescribeInstanceStatus --InstanceId ${{ secrets.ALIYUN_ECS_INSTANCE_ID }} --region us-west-1 | grep "Stopped"; do | |
sleep 10 | |
done | |