[CI] use self-hostd longrun runner #256
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Ray CI | |
on: | |
pull_request: | |
branches: | |
- main | |
env: | |
PYTHON_VERSION: '3.11' | |
DOCKER_BUILDKIT: 1 | |
RAY_CI_POST_WHEEL_TESTS: 1 | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.ref }} | |
cancel-in-progress: true | |
jobs: | |
start-ecs-runner: | |
name: Start ECS Runner | |
runs-on: longrun # this is only used to trigger ecs runner | |
environment: aliyun | |
concurrency: | |
group: ${{ github.workflow }} | |
cancel-in-progress: false | |
outputs: | |
leased_instance_id: ${{ steps.lease_instance.outputs.instance_id }} | |
leased_disk_id: ${{ steps.lease_instance.outputs.disk_id }} | |
steps: | |
- name: Lease and Start Instance | |
id: lease_instance | |
run: | | |
cd ~/actions-runner | |
# Parse instance:disk pairs | |
IFS=',' read -ra INSTANCE_PAIRS <<< "${{ secrets.ALIYUN_ECS_INSTANCE_IDS }}" | |
while true; do | |
for pair in "${INSTANCE_PAIRS[@]}"; do | |
IFS=':' read -r instance_id disk_id <<< "$pair" | |
# Check instance status | |
status=$(./aliyun ecs DescribeInstances --InstanceIds '["'"$instance_id"'"]' --region us-west-1 | jq -r '.Instances.Instance[0].Status') | |
if [ "$status" = "Stopped" ]; then | |
# Try to start this instance | |
max_attempts=30 | |
for i in $(seq 1 $max_attempts); do | |
if ./aliyun ecs StartInstance --InstanceId "$instance_id" --region us-west-1; then | |
echo "StartInstance succeeded on attempt $i" | |
# Wait for instance to be running | |
while true; do | |
current_status=$(./aliyun ecs DescribeInstances --InstanceIds '["'"$instance_id"'"]' --region us-west-1 | jq -r '.Instances.Instance[0].Status') | |
if [ "$current_status" = "Running" ]; then | |
break | |
fi | |
sleep 10 | |
done | |
# Try to attach disk | |
for j in $(seq 1 $max_attempts); do | |
if ./aliyun ecs AttachDisk --InstanceId "$instance_id" --DiskId "$disk_id" --region us-west-1; then | |
echo "AttachDisk succeeded on attempt $j" | |
# Set outputs for use in subsequent jobs | |
echo "instance_id=$instance_id" >> $GITHUB_OUTPUT | |
echo "disk_id=$disk_id" >> $GITHUB_OUTPUT | |
exit 0 | |
else | |
echo "AttachDisk failed on attempt $j" | |
if [ $j -eq $max_attempts ]; then | |
echo "All AttachDisk attempts failed - trying next instance" | |
break | |
fi | |
sleep 10 | |
fi | |
done | |
break | |
else | |
echo "StartInstance failed on attempt $i" | |
if [ $i -eq $max_attempts ]; then | |
echo "All StartInstance attempts failed - trying next instance" | |
fi | |
sleep 10 | |
fi | |
done | |
fi | |
done | |
echo "No available instances found in this iteration, waiting before retrying..." | |
sleep 60 # Wait 1 minute before trying again | |
done | |
check-permissions: | |
needs: start-ecs-runner | |
name: Check User Permissions | |
runs-on: ${{ needs.start-ecs-runner.outputs.leased_instance_id }} | |
outputs: | |
allowed: ${{ steps.check_user.outputs.allowed }} | |
steps: | |
- name: Check if user is allowed | |
id: check_user | |
run: | | |
cat /home/runner/.workflow.cfg | |
if [ -f "/home/runner/.workflow.cfg" ]; then | |
if grep -qE "^allowed_users=(${GITHUB_ACTOR}|.*,${GITHUB_ACTOR}|${GITHUB_ACTOR},.*|.*,${GITHUB_ACTOR},.*)\$" /home/runner/.workflow.cfg; then | |
echo "allowed=true" >> $GITHUB_OUTPUT | |
else | |
echo "allowed=false" >> $GITHUB_OUTPUT | |
echo "User ${GITHUB_ACTOR} is not authorized to trigger this workflow" | |
fi | |
else | |
echo "allowed=false" >> $GITHUB_OUTPUT | |
fi | |
build-base-images: | |
name: Build Base Images | |
needs: [check-permissions, start-ecs-runner] | |
if: needs.check-permissions.outputs.allowed == 'true' | |
runs-on: ${{ needs.start-ecs-runner.outputs.leased_instance_id }} | |
strategy: | |
matrix: | |
python: ['3.11'] | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
- name: Login to GitHub Container Registry | |
uses: docker/login-action@v2 | |
with: | |
registry: ghcr.io | |
username: ${{ github.actor }} | |
password: ${{ secrets.GITHUB_TOKEN }} | |
- name: Set up Docker Buildx | |
uses: docker/setup-buildx-action@v2 | |
- name: Check Test Image Existence | |
id: check_test_image | |
run: | | |
if docker manifest inspect ghcr.io/${{ github.repository }}/oss-ci-base_test-py${{ matrix.python }} > /dev/null 2>&1; then | |
echo "exists=true" >> $GITHUB_OUTPUT | |
else | |
echo "exists=false" >> $GITHUB_OUTPUT | |
fi | |
- name: Build Base Test Image | |
if: steps.check_test_image.outputs.exists != 'true' | |
uses: docker/build-push-action@v3 | |
with: | |
context: . | |
file: ci/docker/base.test.Dockerfile | |
tags: | | |
ghcr.io/${{ github.repository }}/oss-ci-base_test-py${{ matrix.python }} | |
push: true | |
- name: Check Build Image Existence | |
id: check_build_image | |
run: | | |
if docker manifest inspect ghcr.io/${{ github.repository }}/oss-ci-base_build-py${{ matrix.python }} > /dev/null 2>&1; then | |
echo "exists=true" >> $GITHUB_OUTPUT | |
else | |
echo "exists=false" >> $GITHUB_OUTPUT | |
fi | |
- name: Build OSS CI Base | |
if: steps.check_build_image.outputs.exists != 'true' | |
uses: docker/build-push-action@v3 | |
with: | |
context: . | |
file: ci/docker/base.build.Dockerfile | |
tags: ghcr.io/${{ github.repository }}/oss-ci-base_build-py${{ matrix.python }} | |
build-args: | | |
DOCKER_IMAGE_BASE_TEST=ghcr.io/${{ github.repository }}/oss-ci-base_test-py${{ matrix.python }} | |
push: true | |
core-tests: | |
name: Core Tests | |
env: | |
BAZEL_DIR: ${{ github.workspace }}/../tmp/bazel/${{ github.sha }} | |
RAYCI_BUILD_ID: main | |
RAY_DIR: ${{ github.workspace }} | |
RAYCI_CHECKOUT_DIR: ${{ github.workspace }} | |
COMMIT_HASH: ${{ github.sha }} | |
REPORT_LABEL_1: core_python_tests_1 | |
REPORT_LABEL_2: core_python_tests_2 | |
REPORT_LABEL_3: core_python_tests_3 | |
REPORT_LABEL_4: core_cpp_tests | |
REPORT_LABEL_5: dashboard_tests | |
REPORT_LABEL_6: workflow_tests | |
REPORT_LABEL_7: debug_tests | |
REPORT_LABEL_8: asan_tests | |
needs: [build-base-images, check-permissions, start-ecs-runner] | |
if: needs.check-permissions.outputs.allowed == 'true' | |
runs-on: ${{ needs.start-ecs-runner.outputs.leased_instance_id }} | |
strategy: | |
matrix: | |
python: ['3.11'] | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
- name: Login to GitHub Container Registry | |
uses: docker/login-action@v2 | |
with: | |
registry: ghcr.io | |
username: ${{ github.actor }} | |
password: ${{ secrets.GITHUB_TOKEN }} | |
- name: Set up Docker Buildx | |
uses: docker/setup-buildx-action@v2 | |
- name: Check Core Image Existence | |
id: check_core_image | |
run: | | |
if docker image ls --format '{{.Repository}}:{{.Tag}}' | grep -q "localhost:5000/citemp:${{ env.RAYCI_BUILD_ID }}-corebuild"; then | |
echo "exists=true" >> $GITHUB_OUTPUT | |
else | |
echo "exists=false" >> $GITHUB_OUTPUT | |
fi | |
- name: Build core CI Base | |
if: steps.check_core_image.outputs.exists != 'true' | |
uses: docker/build-push-action@v3 | |
with: | |
context: . | |
file: ci/docker/core.build.Dockerfile | |
tags: | | |
localhost:5000/citemp:${{ env.RAYCI_BUILD_ID }}-corebuild | |
build-args: | | |
RAYCI_IS_GPU_BUILD=false | |
BUILDKITE=true | |
DOCKER_IMAGE_BASE_BUILD=ghcr.io/${{ github.repository }}/oss-ci-base_build-py${{ matrix.python }} | |
push: true | |
- name: Start local registry | |
if: '!cancelled()' | |
run: | | |
# Check if registry container exists and handle accordingly | |
if ! docker ps -a --format '{{.Names}}' | grep -q '^local-registry$'; then | |
docker run -d -p 5000:5000 --name local-registry registry:2 | |
elif ! docker ps --format '{{.Names}}' | grep -q '^local-registry$'; then | |
docker start local-registry | |
fi | |
- name: Push core build image to local registry | |
if: '!cancelled()' | |
run: | | |
max_attempts=3 | |
for i in $(seq 1 $max_attempts); do | |
if docker push localhost:5000/citemp:${{ env.RAYCI_BUILD_ID }}-corebuild; then | |
echo "Docker push succeeded on attempt $i" | |
break | |
else | |
echo "Docker push failed on attempt $i" | |
if [ $i -eq $max_attempts ]; then | |
echo "All docker push attempts failed - exiting" | |
exit 1 | |
fi | |
sleep 10 | |
fi | |
done | |
- name: Install bazel | |
if: '!cancelled()' | |
run: | | |
bash ci/env/install-bazel.sh | |
- name: Check Manylinux Image Existence | |
id: check_manylinux_image | |
run: | | |
if docker image ls --format '{{.Repository}}:{{.Tag}}' | grep -q "localhost:5000/citemp:${{ env.RAYCI_BUILD_ID }}-manylinux"; then | |
echo "exists=true" >> $GITHUB_OUTPUT | |
else | |
echo "exists=false" >> $GITHUB_OUTPUT | |
fi | |
- name: Build Manylinux Image | |
if: steps.check_manylinux_image.outputs.exists != 'true' | |
uses: docker/build-push-action@v3 | |
with: | |
context: . | |
file: ci/docker/manylinux.Dockerfile | |
tags: | | |
localhost:5000/citemp:${{ env.RAYCI_BUILD_ID }}-manylinux | |
push: true | |
build-args: | | |
BUILDKITE_BAZEL_CACHE_URL | |
HOSTTYPE=x86_64 | |
- name: Run Core Python Tests (1) | |
if: '!cancelled()' | |
env: | |
RAYCI_WORK_REPO: localhost:5000/citemp | |
run: | | |
echo "Running core: python tests..." | |
bazel run //ci/ray_ci:test_in_docker -- //python/ray/tests/... core \ | |
--workers 1 --worker-id 0 --parallelism-per-worker 12 \ | |
--except-tags debug_tests,asan_tests,post_wheel_build,ha_integration,mem_pressure,tmpfs,container,manual,use_all_core,multi_gpu,large_size_python_tests_shard_2 \ | |
--test-env=BAZEL_DIR=$(realpath ${{ env.BAZEL_DIR }}) \ | |
- name: Grant Permissions | |
if: '!cancelled()' | |
run: | | |
sudo chmod -R 777 ${{ env.BAZEL_DIR }} | |
- name: Generate Report For Label | |
if: '!cancelled()' | |
env: | |
REPORT_LABEL: ${{ env.REPORT_LABEL_1 }} | |
run: | | |
python ci/ray_ci/report_gen.py | |
- name: Run Core Python Tests (2) | |
if: '!cancelled()' | |
env: | |
RAYCI_WORK_REPO: localhost:5000/citemp | |
run: | | |
echo "Running core: python tests..." | |
bazel run //ci/ray_ci:test_in_docker -- //python/ray/dag/... python/ray/autoscaler/v2/... core \ | |
--workers 1 --worker-id 0 --parallelism-per-worker 12 --skip-ray-installation \ | |
--except-tags debug_tests,asan_tests,post_wheel_build,ha_integration,mem_pressure,tmpfs,container,manual,use_all_core,multi_gpu,large_size_python_tests_shard_2 \ | |
--test-env=BAZEL_DIR=$(realpath ${{ env.BAZEL_DIR }}) \ | |
--skip-ray-installation | |
- name: Grant Permissions | |
if: '!cancelled()' | |
run: | | |
sudo chmod -R 777 ${{ env.BAZEL_DIR }} | |
- name: Generate Report For Label | |
if: '!cancelled()' | |
env: | |
REPORT_LABEL: ${{ env.REPORT_LABEL_2 }} | |
run: | | |
python ci/ray_ci/report_gen.py | |
- name: Run Core Python Tests (3) | |
if: '!cancelled()' | |
env: | |
RAYCI_WORK_REPO: localhost:5000/citemp | |
run: | | |
#echo "Running core: python tests..." | |
bazel run //ci/ray_ci:test_in_docker -- //python/ray/tests/... //python/ray/dag/... python/ray/autoscaler/v2/... core \ | |
--workers 1 --worker-id 0 --parallelism-per-worker 12 --skip-ray-installation \ | |
--only-tags use_all_core \ | |
--test-env=BAZEL_DIR=$(realpath ${{ env.BAZEL_DIR }}) | |
- name: Grant Permissions | |
if: '!cancelled()' | |
run: | | |
sudo chmod -R 777 ${{ env.BAZEL_DIR }} | |
- name: Generate Report For Label | |
if: '!cancelled()' | |
env: | |
REPORT_LABEL: ${{ env.REPORT_LABEL_3 }} | |
run: | | |
python ci/ray_ci/report_gen.py | |
- name: Update Test Image | |
if: '!cancelled()' | |
run: | | |
docker build -t localhost:5000/citemp:${{ env.RAYCI_BUILD_ID }}-corebuild -f ci/docker/core.build.ant.Dockerfile \ | |
--build-arg DOCKER_IMAGE_BASE=localhost:5000/citemp:${{ env.RAYCI_BUILD_ID }}-corebuild \ | |
. | |
- name: Run Core Cpp Tests | |
if: '!cancelled()' | |
env: | |
RAYCI_WORK_REPO: localhost:5000/citemp | |
run: | | |
echo "Running core: cpp tests..." | |
bazel run //ci/ray_ci:test_in_docker -- //:all //src/... core --build-type clang \ | |
--workers 1 --worker-id 0 --parallelism-per-worker 12 --skip-ray-installation \ | |
--test-env=BAZEL_DIR=$(realpath ${{ env.BAZEL_DIR }}) | |
- name: Grant Permissions | |
if: '!cancelled()' | |
run: | | |
sudo chmod -R 777 ${{ env.BAZEL_DIR }} | |
- name: Generate Report For Label | |
if: '!cancelled()' | |
env: | |
REPORT_LABEL: ${{ env.REPORT_LABEL_4 }} | |
run: | | |
python ci/ray_ci/report_gen.py | |
- name: Run Dashboard Tests | |
if: '!cancelled()' | |
env: | |
RAYCI_WORK_REPO: localhost:5000/citemp | |
run: | | |
echo "Running dashboard tests..." | |
bazel run //ci/ray_ci:test_in_docker -- python/ray/dashboard/... core \ | |
--parallelism-per-worker 12 --skip-ray-installation \ | |
--test-env=BAZEL_DIR=$(realpath ${{ env.BAZEL_DIR }}) | |
- name: Grant Permissions | |
if: '!cancelled()' | |
run: | | |
sudo chmod -R 777 ${{ env.BAZEL_DIR }} | |
- name: Generate Report For Label | |
if: '!cancelled()' | |
env: | |
REPORT_LABEL: ${{ env.REPORT_LABEL_5 }} | |
run: | | |
python ci/ray_ci/report_gen.py | |
- name: Run Workflow Tests | |
if: '!cancelled()' | |
env: | |
RAYCI_WORK_REPO: localhost:5000/citemp | |
run: | | |
echo "Running workflow tests..." | |
bazel run //ci/ray_ci:test_in_docker -- //python/ray/workflow/... core \ | |
--workers 1 --worker-id 0 \ | |
--except-tags use_all_core \ | |
--parallelism-per-worker 12 \ | |
--skip-ray-installation \ | |
--test-env=BAZEL_DIR=$(realpath ${{ env.BAZEL_DIR }}) | |
bazel run //ci/ray_ci:test_in_docker -- //python/ray/workflow/... core \ | |
--workers 1 --worker-id 0 \ | |
--skip-ray-installation \ | |
--only-tags use_all_core \ | |
--test-env=BAZEL_DIR=$(realpath ${{ env.BAZEL_DIR }}) | |
- name: Grant Permissions | |
if: '!cancelled()' | |
run: | | |
sudo chmod -R 777 ${{ env.BAZEL_DIR }} | |
- name: Generate Report For Label | |
if: '!cancelled()' | |
env: | |
REPORT_LABEL: ${{ env.REPORT_LABEL_6 }} | |
run: | | |
python ci/ray_ci/report_gen.py | |
- name: Run Debug Tests | |
if: '!cancelled()' | |
env: | |
RAYCI_WORK_REPO: localhost:5000/citemp | |
run: | | |
echo "Running debug tests..." | |
bazel run //ci/ray_ci:test_in_docker -- //python/ray/tests/... core \ | |
--build-type debug \ | |
--parallelism-per-worker 12 \ | |
--only-tags debug_tests \ | |
--except-tags kubernetes,manual \ | |
--test-env=BAZEL_DIR=$(realpath ${{ env.BAZEL_DIR }}) \ | |
--skip-ray-installation | |
- name: Grant Permissions | |
if: '!cancelled()' | |
run: | | |
sudo chmod -R 777 ${{ env.BAZEL_DIR }} | |
- name: Generate Report For Label | |
if: '!cancelled()' | |
env: | |
REPORT_LABEL: ${{ env.REPORT_LABEL_7 }} | |
run: | | |
python ci/ray_ci/report_gen.py | |
- name: Run ASAN Tests | |
if: '!cancelled()' | |
env: | |
RAYCI_WORK_REPO: localhost:5000/citemp | |
run: | | |
echo "Running ASAN tests..." | |
bazel run //ci/ray_ci:test_in_docker -- //python/ray/tests/... core \ | |
--build-type asan \ | |
--parallelism-per-worker 12 \ | |
--only-tags asan_tests \ | |
--except-tags kubernetes,manual \ | |
--skip-ray-installation \ | |
--test-env=BAZEL_DIR=$(realpath ${{ env.BAZEL_DIR }}) | |
- name: Grant Permissions | |
if: '!cancelled()' | |
run: | | |
sudo chmod -R 777 ${{ env.BAZEL_DIR }} | |
- name: Generate Report For Label | |
if: '!cancelled()' | |
env: | |
REPORT_LABEL: ${{ env.REPORT_LABEL_8 }} | |
run: | | |
python ci/ray_ci/report_gen.py | |
- name: Workaround | |
if: '!cancelled()' | |
run: | | |
echo "artifacts_path=$(realpath ${{ env.BAZEL_DIR }})" >> $GITHUB_ENV | |
- name: Generate Report | |
if: '!cancelled()' | |
run: | | |
python ci/ray_ci/report_gen.py summary ${{ env.REPORT_LABEL_1 }} ${{ env.REPORT_LABEL_2 }} ${{ env.REPORT_LABEL_3 }} ${{ env.REPORT_LABEL_4 }} ${{ env.REPORT_LABEL_5 }} ${{ env.REPORT_LABEL_6 }} ${{ env.REPORT_LABEL_7 }} ${{ env.REPORT_LABEL_8 }} | |
- name: Cleanup Containers | |
if: always() | |
continue-on-error: true | |
run: | | |
docker ps -a --filter ancestor=localhost:5000/citemp:${{ env.RAYCI_BUILD_ID }}-corebuild -q | xargs -r docker rm --force || true | |
# Clean up any dangling containers | |
docker ps -a --filter status=exited -q | xargs -r docker rm || true | |
- name: Upload Test Reports | |
id: artifact-upload-step | |
continue-on-error: true | |
if: '!cancelled()' | |
uses: actions/upload-artifact@v4 | |
with: | |
name: test-reports-${{ github.sha }} | |
path: ${{ env.artifacts_path }}/reports/ | |
retention-days: 90 | |
- name: Create Summary | |
if: '!cancelled()' | |
run: | | |
echo "### 📊 Test Reports Available" >> $GITHUB_STEP_SUMMARY | |
echo "" >> $GITHUB_STEP_SUMMARY | |
echo "View detailed test results at:" >> $GITHUB_STEP_SUMMARY | |
echo "🔗 [Ant Ray Dashboard](https://ant-ray.streamlit.app?run_id=${{ github.run_id }}&artifact_id=${{ steps.artifact-upload-step.outputs.artifact-id }})" >> $GITHUB_STEP_SUMMARY | |
echo "" >> $GITHUB_STEP_SUMMARY | |
echo "This link contains test reports and analysis for commit \`${{ github.sha }}\`" >> $GITHUB_STEP_SUMMARY | |
stop-ecs-runner: | |
name: Stop ECS Instance | |
environment: aliyun | |
needs: | |
- start-ecs-runner | |
- check-permissions | |
- build-base-images | |
- core-tests | |
if: always() | |
runs-on: longrun | |
steps: | |
- name: Configure Aliyun CLI | |
run: | | |
cd ~/actions-runner | |
max_attempts=3 | |
for i in $(seq 1 $max_attempts); do | |
if ./aliyun ecs StopInstance --InstanceId ${{ needs.start-ecs-runner.outputs.leased_instance_id }} --ForceStop true --region us-west-1; then | |
echo "StopInstance succeeded on attempt $i" | |
break | |
else | |
echo "StopInstance failed on attempt $i" | |
if [ $i -eq $max_attempts ]; then | |
echo "All attempts failed - exiting" | |
exit 1 | |
fi | |
sleep 10 | |
fi | |
done | |
while true; do | |
current_status=$(./aliyun ecs DescribeInstances --InstanceIds '["'"${{ needs.start-ecs-runner.outputs.leased_instance_id }}"'"]' --region us-west-1 | jq -r '.Instances.Instance[0].Status') | |
if [ "$current_status" = "Stopped" ]; then | |
break | |
fi | |
sleep 10 | |
done |