From 6dc3d4e35001ea16e6f7cf70a0ead64884c17ae3 Mon Sep 17 00:00:00 2001 From: hbelmiro Date: Mon, 2 Sep 2024 15:52:22 -0300 Subject: [PATCH] Improvements to wait_for_pods function Signed-off-by: hbelmiro --- .github/workflows/backend.yml | 8 +- .github/workflows/e2e-test.yml | 30 ++++++++ .../kfp-kubernetes-execution-tests.yml | 11 +-- .github/workflows/kfp-samples.yml | 3 +- .../kubeflow-pipelines-integration-v2.yml | 1 + .github/workflows/periodic.yml | 4 + .github/workflows/sdk-execution.yml | 11 +-- .github/workflows/upgrade-test.yml | 6 ++ scripts/deploy/github/deploy-kfp-tekton.sh | 6 +- scripts/deploy/github/deploy-kfp.sh | 6 +- scripts/deploy/github/helper-functions.sh | 54 +------------ .../github/kfp-readiness/wait_for_pods.py | 76 +++++++++++++++++++ 12 files changed, 142 insertions(+), 74 deletions(-) create mode 100644 scripts/deploy/github/kfp-readiness/wait_for_pods.py diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml index e11c177026d..faef43a0042 100644 --- a/.github/workflows/backend.yml +++ b/.github/workflows/backend.yml @@ -34,17 +34,17 @@ jobs: steps: - name: Checkout code uses: actions/checkout@v4 - - name: Create KFP cluster - uses: ./.github/actions/kfp-tekton-cluster - - name: Set up Python 3.10 + - name: Set up Python 3.9 uses: actions/setup-python@v4 with: - python-version: '3.10' + python-version: '3.9' - name: Install sdk run: | python3 -m venv .venv . .venv/bin/activate pip install -e sdk/python + - name: Create KFP cluster + uses: ./.github/actions/kfp-tekton-cluster - name: "flip coin test" run: | . .venv/bin/activate diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml index 9bb0ae250a7..bc7783bafda 100644 --- a/.github/workflows/e2e-test.yml +++ b/.github/workflows/e2e-test.yml @@ -23,6 +23,11 @@ jobs: - name: Checkout code uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.9 + - name: Create KFP cluster uses: ./.github/actions/kfp-cluster @@ -46,6 +51,11 @@ jobs: - name: Checkout code uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.9 + - name: Create KFP cluster uses: ./.github/actions/kfp-cluster @@ -69,6 +79,11 @@ jobs: - name: Checkout code uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.9 + - name: Create KFP cluster uses: ./.github/actions/kfp-cluster @@ -92,6 +107,11 @@ jobs: - name: Checkout code uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.9 + - name: Create KFP cluster uses: ./.github/actions/kfp-cluster @@ -115,6 +135,11 @@ jobs: - name: Checkout code uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.9 + - name: Create KFP cluster uses: ./.github/actions/kfp-cluster @@ -144,6 +169,11 @@ jobs: - name: Checkout code uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.9 + - name: Create KFP cluster uses: ./.github/actions/kfp-cluster diff --git a/.github/workflows/kfp-kubernetes-execution-tests.yml b/.github/workflows/kfp-kubernetes-execution-tests.yml index f382c1d8324..92c60ccf43d 100644 --- a/.github/workflows/kfp-kubernetes-execution-tests.yml +++ b/.github/workflows/kfp-kubernetes-execution-tests.yml @@ -7,6 +7,7 @@ on: pull_request: paths: - '.github/workflows/kfp-kubernetes-execution-tests.yml' + - 'scripts/deploy/github/**' - 'sdk/python/**' - 'api/v2alpha1/**' - 'kubernetes_platform/**' @@ -18,17 +19,17 @@ jobs: - name: Checkout code uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.9' + - name: Create KFP cluster uses: ./.github/actions/kfp-cluster - name: Forward API port run: ./scripts/deploy/github/forward-port.sh "kubeflow" "ml-pipeline" 8888 8888 - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.9' - - name: apt-get update run: sudo apt-get update diff --git a/.github/workflows/kfp-samples.yml b/.github/workflows/kfp-samples.yml index 08cbed5d14a..e63619f2574 100644 --- a/.github/workflows/kfp-samples.yml +++ b/.github/workflows/kfp-samples.yml @@ -6,6 +6,7 @@ on: - master pull_request: paths: + - 'scripts/deploy/github/**' - 'samples/**' - 'backend/src/v2/**' - '.github/workflows/kfp-samples.yml' @@ -21,7 +22,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: 3.8 + python-version: 3.9 - name: Create KFP cluster uses: ./.github/actions/kfp-cluster diff --git a/.github/workflows/kubeflow-pipelines-integration-v2.yml b/.github/workflows/kubeflow-pipelines-integration-v2.yml index 5de0b55b937..0afa4161701 100644 --- a/.github/workflows/kubeflow-pipelines-integration-v2.yml +++ b/.github/workflows/kubeflow-pipelines-integration-v2.yml @@ -7,6 +7,7 @@ on: pull_request: paths: - '.github/workflows/kubeflow-pipelines-integration-v2.yml' + - 'scripts/deploy/github/**' - 'samples' - 'core' - 'backend' diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml index 2f81dde8347..be95ad737e5 100644 --- a/.github/workflows/periodic.yml +++ b/.github/workflows/periodic.yml @@ -10,6 +10,10 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.9 - name: Create KFP cluster uses: ./.github/actions/kfp-cluster - name: Port forward kfp apiserver diff --git a/.github/workflows/sdk-execution.yml b/.github/workflows/sdk-execution.yml index 22f605fca9f..b646534fa68 100644 --- a/.github/workflows/sdk-execution.yml +++ b/.github/workflows/sdk-execution.yml @@ -7,6 +7,7 @@ on: pull_request: paths: - '.github/workflows/sdk-execution.yml' + - 'scripts/deploy/github/**' - 'sdk/python/**' - 'api/v2alpha1/**' @@ -17,17 +18,17 @@ jobs: - name: Checkout code uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.9 + - name: Create KFP cluster uses: ./.github/actions/kfp-cluster - name: Forward API port run: ./scripts/deploy/github/forward-port.sh "kubeflow" "ml-pipeline" 8888 8888 - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: 3.8 - - name: apt-get update run: sudo apt-get update diff --git a/.github/workflows/upgrade-test.yml b/.github/workflows/upgrade-test.yml index 83b13c9af5c..bd40b868549 100644 --- a/.github/workflows/upgrade-test.yml +++ b/.github/workflows/upgrade-test.yml @@ -7,6 +7,7 @@ on: pull_request: paths: - '.github/workflows/upgrade-test.yml' + - 'scripts/deploy/github/**' - 'backend/**' - 'manifests/kustomize/**' @@ -17,6 +18,11 @@ jobs: - name: Checkout code uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.9 + - name: Create KFP cluster uses: ./.github/actions/kfp-cluster diff --git a/scripts/deploy/github/deploy-kfp-tekton.sh b/scripts/deploy/github/deploy-kfp-tekton.sh index c4e39633394..17df6278df4 100755 --- a/scripts/deploy/github/deploy-kfp-tekton.sh +++ b/scripts/deploy/github/deploy-kfp-tekton.sh @@ -40,16 +40,14 @@ then exit 1 fi -# Check if all pods are running - allow 20 retries (10 minutes) -wait_for_pods kubeflow 40 30 || EXIT_CODE=$? +# Check if all pods are running - (10 minutes) +wait_for_pods || EXIT_CODE=$? if [[ $EXIT_CODE -ne 0 ]] then echo "Deploy unsuccessful. Not all pods running." exit 1 fi -echo "List Kubeflow: " -kubectl get pod -n kubeflow collect_artifacts kubeflow echo "List Tekton control plane: " diff --git a/scripts/deploy/github/deploy-kfp.sh b/scripts/deploy/github/deploy-kfp.sh index 5da00d75219..6acd46293ba 100755 --- a/scripts/deploy/github/deploy-kfp.sh +++ b/scripts/deploy/github/deploy-kfp.sh @@ -41,16 +41,14 @@ then exit 1 fi -# Check if all pods are running - allow 20 retries (10 minutes) -wait_for_pods kubeflow 40 30 || EXIT_CODE=$? +# Check if all pods are running - (10 minutes) +wait_for_pods || EXIT_CODE=$? if [[ $EXIT_CODE -ne 0 ]] then echo "Deploy unsuccessful. Not all pods running." exit 1 fi -echo "List Kubeflow: " -kubectl get pod -n kubeflow collect_artifacts kubeflow echo "Finished KFP deployment." diff --git a/scripts/deploy/github/helper-functions.sh b/scripts/deploy/github/helper-functions.sh index 8c42a923f46..f7780feca3a 100644 --- a/scripts/deploy/github/helper-functions.sh +++ b/scripts/deploy/github/helper-functions.sh @@ -56,57 +56,9 @@ wait_for_namespace () { } wait_for_pods () { - if [[ $# -ne 3 ]] - then - echo "Usage: wait_for_pods namespace max_retries sleep_time" - return 1 - fi - - local namespace=$1 - local max_retries=$2 - local sleep_time=$3 - - local i=0 - - while [[ $i -lt $max_retries ]] - do - local pods - local statuses - local num_pods - local num_running - pods=$(kubectl get pod -n "$namespace") - # echo "$pods" - # kubectl get pvc -n "$namespace" - - if [[ -z $pods ]] - then - echo "no pod is up yet" - else - # Using quotations around variables to keep column format in echo - # Remove 1st line (header line) -> trim whitespace -> cut statuses column (3rd column) - # Might be overkill to parse down to specific columns :). - statuses=$(echo "$pods" | tail -n +2 | tr -s ' ' | cut -d ' ' -f 3) - num_pods=$(echo "$statuses" | wc -l | xargs) - num_running=$(echo "$statuses" | grep -ow "Running\|Completed" | wc -l | xargs) - - local msg="${num_running}/${num_pods} pods running in \"${namespace}\"." - - if [[ $num_running -ne $num_pods ]] - then - # for debugging - # kubectl get pod -n "$namespace" | grep '0/1' | awk '{print $1}' | xargs kubectl describe pod -n "$namespace" - echo "$msg Checking again in ${sleep_time}s." - else - echo "$msg" - return 0 - fi - fi - - sleep "$sleep_time" - i=$((i+1)) - done - - return 1 + C_DIR="${BASH_SOURCE%/*}" + pip install -r "${C_DIR}"/../../../sdk/python/requirements.txt + python "${C_DIR}"/kfp-readiness/wait_for_pods.py } deploy_with_retries () { diff --git a/scripts/deploy/github/kfp-readiness/wait_for_pods.py b/scripts/deploy/github/kfp-readiness/wait_for_pods.py new file mode 100644 index 00000000000..3a61086afbb --- /dev/null +++ b/scripts/deploy/github/kfp-readiness/wait_for_pods.py @@ -0,0 +1,76 @@ +import logging +import time +import urllib3 +import sys +from kubernetes import client, config + +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + +namespace = 'kubeflow' + +config.load_kube_config() +v1 = client.CoreV1Api() + + +def get_pod_statuses(): + pods = v1.list_namespaced_pod(namespace=namespace) + statuses = {} + for pod in pods.items: + pod_name = pod.metadata.name + pod_status = pod.status.phase + container_statuses = pod.status.container_statuses or [] + ready_containers = sum(1 for status in container_statuses if status.ready) + total_containers = len(container_statuses) + statuses[pod_name] = (pod_status, ready_containers, total_containers) + return statuses + + +def all_pods_ready(statuses): + return all(pod_status == 'Running' and ready == total + for pod_status, ready, total in statuses.values()) + + +def check_pods(calm_time=10, timeout=600, retries_after_ready=5): + start_time = time.time() + stable_count = 0 + previous_statuses = {} + + while time.time() - start_time < timeout: + current_statuses = get_pod_statuses() + + logging.info("Checking pod statuses...") + for pod_name, (pod_status, ready, total) in current_statuses.items(): + logging.info(f"Pod {pod_name} - Status: {pod_status}, Ready: {ready}/{total}") + + if current_statuses == previous_statuses: + if all_pods_ready(current_statuses): + stable_count += 1 + if stable_count >= retries_after_ready: + logging.info("All pods are calm and fully ready.") + break + else: + logging.info( + f"Pods are calm but have only been stable for {stable_count}/{retries_after_ready} retries.") + else: + stable_count = 0 + else: + stable_count = 0 + + previous_statuses = current_statuses + logging.info(f"Pods are still stabilizing. Retrying in {calm_time} seconds...") + time.sleep(calm_time) + else: + raise Exception("Pods did not stabilize within the timeout period.") + + logging.info("Final pod statuses:") + for pod_name, (pod_status, ready, total) in previous_statuses.items(): + if pod_status == 'Running' and ready == total: + logging.info(f"Pod {pod_name} is fully ready ({ready}/{total})") + else: + logging.info(f"Pod {pod_name} is not ready (Status: {pod_status}, Ready: {ready}/{total})") + + +if __name__ == "__main__": + check_pods()