Backport GPU provisioning updates to releases/2.8.0 (red-hat-data-ser…

…vices#1581) Generalize GPU provisioning script to ease the addition of new providers (red-hat-data-services#1568) * refactor gpu operators directories * generalize GPU script away from AWS provider * minor changes in kustomize yaml * fix aws gpu overlay * fix filepath * add missing end lines
bdattoma · Jul 2, 2024 · a7b44a7 · a7b44a7
1 parent 68d0863
commit a7b44a7
Show file tree

Hide file tree

Showing 14 changed files with 336 additions and 46 deletions.
diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/AMD/amd_gpu_install.yaml b/ods_ci/tasks/Resources/Provisioning/GPU/AMD/amd_gpu_install.yaml
@@ -0,0 +1,28 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: openshift-amd-gpu
+
+---
+
+apiVersion: operators.coreos.com/v1
+kind: OperatorGroup
+metadata:
+  name: openshift-amd-gpu-operator-group
+  namespace: openshift-amd-gpu
+spec: {}
+
+---
+
+apiVersion: operators.coreos.com/v1alpha1
+kind: Subscription
+metadata:
+  name: amd-gpu-operator
+  namespace: openshift-amd-gpu
+spec:
+  channel: alpha
+  installPlanApproval: Automatic
+  name: amd-gpu-operator
+  source: community-operators
+  sourceNamespace: openshift-marketplace
+
diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/AMD/amd_operator.sh b/ods_ci/tasks/Resources/Provisioning/GPU/AMD/amd_operator.sh
@@ -0,0 +1,196 @@
+#!/bin/bash
+set -e
+
+GPU_INSTALL_DIR="$(dirname "$0")"
+
+function create_registry_network() {
+    oc patch configs.imageregistry.operator.openshift.io cluster --type merge --patch '{"spec":{"storage":{"emptyDir":{}}}}'
+    oc patch configs.imageregistry.operator.openshift.io cluster --type merge --patch '{"spec":{"managementState":"Managed"}}'
+    echo "Internal registry network created."
+}
+
+function check_registry() {
+    registry_pod=$(oc get pod -l docker-registry=default -n openshift-image-registry --no-headers -o custom-columns=":metadata.name")
+    if [ -n "$registry_pod" ]; then
+        echo "Internal registry pod ($registry_pod) is present."
+        return 0 # Success
+    else
+        echo "Internal registry pod is not present."
+        create_registry_network
+        return 1 # Failure
+    fi
+}
+function wait_while {
+  local seconds timeout interval
+  interval=2
+  seconds=0
+  timeout=$1
+  shift
+  while eval "$*"; do
+    seconds=$(( seconds + interval ))
+    sleep $interval
+    echo -n '.'
+    [[ $seconds -gt $timeout ]] && echo "Time out of ${timeout} exceeded" && return 1
+  done
+  if [[ "$seconds" != '0' ]]; then
+    echo ''
+  fi
+  return 0
+}
+
+has_csv_succeeded() {
+  local ns=$1
+  local subscription=$2
+  local csv
+  csv=$(oc get subscriptions.operators.coreos.com "${subscription}" -n "${ns}" -o=custom-columns=CURRENT_CSV:.status.currentCSV --no-headers=true)
+  if [ x"$csv" != "x" ] && [ x"$csv" != x"<none>" ]
+  then
+    phase=$(oc get clusterserviceversions.operators.coreos.com -n "${ns}" "${csv}" -o=custom-columns=PHASE:.status.phase --no-headers=true)
+    if [ "$phase" = "Succeeded" ]
+    then
+      return 0
+    fi
+  fi
+
+  return 1
+}
+
+function create_devconfig() {
+  dc_name="dc-internal-registry"
+  dc=$(oc get DeviceConfig $dc_name -n openshift-amd-gpu -oname --ignore-not-found)
+  if [[ -n $dc ]];
+    then
+      echo "AMD DeviceConfig $dc_name already exists". Skipping creation
+    else
+      echo "Creating AMD DeviceConfig..."
+      oc create -f - <<EOF
+kind: DeviceConfig
+apiVersion: amd.io/v1alpha1
+metadata:
+  name: $dc_name
+  namespace: openshift-amd-gpu
+EOF
+  fi
+}
+
+
+function wait_until_pod_is_created() {
+  label=$1
+  namespace=$2
+  timeout=$3
+  start_time=$(date +%s)
+  while [ $(($(date +%s) - start_time)) -lt $timeout ]; do
+    podName=$(oc get pods -n $2 -l $1 -oname)
+    if [[ -n $podName ]];
+      then {
+        echo Pod $podName found!
+        return 0
+      } else {
+        echo "waiting for pod with label $label"
+        sleep 2
+      }
+    fi
+  done
+  echo "Timeout exceeded, pod with label $label not found"
+  return 1
+}
+
+function machineconfig_updates {
+  # There should be only "True" and there should be at least one
+  [ True = "$(oc get machineconfigpool --no-headers=true  '-o=custom-columns=UPDATED:.status.conditions[?(@.type=="Updated")].status' | uniq)" ]
+}
+
+function monitor_logs() {
+    local pod_name=$1
+    local ns=$2
+    local c_name=$3
+    shift 3
+    local search_text=$(printf "%q " "$@")
+    echo "Monitoring logs for pod $pod_name..."
+    # Use 'kubectl logs' command to fetch logs continuously
+    oc logs "$pod_name" -c "$c_name" -n "$ns" | while read -r line; do
+        if [[ $line == *"$search_text"* ]]; then
+            echo "Found \"$search_text\" in pod logs: $line"
+        fi
+    done
+}
+
+function wait_until_driver_image_is_built() {
+  startup_timeout=$1
+  build_timeout=$2
+  name=$(oc get pod -n openshift-amd-gpu -l openshift.io/build.name -oname)
+  echo Builder pod name: $name
+  oc wait --timeout="${startup_timeout}s" --for=condition=ready pod -n openshift-amd-gpu -l openshift.io/build.name
+  echo "Wait for the image build to finish"
+  oc wait --timeout="${build_timeout}s" --for=delete pod -n openshift-amd-gpu -l openshift.io/build.name
+  echo "Checking the image stream got created"
+  image=$(oc get is amd_gpu_kmm_modules -n openshift-amd-gpu -oname)
+  if [[ $? -eq 0 ]];
+    then
+      echo ".Image Stream $image found!"
+    else
+      echo ".Image Stream amd_gpu_kmm_modules not found. Check the cluster"
+      exit 1
+  fi
+}
+
+function create_acceleratorprofile() {
+  echo "Creating an Accelerator Profile for Dashboard"
+  oc apply -f - <<EOF
+  apiVersion: dashboard.opendatahub.io/v1
+  kind: AcceleratorProfile
+  metadata:
+    name: ods-ci-amd-gpu
+    namespace: redhat-ods-applications
+  spec:
+    displayName: AMD GPU
+    enabled: true
+    identifier: amd.com/gpu
+    tolerations:
+      - effect: NoSchedule
+        key: amd.com/gpu
+        operator: Exists
+EOF
+  if [ $? -eq 0 ]; then
+    echo "Verifying that an AcceleratorProfiles resource was created in redhat-ods-applications"
+    oc describe AcceleratorProfiles -n redhat-ods-applications
+  fi 
+}
+
+check_registry
+status=$?
+
+# Blacklist the inbox drivers with a MachineConfig if the registry check was successful
+if [ $status -eq 0 ]; then
+    oc apply -f "$GPU_INSTALL_DIR/blacklist_driver.yaml"
+else
+    return 1
+fi
+
+sleep 120
+wait_while 1800 ! machineconfig_updates
+
+echo "Installing NFD operator"
+oc apply -f "$GPU_INSTALL_DIR/../nfd_operator.yaml"
+wait_while 360 ! has_csv_succeeded openshift-nfd nfd
+oc apply -f "$GPU_INSTALL_DIR/../nfd_deploy.yaml"
+echo "Installing KMM operator"
+oc apply -f "$GPU_INSTALL_DIR/kmm_operator_install.yaml"
+wait_while 360 ! has_csv_succeeded openshift-kmm kernel-module-management
+echo "Installing AMD operator"
+oc apply -f "$GPU_INSTALL_DIR/amd_gpu_install.yaml"
+wait_while 360 ! has_csv_succeeded openshift-amd-gpu amd-gpu-operator
+create_devconfig
+image=$(oc get is amd_gpu_kmm_modules -n openshift-amd-gpu -oname --ignore-not-found)
+if [[ -n $image ]];
+  then
+      echo ".Image Stream amd_gpu_kmm_modules alredy present! Skipping waiting for builder pod";
+  else
+      wait_until_pod_is_created  openshift.io/build.name openshift-amd-gpu 180
+      wait_until_driver_image_is_built 60 1200
+fi
+echo "Configuration of AMD GPU node and Operators completed"
+# the message appears in the logs, but the pod may get delete before our code next iteration checks the logs once again,
+# hence it'd fails to reach the pod. It happened to me
+# wait_while 1200 monitor_logs "$name" openshift-amd-gpu docker-build "Successfully pushed image-registry.openshift-image-registry.svc:5000/openshift-amd-gpu"
+create_acceleratorprofile
diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/AMD/blacklist_driver.yaml b/ods_ci/tasks/Resources/Provisioning/GPU/AMD/blacklist_driver.yaml
@@ -0,0 +1,17 @@
+apiVersion: machineconfiguration.openshift.io/v1
+kind: MachineConfig
+metadata:
+  labels:
+    machineconfiguration.openshift.io/role: worker
+  name: amdgpu-module-blacklist
+spec:
+  config:
+    ignition:
+      version: 3.2.0
+    storage:
+      files:
+        - path: "/etc/modprobe.d/amdgpu-blacklist.conf"
+          mode: 420
+          overwrite: true
+          contents:
+            source: "data:text/plain;base64,YmxhY2tsaXN0IGFtZGdwdQo="
diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/AMD/kmm_operator_install.yaml b/ods_ci/tasks/Resources/Provisioning/GPU/AMD/kmm_operator_install.yaml
@@ -0,0 +1,26 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: openshift-kmm
+
+---
+
+apiVersion: operators.coreos.com/v1
+kind: OperatorGroup
+metadata:
+  name: openshift-kmm-operator-group
+  namespace: openshift-kmm
+spec: {}
+
+---
+apiVersion: operators.coreos.com/v1alpha1
+kind: Subscription
+metadata:
+  name: kernel-module-management
+  namespace: openshift-kmm
+spec:
+  channel: stable
+  installPlanApproval: Automatic
+  name: kernel-module-management
+  source: redhat-operators
+  sourceNamespace: openshift-marketplace
diff --git a/.../Resources/Provisioning/GPU/gpu_deploy.sh → ...ces/Provisioning/GPU/NVIDIA/gpu_deploy.sh b/.../Resources/Provisioning/GPU/gpu_deploy.sh → ...ces/Provisioning/GPU/NVIDIA/gpu_deploy.sh
@@ -9,13 +9,13 @@ CHANNEL="$(oc get packagemanifest gpu-operator-certified -n openshift-marketplac
 
 CSVNAME="$(oc get packagemanifests/gpu-operator-certified -n openshift-marketplace -o json | jq -r '.status.channels[] | select(.name == "'$CHANNEL'") | .currentCSV')"
 
-sed -i -e "0,/v1.11/s//$CHANNEL/g" -e "s/gpu-operator-certified.v1.11.0/$CSVNAME/g"  "$GPU_INSTALL_DIR/gpu_install.yaml"
+sed -i'' -e "0,/v1.11/s//$CHANNEL/g" -e "s/gpu-operator-certified.v1.11.0/$CSVNAME/g"  "$GPU_INSTALL_DIR/gpu_install.yaml"
 
 oc apply -f "$GPU_INSTALL_DIR/gpu_install.yaml"
-
+oc apply -f "$GPU_INSTALL_DIR/../nfd_operator.yaml"
 echo "Wait for Nvidia GPU Operator Subscription, InstallPlan and Deployment to complete"
 
-oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n nvidia-gpu-operator sub nfd
+oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n openshift-nfd sub nfd
 
 oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n nvidia-gpu-operator sub gpu-operator-certified
 
@@ -80,7 +80,7 @@ function rerun_accelerator_migration() {
 }
 
 wait_until_pod_ready_status  "gpu-operator"
-oc apply -f "$GPU_INSTALL_DIR/nfd_deploy.yaml"
+oc apply -f "$GPU_INSTALL_DIR/../nfd_deploy.yaml"
 oc get csv -n nvidia-gpu-operator "$CSVNAME" -o jsonpath='{.metadata.annotations.alm-examples}' | jq .[0] > clusterpolicy.json
 oc apply -f clusterpolicy.json
 wait_until_pod_ready_status "nvidia-device-plugin-daemonset"

diff --git a/...sources/Provisioning/GPU/gpu_install.yaml → .../Provisioning/GPU/NVIDIA/gpu_install.yaml b/...sources/Provisioning/GPU/gpu_install.yaml → .../Provisioning/GPU/NVIDIA/gpu_install.yaml
diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/provision-gpu.sh b/ods_ci/tasks/Resources/Provisioning/GPU/provision-gpu.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+set -e
+
+# Optional params
+INSTANCE_TYPE=${1:-"g4dn.xlarge"}
+PROVIDER=${2:-"AWS"}
+KUSTOMIZE_PATH="$PWD/tasks/Resources/Provisioning/Hive/GPU"
+MACHINESET_PATH="$KUSTOMIZE_PATH/base/source-machineset.yaml"
+
+# Check if existing machineset GPU already exists
+EXISTING_GPU_MACHINESET="$(oc get machineset -n openshift-machine-api -o jsonpath="{.items[?(@.metadata.annotations['machine\.openshift\.io/GPU']>'0')].metadata.name}")"
+if [[ -n "$EXISTING_GPU_MACHINESET" ]] ; then
+  echo "Machine-set for GPU already exists"
+  oc get machinesets -A --show-labels
+  exit 0
+fi
+
+# Select the first machineset as a template for the GPU machineset
+SOURCE_MACHINESET=$(oc get machineset -n openshift-machine-api -o name | head -n1)
+oc get -o yaml -n openshift-machine-api $SOURCE_MACHINESET  > $MACHINESET_PATH
+
+# rename machine set in the template file
+OLD_MACHINESET_NAME=$(yq '.metadata.name' $MACHINESET_PATH )
+NEW_MACHINESET_NAME=${OLD_MACHINESET_NAME/worker/gpu}
+sed -i'' -e "s/$OLD_MACHINESET_NAME/$NEW_MACHINESET_NAME/g" $MACHINESET_PATH
+
+# set the desired node flavor in the kustomize overlay
+sed -i'' -e "s/INSTANCE_TYPE/$INSTANCE_TYPE/g" $KUSTOMIZE_PATH/overlays/$PROVIDER/gpu.yaml
+
+# create the new MachineSet using kustomize
+oc apply --kustomize $KUSTOMIZE_PATH/overlays/$PROVIDER
+
+# Add GPU label to the new machine-set
+oc patch machinesets -n openshift-machine-api "$NEW_MACHINESET_NAME" -p '{"metadata":{"labels":{"gpu-machineset":"true"}}}' --type=merge
diff --git a/ods_ci/tasks/Resources/Provisioning/Hive/AWS/provision-gpu.sh b/ods_ci/tasks/Resources/Provisioning/Hive/AWS/provision-gpu.sh
diff --git a/ods_ci/tasks/Resources/Provisioning/Hive/GPU/base/kustomization.yaml b/ods_ci/tasks/Resources/Provisioning/Hive/GPU/base/kustomization.yaml
@@ -0,0 +1,4 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+  - source-machineset.yaml
diff --git a/ods_ci/tasks/Resources/Provisioning/Hive/GPU/base/source-machineset.yaml b/ods_ci/tasks/Resources/Provisioning/Hive/GPU/base/source-machineset.yaml
@@ -0,0 +1 @@
+# PLACEHOLDER - the content is dynamically generated by ods_ci/tasks/Resources/Provisioning/GPU/provision-gpu.sh
diff --git a/ods_ci/tasks/Resources/Provisioning/Hive/GPU/overlays/AWS/gpu.yaml b/ods_ci/tasks/Resources/Provisioning/Hive/GPU/overlays/AWS/gpu.yaml
@@ -0,0 +1,3 @@
+- op: replace
+  path: /spec/template/spec/providerSpec/value/instanceType
+  value: INSTANCE_TYPE
diff --git a/ods_ci/tasks/Resources/Provisioning/Hive/GPU/overlays/AWS/kustomization.yaml b/ods_ci/tasks/Resources/Provisioning/Hive/GPU/overlays/AWS/kustomization.yaml
@@ -0,0 +1,10 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+metadata:
+  name: add-gpu
+resources:
+  - ../../base/
+patches:
+  - path: gpu.yaml
+    target:
+      kind: MachineSet