From a7b44a7e32433dc8e437a7df1c83afeeb9d080fb Mon Sep 17 00:00:00 2001 From: Berto D'Attoma <88311595+bdattoma@users.noreply.github.com> Date: Tue, 2 Jul 2024 10:20:30 +0200 Subject: [PATCH] Backport GPU provisioning updates to releases/2.8.0 (#1581) Generalize GPU provisioning script to ease the addition of new providers (#1568) * refactor gpu operators directories * generalize GPU script away from AWS provider * minor changes in kustomize yaml * fix aws gpu overlay * fix filepath * add missing end lines --- .../Provisioning/GPU/AMD/amd_gpu_install.yaml | 28 +++ .../Provisioning/GPU/AMD/amd_operator.sh | 196 ++++++++++++++++++ .../GPU/AMD/blacklist_driver.yaml | 17 ++ .../GPU/AMD/kmm_operator_install.yaml | 26 +++ .../GPU/{ => NVIDIA}/gpu_deploy.sh | 8 +- .../GPU/{ => NVIDIA}/gpu_install.yaml | 0 .../Provisioning/GPU/provision-gpu.sh | 34 +++ .../Provisioning/Hive/AWS/provision-gpu.sh | 42 ---- .../Hive/GPU/base/kustomization.yaml | 4 + .../Hive/GPU/base/source-machineset.yaml | 1 + .../Hive/GPU/overlays/AWS/gpu.yaml | 3 + .../Hive/GPU/overlays/AWS/kustomization.yaml | 10 + .../Hive/GPU/overlays/IBM/gpu.yaml | 3 + .../Hive/GPU/overlays/IBM/kustomization.yaml | 10 + 14 files changed, 336 insertions(+), 46 deletions(-) create mode 100644 ods_ci/tasks/Resources/Provisioning/GPU/AMD/amd_gpu_install.yaml create mode 100755 ods_ci/tasks/Resources/Provisioning/GPU/AMD/amd_operator.sh create mode 100644 ods_ci/tasks/Resources/Provisioning/GPU/AMD/blacklist_driver.yaml create mode 100644 ods_ci/tasks/Resources/Provisioning/GPU/AMD/kmm_operator_install.yaml rename ods_ci/tasks/Resources/Provisioning/GPU/{ => NVIDIA}/gpu_deploy.sh (94%) rename ods_ci/tasks/Resources/Provisioning/GPU/{ => NVIDIA}/gpu_install.yaml (100%) create mode 100755 ods_ci/tasks/Resources/Provisioning/GPU/provision-gpu.sh delete mode 100755 ods_ci/tasks/Resources/Provisioning/Hive/AWS/provision-gpu.sh create mode 100644 ods_ci/tasks/Resources/Provisioning/Hive/GPU/base/kustomization.yaml create mode 100644 ods_ci/tasks/Resources/Provisioning/Hive/GPU/base/source-machineset.yaml create mode 100644 ods_ci/tasks/Resources/Provisioning/Hive/GPU/overlays/AWS/gpu.yaml create mode 100644 ods_ci/tasks/Resources/Provisioning/Hive/GPU/overlays/AWS/kustomization.yaml create mode 100644 ods_ci/tasks/Resources/Provisioning/Hive/GPU/overlays/IBM/gpu.yaml create mode 100644 ods_ci/tasks/Resources/Provisioning/Hive/GPU/overlays/IBM/kustomization.yaml diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/AMD/amd_gpu_install.yaml b/ods_ci/tasks/Resources/Provisioning/GPU/AMD/amd_gpu_install.yaml new file mode 100644 index 000000000..e98c3ece4 --- /dev/null +++ b/ods_ci/tasks/Resources/Provisioning/GPU/AMD/amd_gpu_install.yaml @@ -0,0 +1,28 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: openshift-amd-gpu + +--- + +apiVersion: operators.coreos.com/v1 +kind: OperatorGroup +metadata: + name: openshift-amd-gpu-operator-group + namespace: openshift-amd-gpu +spec: {} + +--- + +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + name: amd-gpu-operator + namespace: openshift-amd-gpu +spec: + channel: alpha + installPlanApproval: Automatic + name: amd-gpu-operator + source: community-operators + sourceNamespace: openshift-marketplace + diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/AMD/amd_operator.sh b/ods_ci/tasks/Resources/Provisioning/GPU/AMD/amd_operator.sh new file mode 100755 index 000000000..4ff4b4d88 --- /dev/null +++ b/ods_ci/tasks/Resources/Provisioning/GPU/AMD/amd_operator.sh @@ -0,0 +1,196 @@ +#!/bin/bash +set -e + +GPU_INSTALL_DIR="$(dirname "$0")" + +function create_registry_network() { + oc patch configs.imageregistry.operator.openshift.io cluster --type merge --patch '{"spec":{"storage":{"emptyDir":{}}}}' + oc patch configs.imageregistry.operator.openshift.io cluster --type merge --patch '{"spec":{"managementState":"Managed"}}' + echo "Internal registry network created." +} + +function check_registry() { + registry_pod=$(oc get pod -l docker-registry=default -n openshift-image-registry --no-headers -o custom-columns=":metadata.name") + if [ -n "$registry_pod" ]; then + echo "Internal registry pod ($registry_pod) is present." + return 0 # Success + else + echo "Internal registry pod is not present." + create_registry_network + return 1 # Failure + fi +} +function wait_while { + local seconds timeout interval + interval=2 + seconds=0 + timeout=$1 + shift + while eval "$*"; do + seconds=$(( seconds + interval )) + sleep $interval + echo -n '.' + [[ $seconds -gt $timeout ]] && echo "Time out of ${timeout} exceeded" && return 1 + done + if [[ "$seconds" != '0' ]]; then + echo '' + fi + return 0 +} + +has_csv_succeeded() { + local ns=$1 + local subscription=$2 + local csv + csv=$(oc get subscriptions.operators.coreos.com "${subscription}" -n "${ns}" -o=custom-columns=CURRENT_CSV:.status.currentCSV --no-headers=true) + if [ x"$csv" != "x" ] && [ x"$csv" != x"" ] + then + phase=$(oc get clusterserviceversions.operators.coreos.com -n "${ns}" "${csv}" -o=custom-columns=PHASE:.status.phase --no-headers=true) + if [ "$phase" = "Succeeded" ] + then + return 0 + fi + fi + + return 1 +} + +function create_devconfig() { + dc_name="dc-internal-registry" + dc=$(oc get DeviceConfig $dc_name -n openshift-amd-gpu -oname --ignore-not-found) + if [[ -n $dc ]]; + then + echo "AMD DeviceConfig $dc_name already exists". Skipping creation + else + echo "Creating AMD DeviceConfig..." + oc create -f - < clusterpolicy.json oc apply -f clusterpolicy.json wait_until_pod_ready_status "nvidia-device-plugin-daemonset" diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/gpu_install.yaml b/ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_install.yaml similarity index 100% rename from ods_ci/tasks/Resources/Provisioning/GPU/gpu_install.yaml rename to ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_install.yaml diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/provision-gpu.sh b/ods_ci/tasks/Resources/Provisioning/GPU/provision-gpu.sh new file mode 100755 index 000000000..dfd6aaa40 --- /dev/null +++ b/ods_ci/tasks/Resources/Provisioning/GPU/provision-gpu.sh @@ -0,0 +1,34 @@ +#!/bin/bash +set -e + +# Optional params +INSTANCE_TYPE=${1:-"g4dn.xlarge"} +PROVIDER=${2:-"AWS"} +KUSTOMIZE_PATH="$PWD/tasks/Resources/Provisioning/Hive/GPU" +MACHINESET_PATH="$KUSTOMIZE_PATH/base/source-machineset.yaml" + +# Check if existing machineset GPU already exists +EXISTING_GPU_MACHINESET="$(oc get machineset -n openshift-machine-api -o jsonpath="{.items[?(@.metadata.annotations['machine\.openshift\.io/GPU']>'0')].metadata.name}")" +if [[ -n "$EXISTING_GPU_MACHINESET" ]] ; then + echo "Machine-set for GPU already exists" + oc get machinesets -A --show-labels + exit 0 +fi + +# Select the first machineset as a template for the GPU machineset +SOURCE_MACHINESET=$(oc get machineset -n openshift-machine-api -o name | head -n1) +oc get -o yaml -n openshift-machine-api $SOURCE_MACHINESET > $MACHINESET_PATH + +# rename machine set in the template file +OLD_MACHINESET_NAME=$(yq '.metadata.name' $MACHINESET_PATH ) +NEW_MACHINESET_NAME=${OLD_MACHINESET_NAME/worker/gpu} +sed -i'' -e "s/$OLD_MACHINESET_NAME/$NEW_MACHINESET_NAME/g" $MACHINESET_PATH + +# set the desired node flavor in the kustomize overlay +sed -i'' -e "s/INSTANCE_TYPE/$INSTANCE_TYPE/g" $KUSTOMIZE_PATH/overlays/$PROVIDER/gpu.yaml + +# create the new MachineSet using kustomize +oc apply --kustomize $KUSTOMIZE_PATH/overlays/$PROVIDER + +# Add GPU label to the new machine-set +oc patch machinesets -n openshift-machine-api "$NEW_MACHINESET_NAME" -p '{"metadata":{"labels":{"gpu-machineset":"true"}}}' --type=merge diff --git a/ods_ci/tasks/Resources/Provisioning/Hive/AWS/provision-gpu.sh b/ods_ci/tasks/Resources/Provisioning/Hive/AWS/provision-gpu.sh deleted file mode 100755 index 8455ff405..000000000 --- a/ods_ci/tasks/Resources/Provisioning/Hive/AWS/provision-gpu.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash -set -e - -# Optional params -INSTANCE_TYPE=${1:-"g4dn.xlarge"} - -# Check if existing machineset GPU already exists -EXISTING_GPU_MACHINESET="$(oc get machineset -n openshift-machine-api -o jsonpath="{.items[?(@.metadata.annotations['machine\.openshift\.io/GPU']>'0')].metadata.name}")" -if [[ -n "$EXISTING_GPU_MACHINESET" ]] ; then - echo "Machine-set for GPU already exists" - oc get machinesets -A --show-labels - exit 0 -fi - -# Select the first machineset as a template for the GPU machineset -SOURCE_MACHINESET=$(oc get machineset -n openshift-machine-api -o name | head -n1) - -# Reformat with jq, for better diff result. -oc get -o json -n openshift-machine-api $SOURCE_MACHINESET | jq -r > /tmp/source-machineset.json - -OLD_MACHINESET_NAME=$(jq '.metadata.name' -r /tmp/source-machineset.json ) -NEW_MACHINESET_NAME=${OLD_MACHINESET_NAME/worker/gpu} - - -# Change instanceType and delete some stuff -jq -r --arg INSTANCE_TYPE "$INSTANCE_TYPE" '.spec.template.spec.providerSpec.value.instanceType=$INSTANCE_TYPE - | del(.metadata.selfLink) - | del(.metadata.uid) - | del(.metadata.creationTimestamp) - | del(.metadata.resourceVersion) - | .spec.template.spec.taints += [{"effect": "NoSchedule" , "key": "nvidia.com/gpu" , "value": "None"}] -' /tmp/source-machineset.json > /tmp/gpu-machineset.json - -# Change machineset name -sed -i "s/$OLD_MACHINESET_NAME/$NEW_MACHINESET_NAME/g" /tmp/gpu-machineset.json -# Create new machineset -oc apply -f /tmp/gpu-machineset.json -rm /tmp/source-machineset.json -rm /tmp/gpu-machineset.json - -# Add GPU label to the new machine-set -oc patch machinesets -n openshift-machine-api "$NEW_MACHINESET_NAME" -p '{"metadata":{"labels":{"gpu-machineset":"true"}}}' --type=merge diff --git a/ods_ci/tasks/Resources/Provisioning/Hive/GPU/base/kustomization.yaml b/ods_ci/tasks/Resources/Provisioning/Hive/GPU/base/kustomization.yaml new file mode 100644 index 000000000..40b6be9ae --- /dev/null +++ b/ods_ci/tasks/Resources/Provisioning/Hive/GPU/base/kustomization.yaml @@ -0,0 +1,4 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - source-machineset.yaml diff --git a/ods_ci/tasks/Resources/Provisioning/Hive/GPU/base/source-machineset.yaml b/ods_ci/tasks/Resources/Provisioning/Hive/GPU/base/source-machineset.yaml new file mode 100644 index 000000000..c9faddb83 --- /dev/null +++ b/ods_ci/tasks/Resources/Provisioning/Hive/GPU/base/source-machineset.yaml @@ -0,0 +1 @@ +# PLACEHOLDER - the content is dynamically generated by ods_ci/tasks/Resources/Provisioning/GPU/provision-gpu.sh diff --git a/ods_ci/tasks/Resources/Provisioning/Hive/GPU/overlays/AWS/gpu.yaml b/ods_ci/tasks/Resources/Provisioning/Hive/GPU/overlays/AWS/gpu.yaml new file mode 100644 index 000000000..44f7c2dec --- /dev/null +++ b/ods_ci/tasks/Resources/Provisioning/Hive/GPU/overlays/AWS/gpu.yaml @@ -0,0 +1,3 @@ +- op: replace + path: /spec/template/spec/providerSpec/value/instanceType + value: INSTANCE_TYPE diff --git a/ods_ci/tasks/Resources/Provisioning/Hive/GPU/overlays/AWS/kustomization.yaml b/ods_ci/tasks/Resources/Provisioning/Hive/GPU/overlays/AWS/kustomization.yaml new file mode 100644 index 000000000..5405485c9 --- /dev/null +++ b/ods_ci/tasks/Resources/Provisioning/Hive/GPU/overlays/AWS/kustomization.yaml @@ -0,0 +1,10 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +metadata: + name: add-gpu +resources: + - ../../base/ +patches: + - path: gpu.yaml + target: + kind: MachineSet diff --git a/ods_ci/tasks/Resources/Provisioning/Hive/GPU/overlays/IBM/gpu.yaml b/ods_ci/tasks/Resources/Provisioning/Hive/GPU/overlays/IBM/gpu.yaml new file mode 100644 index 000000000..c69a8c810 --- /dev/null +++ b/ods_ci/tasks/Resources/Provisioning/Hive/GPU/overlays/IBM/gpu.yaml @@ -0,0 +1,3 @@ +- op: replace + path: /spec/template/spec/providerSpec/value/profile + value: INSTANCE_TYPE diff --git a/ods_ci/tasks/Resources/Provisioning/Hive/GPU/overlays/IBM/kustomization.yaml b/ods_ci/tasks/Resources/Provisioning/Hive/GPU/overlays/IBM/kustomization.yaml new file mode 100644 index 000000000..5405485c9 --- /dev/null +++ b/ods_ci/tasks/Resources/Provisioning/Hive/GPU/overlays/IBM/kustomization.yaml @@ -0,0 +1,10 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +metadata: + name: add-gpu +resources: + - ../../base/ +patches: + - path: gpu.yaml + target: + kind: MachineSet