Skip to content

Commit

Permalink
Backport GPU provisioning updates to releases/2.8.0 (red-hat-data-ser…
Browse files Browse the repository at this point in the history
…vices#1581)

Generalize GPU provisioning script to ease the addition of new providers (red-hat-data-services#1568)

* refactor gpu operators directories

* generalize GPU script away from AWS provider

* minor changes in kustomize yaml

* fix aws gpu overlay

* fix filepath

* add missing end lines
  • Loading branch information
bdattoma authored Jul 2, 2024
1 parent 68d0863 commit a7b44a7
Show file tree
Hide file tree
Showing 14 changed files with 336 additions and 46 deletions.
28 changes: 28 additions & 0 deletions ods_ci/tasks/Resources/Provisioning/GPU/AMD/amd_gpu_install.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
apiVersion: v1
kind: Namespace
metadata:
name: openshift-amd-gpu

---

apiVersion: operators.coreos.com/v1
kind: OperatorGroup
metadata:
name: openshift-amd-gpu-operator-group
namespace: openshift-amd-gpu
spec: {}

---

apiVersion: operators.coreos.com/v1alpha1
kind: Subscription
metadata:
name: amd-gpu-operator
namespace: openshift-amd-gpu
spec:
channel: alpha
installPlanApproval: Automatic
name: amd-gpu-operator
source: community-operators
sourceNamespace: openshift-marketplace

196 changes: 196 additions & 0 deletions ods_ci/tasks/Resources/Provisioning/GPU/AMD/amd_operator.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
#!/bin/bash
set -e

GPU_INSTALL_DIR="$(dirname "$0")"

function create_registry_network() {
oc patch configs.imageregistry.operator.openshift.io cluster --type merge --patch '{"spec":{"storage":{"emptyDir":{}}}}'
oc patch configs.imageregistry.operator.openshift.io cluster --type merge --patch '{"spec":{"managementState":"Managed"}}'
echo "Internal registry network created."
}

function check_registry() {
registry_pod=$(oc get pod -l docker-registry=default -n openshift-image-registry --no-headers -o custom-columns=":metadata.name")
if [ -n "$registry_pod" ]; then
echo "Internal registry pod ($registry_pod) is present."
return 0 # Success
else
echo "Internal registry pod is not present."
create_registry_network
return 1 # Failure
fi
}
function wait_while {
local seconds timeout interval
interval=2
seconds=0
timeout=$1
shift
while eval "$*"; do
seconds=$(( seconds + interval ))
sleep $interval
echo -n '.'
[[ $seconds -gt $timeout ]] && echo "Time out of ${timeout} exceeded" && return 1
done
if [[ "$seconds" != '0' ]]; then
echo ''
fi
return 0
}

has_csv_succeeded() {
local ns=$1
local subscription=$2
local csv
csv=$(oc get subscriptions.operators.coreos.com "${subscription}" -n "${ns}" -o=custom-columns=CURRENT_CSV:.status.currentCSV --no-headers=true)
if [ x"$csv" != "x" ] && [ x"$csv" != x"<none>" ]
then
phase=$(oc get clusterserviceversions.operators.coreos.com -n "${ns}" "${csv}" -o=custom-columns=PHASE:.status.phase --no-headers=true)
if [ "$phase" = "Succeeded" ]
then
return 0
fi
fi

return 1
}

function create_devconfig() {
dc_name="dc-internal-registry"
dc=$(oc get DeviceConfig $dc_name -n openshift-amd-gpu -oname --ignore-not-found)
if [[ -n $dc ]];
then
echo "AMD DeviceConfig $dc_name already exists". Skipping creation
else
echo "Creating AMD DeviceConfig..."
oc create -f - <<EOF
kind: DeviceConfig
apiVersion: amd.io/v1alpha1
metadata:
name: $dc_name
namespace: openshift-amd-gpu
EOF
fi
}


function wait_until_pod_is_created() {
label=$1
namespace=$2
timeout=$3
start_time=$(date +%s)
while [ $(($(date +%s) - start_time)) -lt $timeout ]; do
podName=$(oc get pods -n $2 -l $1 -oname)
if [[ -n $podName ]];
then {
echo Pod $podName found!
return 0
} else {
echo "waiting for pod with label $label"
sleep 2
}
fi
done
echo "Timeout exceeded, pod with label $label not found"
return 1
}

function machineconfig_updates {
# There should be only "True" and there should be at least one
[ True = "$(oc get machineconfigpool --no-headers=true '-o=custom-columns=UPDATED:.status.conditions[?(@.type=="Updated")].status' | uniq)" ]
}

function monitor_logs() {
local pod_name=$1
local ns=$2
local c_name=$3
shift 3
local search_text=$(printf "%q " "$@")
echo "Monitoring logs for pod $pod_name..."
# Use 'kubectl logs' command to fetch logs continuously
oc logs "$pod_name" -c "$c_name" -n "$ns" | while read -r line; do
if [[ $line == *"$search_text"* ]]; then
echo "Found \"$search_text\" in pod logs: $line"
fi
done
}

function wait_until_driver_image_is_built() {
startup_timeout=$1
build_timeout=$2
name=$(oc get pod -n openshift-amd-gpu -l openshift.io/build.name -oname)
echo Builder pod name: $name
oc wait --timeout="${startup_timeout}s" --for=condition=ready pod -n openshift-amd-gpu -l openshift.io/build.name
echo "Wait for the image build to finish"
oc wait --timeout="${build_timeout}s" --for=delete pod -n openshift-amd-gpu -l openshift.io/build.name
echo "Checking the image stream got created"
image=$(oc get is amd_gpu_kmm_modules -n openshift-amd-gpu -oname)
if [[ $? -eq 0 ]];
then
echo ".Image Stream $image found!"
else
echo ".Image Stream amd_gpu_kmm_modules not found. Check the cluster"
exit 1
fi
}

function create_acceleratorprofile() {
echo "Creating an Accelerator Profile for Dashboard"
oc apply -f - <<EOF
apiVersion: dashboard.opendatahub.io/v1
kind: AcceleratorProfile
metadata:
name: ods-ci-amd-gpu
namespace: redhat-ods-applications
spec:
displayName: AMD GPU
enabled: true
identifier: amd.com/gpu
tolerations:
- effect: NoSchedule
key: amd.com/gpu
operator: Exists
EOF
if [ $? -eq 0 ]; then
echo "Verifying that an AcceleratorProfiles resource was created in redhat-ods-applications"
oc describe AcceleratorProfiles -n redhat-ods-applications
fi
}

check_registry
status=$?

# Blacklist the inbox drivers with a MachineConfig if the registry check was successful
if [ $status -eq 0 ]; then
oc apply -f "$GPU_INSTALL_DIR/blacklist_driver.yaml"
else
return 1
fi

sleep 120
wait_while 1800 ! machineconfig_updates

echo "Installing NFD operator"
oc apply -f "$GPU_INSTALL_DIR/../nfd_operator.yaml"
wait_while 360 ! has_csv_succeeded openshift-nfd nfd
oc apply -f "$GPU_INSTALL_DIR/../nfd_deploy.yaml"
echo "Installing KMM operator"
oc apply -f "$GPU_INSTALL_DIR/kmm_operator_install.yaml"
wait_while 360 ! has_csv_succeeded openshift-kmm kernel-module-management
echo "Installing AMD operator"
oc apply -f "$GPU_INSTALL_DIR/amd_gpu_install.yaml"
wait_while 360 ! has_csv_succeeded openshift-amd-gpu amd-gpu-operator
create_devconfig
image=$(oc get is amd_gpu_kmm_modules -n openshift-amd-gpu -oname --ignore-not-found)
if [[ -n $image ]];
then
echo ".Image Stream amd_gpu_kmm_modules alredy present! Skipping waiting for builder pod";
else
wait_until_pod_is_created openshift.io/build.name openshift-amd-gpu 180
wait_until_driver_image_is_built 60 1200
fi
echo "Configuration of AMD GPU node and Operators completed"
# the message appears in the logs, but the pod may get delete before our code next iteration checks the logs once again,
# hence it'd fails to reach the pod. It happened to me
# wait_while 1200 monitor_logs "$name" openshift-amd-gpu docker-build "Successfully pushed image-registry.openshift-image-registry.svc:5000/openshift-amd-gpu"
create_acceleratorprofile
17 changes: 17 additions & 0 deletions ods_ci/tasks/Resources/Provisioning/GPU/AMD/blacklist_driver.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
apiVersion: machineconfiguration.openshift.io/v1
kind: MachineConfig
metadata:
labels:
machineconfiguration.openshift.io/role: worker
name: amdgpu-module-blacklist
spec:
config:
ignition:
version: 3.2.0
storage:
files:
- path: "/etc/modprobe.d/amdgpu-blacklist.conf"
mode: 420
overwrite: true
contents:
source: "data:text/plain;base64,YmxhY2tsaXN0IGFtZGdwdQo="
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
apiVersion: v1
kind: Namespace
metadata:
name: openshift-kmm

---

apiVersion: operators.coreos.com/v1
kind: OperatorGroup
metadata:
name: openshift-kmm-operator-group
namespace: openshift-kmm
spec: {}

---
apiVersion: operators.coreos.com/v1alpha1
kind: Subscription
metadata:
name: kernel-module-management
namespace: openshift-kmm
spec:
channel: stable
installPlanApproval: Automatic
name: kernel-module-management
source: redhat-operators
sourceNamespace: openshift-marketplace
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@ CHANNEL="$(oc get packagemanifest gpu-operator-certified -n openshift-marketplac

CSVNAME="$(oc get packagemanifests/gpu-operator-certified -n openshift-marketplace -o json | jq -r '.status.channels[] | select(.name == "'$CHANNEL'") | .currentCSV')"

sed -i -e "0,/v1.11/s//$CHANNEL/g" -e "s/gpu-operator-certified.v1.11.0/$CSVNAME/g" "$GPU_INSTALL_DIR/gpu_install.yaml"
sed -i'' -e "0,/v1.11/s//$CHANNEL/g" -e "s/gpu-operator-certified.v1.11.0/$CSVNAME/g" "$GPU_INSTALL_DIR/gpu_install.yaml"

oc apply -f "$GPU_INSTALL_DIR/gpu_install.yaml"

oc apply -f "$GPU_INSTALL_DIR/../nfd_operator.yaml"
echo "Wait for Nvidia GPU Operator Subscription, InstallPlan and Deployment to complete"

oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n nvidia-gpu-operator sub nfd
oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n openshift-nfd sub nfd

oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n nvidia-gpu-operator sub gpu-operator-certified

Expand Down Expand Up @@ -80,7 +80,7 @@ function rerun_accelerator_migration() {
}

wait_until_pod_ready_status "gpu-operator"
oc apply -f "$GPU_INSTALL_DIR/nfd_deploy.yaml"
oc apply -f "$GPU_INSTALL_DIR/../nfd_deploy.yaml"
oc get csv -n nvidia-gpu-operator "$CSVNAME" -o jsonpath='{.metadata.annotations.alm-examples}' | jq .[0] > clusterpolicy.json
oc apply -f clusterpolicy.json
wait_until_pod_ready_status "nvidia-device-plugin-daemonset"
Expand Down
34 changes: 34 additions & 0 deletions ods_ci/tasks/Resources/Provisioning/GPU/provision-gpu.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/bin/bash
set -e

# Optional params
INSTANCE_TYPE=${1:-"g4dn.xlarge"}
PROVIDER=${2:-"AWS"}
KUSTOMIZE_PATH="$PWD/tasks/Resources/Provisioning/Hive/GPU"
MACHINESET_PATH="$KUSTOMIZE_PATH/base/source-machineset.yaml"

# Check if existing machineset GPU already exists
EXISTING_GPU_MACHINESET="$(oc get machineset -n openshift-machine-api -o jsonpath="{.items[?(@.metadata.annotations['machine\.openshift\.io/GPU']>'0')].metadata.name}")"
if [[ -n "$EXISTING_GPU_MACHINESET" ]] ; then
echo "Machine-set for GPU already exists"
oc get machinesets -A --show-labels
exit 0
fi

# Select the first machineset as a template for the GPU machineset
SOURCE_MACHINESET=$(oc get machineset -n openshift-machine-api -o name | head -n1)
oc get -o yaml -n openshift-machine-api $SOURCE_MACHINESET > $MACHINESET_PATH

# rename machine set in the template file
OLD_MACHINESET_NAME=$(yq '.metadata.name' $MACHINESET_PATH )
NEW_MACHINESET_NAME=${OLD_MACHINESET_NAME/worker/gpu}
sed -i'' -e "s/$OLD_MACHINESET_NAME/$NEW_MACHINESET_NAME/g" $MACHINESET_PATH

# set the desired node flavor in the kustomize overlay
sed -i'' -e "s/INSTANCE_TYPE/$INSTANCE_TYPE/g" $KUSTOMIZE_PATH/overlays/$PROVIDER/gpu.yaml

# create the new MachineSet using kustomize
oc apply --kustomize $KUSTOMIZE_PATH/overlays/$PROVIDER

# Add GPU label to the new machine-set
oc patch machinesets -n openshift-machine-api "$NEW_MACHINESET_NAME" -p '{"metadata":{"labels":{"gpu-machineset":"true"}}}' --type=merge
42 changes: 0 additions & 42 deletions ods_ci/tasks/Resources/Provisioning/Hive/AWS/provision-gpu.sh

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- source-machineset.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# PLACEHOLDER - the content is dynamically generated by ods_ci/tasks/Resources/Provisioning/GPU/provision-gpu.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
- op: replace
path: /spec/template/spec/providerSpec/value/instanceType
value: INSTANCE_TYPE
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
metadata:
name: add-gpu
resources:
- ../../base/
patches:
- path: gpu.yaml
target:
kind: MachineSet
Loading

0 comments on commit a7b44a7

Please sign in to comment.