From c308bb41396cf9e00e41deddccb44451e05e095b Mon Sep 17 00:00:00 2001 From: Pradipta Banerjee Date: Sun, 26 Nov 2023 19:34:33 +0530 Subject: [PATCH] podvm: An example "gpu" addon This is an example addon to configure nvidia GPU support when building podvm image using packer. Signed-off-by: Pradipta Banerjee --- podvm/addons/gpu/README.md | 75 ++++++++++++++++++++++++++++++++++++++ podvm/addons/gpu/setup.sh | 50 +++++++++++++++++++++++++ 2 files changed, 125 insertions(+) create mode 100644 podvm/addons/gpu/README.md create mode 100755 podvm/addons/gpu/setup.sh diff --git a/podvm/addons/gpu/README.md b/podvm/addons/gpu/README.md new file mode 100644 index 0000000000..7df4b7f856 --- /dev/null +++ b/podvm/addons/gpu/README.md @@ -0,0 +1,75 @@ +## Introduction + +This addon enables nvidia GPU support in the podvm image. + +You need to specify the GPU instance types in the cloud-api-adaptor configMap (peer-pods-cm). + +Here is an example. Replace it as appropriate depending on the specific provider and region + +``` +# For AWS +PODVM_INSTANCE_TYPES: "t3.small,c5.xlarge,p3.2xlarge" + +# For Azure +AZURE_INSTANCE_SIZES: "Standard_D8as_v5,Standard_D4as_v5,Standard_NC6s_v3,Standard_NC4as_T4_v3" + +``` + +Example pod definition: +``` +apiVersion: v1 +kind: Pod +metadata: + name: gpu-test + labels: + app: test + annotations: + io.katacontainers.config.hypervisor.machine_type: Standard_NC4as_T4_v3 +spec: + runtimeClassName: kata-remote + containers: + - name: ubuntu + image: ubuntu + command: ["sleep"] + args: ["infinity"] + env: + - name: NVIDIA_VISIBLE_DEVICES + value: "all" +``` + +You can verify the GPU devices by execing a shell in the pod as shown below: + +``` +$ kubectl exec -it gpu-test -- bash +root@gpu-test:/# nvidia-smi +Thu Nov 23 17:30:58 2023 ++---------------------------------------------------------------------------------------+ +| NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.2 | +|-----------------------------------------+----------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+======================+======================| +| 0 Tesla T4 Off | 00000001:00:00.0 Off | Off | +| N/A 36C P8 9W / 70W | 2MiB / 16384MiB | 0% Default | +| | | N/A | ++-----------------------------------------+----------------------+----------------------+ + ++---------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=======================================================================================| +| No running processes found | ++---------------------------------------------------------------------------------------+ + +root@gpu-test:/# nvidia-smi -L +GPU 0: Tesla T4 (UUID: GPU-2b9a9945-a56c-fcf3-7156-8e380cf1d0cc) + +root@gpu-test:/# ls -l /dev/nvidia* +crw-rw-rw- 1 root root 235, 0 Nov 23 17:27 /dev/nvidia-uvm +crw-rw-rw- 1 root root 235, 1 Nov 23 17:27 /dev/nvidia-uvm-tools +crw-rw-rw- 1 root root 195, 0 Nov 23 17:27 /dev/nvidia0 +crw-rw-rw- 1 root root 195, 255 Nov 23 17:27 /dev/nvidiactl + +``` diff --git a/podvm/addons/gpu/setup.sh b/podvm/addons/gpu/setup.sh new file mode 100755 index 0000000000..29797c48ed --- /dev/null +++ b/podvm/addons/gpu/setup.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +# Create the prestart hook directory +mkdir -p /usr/share/oci/hooks/prestart + +# Add hook script +cat <<'END' > /usr/share/oci/hooks/prestart/nvidia-container-toolkit.sh +#!/bin/bash -x + +/usr/bin/nvidia-container-toolkit -debug "$@" +END + +# Make the script executable +chmod +x /usr/share/oci/hooks/prestart/nvidia-container-toolkit.sh + +# PODVM_DISTRO variable is set as part of the podvm image build process +# and available inside the packer VM +# Add NVIDIA packages +if [[ "$PODVM_DISTRO" == "ubuntu" ]]; then + export DEBIAN_FRONTEND=noninteractive + distribution=$(. /etc/os-release;echo $ID$VERSION_ID) + curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg + curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list + apt-get -q update -y + apt-get -q install -y nvidia-container-toolkit + apt-get -q install -y wget build-essential pkg-config + apt-get -q install -y nvidia-driver-530 + + sed -i "s/#debug/debug/g" /etc/nvidia-container-runtime/config.toml + sed -i "s|/var/log|/var/log/nvidia-kata-container|g" /etc/nvidia-container-runtime/config.toml + sed -i "s/#no-cgroups = false/no-cgroups = true/g" /etc/nvidia-container-runtime/config.toml + sed -i "/\[nvidia-container-cli\]/a no-pivot = true" /etc/nvidia-container-runtime/config.toml + sed -i "s/disable-require = false/disable-require = true/g" /etc/nvidia-container-runtime/config.toml + + apt remove -q -y build-essential +fi +if [[ "$PODVM_DISTRO" == "rhel" ]]; then + dnf config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo + dnf install -q -y kernel-devel-"$(uname -r)" kernel-headers-"$(uname -r)" + + dnf install -q -y nvidia-container-toolkit + dnf -q -y module install nvidia-driver:latest + + sed -i "s/#debug/debug/g" /etc/nvidia-container-runtime/config.toml + sed -i "s|/var/log|/var/log/nvidia-kata-container|g" /etc/nvidia-container-runtime/config.toml + sed -i "s/#no-cgroups = false/no-cgroups = true/g" /etc/nvidia-container-runtime/config.toml + sed -i "/\[nvidia-container-cli\]/a no-pivot = true" /etc/nvidia-container-runtime/config.toml + sed -i "s/disable-require = false/disable-require = true/g" /etc/nvidia-container-runtime/config.toml + +fi