From 677f14fd7dd280b511c707ff9fb1eb7092e9cd52 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 31 Oct 2023 18:08:24 -0500 Subject: [PATCH] Updated script --- scripts/nvidia-device-plugin-ds-staging.yaml | 41 ++++++++++++++++++++ scripts/nvidia-device-plugin-ds.yaml | 41 +++++++++----------- 2 files changed, 60 insertions(+), 22 deletions(-) create mode 100644 scripts/nvidia-device-plugin-ds-staging.yaml diff --git a/scripts/nvidia-device-plugin-ds-staging.yaml b/scripts/nvidia-device-plugin-ds-staging.yaml new file mode 100644 index 0000000..00b3151 --- /dev/null +++ b/scripts/nvidia-device-plugin-ds-staging.yaml @@ -0,0 +1,41 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nvidia-device-plugin-daemonset + namespace: staging +spec: + selector: + matchLabels: + name: nvidia-device-plugin-ds + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + name: nvidia-device-plugin-ds + spec: + # Use a nodeSelector to only schedule on GPU pods + nodeSelector: + accelerator: nvidia + tolerations: + - key: CriticalAddonsOnly + operator: Exists + - key: "hub.jupyter.org_dedicated" + operator: "Equal" + value: "user" + effect: "NoSchedule" + priorityClassName: "system-node-critical" + containers: + - image: mcr.microsoft.com/oss/nvidia/k8s-device-plugin:v0.14.1 + name: nvidia-device-plugin-ctr + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + volumes: + - name: device-plugin + hostPath: + path: /var/lib/kubelet/device-plugins diff --git a/scripts/nvidia-device-plugin-ds.yaml b/scripts/nvidia-device-plugin-ds.yaml index 8e65ed8..f24c1d3 100644 --- a/scripts/nvidia-device-plugin-ds.yaml +++ b/scripts/nvidia-device-plugin-ds.yaml @@ -14,31 +14,28 @@ spec: labels: name: nvidia-device-plugin-ds spec: - priorityClassName: "system-cluster-critical" + # Use a nodeSelector to only schedule on GPU pods + nodeSelector: + accelerator: nvidia tolerations: - # Allow this pod to be rescheduled while the node is in "critical add-ons only" mode. - # This, along with the annotation above marks this pod as a critical add-on. - - key: CriticalAddonsOnly - operator: Exists - - key: nvidia.com/gpu - operator: Exists - effect: NoSchedule - - key: "sku" - operator: "Equal" - value: "gpu" - effect: "NoSchedule" + - key: CriticalAddonsOnly + operator: Exists + - key: "hub.jupyter.org_dedicated" + operator: "Equal" + value: "user" + effect: "NoSchedule" + priorityClassName: "system-node-critical" containers: - - image: mcr.microsoft.com/oss/nvidia/k8s-device-plugin:v0.14.1 - name: nvidia-device-plugin-ctr - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: ["ALL"] - volumeMounts: - - name: device-plugin - mountPath: /var/lib/kubelet/device-plugins + - image: mcr.microsoft.com/oss/nvidia/k8s-device-plugin:v0.14.1 + name: nvidia-device-plugin-ctr + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins volumes: - name: device-plugin hostPath: path: /var/lib/kubelet/device-plugins -