Skip to content

Commit

Permalink
Create and configure the Nvidia daemon set (#4371)
Browse files Browse the repository at this point in the history
* stuff to get it working

* fix lint issues

* fix lint issues 2

* fix lint issues 3

* fix linting pt.4

---------

Co-authored-by: Anthony Fitzroy <anthony.fitzroy@justice.gov.uk>
Co-authored-by: Anthony Fitzroy <101649764+AntFMoJ@users.noreply.github.com>
  • Loading branch information
3 people authored May 21, 2024
1 parent c6fc2be commit b6d7210
Show file tree
Hide file tree
Showing 6 changed files with 124 additions and 0 deletions.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
/*
This is commented out because it didn't work.
It wouldn't schedule properly, so we switched to applying the "simple" option with out taints and tolerations
resource "helm_release" "nvidia_device_plugin" {
name = "nvidia-device-plugin"
repository = "https://nvidia.github.io/k8s-device-plugin"
chart = "nvidia-device-plugin"
version = "0.15.0"
namespace = "kube-system"
values = [
templatefile(
"${path.module}/src/helm/nvidia/values.yml.tftpl", {}
)
]
}
*/
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
resource "kubernetes_manifest" "nvidia_device_plugin" {
manifest = yamldecode(file("src/kubernetes/nvidia-device-plugin.yml"))
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
---
affinity: {}
nodeSelector:
gpu-compute: "true"
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
- key: gpu-compute
operator: Exists
effect: NoSchedule
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-device-plugin-daemonset
namespace: kube-system
spec:
selector:
matchLabels:
name: nvidia-device-plugin-ds
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: nvidia-device-plugin-ds
spec:
# @ministryofjustice/analytical-platform: We added this nodeSelector
nodeSelector:
gpu-compute: "true"
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
# @ministryofjustice/analytical-platform: We added this toleration
- key: gpu-compute
operator: Exists
effect: NoSchedule
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
containers:
- image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
name: nvidia-device-plugin-ctr
env:
- name: FAIL_ON_INIT_ERROR
value: "false"
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
12 changes: 12 additions & 0 deletions terraform/aws/analytical-platform-development/cluster/terraform.tf
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ terraform {
source = "hashicorp/random"
version = "3.6.1"
}
helm = {
source = "hashicorp/helm"
version = "2.13.2"
}
}
required_version = "~> 1.7"
}
Expand Down Expand Up @@ -73,4 +77,12 @@ provider "kubernetes" {
token = data.aws_eks_cluster_auth.cluster.token
}

provider "helm" {
kubernetes {
host = data.aws_eks_cluster.cluster.endpoint
cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority[0].data)
token = data.aws_eks_cluster_auth.cluster.token
}
}

provider "random" {}

0 comments on commit b6d7210

Please sign in to comment.