Skip to content

Commit

Permalink
Add gpu node pool and nvidia-driver daemonset in prod (#4387)
Browse files Browse the repository at this point in the history
add gpu node pool and nvidia-driver daemonset in prod
  • Loading branch information
AntFMoJ authored May 23, 2024
1 parent b8300c7 commit 8689550
Show file tree
Hide file tree
Showing 7 changed files with 175 additions and 0 deletions.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,37 @@ module "eks" {
metadata_http_tokens = "required"
metadata_http_put_response_hop_limit = 1
}
gpu_node_pool = {
name_prefix = var.eks_node_group_name_prefix
create_launch_template = true

ami_type = var.eks_node_group_ami_type_gpu_node
instance_types = var.eks_node_group_instance_types_gpu_node

desired_capacity = var.eks_node_group_capacities_gpu_node["desired"]
max_capacity = var.eks_node_group_capacities_gpu_node["max"]
min_capacity = var.eks_node_group_capacities_gpu_node["min"]

metadata_http_endpoint = "enabled"
metadata_http_tokens = "required"
metadata_http_put_response_hop_limit = 1

update_config = {
max_unavailable = 1

}
taints = [
{
key = "gpu-compute"
value = "true"
effect = "NO_SCHEDULE"
}
]

k8s_labels = {
gpu-compute = "true"
}
}
}

workers_additional_policies = ["arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
resource "kubernetes_manifest" "nvidia_device_plugin" {
manifest = yamldecode(file("src/kubernetes/nvidia-device-plugin.yml"))
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-device-plugin-daemonset
namespace: kube-system
spec:
selector:
matchLabels:
name: nvidia-device-plugin-ds
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: nvidia-device-plugin-ds
spec:
# @ministryofjustice/analytical-platform: We added this nodeSelector
nodeSelector:
gpu-compute: "true"
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
# @ministryofjustice/analytical-platform: We added this toleration
- key: gpu-compute
operator: Exists
effect: NoSchedule
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
containers:
- image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
name: nvidia-device-plugin-ctr
env:
- name: FAIL_ON_INIT_ERROR
value: "false"
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
12 changes: 12 additions & 0 deletions terraform/aws/analytical-platform-production/cluster/terraform.tf
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ terraform {
source = "hashicorp/tls"
version = "4.0.5"
}
helm = {
source = "hashicorp/helm"
version = "2.13.2"
}
}
required_version = "~> 1.5"
}
Expand Down Expand Up @@ -88,4 +92,12 @@ provider "kubernetes" {
token = data.aws_eks_cluster_auth.cluster.token
}

provider "helm" {
kubernetes {
host = data.aws_eks_cluster.cluster.endpoint
cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority[0].data)
token = data.aws_eks_cluster_auth.cluster.token
}
}

provider "random" {}
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,16 @@ eks_node_group_ami_type = "AL2_x86_64"
eks_node_group_disk_size = 250
eks_node_group_instance_types = ["r5.2xlarge"]

### GPU-enable node group
eks_node_group_instance_types_gpu_node = ["p3.2xlarge"]
eks_node_group_ami_type_gpu_node = "AL2_x86_64_GPU"

eks_node_group_capacities_gpu_node = {
desired = 0
max = 2
min = 0
}

##################################################
# Control Panel
##################################################
Expand Down
15 changes: 15 additions & 0 deletions terraform/aws/analytical-platform-production/cluster/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,21 @@ variable "eks_node_group_capacities" {
description = "The desired capacities for the EKS node group"
}

variable "eks_node_group_capacities_gpu_node" {
type = map(number)
description = "The desired capacities for the EKS GPU node group"
}

variable "eks_node_group_instance_types_gpu_node" {
type = list(string)
description = "The instance types for the EKS GPU node group"
}

variable "eks_node_group_ami_type_gpu_node" {
type = string
description = "The type of AMI to use for the EKS GPU node group"
}

##################################################
# AWS SSO
##################################################
Expand Down

0 comments on commit 8689550

Please sign in to comment.