jjleng · jjleng · Apr 10, 2024 · Mar 26, 2024 · Apr 5, 2024 · Apr 6, 2024
diff --git a/paka/cluster/aws/eks.py b/paka/cluster/aws/eks.py
@@ -14,6 +14,7 @@
 from paka.cluster.keda import create_keda
 from paka.cluster.knative import create_knative_and_istio
 from paka.cluster.namespace import create_namespace
+from paka.cluster.nvidia_device_plugin import install_nvidia_device_plugin
 from paka.cluster.prometheus import create_prometheus
 from paka.cluster.qdrant import create_qdrant
 from paka.cluster.redis import create_redis
@@ -79,10 +80,6 @@ def create_node_group_for_model_group(
             node_group_name=f"{project}-{kubify_name(model_group.name)}-group",
             cluster=cluster,
             instance_types=[model_group.nodeType],
-            # Set the desired size of the node group to the minimum number of instances
-            # specified for the model group.
-            # Note: Scaling down to 0 is not supported, since cold starting time is
-            # too long for model group services.
             scaling_config=aws.eks.NodeGroupScalingConfigArgs(
                 desired_size=model_group.minInstances,
                 min_size=model_group.minInstances,
@@ -95,8 +92,6 @@ def create_node_group_for_model_group(
             },
             node_role_arn=worker_role.arn,
             subnet_ids=vpc.private_subnet_ids,
-            # Apply taints to ensure that only pods belonging to the same model group
-            # can be scheduled on this node group.
             taints=[
                 aws.eks.NodeGroupTaintArgs(
                     effect="NO_SCHEDULE", key="app", value="model-group"
@@ -105,6 +100,13 @@ def create_node_group_for_model_group(
                     effect="NO_SCHEDULE", key="model", value=model_group.name
                 ),
             ],
+            # Supported AMI types https://docs.aws.amazon.com/eks/latest/APIReference/API_Nodegroup.html#AmazonEKS-Type-Nodegroup-amiType
+            ami_type=("AL2_x86_64_GPU" if model_group.awsGpu else None),
+            disk_size=(
+                model_group.awsGpu.diskSize
+                if model_group.awsGpu
+                else model_group.diskSize
+            ),
         )
 
 
@@ -301,6 +303,9 @@ def create_eks_resources(kubeconfig_json: str) -> None:
         enable_cloudwatch(config, k8s_provider)
         create_prometheus(config, k8s_provider)
         create_zipkin(config, k8s_provider)
+        # Install the NVIDIA device plugin for GPU support
+        # Even if the cluster doesn't have GPUs, this won't cause any issues
+        install_nvidia_device_plugin(k8s_provider)
 
         # TODO: Set timeout to be the one used by knative
         update_elb_idle_timeout(kubeconfig_json, 300)

diff --git a/paka/cluster/nvidia_device_plugin.py b/paka/cluster/nvidia_device_plugin.py
@@ -0,0 +1,87 @@
+import pulumi
+import pulumi_kubernetes as k8s
+
+
+def install_nvidia_device_plugin(
+    k8s_provider: k8s.Provider, version: str = "v0.15.0-rc.2"
+) -> None:
+    """
+    Installs the NVIDIA device plugin for GPU support in the cluster.
+
+    This function deploys the NVIDIA device plugin to the cluster using a DaemonSet.
+    The device plugin allows Kubernetes to discover and manage GPU resources on the nodes.
+
+    Args:
+        k8s_provider (k8s.Provider): The Kubernetes provider to use for deploying the device plugin.
+
+    Returns:
+        None
+    """
+
+    k8s.apps.v1.DaemonSet(
+        "nvidia-device-plugin-daemonset",
+        metadata=k8s.meta.v1.ObjectMetaArgs(
+            namespace="kube-system",
+        ),
+        spec=k8s.apps.v1.DaemonSetSpecArgs(
+            selector=k8s.meta.v1.LabelSelectorArgs(
+                match_labels={
+                    "name": "nvidia-device-plugin-ds",
+                },
+            ),
+            update_strategy=k8s.apps.v1.DaemonSetUpdateStrategyArgs(
+                type="RollingUpdate",
+            ),
+            template=k8s.core.v1.PodTemplateSpecArgs(
+                metadata=k8s.meta.v1.ObjectMetaArgs(
+                    labels={
+                        "name": "nvidia-device-plugin-ds",
+                    },
+                ),
+                spec=k8s.core.v1.PodSpecArgs(
+                    tolerations=[
+                        k8s.core.v1.TolerationArgs(
+                            key="nvidia.com/gpu",
+                            operator="Exists",
+                            effect="NoSchedule",
+                        ),
+                        k8s.core.v1.TolerationArgs(operator="Exists"),
+                    ],
+                    priority_class_name="system-node-critical",
+                    containers=[
+                        k8s.core.v1.ContainerArgs(
+                            image=f"nvcr.io/nvidia/k8s-device-plugin:{version}",
+                            name="nvidia-device-plugin-ctr",
+                            env=[
+                                k8s.core.v1.EnvVarArgs(
+                                    name="FAIL_ON_INIT_ERROR",
+                                    value="false",
+                                )
+                            ],
+                            security_context=k8s.core.v1.SecurityContextArgs(
+                                allow_privilege_escalation=False,
+                                capabilities=k8s.core.v1.CapabilitiesArgs(
+                                    drop=["ALL"],
+                                ),
+                            ),
+                            volume_mounts=[
+                                k8s.core.v1.VolumeMountArgs(
+                                    name="device-plugin",
+                                    mount_path="/var/lib/kubelet/device-plugins",
+                                )
+                            ],
+                        )
+                    ],
+                    volumes=[
+                        k8s.core.v1.VolumeArgs(
+                            name="device-plugin",
+                            host_path=k8s.core.v1.HostPathVolumeSourceArgs(
+                                path="/var/lib/kubelet/device-plugins",
+                            ),
+                        )
+                    ],
+                ),
+            ),
+        ),
+        opts=pulumi.ResourceOptions(provider=k8s_provider),
+    )
diff --git a/paka/config.py b/paka/config.py
@@ -1,5 +1,5 @@
 import re
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Union
 
 from pydantic import BaseModel, field_validator, model_validator
 from ruamel.yaml import YAML
@@ -33,10 +33,12 @@ class ResourceRequest(BaseModel):
     Attributes:
         cpu (str): The amount of CPU to request.
         memory (str): The amount of memory to request.
+        gpu (Optional[int]): The number of GPUs to request. Defaults to None.
     """
 
     cpu: str
     memory: str
+    gpu: Optional[int] = None
 
     @field_validator("cpu", mode="before")
     def validate_cpu(cls, v: str) -> str:
@@ -72,17 +74,78 @@ def validate_memory(cls, v: str) -> str:
         """
         return validate_size(v, "Invalid memory format")
 
+    @field_validator("gpu")
+    def validate_gpu(cls, v: Optional[int]) -> Optional[int]:
+        """
+        Validates the value of the gpu field.
+
+        Args:
+            v (Optional[int]): The value of the gpu field.
+
+        Returns:
+            Optional[int]: The input value if validation is successful.
+
+        Raises:
+            ValueError: If the value is less than 0.
+        """
+        if v is not None and v < 0:
+            raise ValueError("GPU count cannot be less than 0")
+        return v
+
+
+class AwsGpuNode(BaseModel):
+    """
+    Represents a configuration for an AWS GPU node.
+
+    Attributes:
+        diskSize (int): The size of the disk for the GPU node in GB.
+    """
+
+    diskSize: int
+
+
+class GcpGpuNode(BaseModel):
+    """
+    Represents a Google Cloud Platform GPU node.
+
+    Attributes:
+        imageType (str): The type of image used for the GPU node.
+        acceleratorType (str): The type of accelerator used for the GPU node.
+        acceleratorCount (int): The number of accelerators attached to the GPU node.
+        diskType (str): The type of disk used for the GPU node.
+        diskSize (int): The size of the disk attached to the GPU node in GB.
+    """
+
+    imageType: str
+    acceleratorType: str
+    acceleratorCount: int
+    diskType: str
+    diskSize: int
+
 
 class CloudNode(BaseModel):
     """
     Represents a node in the cloud cluster.
 
     Attributes:
         nodeType (str): The type of the node.
-
+        diskSize (int): The size of the disk attached to the node in GB.
+        awsGpu (Optional[AwsGpuNode]): The AWS GPU node configuration, if applicable.
+        gcpGpu (Optional[GcpGpuNode]): The GCP GPU node configuration, if applicable.
     """
 
     nodeType: str
+    diskSize: int = 20
+    awsGpu: Optional[AwsGpuNode] = None
+    gcpGpu: Optional[GcpGpuNode] = None
+
+    @model_validator(mode="before")
+    def validate_gpu(
+        cls, values: Dict[str, Union[AwsGpuNode, GcpGpuNode]]
+    ) -> Dict[str, Union[AwsGpuNode, GcpGpuNode]]:
+        if values.get("awsGpu") and values.get("gcpGpu"):
+            raise ValueError("At most one of awsGpu or gcpGpu can exist")
+        return values
 
 
 class ModelGroup(BaseModel):

diff --git a/paka/kube_resources/model_group/service.py b/paka/kube_resources/model_group/service.py
@@ -8,9 +8,12 @@
 from paka.kube_resources.model_group.model import MODEL_PATH_PREFIX, download_model
 from paka.utils import kubify_name, read_cluster_data
 
-# We hardcode the image here for now
+# `latest` will be stale because of the `IfNotPresent` policy
+# We hardcode the image here for now, we can make it configurable later
 LLAMA_CPP_PYTHON_IMAGE = "ghcr.io/abetlen/llama-cpp-python:latest"
 
+LLAMA_CPP_PYTHON_CUDA = "jijunleng/llama-cpp-python-cuda:latest"
+
 try_load_kubeconfig()
 
 
@@ -115,8 +118,9 @@ def create_pod(
         ],
         "env": [
             client.V1EnvVar(
-                name="USE_MLOCK",  # Model weights are locked in RAM or not
-                value="0",
+                name="N_GPU_LAYERS",
+                # -1 means all layers are GPU layers, 0 means no GPU layers
+                value=("-1" if model_group.awsGpu else "0"),
             ),
             client.V1EnvVar(
                 name="MODEL",
@@ -159,6 +163,17 @@ def create_pod(
             },
         )
 
+    if model_group.awsGpu:
+        if "resources" not in container_args:
+            container_args["resources"] = client.V1ResourceRequirements()
+        if container_args["resources"].limits is None:
+            container_args["resources"].limits = {}
+        gpu_count = 1
+        if model_group.resourceRequest and model_group.resourceRequest.gpu:
+            gpu_count = model_group.resourceRequest.gpu
+        # Ah, we only support nvidia GPUs for now
+        container_args["resources"].limits["nvidia.com/gpu"] = gpu_count
+
     return client.V1Pod(
         metadata=client.V1ObjectMeta(
             name=f"{kubify_name(model_group.name)}",
@@ -488,7 +503,13 @@ def create_model_group_service(
 
     port = 8000
 
-    pod = create_pod(namespace, config, model_group, LLAMA_CPP_PYTHON_IMAGE, port)
+    pod = create_pod(
+        namespace,
+        config,
+        model_group,
+        (LLAMA_CPP_PYTHON_CUDA if model_group.awsGpu else LLAMA_CPP_PYTHON_IMAGE),
+        port,
+    )
 
     deployment = create_deployment(namespace, model_group, pod)
     apply_resource(deployment)

diff --git a/tests/config/snapshots/test_config/test_aws_yaml/aws_yaml.txt b/tests/config/snapshots/test_config/test_aws_yaml/aws_yaml.txt
@@ -9,6 +9,7 @@ aws:
     logRetentionDays: 14
   modelGroups:
     - nodeType: t2.micro
+      diskSize: 20
       name: test-model-group
       minInstances: 1
       maxInstances: 2
diff --git a/tests/config/test_config.py b/tests/config/test_config.py
@@ -52,6 +52,11 @@ def test_invalid_memory_resource_request() -> None:
         ResourceRequest(cpu="500m", memory="2G")
 
 
+def test_invalid_gpu_resource_request() -> None:
+    with pytest.raises(ValueError, match="GPU count cannot be less than 0"):
+        ResourceRequest(cpu="500m", memory="2Gi", gpu=-1)
+
+
 def test_model_group() -> None:
     # Test with valid minInstances and maxInstances
     model_group = ModelGroup(name="test", minInstances=1, maxInstances=2)
@@ -190,6 +195,7 @@ def test_parse_yaml() -> None:
               minInstances: 1
               maxInstances: 1
               name: llama2-7b
+              awsGpu:
         vectorStore:
             nodeType: t2.small
             replicas: 2
@@ -211,10 +217,36 @@ def test_parse_yaml() -> None:
     assert model_group.minInstances == 1
     assert model_group.maxInstances == 1
     assert model_group.name == "llama2-7b"
+    assert model_group.awsGpu is None
     assert config.aws.vectorStore is not None
     assert config.aws.vectorStore.nodeType == "t2.small"
     assert config.aws.vectorStore.replicas == 2
 
+    yaml_str = """
+    aws:
+        cluster:
+            name: test_cluster
+            region: us-west-2
+            nodeType: t2.medium
+            minNodes: 2
+            maxNodes: 4
+        modelGroups:
+            - nodeType: c7a.xlarge
+              minInstances: 1
+              maxInstances: 1
+              name: llama2-7b
+              awsGpu:
+                diskSize: 100
+    """
+    config = parse_yaml(yaml_str)
+    assert isinstance(config, Config)
+    assert config.aws is not None
+    assert config.aws.modelGroups is not None
+    assert len(config.aws.modelGroups) == 1
+    model_group = config.aws.modelGroups[0]
+    assert model_group.awsGpu is not None
+    assert model_group.awsGpu.diskSize == 100
+
 
 def test_round_trip() -> None:
     original_config = Config(aws=cloud_config)