From 95aa895b021820d4e846680a1505882e08a49d46 Mon Sep 17 00:00:00 2001
From: Jijun Leng <jijunleng@gmail.com>
Date: Tue, 26 Mar 2024 14:49:42 -0700
Subject: [PATCH 1/7] [wip] gpu support

---
 paka/cluster/aws/eks.py | 10 ++++++++-
 paka/config.py          | 45 +++++++++++++++++++++++++++++++++++++++--
 2 files changed, 52 insertions(+), 3 deletions(-)

diff --git a/paka/cluster/aws/eks.py b/paka/cluster/aws/eks.py
index f65a268..9344e92 100644
--- a/paka/cluster/aws/eks.py
+++ b/paka/cluster/aws/eks.py
@@ -1,10 +1,11 @@
-from typing import Optional
+from typing import Dict, Optional, Union
 
 import pulumi
 import pulumi_aws as aws
 import pulumi_awsx as awsx
 import pulumi_eks as eks
 import pulumi_kubernetes as k8s
+from pulumi import ResourceOptions
 
 from paka.cluster.aws.cloudwatch import enable_cloudwatch
 from paka.cluster.aws.cluster_autoscaler import create_cluster_autoscaler
@@ -73,6 +74,12 @@ def create_node_group_for_model_group(
     project = config.cluster.name
 
     for model_group in config.modelGroups:
+        additional_args: Dict[
+            str, Union[str, int, float, bool, ResourceOptions, None]
+        ] = {}
+        if model_group.awsGpu is not None:
+            additional_args["ami_type"] = model_group.awsGpu.amiId
+
         # Create a managed node group for our cluster
         eks.ManagedNodeGroup(
             f"{project}-{kubify_name(model_group.name)}-group",
@@ -105,6 +112,7 @@ def create_node_group_for_model_group(
                     effect="NO_SCHEDULE", key="model", value=model_group.name
                 ),
             ],
+            **additional_args,
         )
 
 
diff --git a/paka/config.py b/paka/config.py
index 45c61ef..a9af819 100644
--- a/paka/config.py
+++ b/paka/config.py
@@ -1,5 +1,5 @@
 import re
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Union
 
 from pydantic import BaseModel, field_validator, model_validator
 from ruamel.yaml import YAML
@@ -73,16 +73,57 @@ def validate_memory(cls, v: str) -> str:
         return validate_size(v, "Invalid memory format")
 
 
+class AwsGpuNode(BaseModel):
+    """
+    Represents an AWS GPU node.
+
+    Attributes:
+        amiId (str): The ID of the Amazon Machine Image (AMI) for the GPU node.
+    """
+
+    amiId: str
+
+
+class GcpGpuNode(BaseModel):
+    """
+    Represents a Google Cloud Platform GPU node.
+
+    Attributes:
+        imageType (str): The type of image used for the GPU node.
+        acceleratorType (str): The type of accelerator used for the GPU node.
+        acceleratorCount (int): The number of accelerators attached to the GPU node.
+        diskType (str): The type of disk used for the GPU node.
+        diskSize (int): The size of the disk attached to the GPU node.
+    """
+
+    imageType: str
+    acceleratorType: str
+    acceleratorCount: int
+    diskType: str
+    diskSize: int
+
+
 class CloudNode(BaseModel):
     """
     Represents a node in the cloud cluster.
 
     Attributes:
         nodeType (str): The type of the node.
-
+        awsGpu (Optional[AwsGpuNode]): The AWS GPU node configuration.
+        gcpGpu (Optional[GcpGpuNode]): The GCP GPU node configuration.
     """
 
     nodeType: str
+    awsGpu: Optional[AwsGpuNode] = None
+    gcpGpu: Optional[GcpGpuNode] = None
+
+    @model_validator(mode="before")
+    def validate_gpu(
+        cls, values: Dict[str, Union[AwsGpuNode, GcpGpuNode]]
+    ) -> Dict[str, Union[AwsGpuNode, GcpGpuNode]]:
+        if values.get("awsGpu") and values.get("gcpGpu"):
+            raise ValueError("At most one of awsGpu or gcpGpu can exist")
+        return values
 
 
 class ModelGroup(BaseModel):

From b60a057e9f52231142b42255f4d2a89d873d5d9e Mon Sep 17 00:00:00 2001
From: Jijun Leng <jijunleng@gmail.com>
Date: Fri, 5 Apr 2024 16:48:15 -0700
Subject: [PATCH 2/7] feat(gpu): run models on cuda GPUs

---
 paka/cluster/aws/eks.py                    | 26 +++++++++------------
 paka/cluster/nvidia_device_plugin.py       | 25 ++++++++++++++++++++
 paka/config.py                             |  6 ++---
 paka/kube_resources/model_group/service.py | 27 +++++++++++++++++++---
 4 files changed, 63 insertions(+), 21 deletions(-)
 create mode 100644 paka/cluster/nvidia_device_plugin.py

diff --git a/paka/cluster/aws/eks.py b/paka/cluster/aws/eks.py
index 9344e92..4bb87bf 100644
--- a/paka/cluster/aws/eks.py
+++ b/paka/cluster/aws/eks.py
@@ -1,11 +1,10 @@
-from typing import Dict, Optional, Union
+from typing import Optional
 
 import pulumi
 import pulumi_aws as aws
 import pulumi_awsx as awsx
 import pulumi_eks as eks
 import pulumi_kubernetes as k8s
-from pulumi import ResourceOptions
 
 from paka.cluster.aws.cloudwatch import enable_cloudwatch
 from paka.cluster.aws.cluster_autoscaler import create_cluster_autoscaler
@@ -15,6 +14,7 @@
 from paka.cluster.keda import create_keda
 from paka.cluster.knative import create_knative_and_istio
 from paka.cluster.namespace import create_namespace
+from paka.cluster.nvidia_device_plugin import install_nvidia_device_plugin
 from paka.cluster.prometheus import create_prometheus
 from paka.cluster.qdrant import create_qdrant
 from paka.cluster.redis import create_redis
@@ -74,22 +74,12 @@ def create_node_group_for_model_group(
     project = config.cluster.name
 
     for model_group in config.modelGroups:
-        additional_args: Dict[
-            str, Union[str, int, float, bool, ResourceOptions, None]
-        ] = {}
-        if model_group.awsGpu is not None:
-            additional_args["ami_type"] = model_group.awsGpu.amiId
-
         # Create a managed node group for our cluster
         eks.ManagedNodeGroup(
             f"{project}-{kubify_name(model_group.name)}-group",
             node_group_name=f"{project}-{kubify_name(model_group.name)}-group",
             cluster=cluster,
             instance_types=[model_group.nodeType],
-            # Set the desired size of the node group to the minimum number of instances
-            # specified for the model group.
-            # Note: Scaling down to 0 is not supported, since cold starting time is
-            # too long for model group services.
             scaling_config=aws.eks.NodeGroupScalingConfigArgs(
                 desired_size=model_group.minInstances,
                 min_size=model_group.minInstances,
@@ -102,8 +92,6 @@ def create_node_group_for_model_group(
             },
             node_role_arn=worker_role.arn,
             subnet_ids=vpc.private_subnet_ids,
-            # Apply taints to ensure that only pods belonging to the same model group
-            # can be scheduled on this node group.
             taints=[
                 aws.eks.NodeGroupTaintArgs(
                     effect="NO_SCHEDULE", key="app", value="model-group"
@@ -112,7 +100,12 @@ def create_node_group_for_model_group(
                     effect="NO_SCHEDULE", key="model", value=model_group.name
                 ),
             ],
-            **additional_args,
+            # Supported AMI types https://docs.aws.amazon.com/eks/latest/APIReference/API_Nodegroup.html#AmazonEKS-Type-Nodegroup-amiType
+            ami_type=(
+                "AL2_x86_64_GPU"
+                if model_group.awsGpu and model_group.awsGpu.enabled
+                else None
+            ),
         )
 
 
@@ -309,6 +302,9 @@ def create_eks_resources(kubeconfig_json: str) -> None:
         enable_cloudwatch(config, k8s_provider)
         create_prometheus(config, k8s_provider)
         create_zipkin(config, k8s_provider)
+        # Install the NVIDIA device plugin for GPU support
+        # Even if the cluster doesn't have GPUs, this won't cause any issues
+        install_nvidia_device_plugin(k8s_provider)
 
         # TODO: Set timeout to be the one used by knative
         update_elb_idle_timeout(kubeconfig_json, 300)
diff --git a/paka/cluster/nvidia_device_plugin.py b/paka/cluster/nvidia_device_plugin.py
new file mode 100644
index 0000000..2b93f3f
--- /dev/null
+++ b/paka/cluster/nvidia_device_plugin.py
@@ -0,0 +1,25 @@
+import pulumi
+import pulumi_kubernetes as k8s
+
+
+def install_nvidia_device_plugin(
+    k8s_provider: k8s.Provider, version: str = "main"
+) -> None:
+    """
+    Installs the NVIDIA device plugin for GPU support in the cluster.
+
+    This function deploys the NVIDIA device plugin to the cluster using a DaemonSet.
+    The device plugin allows Kubernetes to discover and manage GPU resources on the nodes.
+
+    Args:
+        k8s_provider (k8s.Provider): The Kubernetes provider to use for deploying the device plugin.
+
+    Returns:
+        None
+    """
+    # This will install a DaemonSet in the kube-system namespace
+    k8s.yaml.ConfigFile(
+        "nvidia-device-plugin",
+        file=f"https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/{version}/nvidia-device-plugin.yml",
+        opts=pulumi.ResourceOptions(provider=k8s_provider),
+    )
diff --git a/paka/config.py b/paka/config.py
index a9af819..c386cf2 100644
--- a/paka/config.py
+++ b/paka/config.py
@@ -75,13 +75,13 @@ def validate_memory(cls, v: str) -> str:
 
 class AwsGpuNode(BaseModel):
     """
-    Represents an AWS GPU node.
+    Represents a configuration for an AWS GPU node.
 
     Attributes:
-        amiId (str): The ID of the Amazon Machine Image (AMI) for the GPU node.
+        enabled (bool): Indicates whether the GPU node is enabled or not.
     """
 
-    amiId: str
+    enabled: bool = False
 
 
 class GcpGpuNode(BaseModel):
diff --git a/paka/kube_resources/model_group/service.py b/paka/kube_resources/model_group/service.py
index 804ff95..b64c0a9 100644
--- a/paka/kube_resources/model_group/service.py
+++ b/paka/kube_resources/model_group/service.py
@@ -8,9 +8,12 @@
 from paka.kube_resources.model_group.model import MODEL_PATH_PREFIX, download_model
 from paka.utils import kubify_name, read_cluster_data
 
-# We hardcode the image here for now
+# `latest` will be stale because of the `IfNotPresent` policy
+# We hardcode the image here for now, we can make it configurable later
 LLAMA_CPP_PYTHON_IMAGE = "ghcr.io/abetlen/llama-cpp-python:latest"
 
+LLAMA_CPP_PYTHON_CUDA = "jijunleng/llama-cpp-python-cuda:latest"
+
 try_load_kubeconfig()
 
 
@@ -116,7 +119,7 @@ def create_pod(
         "env": [
             client.V1EnvVar(
                 name="USE_MLOCK",  # Model weights are locked in RAM or not
-                value="0",
+                value="1",
             ),
             client.V1EnvVar(
                 name="MODEL",
@@ -159,6 +162,14 @@ def create_pod(
             },
         )
 
+    if model_group.awsGpu and model_group.awsGpu.enabled:
+        if "resources" not in container_args:
+            container_args["resources"] = client.V1ResourceRequirements(
+                requests={},
+            )
+        # Ah, we only support nvidia GPUs for now
+        container_args["resources"].requests["nvidia.com/gpu"] = 1
+
     return client.V1Pod(
         metadata=client.V1ObjectMeta(
             name=f"{kubify_name(model_group.name)}",
@@ -488,7 +499,17 @@ def create_model_group_service(
 
     port = 8000
 
-    pod = create_pod(namespace, config, model_group, LLAMA_CPP_PYTHON_IMAGE, port)
+    pod = create_pod(
+        namespace,
+        config,
+        model_group,
+        (
+            LLAMA_CPP_PYTHON_CUDA
+            if model_group.awsGpu and model_group.awsGpu.enabled
+            else LLAMA_CPP_PYTHON_IMAGE
+        ),
+        port,
+    )
 
     deployment = create_deployment(namespace, model_group, pod)
     apply_resource(deployment)

From 195ed1e276827f9af26ee11839cb83bb7f4bf728 Mon Sep 17 00:00:00 2001
From: Jijun Leng <jijunleng@gmail.com>
Date: Fri, 5 Apr 2024 22:28:58 -0700
Subject: [PATCH 3/7] feat(gpu): make nvidia device plugin tolerate model group
 taints

---
 paka/cluster/nvidia_device_plugin.py | 72 ++++++++++++++++++++++++++--
 1 file changed, 67 insertions(+), 5 deletions(-)

diff --git a/paka/cluster/nvidia_device_plugin.py b/paka/cluster/nvidia_device_plugin.py
index 2b93f3f..ce0c919 100644
--- a/paka/cluster/nvidia_device_plugin.py
+++ b/paka/cluster/nvidia_device_plugin.py
@@ -3,7 +3,7 @@
 
 
 def install_nvidia_device_plugin(
-    k8s_provider: k8s.Provider, version: str = "main"
+    k8s_provider: k8s.Provider, version: str = "v0.15.0-rc.2"
 ) -> None:
     """
     Installs the NVIDIA device plugin for GPU support in the cluster.
@@ -17,9 +17,71 @@ def install_nvidia_device_plugin(
     Returns:
         None
     """
-    # This will install a DaemonSet in the kube-system namespace
-    k8s.yaml.ConfigFile(
-        "nvidia-device-plugin",
-        file=f"https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/{version}/nvidia-device-plugin.yml",
+
+    k8s.apps.v1.DaemonSet(
+        "nvidia-device-plugin-daemonset",
+        metadata=k8s.meta.v1.ObjectMetaArgs(
+            namespace="kube-system",
+        ),
+        spec=k8s.apps.v1.DaemonSetSpecArgs(
+            selector=k8s.meta.v1.LabelSelectorArgs(
+                match_labels={
+                    "name": "nvidia-device-plugin-ds",
+                },
+            ),
+            update_strategy=k8s.apps.v1.DaemonSetUpdateStrategyArgs(
+                type="RollingUpdate",
+            ),
+            template=k8s.core.v1.PodTemplateSpecArgs(
+                metadata=k8s.meta.v1.ObjectMetaArgs(
+                    labels={
+                        "name": "nvidia-device-plugin-ds",
+                    },
+                ),
+                spec=k8s.core.v1.PodSpecArgs(
+                    tolerations=[
+                        k8s.core.v1.TolerationArgs(
+                            key="nvidia.com/gpu",
+                            operator="Exists",
+                            effect="NoSchedule",
+                        ),
+                        k8s.core.v1.TolerationArgs(operator="Exists"),
+                    ],
+                    priority_class_name="system-node-critical",
+                    containers=[
+                        k8s.core.v1.ContainerArgs(
+                            image=f"nvcr.io/nvidia/k8s-device-plugin:{version}",
+                            name="nvidia-device-plugin-ctr",
+                            env=[
+                                k8s.core.v1.EnvVarArgs(
+                                    name="FAIL_ON_INIT_ERROR",
+                                    value="false",
+                                )
+                            ],
+                            security_context=k8s.core.v1.SecurityContextArgs(
+                                allow_privilege_escalation=False,
+                                capabilities=k8s.core.v1.CapabilitiesArgs(
+                                    drop=["ALL"],
+                                ),
+                            ),
+                            volume_mounts=[
+                                k8s.core.v1.VolumeMountArgs(
+                                    name="device-plugin",
+                                    mount_path="/var/lib/kubelet/device-plugins",
+                                )
+                            ],
+                        )
+                    ],
+                    volumes=[
+                        k8s.core.v1.VolumeArgs(
+                            name="device-plugin",
+                            host_path=k8s.core.v1.HostPathVolumeSourceArgs(
+                                path="/var/lib/kubelet/device-plugins",
+                            ),
+                        )
+                    ],
+                ),
+            ),
+        ),
         opts=pulumi.ResourceOptions(provider=k8s_provider),
     )

From 81c034ee3e30155f6d02aa09a7d01ff1da1fdcc9 Mon Sep 17 00:00:00 2001
From: Jijun Leng <jijunleng@gmail.com>
Date: Tue, 9 Apr 2024 15:59:49 -0700
Subject: [PATCH 4/7] feat(gpu): set n_gpu_layers to offload work to gpu for
 the llama.cpp runtime

---
 paka/kube_resources/model_group/service.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/paka/kube_resources/model_group/service.py b/paka/kube_resources/model_group/service.py
index b64c0a9..6965116 100644
--- a/paka/kube_resources/model_group/service.py
+++ b/paka/kube_resources/model_group/service.py
@@ -118,8 +118,11 @@ def create_pod(
         ],
         "env": [
             client.V1EnvVar(
-                name="USE_MLOCK",  # Model weights are locked in RAM or not
-                value="1",
+                name="N_GPU_LAYERS",
+                # -1 means all layers are GPU layers, 0 means no GPU layers
+                value=(
+                    "-1" if model_group.awsGpu and model_group.awsGpu.enabled else "0"
+                ),
             ),
             client.V1EnvVar(
                 name="MODEL",
@@ -164,11 +167,11 @@ def create_pod(
 
     if model_group.awsGpu and model_group.awsGpu.enabled:
         if "resources" not in container_args:
-            container_args["resources"] = client.V1ResourceRequirements(
-                requests={},
-            )
+            container_args["resources"] = client.V1ResourceRequirements()
+        if container_args["resources"].limits is None:
+            container_args["resources"].limits = {}
         # Ah, we only support nvidia GPUs for now
-        container_args["resources"].requests["nvidia.com/gpu"] = 1
+        container_args["resources"].limits["nvidia.com/gpu"] = 1
 
     return client.V1Pod(
         metadata=client.V1ObjectMeta(

From b1b39f7e9a449e44ad4640d39e65963d43853d0f Mon Sep 17 00:00:00 2001
From: Jijun Leng <jijunleng@gmail.com>
Date: Tue, 9 Apr 2024 16:03:30 -0700
Subject: [PATCH 5/7] feat(gpu): larger disk for gpu nodes

---
 paka/cluster/aws/eks.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paka/cluster/aws/eks.py b/paka/cluster/aws/eks.py
index 4bb87bf..16af57e 100644
--- a/paka/cluster/aws/eks.py
+++ b/paka/cluster/aws/eks.py
@@ -106,6 +106,7 @@ def create_node_group_for_model_group(
                 if model_group.awsGpu and model_group.awsGpu.enabled
                 else None
             ),
+            disk_size=40 if model_group.awsGpu and model_group.awsGpu.enabled else 20,
         )
 
 

From 864397c86357d9fe96310867f6f2b0727b8d5dfe Mon Sep 17 00:00:00 2001
From: Jijun Leng <jijunleng@gmail.com>
Date: Tue, 9 Apr 2024 17:24:33 -0700
Subject: [PATCH 6/7] feat(gpu): make model group node disk size configerable

---
 paka/cluster/aws/eks.py                       | 10 +++----
 paka/config.py                                | 12 +++++----
 paka/kube_resources/model_group/service.py    | 12 +++------
 .../test_config/test_aws_yaml/aws_yaml.txt    |  1 +
 tests/config/test_config.py                   | 27 +++++++++++++++++++
 5 files changed, 43 insertions(+), 19 deletions(-)

diff --git a/paka/cluster/aws/eks.py b/paka/cluster/aws/eks.py
index 16af57e..c10eef8 100644
--- a/paka/cluster/aws/eks.py
+++ b/paka/cluster/aws/eks.py
@@ -101,12 +101,12 @@ def create_node_group_for_model_group(
                 ),
             ],
             # Supported AMI types https://docs.aws.amazon.com/eks/latest/APIReference/API_Nodegroup.html#AmazonEKS-Type-Nodegroup-amiType
-            ami_type=(
-                "AL2_x86_64_GPU"
-                if model_group.awsGpu and model_group.awsGpu.enabled
-                else None
+            ami_type=("AL2_x86_64_GPU" if model_group.awsGpu else None),
+            disk_size=(
+                model_group.awsGpu.diskSize
+                if model_group.awsGpu
+                else model_group.diskSize
             ),
-            disk_size=40 if model_group.awsGpu and model_group.awsGpu.enabled else 20,
         )
 
 
diff --git a/paka/config.py b/paka/config.py
index c386cf2..e8b8d85 100644
--- a/paka/config.py
+++ b/paka/config.py
@@ -78,10 +78,10 @@ class AwsGpuNode(BaseModel):
     Represents a configuration for an AWS GPU node.
 
     Attributes:
-        enabled (bool): Indicates whether the GPU node is enabled or not.
+        diskSize (int): The size of the disk for the GPU node in GB.
     """
 
-    enabled: bool = False
+    diskSize: int
 
 
 class GcpGpuNode(BaseModel):
@@ -93,7 +93,7 @@ class GcpGpuNode(BaseModel):
         acceleratorType (str): The type of accelerator used for the GPU node.
         acceleratorCount (int): The number of accelerators attached to the GPU node.
         diskType (str): The type of disk used for the GPU node.
-        diskSize (int): The size of the disk attached to the GPU node.
+        diskSize (int): The size of the disk attached to the GPU node in GB.
     """
 
     imageType: str
@@ -109,11 +109,13 @@ class CloudNode(BaseModel):
 
     Attributes:
         nodeType (str): The type of the node.
-        awsGpu (Optional[AwsGpuNode]): The AWS GPU node configuration.
-        gcpGpu (Optional[GcpGpuNode]): The GCP GPU node configuration.
+        diskSize (int): The size of the disk attached to the node in GB.
+        awsGpu (Optional[AwsGpuNode]): The AWS GPU node configuration, if applicable.
+        gcpGpu (Optional[GcpGpuNode]): The GCP GPU node configuration, if applicable.
     """
 
     nodeType: str
+    diskSize: int = 20
     awsGpu: Optional[AwsGpuNode] = None
     gcpGpu: Optional[GcpGpuNode] = None
 
diff --git a/paka/kube_resources/model_group/service.py b/paka/kube_resources/model_group/service.py
index 6965116..73a0246 100644
--- a/paka/kube_resources/model_group/service.py
+++ b/paka/kube_resources/model_group/service.py
@@ -120,9 +120,7 @@ def create_pod(
             client.V1EnvVar(
                 name="N_GPU_LAYERS",
                 # -1 means all layers are GPU layers, 0 means no GPU layers
-                value=(
-                    "-1" if model_group.awsGpu and model_group.awsGpu.enabled else "0"
-                ),
+                value=("-1" if model_group.awsGpu else "0"),
             ),
             client.V1EnvVar(
                 name="MODEL",
@@ -165,7 +163,7 @@ def create_pod(
             },
         )
 
-    if model_group.awsGpu and model_group.awsGpu.enabled:
+    if model_group.awsGpu:
         if "resources" not in container_args:
             container_args["resources"] = client.V1ResourceRequirements()
         if container_args["resources"].limits is None:
@@ -506,11 +504,7 @@ def create_model_group_service(
         namespace,
         config,
         model_group,
-        (
-            LLAMA_CPP_PYTHON_CUDA
-            if model_group.awsGpu and model_group.awsGpu.enabled
-            else LLAMA_CPP_PYTHON_IMAGE
-        ),
+        (LLAMA_CPP_PYTHON_CUDA if model_group.awsGpu else LLAMA_CPP_PYTHON_IMAGE),
         port,
     )
 
diff --git a/tests/config/snapshots/test_config/test_aws_yaml/aws_yaml.txt b/tests/config/snapshots/test_config/test_aws_yaml/aws_yaml.txt
index 917e48e..8a0bfa2 100644
--- a/tests/config/snapshots/test_config/test_aws_yaml/aws_yaml.txt
+++ b/tests/config/snapshots/test_config/test_aws_yaml/aws_yaml.txt
@@ -9,6 +9,7 @@ aws:
     logRetentionDays: 14
   modelGroups:
     - nodeType: t2.micro
+      diskSize: 20
       name: test-model-group
       minInstances: 1
       maxInstances: 2
diff --git a/tests/config/test_config.py b/tests/config/test_config.py
index 401675d..ddcfa9d 100644
--- a/tests/config/test_config.py
+++ b/tests/config/test_config.py
@@ -190,6 +190,7 @@ def test_parse_yaml() -> None:
               minInstances: 1
               maxInstances: 1
               name: llama2-7b
+              awsGpu:
         vectorStore:
             nodeType: t2.small
             replicas: 2
@@ -211,10 +212,36 @@ def test_parse_yaml() -> None:
     assert model_group.minInstances == 1
     assert model_group.maxInstances == 1
     assert model_group.name == "llama2-7b"
+    assert model_group.awsGpu is None
     assert config.aws.vectorStore is not None
     assert config.aws.vectorStore.nodeType == "t2.small"
     assert config.aws.vectorStore.replicas == 2
 
+    yaml_str = """
+    aws:
+        cluster:
+            name: test_cluster
+            region: us-west-2
+            nodeType: t2.medium
+            minNodes: 2
+            maxNodes: 4
+        modelGroups:
+            - nodeType: c7a.xlarge
+              minInstances: 1
+              maxInstances: 1
+              name: llama2-7b
+              awsGpu:
+                diskSize: 100
+    """
+    config = parse_yaml(yaml_str)
+    assert isinstance(config, Config)
+    assert config.aws is not None
+    assert config.aws.modelGroups is not None
+    assert len(config.aws.modelGroups) == 1
+    model_group = config.aws.modelGroups[0]
+    assert model_group.awsGpu is not None
+    assert model_group.awsGpu.diskSize == 100
+
 
 def test_round_trip() -> None:
     original_config = Config(aws=cloud_config)

From 6a255ba2c637cd0f7ece08224a89599973c7c27a Mon Sep 17 00:00:00 2001
From: Jijun Leng <jijunleng@gmail.com>
Date: Tue, 9 Apr 2024 23:15:42 -0700
Subject: [PATCH 7/7] feat(gpu): be able to request a number of GPUs through
 config

---
 paka/config.py                             | 20 +++++
 paka/kube_resources/model_group/service.py |  5 +-
 tests/config/test_config.py                |  5 ++
 tests/model_group/test_service.py          | 98 ++++++++++++++++++++++
 4 files changed, 127 insertions(+), 1 deletion(-)
 create mode 100644 tests/model_group/test_service.py

diff --git a/paka/config.py b/paka/config.py
index e8b8d85..2808827 100644
--- a/paka/config.py
+++ b/paka/config.py
@@ -33,10 +33,12 @@ class ResourceRequest(BaseModel):
     Attributes:
         cpu (str): The amount of CPU to request.
         memory (str): The amount of memory to request.
+        gpu (Optional[int]): The number of GPUs to request. Defaults to None.
     """
 
     cpu: str
     memory: str
+    gpu: Optional[int] = None
 
     @field_validator("cpu", mode="before")
     def validate_cpu(cls, v: str) -> str:
@@ -72,6 +74,24 @@ def validate_memory(cls, v: str) -> str:
         """
         return validate_size(v, "Invalid memory format")
 
+    @field_validator("gpu")
+    def validate_gpu(cls, v: Optional[int]) -> Optional[int]:
+        """
+        Validates the value of the gpu field.
+
+        Args:
+            v (Optional[int]): The value of the gpu field.
+
+        Returns:
+            Optional[int]: The input value if validation is successful.
+
+        Raises:
+            ValueError: If the value is less than 0.
+        """
+        if v is not None and v < 0:
+            raise ValueError("GPU count cannot be less than 0")
+        return v
+
 
 class AwsGpuNode(BaseModel):
     """
diff --git a/paka/kube_resources/model_group/service.py b/paka/kube_resources/model_group/service.py
index 73a0246..2e2d92c 100644
--- a/paka/kube_resources/model_group/service.py
+++ b/paka/kube_resources/model_group/service.py
@@ -168,8 +168,11 @@ def create_pod(
             container_args["resources"] = client.V1ResourceRequirements()
         if container_args["resources"].limits is None:
             container_args["resources"].limits = {}
+        gpu_count = 1
+        if model_group.resourceRequest and model_group.resourceRequest.gpu:
+            gpu_count = model_group.resourceRequest.gpu
         # Ah, we only support nvidia GPUs for now
-        container_args["resources"].limits["nvidia.com/gpu"] = 1
+        container_args["resources"].limits["nvidia.com/gpu"] = gpu_count
 
     return client.V1Pod(
         metadata=client.V1ObjectMeta(
diff --git a/tests/config/test_config.py b/tests/config/test_config.py
index ddcfa9d..1275813 100644
--- a/tests/config/test_config.py
+++ b/tests/config/test_config.py
@@ -52,6 +52,11 @@ def test_invalid_memory_resource_request() -> None:
         ResourceRequest(cpu="500m", memory="2G")
 
 
+def test_invalid_gpu_resource_request() -> None:
+    with pytest.raises(ValueError, match="GPU count cannot be less than 0"):
+        ResourceRequest(cpu="500m", memory="2Gi", gpu=-1)
+
+
 def test_model_group() -> None:
     # Test with valid minInstances and maxInstances
     model_group = ModelGroup(name="test", minInstances=1, maxInstances=2)
diff --git a/tests/model_group/test_service.py b/tests/model_group/test_service.py
new file mode 100644
index 0000000..bac7d3b
--- /dev/null
+++ b/tests/model_group/test_service.py
@@ -0,0 +1,98 @@
+from kubernetes.client import V1Pod
+
+from paka.config import (
+    AwsGpuNode,
+    CloudConfig,
+    CloudModelGroup,
+    ClusterConfig,
+    Config,
+    ResourceRequest,
+)
+from paka.kube_resources.model_group.service import create_pod
+
+
+def test_create_pod() -> None:
+    model_group = CloudModelGroup(
+        nodeType="c7a.xlarge",
+        minInstances=1,
+        maxInstances=1,
+        name="llama2-7b",
+        resourceRequest=ResourceRequest(cpu="100m", memory="256Mi", gpu=2),
+        awsGpu=AwsGpuNode(diskSize=100),
+    )
+
+    config = Config(
+        aws=CloudConfig(
+            cluster=ClusterConfig(
+                name="test_cluster",
+                region="us-west-2",
+                nodeType="t2.medium",
+                minNodes=2,
+                maxNodes=4,
+            ),
+            modelGroups=[model_group],
+        )
+    )
+
+    pod = create_pod("test_namespace", config, model_group, "runtime_image", 8080)
+
+    assert isinstance(pod, V1Pod)
+
+    assert pod.metadata.name == "llama2-7b"
+    assert pod.metadata.namespace == "test_namespace"
+    assert len(pod.spec.containers) == 1
+    container = pod.spec.containers[0]
+    assert container.name == "llama2-7b"
+    assert container.image == "runtime_image"
+    assert container.resources.requests["cpu"] == "100m"
+    assert container.resources.requests["memory"] == "256Mi"
+    assert container.resources.limits["nvidia.com/gpu"] == 2
+    assert len(container.volume_mounts) == 1
+    assert container.volume_mounts[0].name == "model-data"
+    assert container.volume_mounts[0].mount_path == "/data"
+    assert len(container.env) == 3
+    assert container.env[0].name == "N_GPU_LAYERS"
+    assert container.env[0].value == "-1"  # Offload all layers to GPU
+    assert container.env[1].name == "MODEL"
+    assert container.env[1].value == "/data/my_model.gguf"
+    assert container.env[2].name == "PORT"
+    assert container.env[2].value == "8080"
+
+    model_group = CloudModelGroup(
+        nodeType="c7a.xlarge",
+        minInstances=1,
+        maxInstances=1,
+        name="llama2-7b",
+    )
+
+    config = Config(
+        aws=CloudConfig(
+            cluster=ClusterConfig(
+                name="test_cluster",
+                region="us-west-2",
+                nodeType="t2.medium",
+                minNodes=2,
+                maxNodes=4,
+            ),
+            modelGroups=[model_group],
+        )
+    )
+
+    pod = create_pod("test_namespace", config, model_group, "runtime_image", 8080)
+
+    assert isinstance(pod, V1Pod)
+
+    assert len(pod.spec.containers) == 1
+    container = pod.spec.containers[0]
+    assert container.name == "llama2-7b"
+    assert container.image == "runtime_image"
+    assert len(container.volume_mounts) == 1
+    assert container.volume_mounts[0].name == "model-data"
+    assert container.volume_mounts[0].mount_path == "/data"
+    assert len(container.env) == 3
+    assert container.env[0].name == "N_GPU_LAYERS"
+    assert container.env[0].value == "0"  # Offload no layers to GPU
+    assert container.env[1].name == "MODEL"
+    assert container.env[1].value == "/data/my_model.gguf"
+    assert container.env[2].name == "PORT"
+    assert container.env[2].value == "8080"