From 95aa895b021820d4e846680a1505882e08a49d46 Mon Sep 17 00:00:00 2001 From: Jijun Leng Date: Tue, 26 Mar 2024 14:49:42 -0700 Subject: [PATCH 1/7] [wip] gpu support --- paka/cluster/aws/eks.py | 10 ++++++++- paka/config.py | 45 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 52 insertions(+), 3 deletions(-) diff --git a/paka/cluster/aws/eks.py b/paka/cluster/aws/eks.py index f65a268..9344e92 100644 --- a/paka/cluster/aws/eks.py +++ b/paka/cluster/aws/eks.py @@ -1,10 +1,11 @@ -from typing import Optional +from typing import Dict, Optional, Union import pulumi import pulumi_aws as aws import pulumi_awsx as awsx import pulumi_eks as eks import pulumi_kubernetes as k8s +from pulumi import ResourceOptions from paka.cluster.aws.cloudwatch import enable_cloudwatch from paka.cluster.aws.cluster_autoscaler import create_cluster_autoscaler @@ -73,6 +74,12 @@ def create_node_group_for_model_group( project = config.cluster.name for model_group in config.modelGroups: + additional_args: Dict[ + str, Union[str, int, float, bool, ResourceOptions, None] + ] = {} + if model_group.awsGpu is not None: + additional_args["ami_type"] = model_group.awsGpu.amiId + # Create a managed node group for our cluster eks.ManagedNodeGroup( f"{project}-{kubify_name(model_group.name)}-group", @@ -105,6 +112,7 @@ def create_node_group_for_model_group( effect="NO_SCHEDULE", key="model", value=model_group.name ), ], + **additional_args, ) diff --git a/paka/config.py b/paka/config.py index 45c61ef..a9af819 100644 --- a/paka/config.py +++ b/paka/config.py @@ -1,5 +1,5 @@ import re -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Union from pydantic import BaseModel, field_validator, model_validator from ruamel.yaml import YAML @@ -73,16 +73,57 @@ def validate_memory(cls, v: str) -> str: return validate_size(v, "Invalid memory format") +class AwsGpuNode(BaseModel): + """ + Represents an AWS GPU node. + + Attributes: + amiId (str): The ID of the Amazon Machine Image (AMI) for the GPU node. + """ + + amiId: str + + +class GcpGpuNode(BaseModel): + """ + Represents a Google Cloud Platform GPU node. + + Attributes: + imageType (str): The type of image used for the GPU node. + acceleratorType (str): The type of accelerator used for the GPU node. + acceleratorCount (int): The number of accelerators attached to the GPU node. + diskType (str): The type of disk used for the GPU node. + diskSize (int): The size of the disk attached to the GPU node. + """ + + imageType: str + acceleratorType: str + acceleratorCount: int + diskType: str + diskSize: int + + class CloudNode(BaseModel): """ Represents a node in the cloud cluster. Attributes: nodeType (str): The type of the node. - + awsGpu (Optional[AwsGpuNode]): The AWS GPU node configuration. + gcpGpu (Optional[GcpGpuNode]): The GCP GPU node configuration. """ nodeType: str + awsGpu: Optional[AwsGpuNode] = None + gcpGpu: Optional[GcpGpuNode] = None + + @model_validator(mode="before") + def validate_gpu( + cls, values: Dict[str, Union[AwsGpuNode, GcpGpuNode]] + ) -> Dict[str, Union[AwsGpuNode, GcpGpuNode]]: + if values.get("awsGpu") and values.get("gcpGpu"): + raise ValueError("At most one of awsGpu or gcpGpu can exist") + return values class ModelGroup(BaseModel): From b60a057e9f52231142b42255f4d2a89d873d5d9e Mon Sep 17 00:00:00 2001 From: Jijun Leng Date: Fri, 5 Apr 2024 16:48:15 -0700 Subject: [PATCH 2/7] feat(gpu): run models on cuda GPUs --- paka/cluster/aws/eks.py | 26 +++++++++------------ paka/cluster/nvidia_device_plugin.py | 25 ++++++++++++++++++++ paka/config.py | 6 ++--- paka/kube_resources/model_group/service.py | 27 +++++++++++++++++++--- 4 files changed, 63 insertions(+), 21 deletions(-) create mode 100644 paka/cluster/nvidia_device_plugin.py diff --git a/paka/cluster/aws/eks.py b/paka/cluster/aws/eks.py index 9344e92..4bb87bf 100644 --- a/paka/cluster/aws/eks.py +++ b/paka/cluster/aws/eks.py @@ -1,11 +1,10 @@ -from typing import Dict, Optional, Union +from typing import Optional import pulumi import pulumi_aws as aws import pulumi_awsx as awsx import pulumi_eks as eks import pulumi_kubernetes as k8s -from pulumi import ResourceOptions from paka.cluster.aws.cloudwatch import enable_cloudwatch from paka.cluster.aws.cluster_autoscaler import create_cluster_autoscaler @@ -15,6 +14,7 @@ from paka.cluster.keda import create_keda from paka.cluster.knative import create_knative_and_istio from paka.cluster.namespace import create_namespace +from paka.cluster.nvidia_device_plugin import install_nvidia_device_plugin from paka.cluster.prometheus import create_prometheus from paka.cluster.qdrant import create_qdrant from paka.cluster.redis import create_redis @@ -74,22 +74,12 @@ def create_node_group_for_model_group( project = config.cluster.name for model_group in config.modelGroups: - additional_args: Dict[ - str, Union[str, int, float, bool, ResourceOptions, None] - ] = {} - if model_group.awsGpu is not None: - additional_args["ami_type"] = model_group.awsGpu.amiId - # Create a managed node group for our cluster eks.ManagedNodeGroup( f"{project}-{kubify_name(model_group.name)}-group", node_group_name=f"{project}-{kubify_name(model_group.name)}-group", cluster=cluster, instance_types=[model_group.nodeType], - # Set the desired size of the node group to the minimum number of instances - # specified for the model group. - # Note: Scaling down to 0 is not supported, since cold starting time is - # too long for model group services. scaling_config=aws.eks.NodeGroupScalingConfigArgs( desired_size=model_group.minInstances, min_size=model_group.minInstances, @@ -102,8 +92,6 @@ def create_node_group_for_model_group( }, node_role_arn=worker_role.arn, subnet_ids=vpc.private_subnet_ids, - # Apply taints to ensure that only pods belonging to the same model group - # can be scheduled on this node group. taints=[ aws.eks.NodeGroupTaintArgs( effect="NO_SCHEDULE", key="app", value="model-group" @@ -112,7 +100,12 @@ def create_node_group_for_model_group( effect="NO_SCHEDULE", key="model", value=model_group.name ), ], - **additional_args, + # Supported AMI types https://docs.aws.amazon.com/eks/latest/APIReference/API_Nodegroup.html#AmazonEKS-Type-Nodegroup-amiType + ami_type=( + "AL2_x86_64_GPU" + if model_group.awsGpu and model_group.awsGpu.enabled + else None + ), ) @@ -309,6 +302,9 @@ def create_eks_resources(kubeconfig_json: str) -> None: enable_cloudwatch(config, k8s_provider) create_prometheus(config, k8s_provider) create_zipkin(config, k8s_provider) + # Install the NVIDIA device plugin for GPU support + # Even if the cluster doesn't have GPUs, this won't cause any issues + install_nvidia_device_plugin(k8s_provider) # TODO: Set timeout to be the one used by knative update_elb_idle_timeout(kubeconfig_json, 300) diff --git a/paka/cluster/nvidia_device_plugin.py b/paka/cluster/nvidia_device_plugin.py new file mode 100644 index 0000000..2b93f3f --- /dev/null +++ b/paka/cluster/nvidia_device_plugin.py @@ -0,0 +1,25 @@ +import pulumi +import pulumi_kubernetes as k8s + + +def install_nvidia_device_plugin( + k8s_provider: k8s.Provider, version: str = "main" +) -> None: + """ + Installs the NVIDIA device plugin for GPU support in the cluster. + + This function deploys the NVIDIA device plugin to the cluster using a DaemonSet. + The device plugin allows Kubernetes to discover and manage GPU resources on the nodes. + + Args: + k8s_provider (k8s.Provider): The Kubernetes provider to use for deploying the device plugin. + + Returns: + None + """ + # This will install a DaemonSet in the kube-system namespace + k8s.yaml.ConfigFile( + "nvidia-device-plugin", + file=f"https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/{version}/nvidia-device-plugin.yml", + opts=pulumi.ResourceOptions(provider=k8s_provider), + ) diff --git a/paka/config.py b/paka/config.py index a9af819..c386cf2 100644 --- a/paka/config.py +++ b/paka/config.py @@ -75,13 +75,13 @@ def validate_memory(cls, v: str) -> str: class AwsGpuNode(BaseModel): """ - Represents an AWS GPU node. + Represents a configuration for an AWS GPU node. Attributes: - amiId (str): The ID of the Amazon Machine Image (AMI) for the GPU node. + enabled (bool): Indicates whether the GPU node is enabled or not. """ - amiId: str + enabled: bool = False class GcpGpuNode(BaseModel): diff --git a/paka/kube_resources/model_group/service.py b/paka/kube_resources/model_group/service.py index 804ff95..b64c0a9 100644 --- a/paka/kube_resources/model_group/service.py +++ b/paka/kube_resources/model_group/service.py @@ -8,9 +8,12 @@ from paka.kube_resources.model_group.model import MODEL_PATH_PREFIX, download_model from paka.utils import kubify_name, read_cluster_data -# We hardcode the image here for now +# `latest` will be stale because of the `IfNotPresent` policy +# We hardcode the image here for now, we can make it configurable later LLAMA_CPP_PYTHON_IMAGE = "ghcr.io/abetlen/llama-cpp-python:latest" +LLAMA_CPP_PYTHON_CUDA = "jijunleng/llama-cpp-python-cuda:latest" + try_load_kubeconfig() @@ -116,7 +119,7 @@ def create_pod( "env": [ client.V1EnvVar( name="USE_MLOCK", # Model weights are locked in RAM or not - value="0", + value="1", ), client.V1EnvVar( name="MODEL", @@ -159,6 +162,14 @@ def create_pod( }, ) + if model_group.awsGpu and model_group.awsGpu.enabled: + if "resources" not in container_args: + container_args["resources"] = client.V1ResourceRequirements( + requests={}, + ) + # Ah, we only support nvidia GPUs for now + container_args["resources"].requests["nvidia.com/gpu"] = 1 + return client.V1Pod( metadata=client.V1ObjectMeta( name=f"{kubify_name(model_group.name)}", @@ -488,7 +499,17 @@ def create_model_group_service( port = 8000 - pod = create_pod(namespace, config, model_group, LLAMA_CPP_PYTHON_IMAGE, port) + pod = create_pod( + namespace, + config, + model_group, + ( + LLAMA_CPP_PYTHON_CUDA + if model_group.awsGpu and model_group.awsGpu.enabled + else LLAMA_CPP_PYTHON_IMAGE + ), + port, + ) deployment = create_deployment(namespace, model_group, pod) apply_resource(deployment) From 195ed1e276827f9af26ee11839cb83bb7f4bf728 Mon Sep 17 00:00:00 2001 From: Jijun Leng Date: Fri, 5 Apr 2024 22:28:58 -0700 Subject: [PATCH 3/7] feat(gpu): make nvidia device plugin tolerate model group taints --- paka/cluster/nvidia_device_plugin.py | 72 ++++++++++++++++++++++++++-- 1 file changed, 67 insertions(+), 5 deletions(-) diff --git a/paka/cluster/nvidia_device_plugin.py b/paka/cluster/nvidia_device_plugin.py index 2b93f3f..ce0c919 100644 --- a/paka/cluster/nvidia_device_plugin.py +++ b/paka/cluster/nvidia_device_plugin.py @@ -3,7 +3,7 @@ def install_nvidia_device_plugin( - k8s_provider: k8s.Provider, version: str = "main" + k8s_provider: k8s.Provider, version: str = "v0.15.0-rc.2" ) -> None: """ Installs the NVIDIA device plugin for GPU support in the cluster. @@ -17,9 +17,71 @@ def install_nvidia_device_plugin( Returns: None """ - # This will install a DaemonSet in the kube-system namespace - k8s.yaml.ConfigFile( - "nvidia-device-plugin", - file=f"https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/{version}/nvidia-device-plugin.yml", + + k8s.apps.v1.DaemonSet( + "nvidia-device-plugin-daemonset", + metadata=k8s.meta.v1.ObjectMetaArgs( + namespace="kube-system", + ), + spec=k8s.apps.v1.DaemonSetSpecArgs( + selector=k8s.meta.v1.LabelSelectorArgs( + match_labels={ + "name": "nvidia-device-plugin-ds", + }, + ), + update_strategy=k8s.apps.v1.DaemonSetUpdateStrategyArgs( + type="RollingUpdate", + ), + template=k8s.core.v1.PodTemplateSpecArgs( + metadata=k8s.meta.v1.ObjectMetaArgs( + labels={ + "name": "nvidia-device-plugin-ds", + }, + ), + spec=k8s.core.v1.PodSpecArgs( + tolerations=[ + k8s.core.v1.TolerationArgs( + key="nvidia.com/gpu", + operator="Exists", + effect="NoSchedule", + ), + k8s.core.v1.TolerationArgs(operator="Exists"), + ], + priority_class_name="system-node-critical", + containers=[ + k8s.core.v1.ContainerArgs( + image=f"nvcr.io/nvidia/k8s-device-plugin:{version}", + name="nvidia-device-plugin-ctr", + env=[ + k8s.core.v1.EnvVarArgs( + name="FAIL_ON_INIT_ERROR", + value="false", + ) + ], + security_context=k8s.core.v1.SecurityContextArgs( + allow_privilege_escalation=False, + capabilities=k8s.core.v1.CapabilitiesArgs( + drop=["ALL"], + ), + ), + volume_mounts=[ + k8s.core.v1.VolumeMountArgs( + name="device-plugin", + mount_path="/var/lib/kubelet/device-plugins", + ) + ], + ) + ], + volumes=[ + k8s.core.v1.VolumeArgs( + name="device-plugin", + host_path=k8s.core.v1.HostPathVolumeSourceArgs( + path="/var/lib/kubelet/device-plugins", + ), + ) + ], + ), + ), + ), opts=pulumi.ResourceOptions(provider=k8s_provider), ) From 81c034ee3e30155f6d02aa09a7d01ff1da1fdcc9 Mon Sep 17 00:00:00 2001 From: Jijun Leng Date: Tue, 9 Apr 2024 15:59:49 -0700 Subject: [PATCH 4/7] feat(gpu): set n_gpu_layers to offload work to gpu for the llama.cpp runtime --- paka/kube_resources/model_group/service.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/paka/kube_resources/model_group/service.py b/paka/kube_resources/model_group/service.py index b64c0a9..6965116 100644 --- a/paka/kube_resources/model_group/service.py +++ b/paka/kube_resources/model_group/service.py @@ -118,8 +118,11 @@ def create_pod( ], "env": [ client.V1EnvVar( - name="USE_MLOCK", # Model weights are locked in RAM or not - value="1", + name="N_GPU_LAYERS", + # -1 means all layers are GPU layers, 0 means no GPU layers + value=( + "-1" if model_group.awsGpu and model_group.awsGpu.enabled else "0" + ), ), client.V1EnvVar( name="MODEL", @@ -164,11 +167,11 @@ def create_pod( if model_group.awsGpu and model_group.awsGpu.enabled: if "resources" not in container_args: - container_args["resources"] = client.V1ResourceRequirements( - requests={}, - ) + container_args["resources"] = client.V1ResourceRequirements() + if container_args["resources"].limits is None: + container_args["resources"].limits = {} # Ah, we only support nvidia GPUs for now - container_args["resources"].requests["nvidia.com/gpu"] = 1 + container_args["resources"].limits["nvidia.com/gpu"] = 1 return client.V1Pod( metadata=client.V1ObjectMeta( From b1b39f7e9a449e44ad4640d39e65963d43853d0f Mon Sep 17 00:00:00 2001 From: Jijun Leng Date: Tue, 9 Apr 2024 16:03:30 -0700 Subject: [PATCH 5/7] feat(gpu): larger disk for gpu nodes --- paka/cluster/aws/eks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/paka/cluster/aws/eks.py b/paka/cluster/aws/eks.py index 4bb87bf..16af57e 100644 --- a/paka/cluster/aws/eks.py +++ b/paka/cluster/aws/eks.py @@ -106,6 +106,7 @@ def create_node_group_for_model_group( if model_group.awsGpu and model_group.awsGpu.enabled else None ), + disk_size=40 if model_group.awsGpu and model_group.awsGpu.enabled else 20, ) From 864397c86357d9fe96310867f6f2b0727b8d5dfe Mon Sep 17 00:00:00 2001 From: Jijun Leng Date: Tue, 9 Apr 2024 17:24:33 -0700 Subject: [PATCH 6/7] feat(gpu): make model group node disk size configerable --- paka/cluster/aws/eks.py | 10 +++---- paka/config.py | 12 +++++---- paka/kube_resources/model_group/service.py | 12 +++------ .../test_config/test_aws_yaml/aws_yaml.txt | 1 + tests/config/test_config.py | 27 +++++++++++++++++++ 5 files changed, 43 insertions(+), 19 deletions(-) diff --git a/paka/cluster/aws/eks.py b/paka/cluster/aws/eks.py index 16af57e..c10eef8 100644 --- a/paka/cluster/aws/eks.py +++ b/paka/cluster/aws/eks.py @@ -101,12 +101,12 @@ def create_node_group_for_model_group( ), ], # Supported AMI types https://docs.aws.amazon.com/eks/latest/APIReference/API_Nodegroup.html#AmazonEKS-Type-Nodegroup-amiType - ami_type=( - "AL2_x86_64_GPU" - if model_group.awsGpu and model_group.awsGpu.enabled - else None + ami_type=("AL2_x86_64_GPU" if model_group.awsGpu else None), + disk_size=( + model_group.awsGpu.diskSize + if model_group.awsGpu + else model_group.diskSize ), - disk_size=40 if model_group.awsGpu and model_group.awsGpu.enabled else 20, ) diff --git a/paka/config.py b/paka/config.py index c386cf2..e8b8d85 100644 --- a/paka/config.py +++ b/paka/config.py @@ -78,10 +78,10 @@ class AwsGpuNode(BaseModel): Represents a configuration for an AWS GPU node. Attributes: - enabled (bool): Indicates whether the GPU node is enabled or not. + diskSize (int): The size of the disk for the GPU node in GB. """ - enabled: bool = False + diskSize: int class GcpGpuNode(BaseModel): @@ -93,7 +93,7 @@ class GcpGpuNode(BaseModel): acceleratorType (str): The type of accelerator used for the GPU node. acceleratorCount (int): The number of accelerators attached to the GPU node. diskType (str): The type of disk used for the GPU node. - diskSize (int): The size of the disk attached to the GPU node. + diskSize (int): The size of the disk attached to the GPU node in GB. """ imageType: str @@ -109,11 +109,13 @@ class CloudNode(BaseModel): Attributes: nodeType (str): The type of the node. - awsGpu (Optional[AwsGpuNode]): The AWS GPU node configuration. - gcpGpu (Optional[GcpGpuNode]): The GCP GPU node configuration. + diskSize (int): The size of the disk attached to the node in GB. + awsGpu (Optional[AwsGpuNode]): The AWS GPU node configuration, if applicable. + gcpGpu (Optional[GcpGpuNode]): The GCP GPU node configuration, if applicable. """ nodeType: str + diskSize: int = 20 awsGpu: Optional[AwsGpuNode] = None gcpGpu: Optional[GcpGpuNode] = None diff --git a/paka/kube_resources/model_group/service.py b/paka/kube_resources/model_group/service.py index 6965116..73a0246 100644 --- a/paka/kube_resources/model_group/service.py +++ b/paka/kube_resources/model_group/service.py @@ -120,9 +120,7 @@ def create_pod( client.V1EnvVar( name="N_GPU_LAYERS", # -1 means all layers are GPU layers, 0 means no GPU layers - value=( - "-1" if model_group.awsGpu and model_group.awsGpu.enabled else "0" - ), + value=("-1" if model_group.awsGpu else "0"), ), client.V1EnvVar( name="MODEL", @@ -165,7 +163,7 @@ def create_pod( }, ) - if model_group.awsGpu and model_group.awsGpu.enabled: + if model_group.awsGpu: if "resources" not in container_args: container_args["resources"] = client.V1ResourceRequirements() if container_args["resources"].limits is None: @@ -506,11 +504,7 @@ def create_model_group_service( namespace, config, model_group, - ( - LLAMA_CPP_PYTHON_CUDA - if model_group.awsGpu and model_group.awsGpu.enabled - else LLAMA_CPP_PYTHON_IMAGE - ), + (LLAMA_CPP_PYTHON_CUDA if model_group.awsGpu else LLAMA_CPP_PYTHON_IMAGE), port, ) diff --git a/tests/config/snapshots/test_config/test_aws_yaml/aws_yaml.txt b/tests/config/snapshots/test_config/test_aws_yaml/aws_yaml.txt index 917e48e..8a0bfa2 100644 --- a/tests/config/snapshots/test_config/test_aws_yaml/aws_yaml.txt +++ b/tests/config/snapshots/test_config/test_aws_yaml/aws_yaml.txt @@ -9,6 +9,7 @@ aws: logRetentionDays: 14 modelGroups: - nodeType: t2.micro + diskSize: 20 name: test-model-group minInstances: 1 maxInstances: 2 diff --git a/tests/config/test_config.py b/tests/config/test_config.py index 401675d..ddcfa9d 100644 --- a/tests/config/test_config.py +++ b/tests/config/test_config.py @@ -190,6 +190,7 @@ def test_parse_yaml() -> None: minInstances: 1 maxInstances: 1 name: llama2-7b + awsGpu: vectorStore: nodeType: t2.small replicas: 2 @@ -211,10 +212,36 @@ def test_parse_yaml() -> None: assert model_group.minInstances == 1 assert model_group.maxInstances == 1 assert model_group.name == "llama2-7b" + assert model_group.awsGpu is None assert config.aws.vectorStore is not None assert config.aws.vectorStore.nodeType == "t2.small" assert config.aws.vectorStore.replicas == 2 + yaml_str = """ + aws: + cluster: + name: test_cluster + region: us-west-2 + nodeType: t2.medium + minNodes: 2 + maxNodes: 4 + modelGroups: + - nodeType: c7a.xlarge + minInstances: 1 + maxInstances: 1 + name: llama2-7b + awsGpu: + diskSize: 100 + """ + config = parse_yaml(yaml_str) + assert isinstance(config, Config) + assert config.aws is not None + assert config.aws.modelGroups is not None + assert len(config.aws.modelGroups) == 1 + model_group = config.aws.modelGroups[0] + assert model_group.awsGpu is not None + assert model_group.awsGpu.diskSize == 100 + def test_round_trip() -> None: original_config = Config(aws=cloud_config) From 6a255ba2c637cd0f7ece08224a89599973c7c27a Mon Sep 17 00:00:00 2001 From: Jijun Leng Date: Tue, 9 Apr 2024 23:15:42 -0700 Subject: [PATCH 7/7] feat(gpu): be able to request a number of GPUs through config --- paka/config.py | 20 +++++ paka/kube_resources/model_group/service.py | 5 +- tests/config/test_config.py | 5 ++ tests/model_group/test_service.py | 98 ++++++++++++++++++++++ 4 files changed, 127 insertions(+), 1 deletion(-) create mode 100644 tests/model_group/test_service.py diff --git a/paka/config.py b/paka/config.py index e8b8d85..2808827 100644 --- a/paka/config.py +++ b/paka/config.py @@ -33,10 +33,12 @@ class ResourceRequest(BaseModel): Attributes: cpu (str): The amount of CPU to request. memory (str): The amount of memory to request. + gpu (Optional[int]): The number of GPUs to request. Defaults to None. """ cpu: str memory: str + gpu: Optional[int] = None @field_validator("cpu", mode="before") def validate_cpu(cls, v: str) -> str: @@ -72,6 +74,24 @@ def validate_memory(cls, v: str) -> str: """ return validate_size(v, "Invalid memory format") + @field_validator("gpu") + def validate_gpu(cls, v: Optional[int]) -> Optional[int]: + """ + Validates the value of the gpu field. + + Args: + v (Optional[int]): The value of the gpu field. + + Returns: + Optional[int]: The input value if validation is successful. + + Raises: + ValueError: If the value is less than 0. + """ + if v is not None and v < 0: + raise ValueError("GPU count cannot be less than 0") + return v + class AwsGpuNode(BaseModel): """ diff --git a/paka/kube_resources/model_group/service.py b/paka/kube_resources/model_group/service.py index 73a0246..2e2d92c 100644 --- a/paka/kube_resources/model_group/service.py +++ b/paka/kube_resources/model_group/service.py @@ -168,8 +168,11 @@ def create_pod( container_args["resources"] = client.V1ResourceRequirements() if container_args["resources"].limits is None: container_args["resources"].limits = {} + gpu_count = 1 + if model_group.resourceRequest and model_group.resourceRequest.gpu: + gpu_count = model_group.resourceRequest.gpu # Ah, we only support nvidia GPUs for now - container_args["resources"].limits["nvidia.com/gpu"] = 1 + container_args["resources"].limits["nvidia.com/gpu"] = gpu_count return client.V1Pod( metadata=client.V1ObjectMeta( diff --git a/tests/config/test_config.py b/tests/config/test_config.py index ddcfa9d..1275813 100644 --- a/tests/config/test_config.py +++ b/tests/config/test_config.py @@ -52,6 +52,11 @@ def test_invalid_memory_resource_request() -> None: ResourceRequest(cpu="500m", memory="2G") +def test_invalid_gpu_resource_request() -> None: + with pytest.raises(ValueError, match="GPU count cannot be less than 0"): + ResourceRequest(cpu="500m", memory="2Gi", gpu=-1) + + def test_model_group() -> None: # Test with valid minInstances and maxInstances model_group = ModelGroup(name="test", minInstances=1, maxInstances=2) diff --git a/tests/model_group/test_service.py b/tests/model_group/test_service.py new file mode 100644 index 0000000..bac7d3b --- /dev/null +++ b/tests/model_group/test_service.py @@ -0,0 +1,98 @@ +from kubernetes.client import V1Pod + +from paka.config import ( + AwsGpuNode, + CloudConfig, + CloudModelGroup, + ClusterConfig, + Config, + ResourceRequest, +) +from paka.kube_resources.model_group.service import create_pod + + +def test_create_pod() -> None: + model_group = CloudModelGroup( + nodeType="c7a.xlarge", + minInstances=1, + maxInstances=1, + name="llama2-7b", + resourceRequest=ResourceRequest(cpu="100m", memory="256Mi", gpu=2), + awsGpu=AwsGpuNode(diskSize=100), + ) + + config = Config( + aws=CloudConfig( + cluster=ClusterConfig( + name="test_cluster", + region="us-west-2", + nodeType="t2.medium", + minNodes=2, + maxNodes=4, + ), + modelGroups=[model_group], + ) + ) + + pod = create_pod("test_namespace", config, model_group, "runtime_image", 8080) + + assert isinstance(pod, V1Pod) + + assert pod.metadata.name == "llama2-7b" + assert pod.metadata.namespace == "test_namespace" + assert len(pod.spec.containers) == 1 + container = pod.spec.containers[0] + assert container.name == "llama2-7b" + assert container.image == "runtime_image" + assert container.resources.requests["cpu"] == "100m" + assert container.resources.requests["memory"] == "256Mi" + assert container.resources.limits["nvidia.com/gpu"] == 2 + assert len(container.volume_mounts) == 1 + assert container.volume_mounts[0].name == "model-data" + assert container.volume_mounts[0].mount_path == "/data" + assert len(container.env) == 3 + assert container.env[0].name == "N_GPU_LAYERS" + assert container.env[0].value == "-1" # Offload all layers to GPU + assert container.env[1].name == "MODEL" + assert container.env[1].value == "/data/my_model.gguf" + assert container.env[2].name == "PORT" + assert container.env[2].value == "8080" + + model_group = CloudModelGroup( + nodeType="c7a.xlarge", + minInstances=1, + maxInstances=1, + name="llama2-7b", + ) + + config = Config( + aws=CloudConfig( + cluster=ClusterConfig( + name="test_cluster", + region="us-west-2", + nodeType="t2.medium", + minNodes=2, + maxNodes=4, + ), + modelGroups=[model_group], + ) + ) + + pod = create_pod("test_namespace", config, model_group, "runtime_image", 8080) + + assert isinstance(pod, V1Pod) + + assert len(pod.spec.containers) == 1 + container = pod.spec.containers[0] + assert container.name == "llama2-7b" + assert container.image == "runtime_image" + assert len(container.volume_mounts) == 1 + assert container.volume_mounts[0].name == "model-data" + assert container.volume_mounts[0].mount_path == "/data" + assert len(container.env) == 3 + assert container.env[0].name == "N_GPU_LAYERS" + assert container.env[0].value == "0" # Offload no layers to GPU + assert container.env[1].name == "MODEL" + assert container.env[1].value == "/data/my_model.gguf" + assert container.env[2].name == "PORT" + assert container.env[2].value == "8080"