Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add GPU (CUDA) support #15

Merged
merged 7 commits into from
Apr 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 11 additions & 6 deletions paka/cluster/aws/eks.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from paka.cluster.keda import create_keda
from paka.cluster.knative import create_knative_and_istio
from paka.cluster.namespace import create_namespace
from paka.cluster.nvidia_device_plugin import install_nvidia_device_plugin
from paka.cluster.prometheus import create_prometheus
from paka.cluster.qdrant import create_qdrant
from paka.cluster.redis import create_redis
Expand Down Expand Up @@ -79,10 +80,6 @@ def create_node_group_for_model_group(
node_group_name=f"{project}-{kubify_name(model_group.name)}-group",
cluster=cluster,
instance_types=[model_group.nodeType],
# Set the desired size of the node group to the minimum number of instances
# specified for the model group.
# Note: Scaling down to 0 is not supported, since cold starting time is
# too long for model group services.
scaling_config=aws.eks.NodeGroupScalingConfigArgs(
desired_size=model_group.minInstances,
min_size=model_group.minInstances,
Expand All @@ -95,8 +92,6 @@ def create_node_group_for_model_group(
},
node_role_arn=worker_role.arn,
subnet_ids=vpc.private_subnet_ids,
# Apply taints to ensure that only pods belonging to the same model group
# can be scheduled on this node group.
taints=[
aws.eks.NodeGroupTaintArgs(
effect="NO_SCHEDULE", key="app", value="model-group"
Expand All @@ -105,6 +100,13 @@ def create_node_group_for_model_group(
effect="NO_SCHEDULE", key="model", value=model_group.name
),
],
# Supported AMI types https://docs.aws.amazon.com/eks/latest/APIReference/API_Nodegroup.html#AmazonEKS-Type-Nodegroup-amiType
ami_type=("AL2_x86_64_GPU" if model_group.awsGpu else None),
disk_size=(
model_group.awsGpu.diskSize
if model_group.awsGpu
else model_group.diskSize
),
)


Expand Down Expand Up @@ -301,6 +303,9 @@ def create_eks_resources(kubeconfig_json: str) -> None:
enable_cloudwatch(config, k8s_provider)
create_prometheus(config, k8s_provider)
create_zipkin(config, k8s_provider)
# Install the NVIDIA device plugin for GPU support
# Even if the cluster doesn't have GPUs, this won't cause any issues
install_nvidia_device_plugin(k8s_provider)

# TODO: Set timeout to be the one used by knative
update_elb_idle_timeout(kubeconfig_json, 300)
Expand Down
87 changes: 87 additions & 0 deletions paka/cluster/nvidia_device_plugin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import pulumi
import pulumi_kubernetes as k8s


def install_nvidia_device_plugin(
k8s_provider: k8s.Provider, version: str = "v0.15.0-rc.2"
) -> None:
"""
Installs the NVIDIA device plugin for GPU support in the cluster.

This function deploys the NVIDIA device plugin to the cluster using a DaemonSet.
The device plugin allows Kubernetes to discover and manage GPU resources on the nodes.

Args:
k8s_provider (k8s.Provider): The Kubernetes provider to use for deploying the device plugin.

Returns:
None
"""

k8s.apps.v1.DaemonSet(
"nvidia-device-plugin-daemonset",
metadata=k8s.meta.v1.ObjectMetaArgs(
namespace="kube-system",
),
spec=k8s.apps.v1.DaemonSetSpecArgs(
selector=k8s.meta.v1.LabelSelectorArgs(
match_labels={
"name": "nvidia-device-plugin-ds",
},
),
update_strategy=k8s.apps.v1.DaemonSetUpdateStrategyArgs(
type="RollingUpdate",
),
template=k8s.core.v1.PodTemplateSpecArgs(
metadata=k8s.meta.v1.ObjectMetaArgs(
labels={
"name": "nvidia-device-plugin-ds",
},
),
spec=k8s.core.v1.PodSpecArgs(
tolerations=[
k8s.core.v1.TolerationArgs(
key="nvidia.com/gpu",
operator="Exists",
effect="NoSchedule",
),
k8s.core.v1.TolerationArgs(operator="Exists"),
],
priority_class_name="system-node-critical",
containers=[
k8s.core.v1.ContainerArgs(
image=f"nvcr.io/nvidia/k8s-device-plugin:{version}",
name="nvidia-device-plugin-ctr",
env=[
k8s.core.v1.EnvVarArgs(
name="FAIL_ON_INIT_ERROR",
value="false",
)
],
security_context=k8s.core.v1.SecurityContextArgs(
allow_privilege_escalation=False,
capabilities=k8s.core.v1.CapabilitiesArgs(
drop=["ALL"],
),
),
volume_mounts=[
k8s.core.v1.VolumeMountArgs(
name="device-plugin",
mount_path="/var/lib/kubelet/device-plugins",
)
],
)
],
volumes=[
k8s.core.v1.VolumeArgs(
name="device-plugin",
host_path=k8s.core.v1.HostPathVolumeSourceArgs(
path="/var/lib/kubelet/device-plugins",
),
)
],
),
),
),
opts=pulumi.ResourceOptions(provider=k8s_provider),
)
67 changes: 65 additions & 2 deletions paka/config.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import re
from typing import Any, Dict, List, Optional
from typing import Any, Dict, List, Optional, Union

from pydantic import BaseModel, field_validator, model_validator
from ruamel.yaml import YAML
Expand Down Expand Up @@ -33,10 +33,12 @@ class ResourceRequest(BaseModel):
Attributes:
cpu (str): The amount of CPU to request.
memory (str): The amount of memory to request.
gpu (Optional[int]): The number of GPUs to request. Defaults to None.
"""

cpu: str
memory: str
gpu: Optional[int] = None

@field_validator("cpu", mode="before")
def validate_cpu(cls, v: str) -> str:
Expand Down Expand Up @@ -72,17 +74,78 @@ def validate_memory(cls, v: str) -> str:
"""
return validate_size(v, "Invalid memory format")

@field_validator("gpu")
def validate_gpu(cls, v: Optional[int]) -> Optional[int]:
"""
Validates the value of the gpu field.

Args:
v (Optional[int]): The value of the gpu field.

Returns:
Optional[int]: The input value if validation is successful.

Raises:
ValueError: If the value is less than 0.
"""
if v is not None and v < 0:
raise ValueError("GPU count cannot be less than 0")
return v


class AwsGpuNode(BaseModel):
"""
Represents a configuration for an AWS GPU node.

Attributes:
diskSize (int): The size of the disk for the GPU node in GB.
"""

diskSize: int


class GcpGpuNode(BaseModel):
"""
Represents a Google Cloud Platform GPU node.

Attributes:
imageType (str): The type of image used for the GPU node.
acceleratorType (str): The type of accelerator used for the GPU node.
acceleratorCount (int): The number of accelerators attached to the GPU node.
diskType (str): The type of disk used for the GPU node.
diskSize (int): The size of the disk attached to the GPU node in GB.
"""

imageType: str
acceleratorType: str
acceleratorCount: int
diskType: str
diskSize: int


class CloudNode(BaseModel):
"""
Represents a node in the cloud cluster.

Attributes:
nodeType (str): The type of the node.

diskSize (int): The size of the disk attached to the node in GB.
awsGpu (Optional[AwsGpuNode]): The AWS GPU node configuration, if applicable.
gcpGpu (Optional[GcpGpuNode]): The GCP GPU node configuration, if applicable.
"""

nodeType: str
diskSize: int = 20
awsGpu: Optional[AwsGpuNode] = None
gcpGpu: Optional[GcpGpuNode] = None

@model_validator(mode="before")
def validate_gpu(
cls, values: Dict[str, Union[AwsGpuNode, GcpGpuNode]]
) -> Dict[str, Union[AwsGpuNode, GcpGpuNode]]:
if values.get("awsGpu") and values.get("gcpGpu"):
raise ValueError("At most one of awsGpu or gcpGpu can exist")
return values


class ModelGroup(BaseModel):
Expand Down
29 changes: 25 additions & 4 deletions paka/kube_resources/model_group/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,12 @@
from paka.kube_resources.model_group.model import MODEL_PATH_PREFIX, download_model
from paka.utils import kubify_name, read_cluster_data

# We hardcode the image here for now
# `latest` will be stale because of the `IfNotPresent` policy
# We hardcode the image here for now, we can make it configurable later
LLAMA_CPP_PYTHON_IMAGE = "ghcr.io/abetlen/llama-cpp-python:latest"

LLAMA_CPP_PYTHON_CUDA = "jijunleng/llama-cpp-python-cuda:latest"

try_load_kubeconfig()


Expand Down Expand Up @@ -115,8 +118,9 @@ def create_pod(
],
"env": [
client.V1EnvVar(
name="USE_MLOCK", # Model weights are locked in RAM or not
value="0",
name="N_GPU_LAYERS",
# -1 means all layers are GPU layers, 0 means no GPU layers
value=("-1" if model_group.awsGpu else "0"),
),
client.V1EnvVar(
name="MODEL",
Expand Down Expand Up @@ -159,6 +163,17 @@ def create_pod(
},
)

if model_group.awsGpu:
if "resources" not in container_args:
container_args["resources"] = client.V1ResourceRequirements()
if container_args["resources"].limits is None:
container_args["resources"].limits = {}
gpu_count = 1
if model_group.resourceRequest and model_group.resourceRequest.gpu:
gpu_count = model_group.resourceRequest.gpu
# Ah, we only support nvidia GPUs for now
container_args["resources"].limits["nvidia.com/gpu"] = gpu_count

return client.V1Pod(
metadata=client.V1ObjectMeta(
name=f"{kubify_name(model_group.name)}",
Expand Down Expand Up @@ -488,7 +503,13 @@ def create_model_group_service(

port = 8000

pod = create_pod(namespace, config, model_group, LLAMA_CPP_PYTHON_IMAGE, port)
pod = create_pod(
namespace,
config,
model_group,
(LLAMA_CPP_PYTHON_CUDA if model_group.awsGpu else LLAMA_CPP_PYTHON_IMAGE),
port,
)

deployment = create_deployment(namespace, model_group, pod)
apply_resource(deployment)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ aws:
logRetentionDays: 14
modelGroups:
- nodeType: t2.micro
diskSize: 20
name: test-model-group
minInstances: 1
maxInstances: 2
32 changes: 32 additions & 0 deletions tests/config/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@ def test_invalid_memory_resource_request() -> None:
ResourceRequest(cpu="500m", memory="2G")


def test_invalid_gpu_resource_request() -> None:
with pytest.raises(ValueError, match="GPU count cannot be less than 0"):
ResourceRequest(cpu="500m", memory="2Gi", gpu=-1)


def test_model_group() -> None:
# Test with valid minInstances and maxInstances
model_group = ModelGroup(name="test", minInstances=1, maxInstances=2)
Expand Down Expand Up @@ -190,6 +195,7 @@ def test_parse_yaml() -> None:
minInstances: 1
maxInstances: 1
name: llama2-7b
awsGpu:
vectorStore:
nodeType: t2.small
replicas: 2
Expand All @@ -211,10 +217,36 @@ def test_parse_yaml() -> None:
assert model_group.minInstances == 1
assert model_group.maxInstances == 1
assert model_group.name == "llama2-7b"
assert model_group.awsGpu is None
assert config.aws.vectorStore is not None
assert config.aws.vectorStore.nodeType == "t2.small"
assert config.aws.vectorStore.replicas == 2

yaml_str = """
aws:
cluster:
name: test_cluster
region: us-west-2
nodeType: t2.medium
minNodes: 2
maxNodes: 4
modelGroups:
- nodeType: c7a.xlarge
minInstances: 1
maxInstances: 1
name: llama2-7b
awsGpu:
diskSize: 100
"""
config = parse_yaml(yaml_str)
assert isinstance(config, Config)
assert config.aws is not None
assert config.aws.modelGroups is not None
assert len(config.aws.modelGroups) == 1
model_group = config.aws.modelGroups[0]
assert model_group.awsGpu is not None
assert model_group.awsGpu.diskSize == 100


def test_round_trip() -> None:
original_config = Config(aws=cloud_config)
Expand Down
Loading