Skip to content

Commit

Permalink
fix: better backoff
Browse files Browse the repository at this point in the history
  • Loading branch information
AidanAbd committed Dec 14, 2024
1 parent e9e58d6 commit 5a7950d
Showing 1 changed file with 16 additions and 12 deletions.
28 changes: 16 additions & 12 deletions src/latch/resources/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -629,19 +629,23 @@ def nextflow_runtime_task(cpu: int, memory: int, storage_gib: int = 50):
return functools.partial(task, task_config=task_config)


def _get_l40s_pod(instance_type: str, cpu: int, memory: int, gpus: int) -> Pod:
def _get_l40s_pod(instance_type: str, cpu: int, memory_gib: int, gpus: int) -> Pod:
"""Helper function to create L40s GPU pod configurations."""
primary_container = V1Container(name="primary")

backoff_cpu = cpu - 2
backoff_memory = min(memory_gib - 4, int(memory_gib * 0.9))

resources = V1ResourceRequirements(
requests={
"cpu": str(cpu - 2), # Reserve 2 cores for system processes
"memory": f"{memory - 4}Gi", # Reserve 4GB for system processes
"cpu": str(backoff_cpu),
"memory": f"{backoff_memory}Gi",
"nvidia.com/gpu": str(gpus),
"ephemeral-storage": "4500Gi",
},
limits={
"cpu": str(cpu),
"memory": f"{memory}Gi",
"memory": f"{memory_gib}Gi",
"nvidia.com/gpu": str(gpus),
"ephemeral-storage": "5000Gi",
},
Expand All @@ -668,49 +672,49 @@ def _get_l40s_pod(instance_type: str, cpu: int, memory: int, gpus: int) -> Pod:

g6e_xlarge_task = functools.partial(
task,
task_config=_get_l40s_pod("g6e-xlarge", cpu=4, memory=32, gpus=1)
task_config=_get_l40s_pod("g6e-xlarge", cpu=4, memory_gib=32, gpus=1)
)
"""4 vCPUs, 32 GiB RAM, 1 L40s GPU"""

g6e_2xlarge_task = functools.partial(
task,
task_config=_get_l40s_pod("g6e-2xlarge", cpu=8, memory=64, gpus=1)
task_config=_get_l40s_pod("g6e-2xlarge", cpu=8, memory_gib=64, gpus=1)
)
"""8 vCPUs, 64 GiB RAM, 1 L40s GPU"""

g6e_4xlarge_task = functools.partial(
task,
task_config=_get_l40s_pod("g6e-4xlarge", cpu=16, memory=128, gpus=1)
task_config=_get_l40s_pod("g6e-4xlarge", cpu=16, memory_gib=128, gpus=1)
)
"""16 vCPUs, 128 GiB RAM, 1 L40s GPU"""

g6e_8xlarge_task = functools.partial(
task,
task_config=_get_l40s_pod("g6e-8xlarge", cpu=32, memory=256, gpus=1)
task_config=_get_l40s_pod("g6e-8xlarge", cpu=32, memory_gib=256, gpus=1)
)
"""32 vCPUs, 256 GiB RAM, 1 L40s GPU"""

g6e_12xlarge_task = functools.partial(
task,
task_config=_get_l40s_pod("g6e-12xlarge", cpu=48, memory=384, gpus=1)
task_config=_get_l40s_pod("g6e-12xlarge", cpu=48, memory_gib=384, gpus=1)
)
"""48 vCPUs, 384 GiB RAM, 1 L40s GPU"""

g6e_16xlarge_task = functools.partial(
task,
task_config=_get_l40s_pod("g6e-16xlarge", cpu=64, memory=512, gpus=4)
task_config=_get_l40s_pod("g6e-16xlarge", cpu=64, memory_gib=512, gpus=4)
)
"""64 vCPUs, 512 GiB RAM, 4 L40s GPUs"""

g6e_24xlarge_task = functools.partial(
task,
task_config=_get_l40s_pod("g6e-24xlarge", cpu=96, memory=768, gpus=4)
task_config=_get_l40s_pod("g6e-24xlarge", cpu=96, memory_gib=768, gpus=4)
)
"""96 vCPUs, 768 GiB RAM, 4 L40s GPUs"""

"""
g6e_48xlarge_task = functools.partial(
task,
task_config=_get_l40s_pod("g6e-48xlarge", cpu=192, memory=1536, gpus=8)
task_config=_get_l40s_pod("g6e-48xlarge", cpu=192, memory_gib=1536, gpus=8)
)
192 vCPUs, 1536 GiB RAM, 8 L40s GPUs"""

0 comments on commit 5a7950d

Please sign in to comment.