Skip to content

Commit

Permalink
Merge pull request #153 from Duke-GCB/gpu-extension
Browse files Browse the repository at this point in the history
gpu-extension
  • Loading branch information
emmanuelmathot authored Jun 19, 2023
2 parents 78bba0e + e4ec6f0 commit 0de0e1b
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 6 deletions.
26 changes: 22 additions & 4 deletions calrissian/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from cwltool.errors import WorkflowException, UnsupportedRequirement
from calrissian.k8s import KubernetesClient, CompletionResult
from calrissian.report import Reporter, TimedResourceReport
from cwltool.builder import Builder
import logging
import os
import yaml
Expand Down Expand Up @@ -188,7 +189,7 @@ def add_emptydir_volume_binding(self, name, target):

class KubernetesPodBuilder(object):

def __init__(self, name, container_image, environment, volume_mounts, volumes, command_line, stdout, stderr, stdin, resources, labels, nodeselectors, security_context, serviceaccount):
def __init__(self, name, container_image, environment, volume_mounts, volumes, command_line, stdout, stderr, stdin, resources, labels, nodeselectors, security_context, serviceaccount, requirements=None, hints=None):
self.name = name
self.container_image = container_image
self.environment = environment
Expand All @@ -203,6 +204,8 @@ def __init__(self, name, container_image, environment, volume_mounts, volumes, c
self.nodeselectors = nodeselectors
self.security_context = security_context
self.serviceaccount = serviceaccount
self.requirements = {} if requirements is None else requirements
self.hints = [] if hints is None else hints

def pod_name(self):
tag = random_tag()
Expand Down Expand Up @@ -297,7 +300,7 @@ def resource_value(kind, cwl_value):
return None

def container_resources(self):
log.debug('Building resources spec from {}'.format(self.resources))
log.debug(f'Building resources spec from {self.resources}')
container_resources = {}
for cwl_field, cwl_value in self.resources.items():
resource_bound = 'requests'
Expand All @@ -307,6 +310,19 @@ def container_resources(self):
if not container_resources.get(resource_bound):
container_resources[resource_bound] = {}
container_resources[resource_bound][resource_type] = resource_value

# Add CUDA requirements from CWL
for requirement in self.hints:
if requirement["class"] in ['cwltool:CUDARequirement', 'http://commonwl.org/cwltool#CUDARequirement']:
log.debug('Adding CUDARequirement resources spec')
resource_bound = 'requests'
container_resources[resource_bound]['nvidia.com/gpu'] = str(requirement["cudaDeviceCountMin"])
if "limits" in container_resources:
resource_bound = 'limits'
container_resources[resource_bound]['nvidia.com/gpu'] = str(requirement["cudaDeviceCountMax"])
else:
container_resources['limits'] = {'nvidia.com/gpu': str(requirement["cudaDeviceCountMax"])}

return container_resources

def pod_labels(self):
Expand Down Expand Up @@ -553,7 +569,9 @@ def create_kubernetes_runtime(self, runtimeContext):
self.get_pod_labels(runtimeContext),
self.get_pod_nodeselectors(runtimeContext),
self.get_security_context(runtimeContext),
self.get_pod_serviceaccount(runtimeContext)
self.get_pod_serviceaccount(runtimeContext),
self.builder.requirements,
self.builder.hints
)
built = k8s_builder.build()
log.debug('{}\n{}{}\n'.format('-' * 80, yaml.dump(built), '-' * 80))
Expand Down Expand Up @@ -692,4 +710,4 @@ def append_volume(self, runtime, source, target, writable=False):
This is called by the base class for file literals after they've been created.
We already have a similar function, so we just call that.
"""
self._add_volume_binding(source, target, writable)
self._add_volume_binding(source, target, writable)
37 changes: 35 additions & 2 deletions tests/test_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from calrissian.context import CalrissianRuntimeContext
from calrissian.k8s import CompletionResult
import threading

from collections import OrderedDict

class SafeNameTestCase(TestCase):

Expand Down Expand Up @@ -343,6 +343,37 @@ def test_container_resources(self):
}
}
self.assertEqual(expected, resources)

def test_gpu_hints(self):
self.pod_builder.resources = {'cores': 2, 'ram': 256}
self.pod_builder.hints = [OrderedDict([("class", "cwltool:CUDARequirement"), ("cudaVersionMin", '10.0'), ("cudaComputeCapability", '3.0'), ("cudaDeviceCountMin", 1), ("cudaDeviceCountMax", 1)])]

resources = self.pod_builder.container_resources()
expected = {
'requests': {
'cpu': '2',
'memory': '256Mi',
'nvidia.com/gpu': '1'
},
"limits": {
'nvidia.com/gpu': '1'
}
}
self.assertEqual(expected, resources)
self.pod_builder.hints = [OrderedDict([("class", "cwltool:CUDARequirement"), ("cudaVersionMin", '10.0'), ("cudaComputeCapability", '3.0'), ("cudaDeviceCountMin", 2), ("cudaDeviceCountMax", 4)])]

resources = self.pod_builder.container_resources()
expected = {
'requests': {
'cpu': '2',
'memory': '256Mi',
'nvidia.com/gpu': '2'
},
"limits": {
'nvidia.com/gpu': '4'
}
}
self.assertEqual(expected, resources)

def test_string_labels(self):
self.pod_builder.labels = {'key1': 123}
Expand Down Expand Up @@ -637,7 +668,9 @@ def realpath(path):
mock_read_yaml.return_value,
mock_read_yaml.return_value,
job.get_security_context(mock_runtime_context),
None
None,
job.builder.requirements,
job.builder.hints,
))
# calls builder.build
# returns that
Expand Down

0 comments on commit 0de0e1b

Please sign in to comment.