From 0ed1fe371909c55403714c6dd32c64d0941ecc9c Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Mon, 2 Nov 2020 14:57:04 -0800 Subject: [PATCH 1/5] Allow parsing CUDA_VISIBLE_DEVICES with UUID Add new parse_cuda_visible_device utility function to parse UUIDs --- dask_cuda/local_cuda_cluster.py | 13 ++++++++++--- dask_cuda/utils.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py index 635daac7..39a03fa9 100644 --- a/dask_cuda/local_cuda_cluster.py +++ b/dask_cuda/local_cuda_cluster.py @@ -17,6 +17,7 @@ get_n_gpus, get_ucx_config, get_ucx_net_devices, + parse_cuda_visible_device, ) @@ -32,7 +33,9 @@ def cuda_visible_devices(i, visible=None): """ if visible is None: try: - visible = map(int, os.environ["CUDA_VISIBLE_DEVICES"].split(",")) + visible = map( + parse_cuda_visible_device, os.environ["CUDA_VISIBLE_DEVICES"].split(",") + ) except KeyError: visible = range(get_n_gpus()) visible = list(visible) @@ -159,7 +162,9 @@ def __init__( CUDA_VISIBLE_DEVICES = cuda_visible_devices(0) if isinstance(CUDA_VISIBLE_DEVICES, str): CUDA_VISIBLE_DEVICES = CUDA_VISIBLE_DEVICES.split(",") - CUDA_VISIBLE_DEVICES = list(map(int, CUDA_VISIBLE_DEVICES)) + CUDA_VISIBLE_DEVICES = list( + map(parse_cuda_visible_device, CUDA_VISIBLE_DEVICES) + ) if n_workers is None: n_workers = len(CUDA_VISIBLE_DEVICES) self.host_memory_limit = parse_memory_limit( @@ -283,7 +288,9 @@ def new_worker_spec(self): visible_devices = cuda_visible_devices(worker_count, self.cuda_visible_devices) spec["options"].update( { - "env": {"CUDA_VISIBLE_DEVICES": visible_devices,}, + "env": { + "CUDA_VISIBLE_DEVICES": visible_devices, + }, "plugins": { CPUAffinity(get_cpu_affinity(worker_count)), RMMSetup(self.rmm_pool_size, self.rmm_managed_memory), diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py index ee717941..55aea8ed 100644 --- a/dask_cuda/utils.py +++ b/dask_cuda/utils.py @@ -388,3 +388,33 @@ def f(x): def all_to_all(client): return client.sync(_all_to_all, client=client, asynchronous=client.asynchronous) + + +def parse_cuda_visible_device(dev): + """Parses a single CUDA device identifier + + A device identifier must either be an intenger, a string containing an + integer or a string containing the device's UUID, beginning with prefix + 'GPU-' or 'MIG-GPU'. + + >>> parse_cuda_visible_device(2) + 2 + >>> parse_cuda_visible_device('2') + 2 + >>> parse_cuda_visible_device('GPU-9baca7f5-0f2f-01ac-6b05-8da14d6e9005') + 'GPU-9baca7f5-0f2f-01ac-6b05-8da14d6e9005' + >>> parse_cuda_visible_device('Foo') + Traceback (most recent call last): + ... + ValueError: Devices in CUDA_VISIBLE_DEVICES must be comma-separated integers or strings beginning with 'GPU-' or 'MIG-GPU-' prefixes. + """ + try: + return int(dev) + except ValueError: + if any(dev.startswith(prefix) for prefix in ["GPU-", "MIG-GPU-"]): + return dev + else: + raise ValueError( + "Devices in CUDA_VISIBLE_DEVICES must be comma-separated integers " + "or strings beginning with 'GPU-' or 'MIG-GPU-' prefixes." + ) From cb5e0348699b4cc08d627f4279239324df37488a Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Mon, 2 Nov 2020 14:58:12 -0800 Subject: [PATCH 2/5] Add test CUDA_VISIBLE_DEVICES parsing with UUID --- dask_cuda/tests/test_utils.py | 38 +++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/dask_cuda/tests/test_utils.py b/dask_cuda/tests/test_utils.py index 8dcb4323..4d669b67 100644 --- a/dask_cuda/tests/test_utils.py +++ b/dask_cuda/tests/test_utils.py @@ -6,12 +6,15 @@ from dask_cuda.utils import ( get_cpu_affinity, get_device_total_memory, + get_gpu_count, get_n_gpus, get_preload_options, get_ucx_config, get_ucx_net_devices, + parse_cuda_visible_device, unpack_bitmask, ) +from dask_cuda.local_cuda_cluster import cuda_visible_devices def test_get_n_gpus(): @@ -181,3 +184,38 @@ def test_get_ucx_config(enable_tcp_over_ucx, enable_infiniband, net_devices): pass elif net_devices == "": assert "net-device" not in ucx_config + + +def test_parse_visible_devices(): + pynvml = pytest.importorskip("pynvml") + pynvml.nvmlInit() + indices = [] + uuids = [] + for index in range(get_gpu_count()): + handle = pynvml.nvmlDeviceGetHandleByIndex(index) + uuid = pynvml.nvmlDeviceGetUUID(handle).decode("utf-8") + + assert parse_cuda_visible_device(index) == index + assert parse_cuda_visible_device(uuid) == uuid + + indices.append(str(index)) + uuids.append(pynvml.nvmlDeviceGetUUID(handle).decode("utf-8")) + + index_devices = ",".join(indices) + os.environ["CUDA_VISIBLE_DEVICES"] = index_devices + for index in range(get_gpu_count()): + visible = cuda_visible_devices(index) + assert visible.split(",")[0] == str(index) + + uuid_devices = ",".join(uuids) + os.environ["CUDA_VISIBLE_DEVICES"] = uuid_devices + for index in range(get_gpu_count()): + visible = cuda_visible_devices(index) + assert visible.split(",")[0] == str(uuids[index]) + + with pytest.raises(ValueError): + parse_cuda_visible_device("Foo") + + with pytest.raises(TypeError): + parse_cuda_visible_device(None) + parse_cuda_visible_device([]) From 7626154042518ad5b4a22dadebbb7a9d8ac7052c Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Mon, 2 Nov 2020 15:01:03 -0800 Subject: [PATCH 3/5] Move cuda_visible_devices to utils.py --- dask_cuda/cuda_worker.py | 2 +- dask_cuda/local_cuda_cluster.py | 25 +------------------------ dask_cuda/tests/test_utils.py | 2 +- dask_cuda/utils.py | 23 +++++++++++++++++++++++ 4 files changed, 26 insertions(+), 26 deletions(-) diff --git a/dask_cuda/cuda_worker.py b/dask_cuda/cuda_worker.py index bc25bae8..90cf99c1 100644 --- a/dask_cuda/cuda_worker.py +++ b/dask_cuda/cuda_worker.py @@ -20,10 +20,10 @@ from .device_host_file import DeviceHostFile from .initialize import initialize -from .local_cuda_cluster import cuda_visible_devices from .utils import ( CPUAffinity, RMMSetup, + cuda_visible_devices, get_cpu_affinity, get_device_total_memory, get_n_gpus, diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py index 39a03fa9..60fbfb05 100644 --- a/dask_cuda/local_cuda_cluster.py +++ b/dask_cuda/local_cuda_cluster.py @@ -12,38 +12,15 @@ from .utils import ( CPUAffinity, RMMSetup, + cuda_visible_devices, get_cpu_affinity, get_device_total_memory, - get_n_gpus, get_ucx_config, get_ucx_net_devices, parse_cuda_visible_device, ) -def cuda_visible_devices(i, visible=None): - """Cycling values for CUDA_VISIBLE_DEVICES environment variable - - Examples - -------- - >>> cuda_visible_devices(0, range(4)) - '0,1,2,3' - >>> cuda_visible_devices(3, range(8)) - '3,4,5,6,7,0,1,2' - """ - if visible is None: - try: - visible = map( - parse_cuda_visible_device, os.environ["CUDA_VISIBLE_DEVICES"].split(",") - ) - except KeyError: - visible = range(get_n_gpus()) - visible = list(visible) - - L = visible[i:] + visible[:i] - return ",".join(map(str, L)) - - class LocalCUDACluster(LocalCluster): """A variant of LocalCluster that uses one GPU per process diff --git a/dask_cuda/tests/test_utils.py b/dask_cuda/tests/test_utils.py index 4d669b67..3700bd12 100644 --- a/dask_cuda/tests/test_utils.py +++ b/dask_cuda/tests/test_utils.py @@ -4,6 +4,7 @@ from numba import cuda from dask_cuda.utils import ( + cuda_visible_devices, get_cpu_affinity, get_device_total_memory, get_gpu_count, @@ -14,7 +15,6 @@ parse_cuda_visible_device, unpack_bitmask, ) -from dask_cuda.local_cuda_cluster import cuda_visible_devices def test_get_n_gpus(): diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py index 55aea8ed..64226e55 100644 --- a/dask_cuda/utils.py +++ b/dask_cuda/utils.py @@ -418,3 +418,26 @@ def parse_cuda_visible_device(dev): "Devices in CUDA_VISIBLE_DEVICES must be comma-separated integers " "or strings beginning with 'GPU-' or 'MIG-GPU-' prefixes." ) + + +def cuda_visible_devices(i, visible=None): + """Cycling values for CUDA_VISIBLE_DEVICES environment variable + + Examples + -------- + >>> cuda_visible_devices(0, range(4)) + '0,1,2,3' + >>> cuda_visible_devices(3, range(8)) + '3,4,5,6,7,0,1,2' + """ + if visible is None: + try: + visible = map( + parse_cuda_visible_device, os.environ["CUDA_VISIBLE_DEVICES"].split(",") + ) + except KeyError: + visible = range(get_n_gpus()) + visible = list(visible) + + L = visible[i:] + visible[:i] + return ",".join(map(str, L)) From 55ddfb480e9e78adc87366f1058430e41c7f64cb Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Mon, 2 Nov 2020 15:24:47 -0800 Subject: [PATCH 4/5] Fix formatting --- dask_cuda/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py index 64226e55..fa3e72d5 100644 --- a/dask_cuda/utils.py +++ b/dask_cuda/utils.py @@ -406,7 +406,8 @@ def parse_cuda_visible_device(dev): >>> parse_cuda_visible_device('Foo') Traceback (most recent call last): ... - ValueError: Devices in CUDA_VISIBLE_DEVICES must be comma-separated integers or strings beginning with 'GPU-' or 'MIG-GPU-' prefixes. + ValueError: Devices in CUDA_VISIBLE_DEVICES must be comma-separated integers or + strings beginning with 'GPU-' or 'MIG-GPU-' prefixes. """ try: return int(dev) From 1293ad6b484cf305453c06ba13b38cb0a90d932d Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 3 Nov 2020 19:45:03 +0100 Subject: [PATCH 5/5] Fix parse_cuda_visible_device doc typo Co-authored-by: Mads R. B. Kristensen --- dask_cuda/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py index fa3e72d5..b2c7ccf4 100644 --- a/dask_cuda/utils.py +++ b/dask_cuda/utils.py @@ -393,7 +393,7 @@ def all_to_all(client): def parse_cuda_visible_device(dev): """Parses a single CUDA device identifier - A device identifier must either be an intenger, a string containing an + A device identifier must either be an integer, a string containing an integer or a string containing the device's UUID, beginning with prefix 'GPU-' or 'MIG-GPU'.