From ebbafae3ed71bb4e900a2f310eae7907e2333cf8 Mon Sep 17 00:00:00 2001 From: "Michael J. Williams" Date: Fri, 17 Jan 2025 16:45:42 +0000 Subject: [PATCH] Add multi-gpu support for cupy scheme (#5007) * add multi-gpu support for cupy scheme * fix typo in exit method * re-add missing warning * using module logger instead of global logger --- pycbc/scheme.py | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/pycbc/scheme.py b/pycbc/scheme.py index 0a9e6740e1e..9d27d146792 100644 --- a/pycbc/scheme.py +++ b/pycbc/scheme.py @@ -31,6 +31,7 @@ from functools import wraps import logging from .libutils import get_ctypes_library +from .pool import use_mpi logger = logging.getLogger('pycbc.scheme') @@ -117,16 +118,40 @@ def __init__(self, device_num=0): class CUPYScheme(Scheme): - """Scheme for using CUPY""" + """Scheme for using CUPY. + + Supports using CUPY with MPI. If MPI is enabled, will use all available + devices. The environment variable `CUDA_VISIBLE_DEVICES` can be used to + restrict the devices used. + + Parameters + ---------- + device_num : int, optional + The device number to use. If not provided, will use the default, 0. + Should not be provided when using MPI to parallelize across devices. + """ def __init__(self, device_num=None): import cupy # Fail now if cupy is not there. import cupy.cuda + + do_mpi, _, rank = use_mpi(require_mpi=False, log=False) + + if device_num is not None and do_mpi: + logger.warning("MPI is enabled, but a device number was provided.") + + if device_num is None and do_mpi: + # Logical device numbers will always be 0, 1, 2, ... etc. irrespective + # of the physical device numbers. + device_num = rank % cupy.cuda.runtime.getDeviceCount() + logger.debug("MPI enabled, using CUDA device %s", device_num) + self.device_num = device_num self.cuda_device = cupy.cuda.Device(self.device_num) + def __enter__(self): super().__enter__() self.cuda_device.__enter__() - logging.warn( + logger.warning( "You are using the CUPY GPU backend for PyCBC. This backend is " "still only a prototype. It may be useful for your application " "but it may fail unexpectedly, run slowly, or not give correct "