diff --git a/config/opal_check_cuda.m4 b/config/opal_check_cuda.m4 index 67d0cbeede1..ebd70d59e32 100644 --- a/config/opal_check_cuda.m4 +++ b/config/opal_check_cuda.m4 @@ -1,5 +1,6 @@ dnl -*- autoconf -*- dnl +dnl Copyright (c) 2024 NVIDIA Corporation. All rights reserved. dnl Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana dnl University Research and Technology dnl Corporation. All rights reserved. @@ -118,6 +119,12 @@ AS_IF([test "$opal_check_cuda_happy" = "yes"], [#include <$opal_cuda_incdir/cuda.h>])], []) +# If we have CUDA support, check to see if we have support for cuMemCreate memory on host NUMA. +AS_IF([test "$opal_check_cuda_happy"="yes"], + [AC_CHECK_DECL([CU_MEM_LOCATION_TYPE_HOST_NUMA], [CUDA_VMM_SUPPORT=1], [CUDA_VMM_SUPPORT=0], + [#include <$opal_cuda_incdir/cuda.h>])], + []) + # If we have CUDA support, check to see if we have support for SYNC_MEMOPS # which was first introduced in CUDA 6.0. AS_IF([test "$opal_check_cuda_happy" = "yes"], @@ -160,6 +167,10 @@ AM_CONDITIONAL([OPAL_cuda_support], [test "x$CUDA_SUPPORT" = "x1"]) AC_DEFINE_UNQUOTED([OPAL_CUDA_SUPPORT],$CUDA_SUPPORT, [Whether we want cuda device pointer support]) +AM_CONDITIONAL([OPAL_cuda_vmm_support], [test "x$CUDA_VMM_SUPPORT" = "x1"]) +AC_DEFINE_UNQUOTED([OPAL_CUDA_VMM_SUPPORT],$CUDA_VMM_SUPPORT, + [Whether we have CU_MEM_LOCATION_TYPE_HOST_NUMA support available]) + AM_CONDITIONAL([OPAL_cuda_sync_memops], [test "x$CUDA_SYNC_MEMOPS" = "x1"]) AC_DEFINE_UNQUOTED([OPAL_CUDA_SYNC_MEMOPS],$CUDA_SYNC_MEMOPS, [Whether we have CUDA CU_POINTER_ATTRIBUTE_SYNC_MEMOPS support available]) diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.c b/opal/mca/accelerator/cuda/accelerator_cuda.c index 116f0713cd4..1c7d76d1bbf 100644 --- a/opal/mca/accelerator/cuda/accelerator_cuda.c +++ b/opal/mca/accelerator/cuda/accelerator_cuda.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. @@ -77,9 +78,93 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module = accelerator_cuda_get_buffer_id }; +static int accelerator_cuda_check_vmm(CUdeviceptr dbuf, CUmemorytype *mem_type, + int *dev_id) +{ +#if OPAL_CUDA_VMM_SUPPORT + static int device_count = -1; + CUmemAllocationProp prop; + CUmemLocation location; + CUresult result; + unsigned long long flags; + CUmemGenericAllocationHandle alloc_handle; + + if (device_count == -1) { + result = cuDeviceGetCount(&device_count); + if (result != CUDA_SUCCESS) { + return 0; + } + } + + result = cuMemRetainAllocationHandle(&alloc_handle, (void*)dbuf); + if (result != CUDA_SUCCESS) { + return 0; + } + + result = cuMemGetAllocationPropertiesFromHandle(&prop, alloc_handle); + if (result != CUDA_SUCCESS) { + cuMemRelease(alloc_handle); + return 0; + } + + if (prop.location.type == CU_MEM_LOCATION_TYPE_DEVICE) { + *mem_type = CU_MEMORYTYPE_DEVICE; + *dev_id = prop.location.id; + cuMemRelease(alloc_handle); + return 1; + } + + if (prop.location.type == CU_MEM_LOCATION_TYPE_HOST_NUMA) { + /* check if device has access */ + for (int i = 0; i < device_count; i++) { + location.type = CU_MEM_LOCATION_TYPE_DEVICE; + location.id = i; + result = cuMemGetAccess(&flags, &location, dbuf); + if ((CUDA_SUCCESS == result) && + (CU_MEM_ACCESS_FLAGS_PROT_READWRITE == flags)) { + *mem_type = CU_MEMORYTYPE_DEVICE; + *dev_id = i; + cuMemRelease(alloc_handle); + return 1; + } + } + } + + /* host must have access as device access possibility is exhausted */ + *mem_type = CU_MEMORYTYPE_HOST; + *dev_id = MCA_ACCELERATOR_NO_DEVICE_ID; + cuMemRelease(alloc_handle); + return 1; + +#endif + + return 0; +} + +static int accelerator_cuda_get_device_id(CUcontext mem_ctx) { + /* query the device from the context */ + int dev_id = -1; + CUdevice ptr_dev; + cuCtxPushCurrent(mem_ctx); + cuCtxGetDevice(&ptr_dev); + for (int i = 0; i < opal_accelerator_cuda_num_devices; ++i) { + CUdevice dev; + cuDeviceGet(&dev, i); + if (dev == ptr_dev) { + dev_id = i; + break; + } + } + cuCtxPopCurrent(&mem_ctx); + return dev_id; +} + static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *flags) { CUresult result; + int is_vmm = 0; + int vmm_dev_id = MCA_ACCELERATOR_NO_DEVICE_ID; + CUmemorytype vmm_mem_type = 0; CUmemorytype mem_type = 0; CUdeviceptr dbuf = (CUdeviceptr) addr; CUcontext ctx = NULL, mem_ctx = NULL; @@ -91,6 +176,8 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t * *flags = 0; + is_vmm = accelerator_cuda_check_vmm(dbuf, &vmm_mem_type, &vmm_dev_id); + #if OPAL_CUDA_GET_ATTRIBUTES uint32_t is_managed = 0; /* With CUDA 7.0, we can get multiple attributes with a single call */ @@ -120,14 +207,24 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t * return OPAL_ERROR; } } else if (CU_MEMORYTYPE_HOST == mem_type) { - /* Host memory, nothing to do here */ - return 0; + if (is_vmm && (vmm_mem_type == CU_MEMORYTYPE_DEVICE)) { + mem_type = CU_MEMORYTYPE_DEVICE; + *dev_id = vmm_dev_id; + } else { + /* Host memory, nothing to do here */ + return 0; + } } else if (0 == mem_type) { /* This can happen when CUDA is initialized but dbuf is not valid CUDA pointer */ return 0; + } else { + if (is_vmm) { + *dev_id = vmm_dev_id; + } else { + /* query the device from the context */ + *dev_id = accelerator_cuda_get_device_id(mem_ctx); + } } - /* Must be a device pointer */ - assert(CU_MEMORYTYPE_DEVICE == mem_type); #else /* OPAL_CUDA_GET_ATTRIBUTES */ result = cuPointerGetAttribute(&mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dbuf); if (CUDA_SUCCESS != result) { @@ -138,12 +235,27 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t * return OPAL_ERROR; } } else if (CU_MEMORYTYPE_HOST == mem_type) { - /* Host memory, nothing to do here */ - return 0; + if (is_vmm && (vmm_mem_type == CU_MEMORYTYPE_DEVICE)) { + mem_type = CU_MEMORYTYPE_DEVICE; + *dev_id = vmm_dev_id; + } else { + /* Host memory, nothing to do here */ + return 0; + } + } else { + if (is_vmm) { + *dev_id = vmm_dev_id; + } else { + result = cuPointerGetAttribute(&mem_ctx, + CU_POINTER_ATTRIBUTE_CONTEXT, dbuf); + /* query the device from the context */ + *dev_id = accelerator_cuda_get_device_id(mem_ctx); + } } +#endif /* OPAL_CUDA_GET_ATTRIBUTES */ + /* Must be a device pointer */ assert(CU_MEMORYTYPE_DEVICE == mem_type); -#endif /* OPAL_CUDA_GET_ATTRIBUTES */ /* This piece of code was added in to handle in a case involving * OMP threads. The user had initialized CUDA and then spawned @@ -166,6 +278,16 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t * return OPAL_ERROR; } #endif /* OPAL_CUDA_GET_ATTRIBUTES */ + if (is_vmm) { + /* This function is expected to set context if pointer is device + * accessible but VMM allocations have NULL context associated + * which cannot be set against the calling thread */ + opal_output(0, + "CUDA: unable to set context with the given pointer" + "ptr=%p aborting...", addr); + return OPAL_ERROR; + } + result = cuCtxSetCurrent(mem_ctx); if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { opal_output(0,