From 924e9f965cdbc9de398704bf9aaf14bec9a3c98e Mon Sep 17 00:00:00 2001 From: Chang Liu Date: Fri, 22 Nov 2024 06:05:34 -0800 Subject: [PATCH] [Bugfix] Dynamic load NVML symbols for better compatibility (#234) [File PR here for the record] Dynamically load NVML symbols for querying GPU fabric info to address incompatibility issues with outdated display drivers. Authors: - Chang Liu (https://github.com/chang-l) Approvers: - https://github.com/linhu-nv - Brad Rees (https://github.com/BradReesWork) URL: https://github.com/rapidsai/wholegraph/pull/234 --- cpp/src/nvml_wrap.cpp | 78 ++++++++++++++++++++++++++++ cpp/src/nvml_wrap.h | 28 ++++++++++ cpp/src/wholememory/communicator.cpp | 74 ++++++++++++++------------ cpp/src/wholememory/system_info.cpp | 26 +++++----- cpp/src/wholememory/system_info.hpp | 5 +- 5 files changed, 165 insertions(+), 46 deletions(-) create mode 100644 cpp/src/nvml_wrap.cpp create mode 100644 cpp/src/nvml_wrap.h diff --git a/cpp/src/nvml_wrap.cpp b/cpp/src/nvml_wrap.cpp new file mode 100644 index 000000000..fc2551858 --- /dev/null +++ b/cpp/src/nvml_wrap.cpp @@ -0,0 +1,78 @@ +// Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "nvml_wrap.h" + +#if CUDA_VERSION >= 12030 +#include +#include +#include + +namespace { + +void* nvml_handle = nullptr; +std::mutex nvml_mutex; +bool nvml_loaded = false; + +bool LoadNvmlLibrary() +{ + nvml_handle = dlopen("libnvidia-ml.so.1", RTLD_NOW); + if (!nvml_handle) { + nvml_handle = dlopen("libnvidia-ml.so", RTLD_NOW); + if (!nvml_handle) { + fprintf(stderr, "Failed to load NVML library: %s\n", dlerror()); + return false; + } + } + return true; +} + +template +T LoadNvmlSymbol(const char* name) +{ + void* symbol = dlsym(nvml_handle, name); + if (!symbol) { return nullptr; } + return reinterpret_cast(symbol); +} + +} // namespace + +// Global function pointers +nvmlDeviceGetHandleByIndexFunc nvmlDeviceGetHandleByIndexPtr = nullptr; +nvmlDeviceGetGpuFabricInfoFunc nvmlDeviceGetGpuFabricInfoPtr = nullptr; + +// Ensure NVML is loaded and symbols are initialized +bool NvmlFabricSymbolLoaded() +{ + std::lock_guard lock(nvml_mutex); + if (nvml_loaded) { + return true; // Already loaded + } + + if (LoadNvmlLibrary()) { + nvmlDeviceGetHandleByIndexPtr = + LoadNvmlSymbol("nvmlDeviceGetHandleByIndex"); + nvmlDeviceGetGpuFabricInfoPtr = + LoadNvmlSymbol("nvmlDeviceGetGpuFabricInfo"); + + if (!nvmlDeviceGetHandleByIndexPtr || !nvmlDeviceGetGpuFabricInfoPtr) { + dlclose(nvml_handle); + nvml_handle = nullptr; + } else { + nvml_loaded = true; + } + } + return nvml_loaded; +} +#endif diff --git a/cpp/src/nvml_wrap.h b/cpp/src/nvml_wrap.h new file mode 100644 index 000000000..f8b22fc7f --- /dev/null +++ b/cpp/src/nvml_wrap.h @@ -0,0 +1,28 @@ +// Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#if CUDA_VERSION >= 12030 +#include + +bool NvmlFabricSymbolLoaded(); + +typedef nvmlReturn_t (*nvmlDeviceGetHandleByIndexFunc)(unsigned int, nvmlDevice_t*); +typedef nvmlReturn_t (*nvmlDeviceGetGpuFabricInfoFunc)(nvmlDevice_t, nvmlGpuFabricInfo_t*); + +extern nvmlDeviceGetHandleByIndexFunc nvmlDeviceGetHandleByIndexPtr; +extern nvmlDeviceGetGpuFabricInfoFunc nvmlDeviceGetGpuFabricInfoPtr; +#endif diff --git a/cpp/src/wholememory/communicator.cpp b/cpp/src/wholememory/communicator.cpp index f76a4c7b1..34053ad7e 100644 --- a/cpp/src/wholememory/communicator.cpp +++ b/cpp/src/wholememory/communicator.cpp @@ -497,6 +497,7 @@ void get_host_info(host_info* phi) bool comm_support_mnnvl(wholememory_comm_t wm_comm, const std::unique_ptr& p_rank_info) { #if CUDA_VERSION >= 12030 + if (!nvmlFabricSymbolLoaded) return 0; int flag = 0; CUdevice currentDev; WM_CU_CHECK_NO_THROW(cuDeviceGet(¤tDev, wm_comm->dev_id)); @@ -534,16 +535,22 @@ void exchange_rank_info(wholememory_comm_t wm_comm) wm_comm->clique_info.is_in_clique = 0; #if CUDA_VERSION >= 12030 - memset(&ri.fabric_info, 0, sizeof(ri.fabric_info)); - WHOLEMEMORY_CHECK_NOTHROW(GetGpuFabricInfo(wm_comm->dev_id, &ri.fabric_info) == - WHOLEMEMORY_SUCCESS); + if (nvmlFabricSymbolLoaded) { + memset(&ri.fabric_info, 0, sizeof(ri.fabric_info)); + WHOLEMEMORY_CHECK_NOTHROW(GetGpuFabricInfo(wm_comm->dev_id, &ri.fabric_info) == + WHOLEMEMORY_SUCCESS); - // // A zero UUID means we don't have MNNVL fabric info - if (((((long*)ri.fabric_info.clusterUuid)[0] | ((long*)ri.fabric_info.clusterUuid)[1]) == 0)) { - wm_comm->clique_info.is_in_clique = 0; + // // A zero UUID means we don't have MNNVL fabric info + if (((((long*)ri.fabric_info.clusterUuid)[0] | ((long*)ri.fabric_info.clusterUuid)[1]) == 0)) { + wm_comm->clique_info.is_in_clique = 0; + } else { + wm_comm->clique_info.is_in_clique = 1; + } } else { - wm_comm->clique_info.is_in_clique = 1; + WHOLEMEMORY_WARN( + "Some required NVML symbols are missing, likely due to an outdated GPU display driver. MNNVL " + "support will be disabled."); } #endif @@ -573,38 +580,41 @@ void exchange_rank_info(wholememory_comm_t wm_comm) } #if CUDA_VERSION >= 12030 - - if ((memcmp(ri.fabric_info.clusterUuid, - p_rank_info.get()[r].fabric_info.clusterUuid, - NVML_GPU_FABRIC_UUID_LEN) == 0) && - (ri.fabric_info.cliqueId == p_rank_info.get()[r].fabric_info.cliqueId)) { - if (r == wm_comm->world_rank) { - wm_comm->clique_info.clique_rank = wm_comm->clique_info.clique_rank_num; + if (nvmlFabricSymbolLoaded) { + if ((memcmp(ri.fabric_info.clusterUuid, + p_rank_info.get()[r].fabric_info.clusterUuid, + NVML_GPU_FABRIC_UUID_LEN) == 0) && + (ri.fabric_info.cliqueId == p_rank_info.get()[r].fabric_info.cliqueId)) { + if (r == wm_comm->world_rank) { + wm_comm->clique_info.clique_rank = wm_comm->clique_info.clique_rank_num; + } + if (wm_comm->clique_info.clique_rank_num == 0) { + wm_comm->clique_info.clique_first_rank = r; + } + wm_comm->clique_info.clique_rank_num++; } - if (wm_comm->clique_info.clique_rank_num == 0) { wm_comm->clique_info.clique_first_rank = r; } - wm_comm->clique_info.clique_rank_num++; + clique_uuids.insert( + std::string(reinterpret_cast(p_rank_info.get()[r].fabric_info.clusterUuid), + NVML_GPU_FABRIC_UUID_LEN)); } - clique_uuids.insert( - std::string(reinterpret_cast(p_rank_info.get()[r].fabric_info.clusterUuid), - NVML_GPU_FABRIC_UUID_LEN)); - #endif } #if CUDA_VERSION >= 12030 - wm_comm->clique_info.clique_num = clique_uuids.size(); - - std::string uuid = std::string(reinterpret_cast(ri.fabric_info.clusterUuid), - NVML_GPU_FABRIC_UUID_LEN); - int id = 0; - for (auto clique_uuid : clique_uuids) { - if (clique_uuid == uuid) { wm_comm->clique_info.clique_id = id; } - id++; - } - - wm_comm->support_mnnvl = (comm_support_mnnvl(wm_comm, p_rank_info)) && - (wm_comm->clique_info.clique_rank_num == wm_comm->world_size); + if (nvmlFabricSymbolLoaded) { + wm_comm->clique_info.clique_num = clique_uuids.size(); + + std::string uuid = std::string(reinterpret_cast(ri.fabric_info.clusterUuid), + NVML_GPU_FABRIC_UUID_LEN); + int id = 0; + for (auto clique_uuid : clique_uuids) { + if (clique_uuid == uuid) { wm_comm->clique_info.clique_id = id; } + id++; + } + wm_comm->support_mnnvl = (comm_support_mnnvl(wm_comm, p_rank_info)) && + (wm_comm->clique_info.clique_rank_num == wm_comm->world_size); + } #endif } diff --git a/cpp/src/wholememory/system_info.cpp b/cpp/src/wholememory/system_info.cpp index 01c124a6f..8cd0209f5 100644 --- a/cpp/src/wholememory/system_info.cpp +++ b/cpp/src/wholememory/system_info.cpp @@ -13,8 +13,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "system_info.hpp" - #include #include "cuda_macros.hpp" @@ -140,17 +138,19 @@ wholememory_error_code_t NvmlEnsureInitialized() wholememory_error_code_t GetGpuFabricInfo(int dev, nvmlGpuFabricInfo_t* gpuFabricInfo) { WHOLEMEMORY_CHECK_NOTHROW(NvmlEnsureInitialized() == WHOLEMEMORY_SUCCESS); - std::lock_guard locked(lock); - // gpuFabricInfo->version = nvmlGpuFabricInfo_v2; - nvmlDevice_t nvml_device; - nvmlReturn_t ret = nvmlDeviceGetHandleByIndex(dev, &nvml_device); - WHOLEMEMORY_EXPECTS_NOTHROW( - ret == NVML_SUCCESS, "nvmlDeviceGetHandleByIndex error:%s", nvmlErrorString(ret)); - ret = nvmlDeviceGetGpuFabricInfo(nvml_device, gpuFabricInfo); - WHOLEMEMORY_EXPECTS_NOTHROW( - ret == NVML_SUCCESS, "nvmlDeviceGetGpuFabricInfo error:%s", nvmlErrorString(ret)); - - return WHOLEMEMORY_SUCCESS; + if (wholememory::nvmlFabricSymbolLoaded) { + std::lock_guard locked(lock); + // gpuFabricInfo->version = nvmlGpuFabricInfo_v2; + nvmlDevice_t nvml_device; + nvmlReturn_t ret = nvmlDeviceGetHandleByIndexPtr(dev, &nvml_device); + WHOLEMEMORY_EXPECTS_NOTHROW( + ret == NVML_SUCCESS, "nvmlDeviceGetHandleByIndex error:%s", nvmlErrorString(ret)); + ret = nvmlDeviceGetGpuFabricInfoPtr(nvml_device, gpuFabricInfo); + WHOLEMEMORY_EXPECTS_NOTHROW( + ret == NVML_SUCCESS, "nvmlDeviceGetGpuFabricInfo error:%s", nvmlErrorString(ret)); + return WHOLEMEMORY_SUCCESS; + } + return WHOLEMEMORY_SYSTEM_ERROR; } }; // namespace wholememory diff --git a/cpp/src/wholememory/system_info.hpp b/cpp/src/wholememory/system_info.hpp index a157924eb..4d6c52c27 100644 --- a/cpp/src/wholememory/system_info.hpp +++ b/cpp/src/wholememory/system_info.hpp @@ -18,6 +18,7 @@ #include "wholememory/wholememory.h" #if CUDA_VERSION >= 12030 +#include "nvml_wrap.h" #include #endif bool DevAttrPagebleMemoryAccess(); @@ -37,7 +38,9 @@ bool SupportEGM(); // bool SupportMNNVLForEGM(); #if CUDA_VERSION >= 12030 namespace wholememory { + +inline bool nvmlFabricSymbolLoaded = NvmlFabricSymbolLoaded(); wholememory_error_code_t GetGpuFabricInfo(int dev, nvmlGpuFabricInfo_t* gpuFabricInfo); -} +} // namespace wholememory #endif