Skip to content

Commit

Permalink
Performance: Add CUDA Aware MPI (#5930)
Browse files Browse the repository at this point in the history
* add CUDA-aware MPI
* update docs
  • Loading branch information
Qianruipku authored Feb 26, 2025
1 parent b06a163 commit 9448ba9
Show file tree
Hide file tree
Showing 6 changed files with 49 additions and 0 deletions.
5 changes: 5 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ option(ENABLE_PEXSI "Enable support for PEXSI." OFF)
option(ENABLE_CUSOLVERMP "Enable cusolvermp." OFF)
option(USE_DSP "Enable DSP usage." OFF)
option(USE_CUDA_ON_DCU "Enable CUDA on DCU" OFF)
option(USE_CUDA_MPI "Enable CUDA-aware MPI" OFF)

# enable json support
if(ENABLE_RAPIDJSON)
Expand Down Expand Up @@ -132,6 +133,10 @@ if (USE_CUDA_ON_DCU)
add_compile_definitions(__CUDA_ON_DCU)
endif()

if (USE_CUDA_MPI)
add_compile_definitions(__CUDA_MPI)
endif()

list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)

if(ENABLE_COVERAGE)
Expand Down
2 changes: 2 additions & 0 deletions docs/advanced/install.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,8 @@ To build NVIDIA GPU support for ABACUS, define `USE_CUDA` flag. You can also spe
cmake -B build -DUSE_CUDA=1 -DCMAKE_CUDA_COMPILER=${path to cuda toolkit}/bin/nvcc
```

If you are confident that your MPI supports CUDA Aware, you can add `-DUSE_CUDA_MPI=ON`. In this case, the program will directly communicate data with the CUDA hardware, rather than transferring it to the CPU first before communication. But note that if CUDA Aware is not supported, adding `-DUSE_CUDA_MPI=ON` will cause the program to throw an error.

## Build math library from source

> Note: We recommend using the latest available compiler sets, since they offer faster implementations of math functions.
Expand Down
13 changes: 13 additions & 0 deletions source/module_base/para_gemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include "kernels/math_kernel_op.h"
#include "parallel_device.h"
#include "module_base/timer.h"
namespace ModuleBase
{
template <typename T, typename Device>
Expand Down Expand Up @@ -109,6 +110,7 @@ void PGemmCN<T, Device>::set_dimension(
template <typename T, typename Device>
void PGemmCN<T, Device>::multiply(const T alpha, const T* A, const T* B, const T beta, T* C)
{
ModuleBase::timer::tick("PGemmCN", "multiply");
#ifdef __MPI
if (this->col_nproc > 1)
{
Expand All @@ -126,6 +128,7 @@ void PGemmCN<T, Device>::multiply(const T alpha, const T* A, const T* B, const T
{
multiply_single(alpha, A, B, beta, C);
}
ModuleBase::timer::tick("PGemmCN", "multiply");
}

template <typename T, typename Device>
Expand Down Expand Up @@ -154,10 +157,12 @@ void PGemmCN<T, Device>::multiply_col(const T alpha, const T* A, const T* B, con

std::vector<T> B_tmp(max_colA * LDA);
std::vector<T> isend_tmp;
#ifndef __CUDA_MPI
if (std::is_same<Device, base_device::DEVICE_GPU>::value)
{
isend_tmp.resize(max_colA * LDA);
}
#endif
for (int ip = 0; ip < col_nproc; ip++)
{
if (col_rank != ip)
Expand Down Expand Up @@ -244,6 +249,13 @@ void PGemmCN<T, Device>::multiply_col(const T alpha, const T* A, const T* B, con

if (this->gatherC)
{
#ifdef __CUDA_MPI
if (this->row_nproc > 1)
{
Parallel_Common::reduce_data(C_local, size_C_local, row_world);
}
Parallel_Common::gatherv_data(C_local, size_C_local, C, recv_counts.data(), displs.data(), col_world);
#else
T* Cglobal_cpu = nullptr;
T* Clocal_cpu = C_tmp.data();
std::vector<T> cpu_tmp;
Expand Down Expand Up @@ -277,6 +289,7 @@ void PGemmCN<T, Device>::multiply_col(const T alpha, const T* A, const T* B, con
{
syncmem_h2d_op()(C, Cglobal_cpu, size_C_global);
}
#endif
}
else
{
Expand Down
2 changes: 2 additions & 0 deletions source/module_base/parallel_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ void gatherv_data(const std::complex<float>* sendbuf, int sendcount, std::comple
MPI_Allgatherv(sendbuf, sendcount, MPI_COMPLEX, recvbuf, recvcounts, displs, MPI_COMPLEX, comm);
}

#ifndef __CUDA_MPI
template <typename T>
struct object_cpu_point<T, base_device::DEVICE_GPU>
{
Expand Down Expand Up @@ -171,6 +172,7 @@ template struct object_cpu_point<float, base_device::DEVICE_CPU>;
template struct object_cpu_point<float, base_device::DEVICE_GPU>;
template struct object_cpu_point<std::complex<float>, base_device::DEVICE_CPU>;
template struct object_cpu_point<std::complex<float>, base_device::DEVICE_GPU>;
#endif

} // namespace Parallel_Common
#endif
22 changes: 22 additions & 0 deletions source/module_base/parallel_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ void gatherv_data(const std::complex<double>* sendbuf, int sendcount, std::compl
void gatherv_data(const float* sendbuf, int sendcount, float* recvbuf, const int* recvcounts, const int* displs, MPI_Comm& comm);
void gatherv_data(const std::complex<float>* sendbuf, int sendcount, std::complex<float>* recvbuf, const int* recvcounts, const int* displs, MPI_Comm& comm);

#ifndef __CUDA_MPI
template<typename T, typename Device>
struct object_cpu_point
{
Expand All @@ -41,6 +42,7 @@ struct object_cpu_point
void sync_d2h(T* object_cpu, const T* object, const int& n);
void sync_h2d(T* object, const T* object_cpu, const int& n);
};
#endif

/**
* @brief send data in Device
Expand All @@ -49,11 +51,15 @@ struct object_cpu_point
template <typename T, typename Device>
void send_dev(const T* object, int count, int dest, int tag, MPI_Comm& comm, T* tmp_space = nullptr)
{
#ifdef __CUDA_MPI
send_data(object, count, dest, tag, comm);
#else
object_cpu_point<T,Device> o;
T* object_cpu = o.get(object, count, tmp_space);
o.sync_d2h(object_cpu, object, count);
send_data(object_cpu, count, dest, tag, comm);
o.del(object_cpu);
#endif
return;
}

Expand All @@ -65,11 +71,15 @@ void send_dev(const T* object, int count, int dest, int tag, MPI_Comm& comm, T*
template <typename T, typename Device>
void isend_dev(const T* object, int count, int dest, int tag, MPI_Comm& comm, MPI_Request* request, T* send_space)
{
#ifdef __CUDA_MPI
isend_data(object, count, dest, tag, comm, request);
#else
object_cpu_point<T,Device> o;
T* object_cpu = o.get(object, count, send_space);
o.sync_d2h(object_cpu, object, count);
isend_data(object_cpu, count, dest, tag, comm, request);
o.del(object_cpu);
#endif
return;
}

Expand All @@ -80,11 +90,15 @@ void isend_dev(const T* object, int count, int dest, int tag, MPI_Comm& comm, MP
template <typename T, typename Device>
void recv_dev(T* object, int count, int source, int tag, MPI_Comm& comm, MPI_Status* status, T* tmp_space = nullptr)
{
#ifdef __CUDA_MPI
recv_data(object, count, source, tag, comm, status);
#else
object_cpu_point<T,Device> o;
T* object_cpu = o.get(object, count, tmp_space);
recv_data(object_cpu, count, source, tag, comm, status);
o.sync_h2d(object, object_cpu, count);
o.del(object_cpu);
#endif
return;
}

Expand All @@ -102,24 +116,32 @@ void recv_dev(T* object, int count, int source, int tag, MPI_Comm& comm, MPI_Sta
template <typename T, typename Device>
void bcast_dev(T* object, const int& n, const MPI_Comm& comm, T* tmp_space = nullptr)
{
#ifdef __CUDA_MPI
bcast_data(object, n, comm);
#else
object_cpu_point<T,Device> o;
T* object_cpu = o.get(object, n, tmp_space);
o.sync_d2h(object_cpu, object, n);
bcast_data(object_cpu, n, comm);
o.sync_h2d(object, object_cpu, n);
o.del(object_cpu);
#endif
return;
}

template <typename T, typename Device>
void reduce_dev(T* object, const int& n, const MPI_Comm& comm, T* tmp_space = nullptr)
{
#ifdef __CUDA_MPI
reduce_data(object, n, comm);
#else
object_cpu_point<T,Device> o;
T* object_cpu = o.get(object, n, tmp_space);
o.sync_d2h(object_cpu, object, n);
reduce_data(object_cpu, n, comm);
o.sync_h2d(object, object_cpu, n);
o.del(object_cpu);
#endif
return;
}
}
Expand Down
5 changes: 5 additions & 0 deletions source/module_hsolver/para_linear_transform.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "para_linear_transform.h"
#include "module_base/timer.h"

#include <algorithm>
#include <vector>
Expand Down Expand Up @@ -54,6 +55,7 @@ void PLinearTransform<T, Device>::set_dimension(const int nrowA,
template <typename T, typename Device>
void PLinearTransform<T, Device>::act(const T alpha, const T* A, const T* U, const T beta, T* B)
{
ModuleBase::timer::tick("PLinearTransform", "act");
const Device* ctx = {};
#ifdef __MPI
if (nproc_col > 1)
Expand All @@ -65,7 +67,9 @@ void PLinearTransform<T, Device>::act(const T alpha, const T* A, const T* U, con
if (std::is_same<Device, base_device::DEVICE_GPU>::value)
{
A_tmp_device = nullptr;
#ifndef __CUDA_MPI
isend_tmp.resize(max_colA * LDA);
#endif
resmem_dev_op()(A_tmp_device, max_colA * LDA);
}
T* B_tmp = nullptr;
Expand Down Expand Up @@ -168,6 +172,7 @@ void PLinearTransform<T, Device>::act(const T alpha, const T* A, const T* U, con
B,
LDA);
}
ModuleBase::timer::tick("PLinearTransform", "act");
};

template struct PLinearTransform<double, base_device::DEVICE_CPU>;
Expand Down

0 comments on commit 9448ba9

Please sign in to comment.