From 15c73d2f8c05012e5f5fa6becca9b139c25a6cd7 Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Tue, 8 Aug 2023 16:20:07 -0500 Subject: [PATCH 1/2] Capture all non-zero info in getrf batched. --- src/QMCWaveFunctions/detail/CUDA/cuBLAS_LU.cu | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/QMCWaveFunctions/detail/CUDA/cuBLAS_LU.cu b/src/QMCWaveFunctions/detail/CUDA/cuBLAS_LU.cu index 6ac999ca95..af7b3032c1 100644 --- a/src/QMCWaveFunctions/detail/CUDA/cuBLAS_LU.cu +++ b/src/QMCWaveFunctions/detail/CUDA/cuBLAS_LU.cu @@ -158,14 +158,14 @@ void computeGetrf_batched(cublasHandle_t& h_cublas, "cudaMemcpyAsync failed copying cuBLAS::getrf_batched infos from device"); cudaErrorCheck(cudaStreamSynchronize(hstream), "cudaStreamSynchronize failed!"); - for (int iw = 0; iw < batch_size; ++iw) + if (std::all_of(host_infos, host_infos + batch_size, [](int i) { return i != 0; })) { - if (*(host_infos + iw) != 0) - { - std::ostringstream err_msg; - err_msg << "cuBLAS::getrf_batched failed with return code " << *(host_infos + iw); - throw std::runtime_error(err_msg.str()); - } + std::ostringstream err_msg; + err_msg << "cuBLAS::getrf_batched failed! Non-zero infos:" << std::endl; + for (int iw = 0; iw < batch_size; ++iw) + if (*(host_infos + iw) != 0) + err_msg << "infos[" << iw << "] = " << *(host_infos + iw) << std::endl; + throw std::runtime_error(err_msg.str()); } } From 95c3701df73928a0c94587bbebff790ba604ff7e Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Tue, 8 Aug 2023 17:05:07 -0500 Subject: [PATCH 2/2] Connect computeGetri_batched --- src/QMCWaveFunctions/detail/CUDA/cuBLAS_LU.cu | 49 +++++++++++++++--- .../detail/CUDA/cuBLAS_LU.hpp | 51 ++++++++++--------- src/QMCWaveFunctions/tests/test_cuBLAS_LU.cpp | 2 +- 3 files changed, 70 insertions(+), 32 deletions(-) diff --git a/src/QMCWaveFunctions/detail/CUDA/cuBLAS_LU.cu b/src/QMCWaveFunctions/detail/CUDA/cuBLAS_LU.cu index af7b3032c1..2b31866af1 100644 --- a/src/QMCWaveFunctions/detail/CUDA/cuBLAS_LU.cu +++ b/src/QMCWaveFunctions/detail/CUDA/cuBLAS_LU.cu @@ -10,6 +10,7 @@ ////////////////////////////////////////////////////////////////////////////////////// #include "cuBLAS_LU.hpp" +#include #include "Platforms/CUDA/CUDAruntime.hpp" #include "Platforms/CUDA/cuBLAS.hpp" #include "Platforms/CUDA/CUDATypeMapping.hpp" @@ -158,7 +159,7 @@ void computeGetrf_batched(cublasHandle_t& h_cublas, "cudaMemcpyAsync failed copying cuBLAS::getrf_batched infos from device"); cudaErrorCheck(cudaStreamSynchronize(hstream), "cudaStreamSynchronize failed!"); - if (std::all_of(host_infos, host_infos + batch_size, [](int i) { return i != 0; })) + if (std::any_of(host_infos, host_infos + batch_size, [](int i) { return i != 0; })) { std::ostringstream err_msg; err_msg << "cuBLAS::getrf_batched failed! Non-zero infos:" << std::endl; @@ -186,24 +187,37 @@ void computeInverseAndDetLog_batched(cublasHandle_t& h_cublas, computeGetrf_batched(h_cublas, hstream, n, lda, Ms, pivots, host_infos, infos, batch_size); cudaErrorCheck(computeLogDet_batched_impl(hstream, n, lda, Ms, pivots, log_dets, batch_size), "failed to calculate log determinant values in computeLogDet_batched_impl"); - cublasErrorCheck(cuBLAS::getri_batched(h_cublas, n, Ms, lda, pivots, Cs, lda, infos, batch_size), - "cuBLAS::getri_batched failed in computeInverseAndDetLog_batched"); - //FIXME replace getri_batched with computeGetri_batched and computeGetri_batched should sync and check infos - cudaErrorCheck(cudaStreamSynchronize(hstream), "cudaStreamSynchronize failed!"); + computeGetri_batched(h_cublas, hstream, n, lda, Ms, Cs, pivots, host_infos, infos, batch_size); } +template void computeGetri_batched(cublasHandle_t& h_cublas, + cudaStream_t& hstream, const int n, const int lda, - double* Ms[], - double* Cs[], + T* Ms[], + T* Cs[], int* pivots, + int* host_infos, int* infos, const int batch_size) { cublasErrorCheck(cuBLAS::getri_batched(h_cublas, n, Ms, lda, pivots, Cs, lda, infos, batch_size), "cuBLAS::getri_batched failed in computeInverseAndDetLog_batched"); + cudaErrorCheck(cudaMemcpyAsync(host_infos, infos, sizeof(int) * batch_size, cudaMemcpyDeviceToHost, hstream), + "cudaMemcpyAsync failed copying cuBLAS::getri_batched infos from device"); + cudaErrorCheck(cudaStreamSynchronize(hstream), "cudaStreamSynchronize failed!"); + + if (std::any_of(host_infos, host_infos + batch_size, [](int i) { return i != 0; })) + { + std::ostringstream err_msg; + err_msg << "cuBLAS::getri_batched failed! Non-zero infos:" << std::endl; + for (int iw = 0; iw < batch_size; ++iw) + if (*(host_infos + iw) != 0) + err_msg << "infos[" << iw << "] = " << *(host_infos + iw) << std::endl; + throw std::runtime_error(err_msg.str()); + } } template void computeGetrf_batched(cublasHandle_t& h_cublas, @@ -226,6 +240,27 @@ template void computeGetrf_batched>(cublasHandle_t& h_cubla int* infos, const int batch_size); +template void computeGetri_batched(cublasHandle_t& h_cublas, + cudaStream_t& hstream, + const int n, + const int lda, + double* Ms[], + double* Cs[], + int* pivots, + int* host_infos, + int* infos, + const int batch_size); + +template void computeGetri_batched>(cublasHandle_t& h_cublas, + cudaStream_t& hstream, + const int n, + const int lda, + std::complex* Ms[], + std::complex* Cs[], + int* pivots, + int* host_infos, + int* infos, + const int batch_size); template void computeLogDet_batched>(cudaStream_t& hstream, const int n, diff --git a/src/QMCWaveFunctions/detail/CUDA/cuBLAS_LU.hpp b/src/QMCWaveFunctions/detail/CUDA/cuBLAS_LU.hpp index c2157bd2fb..f22e13196d 100644 --- a/src/QMCWaveFunctions/detail/CUDA/cuBLAS_LU.hpp +++ b/src/QMCWaveFunctions/detail/CUDA/cuBLAS_LU.hpp @@ -81,40 +81,43 @@ void computeLogDet_batched(cudaStream_t& hstream, std::complex* logdets, const int batch_size); +template void computeGetri_batched(cublasHandle_t& h_cublas, + cudaStream_t& hstream, const int n, const int lda, - double* Ms[], - double* Cs[], + T* Ms[], + T* Cs[], int* pivots, + int* host_infos, int* infos, const int batch_size); extern template void computeInverseAndDetLog_batched(cublasHandle_t& h_cublas, - cudaStream_t& hstream, - const int n, - const int lda, - double* Ms[], - double* Cs[], - double* LU_diags, - int* pivots, - int* host_infos, - int* infos, - std::complex* log_dets, - const int batch_size); + cudaStream_t& hstream, + const int n, + const int lda, + double* Ms[], + double* Cs[], + double* LU_diags, + int* pivots, + int* host_infos, + int* infos, + std::complex* log_dets, + const int batch_size); extern template void computeInverseAndDetLog_batched>(cublasHandle_t& h_cublas, - cudaStream_t& hstream, - const int n, - const int lda, - std::complex* Ms[], - std::complex* Cs[], - std::complex* LU_diags, - int* pivots, - int* host_infos, - int* infos, - std::complex* log_dets, - const int batch_size); + cudaStream_t& hstream, + const int n, + const int lda, + std::complex* Ms[], + std::complex* Cs[], + std::complex* LU_diags, + int* pivots, + int* host_infos, + int* infos, + std::complex* log_dets, + const int batch_size); } // namespace cuBLAS_LU } // namespace qmcplusplus diff --git a/src/QMCWaveFunctions/tests/test_cuBLAS_LU.cpp b/src/QMCWaveFunctions/tests/test_cuBLAS_LU.cpp index 3f038b7611..f70bc2579d 100644 --- a/src/QMCWaveFunctions/tests/test_cuBLAS_LU.cpp +++ b/src/QMCWaveFunctions/tests/test_cuBLAS_LU.cpp @@ -495,7 +495,7 @@ TEST_CASE("cuBLAS_LU::getri_batched", "[wavefunction][CUDA]") "cudaMemcpyAsync failed copying invMs to device"); cudaErrorCheck(cudaMemcpyAsync(dev_pivots.data(), pivots.data(), sizeof(int) * 4, cudaMemcpyHostToDevice, hstream), "cudaMemcpyAsync failed copying pivots to device"); - cuBLAS_LU::computeGetri_batched(cuda_handles->h_cublas, n, lda, devMs.data(), invMs.data(), dev_pivots.data(), dev_infos.data(), batch_size); + cuBLAS_LU::computeGetri_batched(cuda_handles->h_cublas, cuda_handles->hstream, n, lda, devMs.data(), invMs.data(), dev_pivots.data(), infos.data(), dev_infos.data(), batch_size); cudaErrorCheck(cudaMemcpyAsync(invM_vec.data(), dev_invM_vec.data(), sizeof(double) * 16, cudaMemcpyDeviceToHost, hstream), "cudaMemcpyAsync failed copying invM from device");