From c7c49b94fe9479ade5d988f7b2777083725da8f0 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 31 Jul 2020 12:58:31 +0200 Subject: [PATCH] add support for cusparse generic SpGEMM --- cuda/base/cusparse_bindings.hpp | 74 +++++++++++++++++- cuda/matrix/csr_kernels.cu | 129 +++++++++++++++++++++++++++----- 2 files changed, 183 insertions(+), 20 deletions(-) diff --git a/cuda/base/cusparse_bindings.hpp b/cuda/base/cusparse_bindings.hpp index 9ecf9030236..af1b7759aad 100644 --- a/cuda/base/cusparse_bindings.hpp +++ b/cuda/base/cusparse_bindings.hpp @@ -379,7 +379,10 @@ GKO_BIND_CUSPARSE32_SPMV(ValueType, detail::not_implemented); #undef GKO_BIND_CUSPARSE32_SPMV -#endif +#endif // CUDA_VERSION < 11000 + + +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) template @@ -505,6 +508,75 @@ GKO_BIND_CUSPARSE_SPGEMM(std::complex, cusparseZcsrgemm2); #undef GKO_BIND_CUSPARSE_SPGEMM +#else // CUDA_VERSION >= 11000 + + +template +void spgemm_work_estimation(cusparseHandle_t handle, const ValueType *alpha, + cusparseSpMatDescr_t a_descr, + cusparseSpMatDescr_t b_descr, const ValueType *beta, + cusparseSpMatDescr_t c_descr, + cusparseSpGEMMDescr_t spgemm_descr, + size_t &buffer1_size, void *buffer1) +{ + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpGEMM_workEstimation( + handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, alpha, a_descr, b_descr, beta, + c_descr, cuda_data_type(), CUSPARSE_SPGEMM_DEFAULT, + spgemm_descr, &buffer1_size, buffer1)); +} + + +template +void spgemm_compute(cusparseHandle_t handle, const ValueType *alpha, + cusparseSpMatDescr_t a_descr, cusparseSpMatDescr_t b_descr, + const ValueType *beta, cusparseSpMatDescr_t c_descr, + cusparseSpGEMMDescr_t spgemm_descr, void *buffer1, + size_t &buffer2_size, void *buffer2) +{ + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpGEMM_compute( + handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, alpha, a_descr, b_descr, beta, + c_descr, cuda_data_type(), CUSPARSE_SPGEMM_DEFAULT, + spgemm_descr, &buffer2_size, buffer2)); +} + + +template +void spgemm_copy(cusparseHandle_t handle, const ValueType *alpha, + cusparseSpMatDescr_t a_descr, cusparseSpMatDescr_t b_descr, + const ValueType *beta, cusparseSpMatDescr_t c_descr, + cusparseSpGEMMDescr_t spgemm_descr) +{ + GKO_ASSERT_NO_CUSPARSE_ERRORS( + cusparseSpGEMM_copy(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, alpha, a_descr, + b_descr, beta, c_descr, cuda_data_type(), + CUSPARSE_SPGEMM_DEFAULT, spgemm_descr)); +} + + +inline size_type sparse_matrix_nnz(cusparseSpMatDescr_t descr) +{ + int64_t dummy1{}; + int64_t dummy2{}; + int64_t nnz{}; + cusparseSpMatGetSize(descr, &dummy1, &dummy2, &nnz); + return static_cast(nnz); +} + + +template +void csr_set_pointers(cusparseSpMatDescr_t descr, IndexType *row_ptrs, + IndexType *col_idxs, ValueType *vals) +{ + cusparseCsrSetPointers(descr, row_ptrs, col_idxs, vals); +} + + +#endif // CUDA_VERSION >= 11000 + + #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu index 3d193538f7b..2be25eea83c 100644 --- a/cuda/matrix/csr_kernels.cu +++ b/cuda/matrix/csr_kernels.cu @@ -496,32 +496,26 @@ void spgemm(std::shared_ptr exec, if (cusparse::is_supported::value) { auto handle = exec->get_cusparse_handle(); cusparse::pointer_mode_guard pm_guard(handle); - auto a_descr = cusparse::create_mat_descr(); - auto b_descr = cusparse::create_mat_descr(); - auto c_descr = cusparse::create_mat_descr(); - auto d_descr = cusparse::create_mat_descr(); - auto info = cusparse::create_spgemm_info(); auto alpha = one(); - auto a_nnz = IndexType(a->get_num_stored_elements()); - auto a_vals = a->get_const_values(); - auto a_row_ptrs = a->get_const_row_ptrs(); - auto a_col_idxs = a->get_const_col_idxs(); - auto b_nnz = IndexType(b->get_num_stored_elements()); - auto b_vals = b->get_const_values(); - auto b_row_ptrs = b->get_const_row_ptrs(); - auto b_col_idxs = b->get_const_col_idxs(); + auto a_nnz = static_cast(a->get_num_stored_elements()); + auto b_nnz = static_cast(b->get_num_stored_elements()); auto null_value = static_cast(nullptr); auto null_index = static_cast(nullptr); auto zero_nnz = IndexType{}; auto m = IndexType(a->get_size()[0]); auto n = IndexType(b->get_size()[1]); auto k = IndexType(a->get_size()[1]); - auto c_row_ptrs = c->get_row_ptrs(); matrix::CsrBuilder c_builder{c}; auto &c_col_idxs_array = c_builder.get_col_idx_array(); auto &c_vals_array = c_builder.get_value_array(); +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + auto a_descr = cusparse::create_mat_descr(); + auto b_descr = cusparse::create_mat_descr(); + auto c_descr = cusparse::create_mat_descr(); + auto d_descr = cusparse::create_mat_descr(); + auto info = cusparse::create_spgemm_info(); // allocate buffer size_type buffer_size{}; cusparse::spgemm_buffer_size( @@ -554,6 +548,54 @@ void spgemm(std::shared_ptr exec, cusparse::destroy(c_descr); cusparse::destroy(b_descr); cusparse::destroy(a_descr); + +#else // CUDA_VERSION >= 11000 + const auto beta = zero(); + auto spgemm_descr = cusparse::create_spgemm_descr(); + auto a_descr = cusparse::create_csr(m, k, a_nnz, + const_cast(a_row_ptrs), + const_cast(a_col_idxs), + const_cast(a_vals)); + auto b_descr = cusparse::create_csr(k, n, b_nnz, + const_cast(b_row_ptrs), + const_cast(b_col_idxs), + const_cast(b_vals)); + auto c_descr = cusparse::create_csr(m, n, zero_nnz, null_index, + null_index, null_value); + + size_type buffer1_size{}; + cusparse::spgemm_work_estimation(handle, &alpha, a_descr, b_descr, + &beta, c_descr, spgemm_descr, + buffer1_size, nullptr); + Array buffer1{exec, buffer1_size}; + cusparse::spgemm_work_estimation(handle, &alpha, a_descr, b_descr, + &beta, c_descr, spgemm_descr, + buffer1_size, buffer1.get_data()); + + size_type buffer2_size{}; + cusparse::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta, + c_descr, spgemm_descr, buffer1.get_data(), + buffer2_size, nullptr); + Array buffer2{exec, buffer2_size}; + cusparse::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta, + c_descr, spgemm_descr, buffer1.get_data(), + buffer2_size, buffer2.get_data()); + + auto c_nnz = cusparse::sparse_matrix_nnz(c_descr); + c_col_idxs_array.resize_and_reset(c_nnz); + c_vals_array.resize_and_reset(c_nnz); + cusparse::csr_set_pointers(c_descr, c_row_ptrs, + c_col_idxs_array.get_data(), + c_vals_array.get_data()); + + cusparse::spgemm_copy(handle, &alpha, a_descr, b_descr, &beta, c_descr, + spgemm_descr); + + cusparse::destroy(c_descr); + cusparse::destroy(b_descr); + cusparse::destroy(a_descr); + cusparse::destroy(spgemm_descr); +#endif // CUDA_VERSION >= 11000 } else { GKO_NOT_IMPLEMENTED; } @@ -574,11 +616,6 @@ void advanced_spgemm(std::shared_ptr exec, if (cusparse::is_supported::value) { auto handle = exec->get_cusparse_handle(); cusparse::pointer_mode_guard pm_guard(handle); - auto a_descr = cusparse::create_mat_descr(); - auto b_descr = cusparse::create_mat_descr(); - auto c_descr = cusparse::create_mat_descr(); - auto d_descr = cusparse::create_mat_descr(); - auto info = cusparse::create_spgemm_info(); auto valpha = exec->copy_val_to_host(alpha->get_const_values()); auto a_nnz = IndexType(a->get_num_stored_elements()); @@ -602,6 +639,12 @@ void advanced_spgemm(std::shared_ptr exec, auto &c_col_idxs_array = c_builder.get_col_idx_array(); auto &c_vals_array = c_builder.get_value_array(); +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + auto a_descr = cusparse::create_mat_descr(); + auto b_descr = cusparse::create_mat_descr(); + auto c_descr = cusparse::create_mat_descr(); + auto d_descr = cusparse::create_mat_descr(); + auto info = cusparse::create_spgemm_info(); // allocate buffer size_type buffer_size{}; cusparse::spgemm_buffer_size( @@ -634,6 +677,54 @@ void advanced_spgemm(std::shared_ptr exec, cusparse::destroy(c_descr); cusparse::destroy(b_descr); cusparse::destroy(a_descr); +#else // CUDA_VERSION >= 11000 + auto spgemm_descr = cusparse::create_spgemm_descr(); + auto a_descr = cusparse::create_csr(m, k, a_nnz, + const_cast(a_row_ptrs), + const_cast(a_col_idxs), + const_cast(a_vals)); + auto b_descr = cusparse::create_csr(k, n, b_nnz, + const_cast(b_row_ptrs), + const_cast(b_col_idxs), + const_cast(b_vals)); + auto c_descr = cusparse::create_csr(m, n, d_nnz, + const_cast(d_row_ptrs), + const_cast(d_col_idxs), + const_cast(d_vals)); + + size_type buffer1_size{}; + cusparse::spgemm_work_estimation(handle, &valpha, a_descr, b_descr, + &vbeta, c_descr, spgemm_descr, + buffer1_size, nullptr); + Array buffer1{exec, buffer1_size}; + cusparse::spgemm_work_estimation(handle, &valpha, a_descr, b_descr, + &vbeta, c_descr, spgemm_descr, + buffer1_size, buffer1.get_data()); + + size_type buffer2_size{}; + cusparse::spgemm_compute(handle, &valpha, a_descr, b_descr, &vbeta, + c_descr, spgemm_descr, buffer1.get_data(), + buffer2_size, nullptr); + Array buffer2{exec, buffer2_size}; + cusparse::spgemm_compute(handle, &valpha, a_descr, b_descr, &vbeta, + c_descr, spgemm_descr, buffer1.get_data(), + buffer2_size, buffer2.get_data()); + + auto c_nnz = cusparse::sparse_matrix_nnz(c_descr); + c_col_idxs_array.resize_and_reset(c_nnz); + c_vals_array.resize_and_reset(c_nnz); + cusparse::csr_set_pointers(c_descr, c_row_ptrs, + c_col_idxs_array.get_data(), + c_vals_array.get_data()); + + cusparse::spgemm_copy(handle, &valpha, a_descr, b_descr, &vbeta, + c_descr, spgemm_descr); + + cusparse::destroy(c_descr); + cusparse::destroy(b_descr); + cusparse::destroy(a_descr); + cusparse::destroy(spgemm_descr); +#endif // CUDA_VERSION >= 11000 } else { GKO_NOT_IMPLEMENTED; }