Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cuda10.1 generic api: CSR and COO SpMV #468

Merged
merged 7 commits into from
Mar 3, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
219 changes: 218 additions & 1 deletion benchmark/utils/cuda_linops.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <ginkgo/ginkgo.hpp>


#include <cuda.h>
#include <cuda_runtime.h>
#include <cusparse.h>
#include <memory>
Expand All @@ -45,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "cuda/base/cusparse_bindings.hpp"
#include "cuda/base/device_guard.hpp"
#include "cuda/base/pointer_mode_guard.hpp"
#include "cuda/base/types.hpp"


namespace detail {
Expand All @@ -54,7 +56,12 @@ class CuspBase : public gko::LinOp {
public:
cusparseMatDescr_t get_descr() const { return this->descr_.get(); }

const gko::CudaExecutor *get_gpu_exec() const { return gpu_exec_.get(); }
// Return shared pointer not plain pointer such that CuspGenericSpMV uses
// gko::Array to allocate buffer.
std::shared_ptr<const gko::CudaExecutor> get_gpu_exec() const
{
return gpu_exec_;
}

protected:
void apply_impl(const gko::LinOp *, const gko::LinOp *, const gko::LinOp *,
Expand Down Expand Up @@ -475,6 +482,204 @@ class CuspHybrid
};


#if defined(CUDA_VERSION) && (CUDA_VERSION >= 10010)


template <typename ValueType>
void cusp_generic_spmv(std::shared_ptr<const gko::CudaExecutor> gpu_exec,
const cusparseSpMatDescr_t mat,
const gko::Array<ValueType> &scalars,
const gko::LinOp *b, gko::LinOp *x,
cusparseOperation_t trans, cusparseSpMVAlg_t alg)
{
cudaDataType_t cu_value = gko::kernels::cuda::cuda_data_type<ValueType>();
using gko::kernels::cuda::as_culibs_type;
auto dense_b = gko::as<gko::matrix::Dense<ValueType>>(b);
auto dense_x = gko::as<gko::matrix::Dense<ValueType>>(x);
auto db = dense_b->get_const_values();
auto dx = dense_x->get_values();
const auto id = gpu_exec->get_device_id();
gko::cuda::device_guard g{id};
cusparseDnVecDescr_t vecb, vecx;
GKO_ASSERT_NO_CUSPARSE_ERRORS(
cusparseCreateDnVec(&vecx, dense_x->get_num_stored_elements(),
as_culibs_type(dx), cu_value));
// cusparseCreateDnVec only allows non-const pointer
GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateDnVec(
&vecb, dense_b->get_num_stored_elements(),
as_culibs_type(const_cast<ValueType *>(db)), cu_value));

size_t buffer_size = 0;
GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpMV_bufferSize(
gpu_exec->get_cusparse_handle(), trans, &scalars.get_const_data()[0],
mat, vecb, &scalars.get_const_data()[1], vecx, cu_value, alg,
&buffer_size));
gko::Array<char> buffer_array(gpu_exec, buffer_size);
auto dbuffer = buffer_array.get_data();
GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpMV(
gpu_exec->get_cusparse_handle(), trans, &scalars.get_const_data()[0],
mat, vecb, &scalars.get_const_data()[1], vecx, cu_value, alg, dbuffer));
GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyDnVec(vecx));
GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyDnVec(vecb));
}


template <typename ValueType = gko::default_precision,
typename IndexType = gko::int32,
cusparseSpMVAlg_t Alg = CUSPARSE_MV_ALG_DEFAULT>
class CuspGenericCsr
: public gko::EnableLinOp<CuspGenericCsr<ValueType, IndexType, Alg>,
CuspBase>,
public gko::EnableCreateMethod<CuspGenericCsr<ValueType, IndexType, Alg>>,
public gko::ReadableFromMatrixData<ValueType, IndexType> {
friend class gko::EnableCreateMethod<CuspGenericCsr>;
friend class gko::EnablePolymorphicObject<CuspGenericCsr, CuspBase>;

public:
using csr = gko::matrix::Csr<ValueType, IndexType>;
using mat_data = gko::matrix_data<ValueType, IndexType>;
cusparseIndexType_t cu_index =
gko::kernels::cuda::cusparse_index_type<IndexType>();
cudaDataType_t cu_value = gko::kernels::cuda::cuda_data_type<ValueType>();

void read(const mat_data &data) override
{
using gko::kernels::cuda::as_culibs_type;
csr_->read(data);
this->set_size(gko::dim<2>{csr_->get_size()});
GKO_ASSERT_NO_CUSPARSE_ERRORS(
cusparseCreateCsr(&mat_, csr_->get_size()[0], csr_->get_size()[1],
csr_->get_num_stored_elements(),
as_culibs_type(csr_->get_row_ptrs()),
as_culibs_type(csr_->get_col_idxs()),
as_culibs_type(csr_->get_values()), cu_index,
cu_index, CUSPARSE_INDEX_BASE_ZERO, cu_value));
}

gko::size_type get_num_stored_elements() const noexcept
{
return csr_->get_num_stored_elements();
}

~CuspGenericCsr() override
{
const auto id = this->get_gpu_exec()->get_device_id();
try {
gko::cuda::device_guard g{id};
GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroySpMat(mat_));
} catch (const std::exception &e) {
std::cerr << "Error when unallocating CuspGenericCsr mat_ matrix: "
<< e.what() << std::endl;
}
}

CuspGenericCsr(const CuspGenericCsr &other) = delete;

CuspGenericCsr &operator=(const CuspGenericCsr &other) = default;

protected:
void apply_impl(const gko::LinOp *b, gko::LinOp *x) const override
{
cusp_generic_spmv(this->get_gpu_exec(), mat_, scalars, b, x, trans_,
Alg);
}

CuspGenericCsr(std::shared_ptr<const gko::Executor> exec,
const gko::dim<2> &size = gko::dim<2>{})
: gko::EnableLinOp<CuspGenericCsr, CuspBase>(exec, size),
csr_(std::move(
csr::create(exec, std::make_shared<typename csr::classical>()))),
trans_(CUSPARSE_OPERATION_NON_TRANSPOSE)
{}

private:
// Contains {alpha, beta}
gko::Array<ValueType> scalars{
this->get_executor(), {gko::one<ValueType>(), gko::zero<ValueType>()}};
std::shared_ptr<csr> csr_;
cusparseOperation_t trans_;
cusparseSpMatDescr_t mat_;
};


template <typename ValueType = gko::default_precision,
typename IndexType = gko::int32>
class CuspGenericCoo
: public gko::EnableLinOp<CuspGenericCoo<ValueType, IndexType>, CuspBase>,
public gko::EnableCreateMethod<CuspGenericCoo<ValueType, IndexType>>,
public gko::ReadableFromMatrixData<ValueType, IndexType> {
friend class gko::EnableCreateMethod<CuspGenericCoo>;
friend class gko::EnablePolymorphicObject<CuspGenericCoo, CuspBase>;

public:
using coo = gko::matrix::Coo<ValueType, IndexType>;
using mat_data = gko::matrix_data<ValueType, IndexType>;
cusparseIndexType_t cu_index =
gko::kernels::cuda::cusparse_index_type<IndexType>();
cudaDataType_t cu_value = gko::kernels::cuda::cuda_data_type<ValueType>();

void read(const mat_data &data) override
{
using gko::kernels::cuda::as_culibs_type;
coo_->read(data);
this->set_size(gko::dim<2>{coo_->get_size()});
GKO_ASSERT_NO_CUSPARSE_ERRORS(
cusparseCreateCoo(&mat_, coo_->get_size()[0], coo_->get_size()[1],
coo_->get_num_stored_elements(),
as_culibs_type(coo_->get_row_idxs()),
as_culibs_type(coo_->get_col_idxs()),
as_culibs_type(coo_->get_values()), cu_index,
CUSPARSE_INDEX_BASE_ZERO, cu_value));
}

gko::size_type get_num_stored_elements() const noexcept
{
return coo_->get_num_stored_elements();
}

~CuspGenericCoo() override
{
const auto id = this->get_gpu_exec()->get_device_id();
try {
gko::cuda::device_guard g{id};
GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroySpMat(mat_));
} catch (const std::exception &e) {
std::cerr << "Error when unallocating CuspGenericCoo mat_ matrix: "
<< e.what() << std::endl;
}
}

CuspGenericCoo(const CuspGenericCoo &other) = delete;

CuspGenericCoo &operator=(const CuspGenericCoo &other) = default;

protected:
void apply_impl(const gko::LinOp *b, gko::LinOp *x) const override
{
cusp_generic_spmv(this->get_gpu_exec(), mat_, scalars, b, x, trans_,
CUSPARSE_MV_ALG_DEFAULT);
}

CuspGenericCoo(std::shared_ptr<const gko::Executor> exec,
const gko::dim<2> &size = gko::dim<2>{})
: gko::EnableLinOp<CuspGenericCoo, CuspBase>(exec, size),
coo_(std::move(coo::create(exec))),
trans_(CUSPARSE_OPERATION_NON_TRANSPOSE)
{}

private:
// Contains {alpha, beta}
gko::Array<ValueType> scalars{
this->get_executor(), {gko::one<ValueType>(), gko::zero<ValueType>()}};
std::shared_ptr<coo> coo_;
cusparseOperation_t trans_;
cusparseSpMatDescr_t mat_;
};


#endif // defined(CUDA_VERSION) && (CUDA_VERSION >= 10010)


} // namespace detail


Expand All @@ -485,6 +690,18 @@ using cusp_csrmp = detail::CuspCsrmp<>;
using cusp_csrmm = detail::CuspCsrmm<>;


#if defined(CUDA_VERSION) && (CUDA_VERSION >= 10010)


using cusp_gcsr = detail::CuspGenericCsr<>;
using cusp_gcsr2 =
detail::CuspGenericCsr<double, gko::int32, CUSPARSE_CSRMV_ALG2>;
using cusp_gcoo = detail::CuspGenericCoo<>;


#endif // defined(CUDA_VERSION) && (CUDA_VERSION >= 10010)


using cusp_coo =
detail::CuspHybrid<double, gko::int32, CUSPARSE_HYB_PARTITION_USER, 0>;
using cusp_ell =
Expand Down
17 changes: 17 additions & 0 deletions benchmark/utils/formats.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,18 @@ std::string format_description =
"cusp_csrex: benchmark CuSPARSE with the cusparseXcsrmvEx function.\n"
"cusp_csrmp: benchmark CuSPARSE with the cusparseXcsrmv_mp function.\n"
"cusp_csrmm: benchmark CuSPARSE with the cusparseXcsrmv_mm function."
#if defined(CUDA_VERSION) && (CUDA_VERSION >= 10010)
"\n"
"cusp_gcsr: benchmark CuSPARSE with the generic csr with default "
"algorithm.\n"
"cusp_gcsr2: benchmark CuSPARSE with the generic csr with "
"CUSPARSE_CSRMV_ALG2.\n"
"cusp_gcoo: benchmark CuSPARSE with the generic coo with default "
"algorithm.\n"
#endif // defined(CUDA_VERSION) && (CUDA_VERSION >= 10010)
#endif // HAS_CUDA
#ifdef HAS_HIP
"\n"
"hipsp_csr: benchmark HipSPARSE with the hipsparseXcsrmv function.\n"
"hipsp_csrmm: benchmark HipSPARSE with the hipsparseXcsrmv_mm function.\n"
"hipsp_hybrid: benchmark HipSPARSE spmv with hipsparseXhybmv and an "
Expand Down Expand Up @@ -163,6 +173,7 @@ std::unique_ptr<MatrixType> read_matrix_from_data(
}


// clang-format off
const std::map<std::string, std::function<std::unique_ptr<gko::LinOp>(
std::shared_ptr<const gko::Executor>,
const gko::matrix_data<> &)>>
Expand All @@ -181,6 +192,11 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOp>(
{"cusp_hybrid", read_matrix_from_data<cusp_hybrid>},
{"cusp_coo", read_matrix_from_data<cusp_coo>},
{"cusp_ell", read_matrix_from_data<cusp_ell>},
#if defined(CUDA_VERSION) && (CUDA_VERSION >= 10010)
{"cusp_gcsr", read_matrix_from_data<cusp_gcsr>},
{"cusp_gcsr2", read_matrix_from_data<cusp_gcsr2>},
{"cusp_gcoo", read_matrix_from_data<cusp_gcoo>},
#endif // defined(CUDA_VERSION) && (CUDA_VERSION >= 10010)
#endif // HAS_CUDA
#ifdef HAS_HIP
{"hipsp_csr", read_matrix_from_data<hipsp_csr>},
Expand Down Expand Up @@ -216,6 +232,7 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOp>(
READ_MATRIX(hybrid,
std::make_shared<hybrid::minimal_storage_limit>())},
{"sellp", read_matrix_from_data<gko::matrix::Sellp<>>}};
// clang-format on


} // namespace formats
Expand Down
2 changes: 1 addition & 1 deletion common/components/atomic.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned long long int);
GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned int);


#if !(defined(CUDA_VERSION) && (CUDA_VERSION < 10100))
#if !(defined(CUDA_VERSION) && (CUDA_VERSION < 10010))
// CUDA 10.1 starts supporting 16-bit unsigned short int atomicCAS
GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned short int);
#endif
Expand Down
47 changes: 47 additions & 0 deletions cuda/base/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include <cublas_v2.h>
#include <cusparse.h>
#include <thrust/complex.h>


Expand Down Expand Up @@ -190,6 +191,31 @@ constexpr cudaDataType_t cuda_data_type_impl<uint8>()
}


#if defined(CUDA_VERSION) && (CUDA_VERSION >= 10010)


template <typename T>
constexpr cusparseIndexType_t cusparse_index_type_impl()
{
return CUSPARSE_INDEX_16U;
}

template <>
constexpr cusparseIndexType_t cusparse_index_type_impl<int32>()
{
return CUSPARSE_INDEX_32I;
}

template <>
constexpr cusparseIndexType_t cusparse_index_type_impl<int64>()
{
return CUSPARSE_INDEX_64I;
}


#endif // defined(CUDA_VERSION) && (CUDA_VERSION >= 10010)


} // namespace detail


Expand All @@ -208,6 +234,27 @@ constexpr cudaDataType_t cuda_data_type()
}


#if defined(CUDA_VERSION) && (CUDA_VERSION >= 10010)


/**
* This is an alias for the `cudaIndexType_t` equivalent of `T`. By default,
* CUSPARSE_INDEX_16U is returned.
*
* @tparam T a type
*
* @returns the actual `cusparseIndexType_t`
*/
template <typename T>
constexpr cusparseIndexType_t cusparse_index_type()
{
return detail::cusparse_index_type_impl<T>();
}


#endif // defined(CUDA_VERSION) && (CUDA_VERSION >= 10010)


/**
* This is an alias for CUDA's equivalent of `T`.
*
Expand Down