Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Release 3.7.01 #1617

Merged
merged 12 commits into from
Dec 6, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,22 @@
# Change Log

## [3.7.01](https://github.com/kokkos/kokkos-kernels/tree/3.7.01) (2022-12-01)
[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.7.00...3.7.01)

### Bug Fixes:

- Change template type for StaticCrsGraph in BsrMatrix [\#1531](https://github.com/kokkos/kokkos/pull/1531)
- Remove listing of undefined TPL deps [\#1568](https://github.com/kokkos/kokkos/pull/1568)
- Fix using SpGEMM with nonstandard scalar type, with MKL enabled [\#1591](https://github.com/kokkos/kokkos/pull/1591)
- Move destroying dense vector descriptors out of cuSparse sptrsv handle [\#1590](https://github.com/kokkos/kokkos/pull/1590)
- Fix `cuda_data_type_from` to return `CUDA_C_64F` for `Kokkos::complex<double>` [\#1604](https://github.com/kokkos/kokkos/pull/1604)
- Disable compile-time check in cuda_data_type_from on supported scalar types for cuSPARSE [\#1605](https://github.com/kokkos/kokkos/pull/1605)
- Reduce register pressure in batched dense algorithms [\#1588](https://github.com/kokkos/kokkos/pull/1588)

### Implemented enhancements:

- Use new cusparseSpSV TPL for SPTRSV when cuSPARSE is enabled with CUDA >= 11.3 [\#1574](https://github.com/kokkos/kokkos/pull/1574)

## [3.7.00](https://github.com/kokkos/kokkos-kernels/tree/3.7.00) (2022-08-18)
[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.6.01...3.7.00)

Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ IF(NOT KOKKOSKERNELS_HAS_TRILINOS)
ENDIF()
SET(KokkosKernels_VERSION_MAJOR 3)
SET(KokkosKernels_VERSION_MINOR 7)
SET(KokkosKernels_VERSION_PATCH 00)
SET(KokkosKernels_VERSION_PATCH 01)
SET(KokkosKernels_VERSION "${KokkosKernels_VERSION_MAJOR}.${KokkosKernels_VERSION_MINOR}.${KokkosKernels_VERSION_PATCH}")
MATH(EXPR KOKKOSKERNELS_VERSION "${KokkosKernels_VERSION_MAJOR} * 10000 + ${KokkosKernels_VERSION_MINOR} * 100 + ${KokkosKernels_VERSION_PATCH}")
ENDIF()
Expand Down
2 changes: 1 addition & 1 deletion cmake/Dependencies.cmake
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers KokkosAlgorithms
LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE MAGMA METIS SuperLU Cholmod LAPACKE CBLAS ARMPL ROCBLAS ROCSPARSE CUBLAS
LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE METIS SuperLU Cholmod CUBLAS
TEST_OPTIONAL_TPLS yaml-cpp
)
# NOTE: If you update names in LIB_OPTIONAL_TPLS above, make sure to map those names in
Expand Down
1 change: 1 addition & 0 deletions master_history.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ tag: 3.5.00 date: 11/19/2021 master: 00189c0b release: f171533d
tag: 3.6.00 date: 04/06/2022 master: 8381db04 release: a7e683c4
tag: 3.6.01 date: 05/23/2022 master: e09389ae release: e1d8de42
tag: 3.7.00 date: 08/25/2022 master: 42ab7a29 release: 9cc88ffa
tag: 3.7.01 date: 12/01/2022 master: 04821ac3 release: 6cb632b6
59 changes: 47 additions & 12 deletions src/batched/dense/KokkosBatched_Gemm_Decl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,42 @@ template <class ArgTransA, class ArgTransB, class ArgBatchSzDim,
int tile_m, int tile_n, int tile_k>
class BatchedDblBufGemm;

//////////////////////////////// tile_m //////////////////////////////////
template <typename ExecutionSpace>
constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dlb_buf_tile_m() {
return 32;
}
//////////////////////////////// tile_n //////////////////////////////////
template <typename ExecutionSpace>
constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dlb_buf_tile_n() {
return 32;
}
//////////////////////////////// tile_k //////////////////////////////////
template <typename ExecutionSpace>
constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dlb_buf_tile_k() {
return 8;
}

// On MI100, batched_scalar_batched_gemm_nt_nt_dcomplex_dcomplex_right fails
// without this. See https://github.com/kokkos/kokkos-kernels/issues/1547.
// This reduces the register allocations (REG_M and REG_N) in the double
// buffering algorithm by a factor of 2.
#if defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_ARCH_VEGA908)
template <>
constexpr KOKKOS_INLINE_FUNCTION int
kk_gemm_dlb_buf_tile_k<Kokkos::Experimental::HIP>() {
return 16;
}
#endif
////////////////////////// alpha_in_fma_thresh ////////////////////////////
constexpr KOKKOS_INLINE_FUNCTION size_t kk_gemm_dbl_buf_alpha_in_fma_thresh() {
#ifdef __CUDACC_RDC__
return 24;
#else
return 64;
#endif // __CUDAACC_RDC__
}

// clang-format off
/// \brief Blocking solve of general matrix multiply on a batch of uniform matrices.
///
Expand Down Expand Up @@ -458,19 +494,19 @@ int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha,
// Begin checking conditions for optimal BatchedGemm invocation.
using view_scalar_type = typename CViewType::value_type;
using layout_type = typename CViewType::array_layout;
using exec_space = typename CViewType::execution_space;
constexpr bool is_vector = KokkosBatched::is_vector<view_scalar_type>::value;
constexpr bool on_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space<
typename CViewType::execution_space>();
constexpr bool on_gpu =
KokkosKernels::Impl::kk_is_gpu_exec_space<exec_space>();
constexpr bool on_x86_64 = KokkosKernels::Impl::kk_is_x86_64_mem_space<
typename CViewType::execution_space::memory_space>();
typename exec_space::memory_space>();
constexpr bool on_a64fx = KokkosKernels::Impl::kk_is_a64fx_mem_space<
typename CViewType::execution_space::memory_space>();
typename exec_space::memory_space>();

if (handle->enableDebug) {
std::cout << "view_scalar_type:" << typeid(view_scalar_type).name()
<< std::endl
<< "execution_space:"
<< typeid(typename CViewType::execution_space).name() << std::endl
<< "execution_space:" << typeid(exec_space).name() << std::endl
<< std::endl
<< "is_vector:" << is_vector << std::endl
<< "on_gpu:" << on_gpu << std::endl
Expand Down Expand Up @@ -521,12 +557,11 @@ int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha,
? (c_m >= 16)
: (c_m >= 24 && c_m <= 32) || c_m >= 40)) {
handle->teamSz = handle->vecLen = 8;
constexpr int tile_m = 32, tile_n = 32, tile_k = 8;
#ifdef __CUDACC_RDC__
constexpr size_t alpha_in_fma_thresh = 24;
#else
constexpr size_t alpha_in_fma_thresh = 64;
#endif // __CUDAACC_RDC__
constexpr int tile_m = Impl::kk_gemm_dlb_buf_tile_m<exec_space>();
constexpr int tile_n = Impl::kk_gemm_dlb_buf_tile_n<exec_space>();
constexpr int tile_k = Impl::kk_gemm_dlb_buf_tile_k<exec_space>();
constexpr size_t alpha_in_fma_thresh =
Impl::kk_gemm_dbl_buf_alpha_in_fma_thresh();

if (c_m % 32 == 0) { // No bounds checking
if (c_m >= alpha_in_fma_thresh) { // apply alpha in fma
Expand Down
8 changes: 4 additions & 4 deletions src/sparse/KokkosSparse_BsrMatrix.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -390,12 +390,12 @@ class BsrMatrix {
typedef BsrMatrix<ScalarType, OrdinalType, host_mirror_space, MemoryTraits>
HostMirror;
//! Type of the graph structure of the sparse matrix.
typedef Kokkos::StaticCrsGraph<ordinal_type, Kokkos::LayoutLeft,
execution_space, memory_traits, size_type>
typedef Kokkos::StaticCrsGraph<ordinal_type, Kokkos::LayoutLeft, device_type,
memory_traits, size_type>
StaticCrsGraphType;
//! Type of the graph structure of the sparse matrix - consistent with Kokkos.
typedef Kokkos::StaticCrsGraph<ordinal_type, Kokkos::LayoutLeft,
execution_space, memory_traits, size_type>
typedef Kokkos::StaticCrsGraph<ordinal_type, Kokkos::LayoutLeft, device_type,
memory_traits, size_type>
staticcrsgraph_type;
//! Type of column indices in the sparse matrix.
typedef typename staticcrsgraph_type::entries_type index_type;
Expand Down
9 changes: 6 additions & 3 deletions src/sparse/KokkosSparse_Utils_cusparse.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,12 @@ inline void cusparse_internal_safe_call(cusparseStatus_t cusparseStatus,

template <typename T>
cudaDataType cuda_data_type_from() {
// Note: compile-time failure is disabled to allow for packages such as
// Ifpack2 to more easily support scalar types that cuSPARSE may not.

// compile-time failure with a nice message if called on an unsupported type
static_assert(!std::is_same<T, T>::value,
"cuSparse TPL does not support scalar type");
// static_assert(!std::is_same<T, T>::value,
// "cuSparse TPL does not support scalar type");
// static_assert(false, ...) is allowed to error even if the code is not
// instantiated. obfuscate the predicate Despite this function being
// uncompilable, the compiler may decide that a return statement is missing,
Expand Down Expand Up @@ -151,7 +154,7 @@ inline cudaDataType cuda_data_type_from<Kokkos::complex<float>>() {
}
template <>
inline cudaDataType cuda_data_type_from<Kokkos::complex<double>>() {
return CUDA_C_32F;
return CUDA_C_64F;
}

#if defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION)
Expand Down
18 changes: 11 additions & 7 deletions src/sparse/KokkosSparse_Utils_mkl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,24 +123,28 @@ template <typename value_type>
class MKLSparseMatrix {
sparse_matrix_t mtx;

static_assert(mkl_is_supported_value_type<value_type>::value,
"Scalar type used in MKLSparseMatrix<value_type> is NOT "
"supported by MKL");

public:
inline MKLSparseMatrix(sparse_matrix_t mtx_) : mtx(mtx_) {}

// Constructs MKL sparse matrix from KK sparse views (m rows x n cols)
inline MKLSparseMatrix(const MKL_INT num_rows, const MKL_INT num_cols,
MKL_INT *xadj, MKL_INT *adj, value_type *values);
MKL_INT *xadj, MKL_INT *adj, value_type *values) {
throw std::runtime_error(
"Scalar type used in MKLSparseMatrix<value_type> is NOT "
"supported by MKL");
}

// Allows using MKLSparseMatrix directly in MKL calls
inline operator sparse_matrix_t() const { return mtx; }

// Exports MKL sparse matrix contents into KK views
inline void export_data(MKL_INT &num_rows, MKL_INT &num_cols,
MKL_INT *&rows_start, MKL_INT *&columns,
value_type *&values);
value_type *&values) {
throw std::runtime_error(
"Scalar type used in MKLSparseMatrix<value_type> is NOT "
"supported by MKL");
}

inline void destroy() {
KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_destroy(mtx));
Expand Down Expand Up @@ -256,4 +260,4 @@ inline void MKLSparseMatrix<Kokkos::complex<double>>::export_data(

#endif // KOKKOSKERNELS_ENABLE_TPL_MKL

#endif // _KOKKOSKERNELS_SPARSEUTILS_MKL_HPP
#endif // _KOKKOSKERNELS_SPARSEUTILS_MKL_HPP
57 changes: 55 additions & 2 deletions src/sparse/KokkosSparse_sptrsv_handle.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
#define KOKKOSSPARSE_SPTRSVHANDLE_HPP

#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
#include "cusparse.h"
#include "KokkosSparse_Utils_cusparse.hpp"
#endif

#if defined(KOKKOS_ENABLE_CUDA) && 10000 < CUDA_VERSION && \
Expand Down Expand Up @@ -108,6 +108,8 @@ class SPTRSVHandle {
typedef typename nnz_row_view_t::HostMirror host_nnz_row_view_t;
typedef typename Kokkos::View<int *, HandlePersistentMemorySpace>
int_row_view_t;
typedef typename Kokkos::View<int64_t *, HandlePersistentMemorySpace>
int64_row_view_t;
// typedef typename row_lno_persistent_work_view_t::HostMirror
// row_lno_persistent_work_host_view_t; //Host view type
typedef typename Kokkos::View<
Expand Down Expand Up @@ -154,6 +156,42 @@ class SPTRSVHandle {
mtx_scalar_view_t;

#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
#if (CUDA_VERSION >= 11030)
struct cuSparseHandleType {
cusparseHandle_t handle;
cusparseOperation_t transpose;
cusparseSpMatDescr_t matDescr;
cusparseDnVecDescr_t vecBDescr, vecBDescr_dummy;
cusparseDnVecDescr_t vecXDescr, vecXDescr_dummy;
cusparseSpSVDescr_t spsvDescr;
void *pBuffer{nullptr};

cuSparseHandleType(bool transpose_, bool is_lower) {
KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreate(&handle));

KOKKOS_CUSPARSE_SAFE_CALL(
cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST));

if (transpose_) {
transpose = CUSPARSE_OPERATION_TRANSPOSE;
} else {
transpose = CUSPARSE_OPERATION_NON_TRANSPOSE;
}

KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpSV_createDescr(&spsvDescr));
}

~cuSparseHandleType() {
if (pBuffer != nullptr) {
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(pBuffer));
pBuffer = nullptr;
}
KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroySpMat(matDescr));
KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpSV_destroyDescr(spsvDescr));
KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroy(handle));
}
};
#else // CUDA_VERSION < 11030
struct cuSparseHandleType {
cusparseHandle_t handle;
cusparseOperation_t transpose;
Expand Down Expand Up @@ -202,6 +240,7 @@ class SPTRSVHandle {
cusparseDestroy(handle);
}
};
#endif

typedef cuSparseHandleType SPTRSVcuSparseHandleType;
#endif
Expand Down Expand Up @@ -337,6 +376,7 @@ class SPTRSVHandle {
#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
SPTRSVcuSparseHandleType *cuSPARSEHandle;
int_row_view_t tmp_int_rowmap;
int64_row_view_t tmp_int64_rowmap;
#endif

#ifdef KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV
Expand Down Expand Up @@ -443,7 +483,8 @@ class SPTRSVHandle {
#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
,
cuSPARSEHandle(nullptr),
tmp_int_rowmap()
tmp_int_rowmap(),
tmp_int64_rowmap()
#endif
#ifdef KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV
,
Expand Down Expand Up @@ -851,6 +892,18 @@ class SPTRSVHandle {
}
int_row_view_t get_int_rowmap_view() { return tmp_int_rowmap; }
int *get_int_rowmap_ptr() { return tmp_int_rowmap.data(); }

void allocate_tmp_int64_rowmap(size_type N) {
tmp_int64_rowmap = int64_row_view_t(
Kokkos::view_alloc(Kokkos::WithoutInitializing, "tmp_int64_rowmap"), N);
}
template <typename RowViewType>
int64_t *get_int64_rowmap_ptr_copy(const RowViewType &rowmap) {
Kokkos::deep_copy(tmp_int64_rowmap, rowmap);
Kokkos::fence();
return tmp_int64_rowmap.data();
}
int64_t *get_int64_rowmap_ptr() { return tmp_int64_rowmap.data(); }
#endif

bool algm_requires_symb_lvlsched() const {
Expand Down
Loading