Skip to content

Commit

Permalink
Improve performance of the native BsrMatrix SpMV, especially for sing…
Browse files Browse the repository at this point in the history
…le-vector cases.

* Adds a new `"v4.2"` BsrMatrix SpMV implementation for non-transpose mode.
  * It is the default (when TPLs are disabled or not supported) on the GPU for non-transpose mode
  * The old implementation is retained for all other modes
  * old implementation may be requested explicitly with `controls.setParameter("algorithm", "v4.1")`
* Adds explicit invocation of old "4.1" impl to `KokkosKernels_sparse_spmv_bsr_benchmark`
* When TPLs are enabled, the new implementation may be requested anyway with `controls.setParameter("algorithm", "v4.2")`
* simplify `KokkosKernels::Impl::always_false_v`
* Add `template <typename View> class with_unmanaged` which provides a `type` alias reproducing `View` with `Kokkos::Unmanaged` added to its memory traits
* Add `KokkosKernels::Impl::with_unmanaged_t` as an alias for `typename with_unamanged::type`
* Add `template <typename View> auto KokkosKernels::Impl:make_unmanaged(const View &v)` which constructs a `with_unmanaged_t<View>` from v
* Add `<sstream>` to `KokkosKernels_Error.hpp`
* Add `DieOnError` and `SkipOnError` wrapped `bool`s to give names to boolean function arguments
* Link `KokkosKernels_sparse_spmv_bsr_benchmark` against `stdc++fs` for rocm 5.2
* More aggressive block size filtering in `KokkosSparse_csr_detect_block_size.hpp`
* Removes a useless warning from `Controls::getParameter` since what happens when a parameter is unset was made explicit in be87154
* `BsrMatrix` constructor throws when combination of nnz, rows, and columns don't make sense
* Change `BsrMatrix::block_layout` to `BsrMatrix::block_layout_type` for consistency
* Adds `BsrMatrix::unmanaged_block` to return an unmanged view to a 2D block of values
* Adds `BsrMatrix::unmanaged_block_const` to return a const unmanged view to a 2D block of values
  • Loading branch information
cwpearson committed Jul 24, 2023
1 parent 1b96d94 commit 86d8371
Show file tree
Hide file tree
Showing 15 changed files with 477 additions and 108 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -17,23 +17,12 @@
#ifndef KOKKOSKERNELS_ALWAYSFALSE_HPP
#define KOKKOSKERNELS_ALWAYSFALSE_HPP

#include <type_traits>
namespace KokkosKernels::Impl {

/*! \file KokkosKernels_AlwaysFalse.hpp
\brief A convenience type to be used in a static_assert that should always
fail
*/
// for use in static asserts
template <typename...>
inline constexpr bool always_false_v = false;

namespace KokkosKernels {
namespace Impl {

template <typename T>
using always_false = std::false_type;

template <typename T>
inline constexpr bool always_false_v = always_false<T>::value;

} // namespace Impl
} // namespace KokkosKernels
} // namespace KokkosKernels::Impl

#endif // KOKKOSKERNELS_ALWAYSFALSE_HPP
59 changes: 59 additions & 0 deletions common/impl/KokkosKernels_ViewUtils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
//@HEADER
// ************************************************************************
//
// Kokkos v. 4.0
// Copyright (2022) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
// See https://kokkos.org/LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//@HEADER

#ifndef KOKKOSKERNELS_VIEWUTILS_HPP
#define KOKKOSKERNELS_VIEWUTILS_HPP
#include "Kokkos_Core.hpp"

namespace KokkosKernels::Impl {

/*! \brief Yields a type that is View with Kokkos::Unmanaged added to the memory
* traits
*/
template <typename View>
class with_unmanaged {
using data_type = typename View::data_type;
using layout_type = typename View::array_layout;
using memory_space = typename View::memory_space;

using orig_traits = typename View::memory_traits;
static constexpr unsigned new_traits =
orig_traits::impl_value | Kokkos::Unmanaged;

public:
using type = Kokkos::View<data_type, layout_type, memory_space,
Kokkos::MemoryTraits<new_traits> >;
};

/*! \brief A type that is View with Kokkos::Unmanaged added to the memory traits
\tparam View the type to add Kokkos::Unmanaged to
*/
template <typename View>
using with_unmanaged_t = typename with_unmanaged<View>::type;

/*! \brief Returns an unmanaged version of v
\tparam View the type of the input view v
*/
template <typename View>
auto make_unmanaged(const View &v) {
return typename with_unmanaged<View>::type(v);
}

} // namespace KokkosKernels::Impl

#endif
1 change: 1 addition & 0 deletions common/src/KokkosKernels_Error.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#define KOKKOSKERNELS_ERROR_HPP

#include <stdexcept>
#include <sstream>

namespace KokkosKernels {
namespace Impl {
Expand Down
45 changes: 45 additions & 0 deletions perf_test/Benchmark_Utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 4.0
// Copyright (2022) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
// See https://kokkos.org/LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//@HEADER
*/

#ifndef KOKKOSKERNELS_PERFTEST_BENCHMARK_UTILS_HPP
#define KOKKOSKERNELS_PERFTEST_BENCHMARK_UTILS_HPP

namespace KokkosKernelsBenchmark {

class WrappedBool {
public:
WrappedBool(const bool &val) : val_(val) {}

operator bool() const { return val_; }

protected:
bool val_;
};

class DieOnError : public WrappedBool {
public:
DieOnError(const bool &val) : WrappedBool(val) {}
};
class SkipOnError : public WrappedBool {
public:
SkipOnError(const bool &val) : WrappedBool(val) {}
};

} // namespace KokkosKernelsBenchmark

#endif // KOKKOSKERNELS_PERFTEST_BENCHMARK_UTILS_HPP
6 changes: 6 additions & 0 deletions perf_test/sparse/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -139,4 +139,10 @@ if (KokkosKernels_ENABLE_BENCHMARK)
KOKKOSKERNELS_ADD_BENCHMARK(
sparse_spmv_bsr_benchmark SOURCES KokkosSparse_spmv_bsr_benchmark.cpp
)

# hipcc 5.2 has an underlying clang that has the std::filesystem
# in an experimental namespace and a different library
if (Kokkos_CXX_COMPILER_ID STREQUAL HIPCC AND Kokkos_CXX_COMPILER_VERSION VERSION_LESS 5.3)
target_link_libraries(KokkosKernels_sparse_spmv_bsr_benchmark PRIVATE -lstdc++fs)
endif()
endif()
90 changes: 53 additions & 37 deletions perf_test/sparse/KokkosSparse_spmv_bsr_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ namespace fs = std::filesystem;

#include <Kokkos_ArithTraits.hpp>

#include "Benchmark_Utils.hpp"
#include "KokkosSparse_CrsMatrix.hpp"
#include "KokkosSparse_IOUtils.hpp"
#include "KokkosSparse_spmv.hpp"
Expand Down Expand Up @@ -121,27 +122,6 @@ size_t detect_block_size(const fs::path &path) {
return cache.at(path);
}

// a bool by a different name, to make its purpose clear
class DieOnError {
public:
DieOnError(const bool &val) : val_(val) {}

operator bool() const { return val_; }

private:
bool val_;
};

// a bool by a different name, to make its purpose clear
class SkipOnError {
public:
SkipOnError(const bool &val) : val_(val) {}
operator bool() const { return val_; }

private:
bool val_;
};

// Test that y_act is close to y_exp.
// This needs the matrix, alpha, and beta to compute the error tolerance
// properly
Expand Down Expand Up @@ -235,6 +215,20 @@ struct SpmvNative {
static std::string name() { return "native"; }
};

// Wrapper to create a common interface for all SpMVs to benchmark
struct SpmvV41 {
template <typename Alpha, typename Matrix, typename XView, typename Beta,
typename YView>
static void spmv(const char *mode, const Alpha &alpha, const Matrix &crs,
const XView &x, const Beta &beta, const YView &y) {
KokkosKernels::Experimental::Controls controls;
controls.setParameter("algorithm", "v4.1");
return KokkosSparse::spmv(controls, mode, alpha, crs, x, beta, y);
}

static std::string name() { return "v4.1"; }
};

template <typename Spmv, typename Bsr>
void run(benchmark::State &state, const Bsr &bsr, const size_t k) {
using execution_space = typename Bsr::execution_space;
Expand Down Expand Up @@ -269,10 +263,10 @@ void run(benchmark::State &state, const Bsr &bsr, const size_t k) {
const char *mode = KokkosSparse::NoTranspose;

// test the SpMV against whatever the default is
KokkosSparse::spmv(mode, alpha, bsr, x, beta, y_exp);
Kokkos::fence();
Spmv::spmv(mode, alpha, bsr, x, beta, y_act);
Kokkos::fence();
KokkosSparse::spmv(mode, alpha, bsr, x, beta, y_exp);
Kokkos::fence();

check_correctness(state, y_exp, y_act, bsr, alpha, beta, DieOnError(false),
SkipOnError(true));
Expand All @@ -299,7 +293,6 @@ void run(benchmark::State &state, const Bsr &bsr, const size_t k) {
template <typename Bsr, typename Spmv>
void read_expand_run(benchmark::State &state, const fs::path &path,
const size_t blockSize, const size_t k) {
using device_type = typename Bsr::device_type;
using scalar_type = typename Bsr::non_const_value_type;
using ordinal_type = typename Bsr::non_const_ordinal_type;

Expand All @@ -322,7 +315,6 @@ void read_expand_run(benchmark::State &state, const fs::path &path,
template <typename Bsr, typename Spmv>
void read_convert_run(benchmark::State &state, const fs::path &path,
const size_t blockSize, const size_t k) {
using device_type = typename Bsr::device_type;
using scalar_type = typename Bsr::non_const_value_type;
using ordinal_type = typename Bsr::non_const_ordinal_type;

Expand Down Expand Up @@ -386,27 +378,53 @@ template <typename Device>
void register_converts(const fs::path &path, const size_t bs) {
std::cerr << "benchmarks will use detected blocksize\n";
// clang-format off
register_convert_type<int, float, unsigned, Device, SpmvDefault>(path, bs);
register_convert_type<int, float, unsigned, Device, SpmvNative>(path, bs);
register_convert_type<int, float, int, Device, SpmvDefault>(path, bs);
register_convert_type<int, float, int, Device, SpmvNative>(path, bs);
register_convert_type<int64_t, double, size_t, Device, SpmvDefault>(path, bs);
register_convert_type<int64_t, double, size_t, Device, SpmvNative>(path, bs);
register_convert_type<int64_t, double, int64_t, Device, SpmvDefault>(path, bs);
register_convert_type<int64_t, double, int64_t, Device, SpmvNative>(path, bs);
register_convert_type<int, float, int, Device, SpmvDefault>(path, bs);
register_convert_type<int, float, int, Device, SpmvNative>(path, bs);
register_convert_type<int, float, int, Device, SpmvV41>(path, bs);

register_convert_type<int, double, int, Device, SpmvDefault>(path, bs);
register_convert_type<int, double, int, Device, SpmvNative>(path, bs);
register_convert_type<int, double, int, Device, SpmvV41>(path, bs);

register_convert_type<int, float, unsigned, Device, SpmvDefault>(path, bs);
register_convert_type<int, float, unsigned, Device, SpmvNative>(path, bs);
register_convert_type<int, float, unsigned, Device, SpmvV41>(path, bs);

register_convert_type<int64_t, double, size_t, Device, SpmvDefault>(path, bs);
register_convert_type<int64_t, double, size_t, Device, SpmvNative>(path, bs);
register_convert_type<int64_t, double, size_t, Device, SpmvV41>(path, bs);

register_convert_type<int64_t, double, int64_t, Device, SpmvDefault>(path, bs);
register_convert_type<int64_t, double, int64_t, Device, SpmvNative>(path, bs);
register_convert_type<int64_t, double, int64_t, Device, SpmvV41>(path, bs);

// clang-format on
}

template <typename Device>
void register_expands(const fs::path &path) {
register_expand_type<int, float, unsigned, Device, SpmvDefault>(path);
register_expand_type<int, float, unsigned, Device, SpmvNative>(path);
std::cerr << "benchmarks will expand each non-zero into a larger block\n";
// clang-format off
register_expand_type<int, float, int, Device, SpmvDefault>(path);
register_expand_type<int, float, int, Device, SpmvNative>(path);
register_expand_type<int, float, int, Device, SpmvV41>(path);

register_expand_type<int, double, int, Device, SpmvDefault>(path);
register_expand_type<int, double, int, Device, SpmvNative>(path);
register_expand_type<int, double, int, Device, SpmvV41>(path);

register_expand_type<int, float, unsigned, Device, SpmvDefault>(path);
register_expand_type<int, float, unsigned, Device, SpmvNative>(path);
register_expand_type<int, float, unsigned, Device, SpmvV41>(path);

register_expand_type<int64_t, double, uint64_t, Device, SpmvDefault>(path);
register_expand_type<int64_t, double, uint64_t, Device, SpmvNative>(path);
register_expand_type<int64_t, double, uint64_t, Device, SpmvV41>(path);

register_expand_type<int64_t, double, int64_t, Device, SpmvDefault>(path);
register_expand_type<int64_t, double, int64_t, Device, SpmvNative>(path);
register_expand_type<int64_t, double, int64_t, Device, SpmvV41>(path);
// clang-format on
}

template <typename Device>
Expand All @@ -425,10 +443,8 @@ void register_path(const fs::path &path) {
Otherwise, expand the matrix to some arbitrary block sizes to test BSR
*/
if (detectedSize != 1) {
std::cerr << "benchmarks will use detected size\n";
register_converts<Device>(path, detectedSize);
} else {
std::cerr << "benchmarks will expand each non-zero into a larger block\n";
register_expands<Device>(path);
}
}
Expand Down
26 changes: 13 additions & 13 deletions sparse/impl/KokkosSparse_crs_detect_block_size.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@
for performance-sensitive use.
*/

namespace KokkosSparse {
namespace Impl {
namespace KokkosSparse::Impl {

/**
* \class BlockPopulations
Expand Down Expand Up @@ -86,14 +85,14 @@ class BlockPopulations {
* @return The largest block size that results in completely dense blocks
The smallest valid block size is 1
Since blocks must be dense, sqrt(nnz), num rows, num cols, and min nnz/row
among non-empty rows are all easy upper bounds of the block size Block sizes
are tested from 1 to the minimum of the above The matrix dimensions must divide
evenly into a trial block size (otherwise a block would not be full)
Furthermore, if a block size of N is not dense, any multiple of N will also not
be dense, and can be skipped. This is because blocks of 2N contain blocks of N,
at least one of which is already known not to be dense. In practice, this ends
up testing only small composite factors and all prime factors up to the upper
bound
among non-empty rows are all easy upper bounds of the block size.
Block sizes are tested from 1 to the minimum of the above.
The matrix dimensions must divide evenly into a trial block size (otherwise a
block would not be full). Furthermore, if a block size of N is not dense, any
multiple of N will also not be dense, and can be skipped. This is because
blocks of 2N contain blocks of N, at least one of which is already known not to
be dense. In practice, this ends up testing only small composite factors and
all prime factors up to the upper bound.
*/
template <typename Crs>
size_t detect_block_size(const Crs &crs) {
Expand Down Expand Up @@ -124,12 +123,14 @@ size_t detect_block_size(const Crs &crs) {
for (size_t trialSize = 2; trialSize <= upperBound; ++trialSize) {
// trial size must be factor of rows / cols
if ((crs.numRows() % trialSize) || (crs.numCols() % trialSize)) {
rejectedSizes.push_back(trialSize);
continue;
}

// trial size must not be a multiple of previously-rejected size
if (std::any_of(rejectedSizes.begin(), rejectedSizes.end(),
[&](size_t f) { return trialSize % f == 0; })) {
rejectedSizes.push_back(trialSize);
continue;
}

Expand All @@ -152,7 +153,6 @@ size_t detect_block_size(const Crs &crs) {
return largestBlockSize;
}

} // namespace Impl
} // namespace KokkosSparse
} // namespace KokkosSparse::Impl

#endif // KOKKOSSPARSE_CRS_DETECT_BLOCK_SIZE_HPP
#endif // KOKKOSSPARSE_CRS_DETECT_BLOCK_SIZE_HPP
16 changes: 15 additions & 1 deletion sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,21 @@ Bsr expand_crs_to_bsr(const Crs &crs, size_t blockSize) {
using crs_row_map_type = typename Crs::row_map_type;
using bsr_row_map_type =
Kokkos::View<typename Bsr::row_map_type::non_const_data_type,
bsr_device_type>;
bsr_device_type>; // need non-const version

using bsr_size_type = typename Bsr::non_const_size_type;

{
size_t nnz = crs.nnz() * blockSize * blockSize;
if (nnz > size_t(Kokkos::ArithTraits<bsr_size_type>::max())) {
std::stringstream ss;
ss << "expanding " << crs.nnz()
<< " non-zeros of CrsMatrix into blocks of " << blockSize
<< " would overflow size_type of requested BsrMatrix "
<< Kokkos::ArithTraits<bsr_size_type>::name();
throw std::runtime_error(ss.str());
}
}

// construct the Bsr row map
bsr_row_map_type bsrRowMap("bsrRowMap", crs.graph.row_map.size());
Expand Down
Loading

0 comments on commit 86d8371

Please sign in to comment.