Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add accelerator support for POTRF #7

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ option(TTG_PARSEC_USE_BOOST_SERIALIZATION "Whether to select Boost serialization
option(TTG_ENABLE_CUDA "Whether to TTG will look for CUDA" OFF)
option(TTG_ENABLE_HIP "Whether to TTG will look for HIP" OFF)
option(TTG_EXAMPLES "Whether to build examples" OFF)
option(TTG_ENABLE_ASAN "Whether to enable address sanitizer" OFF)

option(TTG_FETCH_BOOST "Whether to fetch+build Boost, if missing" OFF)
option(TTG_IGNORE_BUNDLED_EXTERNALS "Whether to skip installation and use of bundled external depenedencies (Boost.CallableTraits)" OFF)
Expand All @@ -68,6 +69,11 @@ if (FORCE_COLORED_OUTPUT)
endif ()
endif (FORCE_COLORED_OUTPUT)

if (TTG_ENABLE_ASAN)
add_compile_options(-fsanitize=address)
add_link_options(-fsanitize=address)
endif (TTG_ENABLE_ASAN)

set(TTG_HIP_PLATFORM "__HIP_PLATFORM_AMD__" CACHE STRING "Which platform to use when compiling HIP-related code (default: __HIP_PLATFORM_AMD__)")
##########################
#### prerequisites
Expand Down Expand Up @@ -120,6 +126,11 @@ if (TTG_ENABLE_HIP)
if (TARGET roc::hipblas)
set(TTG_HAVE_HIPBLAS True CACHE BOOL "TTG detected support for hipBLAS")
endif()

find_package(hipsolver)
if (TARGET roc::hipsolver)
set(TTG_HAVE_HIPSOLVER True CACHE BOOL "TTG detected support for hipSolver")
endif()
add_compile_definitions(${TTG_HIP_PLATFORM})
endif(TTG_ENABLE_HIP)

Expand Down
12 changes: 12 additions & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,23 @@ if (TARGET tiledarray)
LINK_LIBRARIES tiledarray TiledArray_Eigen BTAS Boost::boost CUDA::cublas
COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2
RUNTIMES "parsec")

if (TARGET CUDA::cusolver)
add_ttg_executable(testing_dpotrf_cuda potrf/testing_dpotrf.cc
LINK_LIBRARIES lapackpp tiledarray CUDA::cublas CUDA::cusolver
COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2;TTG_ENABLE_CUDA=1
RUNTIMES "parsec")
endif(TARGET CUDA::cusolver)
elseif (TARGET roc::hipblas)
add_ttg_executable(bspmm-hip spmm/spmm_cuda.cc
LINK_LIBRARIES tiledarray TiledArray_Eigen BTAS Boost::boost roc::hipblas
COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2
RUNTIMES "parsec")
if (TARGET roc::hipsolver)
add_ttg_executable(testing_dpotrf_hip potrf/testing_dpotrf.cc
LINK_LIBRARIES lapackpp tiledarray roc::hipblas roc::hipsolver
RUNTIMES "parsec")
endif(TARGET roc::hipsolver)
endif()

add_ttg_executable(chain-ttg-dev task-benchmarks/chain-ttg-dev.cc LINK_LIBRARIES tiledarray RUNTIMES "parsec")
Expand Down
128 changes: 128 additions & 0 deletions examples/devblas_helper.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
#include "ttg/config.h"

#include <memory>
#include <stdexcept>
#include <optional>
#include <map>

#ifdef TTG_HAVE_CUDART

#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusolverDn.h>

/// \brief Returns the cuBLAS handle to be used for launching cuBLAS kernels from the current thread
/// \return the cuBLAS handle for the current thread
template<typename T = int>
inline const cublasHandle_t& cublas_handle(T _ = 0) {
using map_type = std::map<std::pair<int, cudaStream_t>, cublasHandle_t>;
static thread_local map_type handles;

int device = ttg::device::current_device();
cudaStream_t stream = ttg::device::current_stream();

map_type::iterator it;
if ((it = handles.find({device, stream})) == handles.end()){
cublasHandle_t handle;
auto status = cublasCreate_v2(&handle);
if (CUBLAS_STATUS_SUCCESS != status) {
std::cerr << "cublasCreate_v2 failed: " << status << std::endl;
throw std::runtime_error("cublasCreate_v2 failed");
}
status = cublasSetStream_v2(handle, ttg::device::current_stream());
if (CUBLAS_STATUS_SUCCESS != status) {
std::cerr << "cublasSetStream_v2 failed: " << status << std::endl;
throw std::runtime_error("cublasSetStream_v2 failed");
}
auto [iterator, success] = handles.insert({{device, stream}, handle});
it = iterator;
}
return it->second;
}

template<typename T = int>
inline const cusolverDnHandle_t& cusolver_handle(T _ = 0) {

using map_type = std::map<std::pair<int, cudaStream_t>, cusolverDnHandle_t>;
static thread_local map_type handles;

int device = ttg::device::current_device();
cudaStream_t stream = ttg::device::current_stream();

map_type::iterator it;
if ((it = handles.find({device, stream})) == handles.end()){
cusolverDnHandle_t handle;
auto status = cusolverDnCreate(&handle);
if (CUSOLVER_STATUS_SUCCESS != status) {
std::cerr << "cusolverDnCreate failed: " << status << std::endl;
throw std::runtime_error("cusolverDnCreate failed");
}
status = cusolverDnSetStream(handle, stream);
if (CUSOLVER_STATUS_SUCCESS != status) {
std::cerr << "cusolverDnSetStream failed: " << status << std::endl;
throw std::runtime_error("cusolverDnSetStream failed");
}

std::cout << "Creating cusolver handle " << handle << " for device " << device << " stream " << stream << std::endl;
auto [iterator, success] = handles.insert({{device, stream}, handle});
it = iterator;
} else {
std::cout << "Found cusolver handle " << it->second << " for device " << device << " stream " << stream << std::endl;
}

return it->second;
}
#endif // TTG_HAVE_CUDART

#ifdef TTG_HAVE_HIPBLAS

#include <hip/hip_runtime.h>
#include <hipblas/hipblas.h>
#include <hipsolver/hipsolver.h>

/// \brief Returns the rocBLAS handle to be used for launching rocBLAS kernels from the current thread
/// \return the rocBLAS handle for the current thread
template<typename T = int>
const hipblasHandle_t& hipblas_handle(T _ = 0) {
static thread_local std::map<int, hipblasHandle_t> handles;
int device = ttg::device::current_device();
std::map<int, hipblasHandle_t>::iterator it;
if ((it = handles.find(device)) == handles.end()){
hipblasHandle_t handle;
auto status = hipblasCreate(&handle);
if (HIPBLAS_STATUS_SUCCESS != status) {
throw std::runtime_error("hipblasCreate failed");
}
auto [iterator, success] = handles.insert({device, handle});
it = iterator;
}
hipblasStatus_t status = hipblasSetStream(it->second, ttg::device::current_stream());
if (HIPBLAS_STATUS_SUCCESS != status) {
throw std::runtime_error("hipblasSetStream failed");
}
return it->second;
}

/// \brief Returns the hipsolver handle to be used for launching rocBLAS kernels from the current thread
/// \return the hipsolver handle for the current thread
template<typename T = int>
const hipsolverDnHandle_t& hipsolver_handle(T _ = 0) {
static thread_local std::map<int, hipsolverDnHandle_t> handles;
int device = ttg::device::current_device();
std::map<int, hipsolverDnHandle_t>::iterator it;
if ((it = handles.find(device)) == handles.end()){
hipsolverDnHandle_t handle;
auto status = hipsolverDnCreate(&handle);
if (HIPSOLVER_STATUS_SUCCESS != status) {
throw std::runtime_error("hipsolverCreate failed");
}
auto [iterator, success] = handles.insert({device, handle});
it = iterator;
}
hipsolverStatus_t status = hipsolverDnSetStream(it->second, ttg::device::current_stream());
if (HIPSOLVER_STATUS_SUCCESS != status) {
throw std::runtime_error("hipsolverSetStream failed");
}
return it->second;
}
#endif // TTG_HAVE_HIPBLAS
98 changes: 45 additions & 53 deletions examples/matrixtile.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,111 +6,94 @@

#include <ttg/serialization/splitmd_data_descriptor.h>

template <typename T>
class MatrixTile {
#include <TiledArray/device/allocators.h>

template <typename T, class Allocator = TiledArray::device_pinned_allocator<double>>
class MatrixTile : public ttg::TTValue<MatrixTile<T, Allocator>> {
public:
using metadata_t = typename std::tuple<int, int, int>;

using pointer_t = typename std::shared_ptr<T>;
using buffer_t = typename ttg::buffer<T, Allocator>;
using ttvalue_type = ttg::TTValue<MatrixTile<T, Allocator>>;

private:
pointer_t _data;
buffer_t _buffer;
int _rows = 0, _cols = 0, _lda = 0;

// (Re)allocate the tile memory
void realloc() {
// std::cout << "Reallocating new tile" << std::endl;
_data = std::shared_ptr<T>(new T[_lda * _cols], [](T* p) { delete[] p; });
_buffer.reset(_lda * _cols);
}

public:
MatrixTile() {}

MatrixTile(int rows, int cols, int lda) : _rows(rows), _cols(cols), _lda(lda) { realloc(); }
MatrixTile(int rows, int cols, int lda)
: ttvalue_type()
, _buffer(lda*cols)
, _rows(rows)
, _cols(cols)
, _lda(lda)
{ }

MatrixTile(const metadata_t& metadata)
: MatrixTile(std::get<0>(metadata), std::get<1>(metadata), std::get<2>(metadata)) {}

MatrixTile(int rows, int cols, pointer_t data, int lda) : _data(data), _rows(rows), _cols(cols), _lda(lda) {}

MatrixTile(const metadata_t& metadata, pointer_t data)
MatrixTile(const metadata_t& metadata, T* data)
: MatrixTile(std::get<0>(metadata), std::get<1>(metadata), std::forward(data), std::get<2>(metadata)) {}

/**
* Constructor with outside memory. The tile will *not* delete this memory
* upon destruction.
*/
MatrixTile(int rows, int cols, T* data, int lda) : _data(data, [](T*) {}), _rows(rows), _cols(cols), _lda(lda) {}

MatrixTile(const metadata_t& metadata, T* data)
: MatrixTile(std::get<0>(metadata), std::get<1>(metadata), data, std::get<2>(metadata)) {}

#if 0
/* Copy dtor and operator with a static_assert to catch unexpected copying */
MatrixTile(const MatrixTile& other) {
static_assert("Oops, copy ctor called?!");
}

MatrixTile& operator=(const MatrixTile& other) {
static_assert("Oops, copy ctor called?!");
}
#endif

MatrixTile(MatrixTile<T>&& other) = default;
MatrixTile(int rows, int cols, T* data, int lda)
: ttvalue_type()
, _buffer(data, lda*cols)
, _rows(rows)
, _cols(cols)
, _lda(lda)
{ }

MatrixTile& operator=(MatrixTile<T>&& other) = default;
MatrixTile(MatrixTile<T, Allocator>&& other) = default;

#if 0
/* Defaulted copy ctor and op for shallow copies, see comment below */
MatrixTile(const MatrixTile<T>& other) = default;
MatrixTile& operator=(MatrixTile<T, Allocator>&& other) = default;

MatrixTile& operator=(const MatrixTile<T>& other) = default;
#endif // 0
/* Deep copy ctor und op are not needed for PO since tiles will never be read
* and written concurrently. Hence shallow copies are enough, will all
* receiving tasks sharing tile data. Re-enable this once the PaRSEC backend
* can handle data sharing without excessive copying */
#if 1
MatrixTile(const MatrixTile<T>& other) : _rows(other._rows), _cols(other._cols), _lda(other._lda) {
this->realloc();
MatrixTile(const MatrixTile<T, Allocator>& other)
: ttvalue_type()
, _buffer(other._lda*other._cols)
, _rows(other._rows)
, _cols(other._cols)
, _lda(other._lda) {
std::copy_n(other.data(), _lda * _cols, this->data());
}

MatrixTile& operator=(const MatrixTile<T>& other) {
MatrixTile& operator=(const MatrixTile<T, Allocator>& other) {
this->_rows = other._rows;
this->_cols = other._cols;
this->_lda = other._lda;
this->realloc();
std::copy_n(other.data(), _lda * _cols, this->data());
return *this;
}
#endif // 1

void set_metadata(metadata_t meta) {
_rows = std::get<0>(meta);
_cols = std::get<1>(meta);
_lda = std::get<2>(meta);
this->realloc();
}

metadata_t get_metadata(void) const { return metadata_t{_rows, _cols, _lda}; }

// Accessing the raw data
T* data() { return _data.get(); }

const T* data() const { return _data.get(); }

/// @return shared_ptr to data
pointer_t data_shared() & { return _data; }

/// @return shared_ptr to data
pointer_t data_shared() const& { return _data; }
T* data() { return _buffer.host_ptr(); }

/// yields data and resets this object to a default-constucted state
pointer_t yield_data() && {
pointer_t result = _data;
*this = MatrixTile();
return std::move(result);
}
const T* data() const { return _buffer.host_ptr(); }

size_t size() const { return _cols * _lda; }

Expand All @@ -120,8 +103,17 @@ class MatrixTile {

int lda() const { return _lda; }

buffer_t& buffer() {
return _buffer;
}

const buffer_t& buffer() const {
return _buffer;
}

auto& fill(T value) {
std::fill(_data.get(), _data.get() + size(), value);
std::fill(data().get(), data().get() + size(), value);
_buffer.set_current_device(0);
return *this;
}

Expand Down
Loading
Loading