From d1a6ef3ef935634b61011ad132a6a6f2924cf7a3 Mon Sep 17 00:00:00 2001 From: Thomas Herault Date: Wed, 6 Sep 2023 13:57:26 -0400 Subject: [PATCH 01/17] User needs to include any std:: serialization they use --- examples/potrf/testing_dlauum.cc | 1 + examples/potrf/testing_dpoinv.cc | 1 + examples/potrf/testing_dpotrf.cc | 1 + examples/potrf/testing_dtrtri.cc | 1 + 4 files changed, 4 insertions(+) diff --git a/examples/potrf/testing_dlauum.cc b/examples/potrf/testing_dlauum.cc index 51dc7f81a..717600094 100644 --- a/examples/potrf/testing_dlauum.cc +++ b/examples/potrf/testing_dlauum.cc @@ -6,6 +6,7 @@ #endif // TTG_USE_PARSEC #include +#include #include "lauum.h" #include "plgsy.h" diff --git a/examples/potrf/testing_dpoinv.cc b/examples/potrf/testing_dpoinv.cc index 550e6435d..a1e419cc8 100644 --- a/examples/potrf/testing_dpoinv.cc +++ b/examples/potrf/testing_dpoinv.cc @@ -6,6 +6,7 @@ #endif // TTG_USE_PARSEC #include +#include #include "plgsy.h" #include "pmw.h" diff --git a/examples/potrf/testing_dpotrf.cc b/examples/potrf/testing_dpotrf.cc index 5cef0bd20..d5686042f 100644 --- a/examples/potrf/testing_dpotrf.cc +++ b/examples/potrf/testing_dpotrf.cc @@ -1,4 +1,5 @@ #include +#include #include "plgsy.h" #include "pmw.h" diff --git a/examples/potrf/testing_dtrtri.cc b/examples/potrf/testing_dtrtri.cc index 4a24dec4a..107fbf8c2 100644 --- a/examples/potrf/testing_dtrtri.cc +++ b/examples/potrf/testing_dtrtri.cc @@ -6,6 +6,7 @@ #endif // TTG_USE_PARSEC #include +#include #include "plgsy.h" #include "pmw.h" From 7576321f426a18f60d4c46a1a7f197202fea8935 Mon Sep 17 00:00:00 2001 From: Thomas Herault Date: Fri, 29 Sep 2023 12:12:05 -0400 Subject: [PATCH 02/17] Work in progress: adding CUDA version of POTRF task in POTRF example -- not compiling --- examples/matrixtile.h | 46 ++++++++++------------ examples/potrf/potrf.h | 86 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 104 insertions(+), 28 deletions(-) diff --git a/examples/matrixtile.h b/examples/matrixtile.h index e23007803..b3e5a3b8e 100644 --- a/examples/matrixtile.h +++ b/examples/matrixtile.h @@ -6,12 +6,13 @@ #include -template -class MatrixTile { +template +class MatrixTile : public ttg::TTValue> { public: using metadata_t = typename std::tuple; - using pointer_t = typename std::shared_ptr; + using pointer_t = typename ttg::buffer; + using ttvalue_type = ttg::TTValue>; private: pointer_t _data; @@ -20,18 +21,18 @@ class MatrixTile { // (Re)allocate the tile memory void realloc() { // std::cout << "Reallocating new tile" << std::endl; - _data = std::shared_ptr(new T[_lda * _cols], [](T* p) { delete[] p; }); + _data = ttg::buffer(_data, _lda * _cols); } public: MatrixTile() {} - MatrixTile(int rows, int cols, int lda) : _rows(rows), _cols(cols), _lda(lda) { realloc(); } + MatrixTile(int rows, int cols, int lda) : ttvalue_type(), _rows(rows), _cols(cols), _lda(lda) { realloc(); } MatrixTile(const metadata_t& metadata) : MatrixTile(std::get<0>(metadata), std::get<1>(metadata), std::get<2>(metadata)) {} - MatrixTile(int rows, int cols, pointer_t data, int lda) : _data(data), _rows(rows), _cols(cols), _lda(lda) {} + MatrixTile(int rows, int cols, pointer_t data, int lda) : ttvalue_type(), _data(data), _rows(rows), _cols(cols), _lda(lda) {} MatrixTile(const metadata_t& metadata, pointer_t data) : MatrixTile(std::get<0>(metadata), std::get<1>(metadata), std::forward(data), std::get<2>(metadata)) {} @@ -40,38 +41,20 @@ class MatrixTile { * Constructor with outside memory. The tile will *not* delete this memory * upon destruction. */ - MatrixTile(int rows, int cols, T* data, int lda) : _data(data, [](T*) {}), _rows(rows), _cols(cols), _lda(lda) {} + MatrixTile(int rows, int cols, T* data, int lda) : ttvalue_type(), _data(data), _rows(rows), _cols(cols), _lda(lda) {} MatrixTile(const metadata_t& metadata, T* data) : MatrixTile(std::get<0>(metadata), std::get<1>(metadata), data, std::get<2>(metadata)) {} -#if 0 - /* Copy dtor and operator with a static_assert to catch unexpected copying */ - MatrixTile(const MatrixTile& other) { - static_assert("Oops, copy ctor called?!"); - } - - MatrixTile& operator=(const MatrixTile& other) { - static_assert("Oops, copy ctor called?!"); - } -#endif - MatrixTile(MatrixTile&& other) = default; MatrixTile& operator=(MatrixTile&& other) = default; -#if 0 - /* Defaulted copy ctor and op for shallow copies, see comment below */ - MatrixTile(const MatrixTile& other) = default; - - MatrixTile& operator=(const MatrixTile& other) = default; -#endif // 0 /* Deep copy ctor und op are not needed for PO since tiles will never be read * and written concurrently. Hence shallow copies are enough, will all * receiving tasks sharing tile data. Re-enable this once the PaRSEC backend * can handle data sharing without excessive copying */ -#if 1 - MatrixTile(const MatrixTile& other) : _rows(other._rows), _cols(other._cols), _lda(other._lda) { + MatrixTile(const MatrixTile& other) : ttvalue_type(), _rows(other._rows), _cols(other._cols), _lda(other._lda) { this->realloc(); std::copy_n(other.data(), _lda * _cols, this->data()); } @@ -84,7 +67,6 @@ class MatrixTile { std::copy_n(other.data(), _lda * _cols, this->data()); return *this; } -#endif // 1 void set_metadata(metadata_t meta) { _rows = std::get<0>(meta); @@ -144,6 +126,16 @@ class MatrixTile { } }; +#if defined(TTG_HAVE_CUDA) +using blk_t = DeviceTensor>, + btas::Handle::shared_ptr>>; +#else +// TODO: no hip pinned allocator in TA? +using blk_t = DeviceTensor, btas::Handle::shared_ptr>>; +#endif + namespace ttg { template diff --git a/examples/potrf/potrf.h b/examples/potrf/potrf.h index 6079bc6bc..e536c655a 100644 --- a/examples/potrf/potrf.h +++ b/examples/potrf/potrf.h @@ -19,7 +19,90 @@ namespace potrf { ttg::Edge>& input, ttg::Edge>& output_trsm, ttg::Edge>& output_result) { - using T = typename MatrixT::element_type; + using T = typename MatrixT::element_type; +#if defined(TTG_HAS_CUDART) || defined(TTG_HAS_HIP) + static int device_potrf_workspace_size(blk_t &A) { + int Lwork; + #if defined(TTG_HAVE_CUDA) + cusolverDnDpotrf_bufferSize(ttg::detail::cublas_get_handle(), + CUBLAS_FILL_MODE_LOWER, A.extent(1), + nullptr, A.extent(0), + &Lwork); + return Lwork; + #elif defined(TTG_HAVE_HIPBLAS) + #error TBCoded + #else + return 0; + #endif + } + + static void device_potrf(blk_t &A, double *workspace, int Lwork, int *devInfo) { + int device = A.b.get_current_device(); + assert(device != 0); + #if defined(TTG_HAVE_CUDA) + cusolverDnDpotrf(ttg::detail::cublas_get_handle(), + CUBLAS_FILL_MODE_LOWER, A.extent(1), + A.b.device_ptr_on(device), A.extent(0), + workspace, Lwork, + devInfo); + #elif defined(TTG_HAVE_HIPBLAS) + hipsolverDpotrf(ttg::detail::hipblas_get_handle(), + HIPSOLVER_FILL_MODE_LOWER, A.extent(1), + A.b.device_ptr_on(device), A.extent(0), + workspace, Lwork, + devInfo); + #endif + } + + auto f_dev = [=](const Key1& key, MatrixTile&& A, + std::tuple>, ttg::Out>>& out) -> ttg::device_task { + const auto K = key[0]; + + /* pull the matrix onto the device, as computing the workspace size might in theory depend on the data */ + //TODO: extend MatrixTile to be heterogeneous-aware. Look at spmm-cuda.cc 50-253 + // Need to include a ttg::buffer _data instead of a shared_ptr; + // Check pmw.h: when we generate the MatrixTile + // Also check pinned allocator at the end of DeviceTensor (250-253) + + int Lwork = device_potrf_workspace_size(A); + + // Instead of using scratch here, we should have hostWS and hostInfo globals and use to_device + // this would reduce the number of I/O operations to devices + double hostWS[Lwork]; + ttg::devicescratch devWS = ttg::make_scratch(hostWS, ttg::scope::Allocate); + int hostInfo = -1; + ttg::devicescratch devInfo = ttg::make_scratch(&hostInfo, ttg::scope::Allocate); + + /* the workspace and the devInfo must be device-level pointers */ + co_await ttg::to_device(A._data, devWS, devInfo); + + /* everything is on the device, call the POTRF */ + device_potrf(A, devWS, Lwork, devInfo); + + /* compute successors while the kernel is running */ + std::vector keylist; + keylist.reserve(A.rows() - K); + /* TODO: reverse order of arrays */ + for (int m = K + 1; m < A.rows(); ++m) { + /* send tile to trsm */ + keylist.push_back(Key2(m, K)); + } + + /* wait for the kernel to complete */ + co_await ttg::wait_kernel(devInfo); + + if( hostInfo == 0 ) { + co_await ttg::device::forward(ttg::device::broadcast<0, 1>(std::make_tuple(Key2(K, K), keylist), std::move(A), out)); + // Anything after this co_await is never executed + // co_return would look better, but co_return would destroy keylist before the runtime can handle it + } else { + // Well... Here we should interrupt the DAG of tasks, there is an error. Raise? + std::cerr << "Factorization is SUSPICIOUS (the matrix might not be diagonal dominant)" << std::endl; + } + } + return ttg::make_tt(f_dev, ttg::edges(ttg::fuse(input, input_disp)), ttg::edges(output_result, output_trsm), "POTRF", + {"tile_kk/dispatcher"}, {"output_result", "output_trsm"}); +#else /* defined(TTG_HAS_CUDART) || defined(TTG_HAS_HIP) */ auto f = [=](const Key1& key, MatrixTile&& tile_kk, std::tuple>, ttg::Out>>& out) { const int K = key[0]; @@ -49,6 +132,7 @@ namespace potrf { }; return ttg::make_tt(f, ttg::edges(ttg::fuse(input, input_disp)), ttg::edges(output_result, output_trsm), "POTRF", {"tile_kk/dispatcher"}, {"output_result", "output_trsm"}); +#endif } template From 4a23f42ba4465c0ffe7697f667f1e76a67a10174 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Sun, 1 Oct 2023 19:27:27 +0000 Subject: [PATCH 03/17] Further steps towards device-enabled POTRF Signed-off-by: Joseph Schuchart --- examples/matrixtile.h | 64 ++++++------ examples/potrf/potrf.h | 216 ++++++++++++++++++++++++++++++++++++++--- 2 files changed, 230 insertions(+), 50 deletions(-) diff --git a/examples/matrixtile.h b/examples/matrixtile.h index b3e5a3b8e..929b6a07e 100644 --- a/examples/matrixtile.h +++ b/examples/matrixtile.h @@ -6,34 +6,38 @@ #include -template +template > class MatrixTile : public ttg::TTValue> { public: using metadata_t = typename std::tuple; - using pointer_t = typename ttg::buffer; + using buffer_t = typename ttg::buffer; using ttvalue_type = ttg::TTValue>; private: - pointer_t _data; + buffer_t b; int _rows = 0, _cols = 0, _lda = 0; // (Re)allocate the tile memory void realloc() { // std::cout << "Reallocating new tile" << std::endl; - _data = ttg::buffer(_data, _lda * _cols); + b.reset(_lda * _cols); } public: MatrixTile() {} - MatrixTile(int rows, int cols, int lda) : ttvalue_type(), _rows(rows), _cols(cols), _lda(lda) { realloc(); } + MatrixTile(int rows, int cols, int lda) + : ttvalue_type() + , b(lda*cols) + , _rows(rows) + , _cols(cols) + , _lda(lda) + { } MatrixTile(const metadata_t& metadata) : MatrixTile(std::get<0>(metadata), std::get<1>(metadata), std::get<2>(metadata)) {} - MatrixTile(int rows, int cols, pointer_t data, int lda) : ttvalue_type(), _data(data), _rows(rows), _cols(cols), _lda(lda) {} - MatrixTile(const metadata_t& metadata, pointer_t data) : MatrixTile(std::get<0>(metadata), std::get<1>(metadata), std::forward(data), std::get<2>(metadata)) {} @@ -41,7 +45,13 @@ class MatrixTile : public ttg::TTValue> { * Constructor with outside memory. The tile will *not* delete this memory * upon destruction. */ - MatrixTile(int rows, int cols, T* data, int lda) : ttvalue_type(), _data(data), _rows(rows), _cols(cols), _lda(lda) {} + MatrixTile(int rows, int cols, T* data, int lda) + : ttvalue_type() + , _data(data, lda*cols) + , _rows(rows) + , _cols(cols) + , _lda(lda) + { } MatrixTile(const metadata_t& metadata, T* data) : MatrixTile(std::get<0>(metadata), std::get<1>(metadata), data, std::get<2>(metadata)) {} @@ -54,8 +64,12 @@ class MatrixTile : public ttg::TTValue> { * and written concurrently. Hence shallow copies are enough, will all * receiving tasks sharing tile data. Re-enable this once the PaRSEC backend * can handle data sharing without excessive copying */ - MatrixTile(const MatrixTile& other) : ttvalue_type(), _rows(other._rows), _cols(other._cols), _lda(other._lda) { - this->realloc(); + MatrixTile(const MatrixTile& other) + : ttvalue_type() + , b(other._lda*other._cols) + , _rows(other._rows) + , _cols(other._cols) + , _lda(other._lda) { std::copy_n(other.data(), _lda * _cols, this->data()); } @@ -77,22 +91,9 @@ class MatrixTile : public ttg::TTValue> { metadata_t get_metadata(void) const { return metadata_t{_rows, _cols, _lda}; } // Accessing the raw data - T* data() { return _data.get(); } - - const T* data() const { return _data.get(); } - - /// @return shared_ptr to data - pointer_t data_shared() & { return _data; } + T* data() { return b.host_ptr(); } - /// @return shared_ptr to data - pointer_t data_shared() const& { return _data; } - - /// yields data and resets this object to a default-constucted state - pointer_t yield_data() && { - pointer_t result = _data; - *this = MatrixTile(); - return std::move(result); - } + const T* data() const { return b.host_ptr(); } size_t size() const { return _cols * _lda; } @@ -103,7 +104,8 @@ class MatrixTile : public ttg::TTValue> { int lda() const { return _lda; } auto& fill(T value) { - std::fill(_data.get(), _data.get() + size(), value); + std::fill(data().get(), data().get() + size(), value); + b.set_current_device(0); return *this; } @@ -126,16 +128,6 @@ class MatrixTile : public ttg::TTValue> { } }; -#if defined(TTG_HAVE_CUDA) -using blk_t = DeviceTensor>, - btas::Handle::shared_ptr>>; -#else -// TODO: no hip pinned allocator in TA? -using blk_t = DeviceTensor, btas::Handle::shared_ptr>>; -#endif - namespace ttg { template diff --git a/examples/potrf/potrf.h b/examples/potrf/potrf.h index e536c655a..e6966bb2b 100644 --- a/examples/potrf/potrf.h +++ b/examples/potrf/potrf.h @@ -6,6 +6,19 @@ #undef DEBUG_TILES_VALUES +#if defined(TTG_HAS_CUDART) +#define ES ttg::ExecutionSpace::CUDA +#define TASKRET -> ttg::device_task +#elif defined(TTG_HAS_HIP) +#define ES ttg::ExecutionSpace::HIP +#define TASKRET -> ttg::device_task +#else +#define ES ttg::ExecutionSpace::Host +#define TASKRET -> void +#endif + + + namespace potrf { /* FLOP macros taken from DPLASMA */ @@ -55,8 +68,19 @@ namespace potrf { } auto f_dev = [=](const Key1& key, MatrixTile&& A, - std::tuple>, ttg::Out>>& out) -> ttg::device_task { - const auto K = key[0]; + std::tuple>, ttg::Out>>& out) TASKRET { + const auto K = key[0]; + + /* compute successors before submitting the kernel running + * TODO: this is parsec specific since this code is still executing on the worker threads + */ + std::vector keylist; + keylist.reserve(A.rows() - K); + /* TODO: reverse order of arrays */ + for (int m = K + 1; m < A.rows(); ++m) { + /* send tile to trsm */ + keylist.push_back(Key2(m, K)); + } /* pull the matrix onto the device, as computing the workspace size might in theory depend on the data */ //TODO: extend MatrixTile to be heterogeneous-aware. Look at spmm-cuda.cc 50-253 @@ -79,28 +103,20 @@ namespace potrf { /* everything is on the device, call the POTRF */ device_potrf(A, devWS, Lwork, devInfo); - /* compute successors while the kernel is running */ - std::vector keylist; - keylist.reserve(A.rows() - K); - /* TODO: reverse order of arrays */ - for (int m = K + 1; m < A.rows(); ++m) { - /* send tile to trsm */ - keylist.push_back(Key2(m, K)); - } - /* wait for the kernel to complete */ co_await ttg::wait_kernel(devInfo); if( hostInfo == 0 ) { - co_await ttg::device::forward(ttg::device::broadcast<0, 1>(std::make_tuple(Key2(K, K), keylist), std::move(A), out)); + co_await ttg::device::forward(ttg::device::broadcast<0, 1>(std::make_tuple(Key2(K, K), std::move(keylist)), std::move(A), out)); // Anything after this co_await is never executed // co_return would look better, but co_return would destroy keylist before the runtime can handle it } else { // Well... Here we should interrupt the DAG of tasks, there is an error. Raise? - std::cerr << "Factorization is SUSPICIOUS (the matrix might not be diagonal dominant)" << std::endl; + std::cerr << "Factorization is SUSPICIOUS (the matrix might not be diagonally dominant)" << std::endl; + ttg::abort(); } } - return ttg::make_tt(f_dev, ttg::edges(ttg::fuse(input, input_disp)), ttg::edges(output_result, output_trsm), "POTRF", + return ttg::make_tt(f_dev, ttg::edges(ttg::fuse(input, input_disp)), ttg::edges(output_result, output_trsm), "POTRF", {"tile_kk/dispatcher"}, {"output_result", "output_trsm"}); #else /* defined(TTG_HAS_CUDART) || defined(TTG_HAS_HIP) */ auto f = [=](const Key1& key, MatrixTile&& tile_kk, @@ -145,6 +161,69 @@ namespace potrf { ttg::Edge>& output_col, // to GEMM ttg::Edge>& output_result) { using T = typename MatrixT::element_type; +#if defined(TTG_HAS_CUDART) || defined(TTG_HAS_HIP) + auto f = [=](const Key2& key, const MatrixTile& tile_kk, MatrixTile&& tile_mk, + std::tuple>, ttg::Out>, ttg::Out>, + ttg::Out>>& out) TASKRET { + const int M = key[0]; + const int K = key[1]; // the column equals the outer most look K (same as PO) + + auto mb = tile_mk.rows(); + auto nb = tile_mk.cols(); + + /* in trsm, tile_mk is mb x nb, and tile_kk needs to be lda x nb because side = Right */ + assert(nb == tile_kk.rows()); + + if (ttg::tracing()) ttg::print("TRSM(", key, ")"); + + /* populate successor keys while we're on the worker thread */ + std::vector keylist_row; + keylist_row.reserve(M - K); + std::vector keylist_col; + keylist_col.reserve(A.rows() - M - 1); + + /* send tile to syrk on diagonal */ + if (ttg::tracing()) ttg::print("TRSM(", key, "): sending output to syrk(", Key2{K, M}, ")"); + + /* send the tile to all gemms across in row i */ + for (int n = K + 1; n < M; ++n) { + if (ttg::tracing()) ttg::print("TRSM(", key, "): sending output to gemm( ", Key3{M, n, K}, ")"); + keylist_row.push_back(Key3(M, n, K)); + } + + /* send the tile to all gemms down in column i */ + for (int m = M + 1; m < A.rows(); ++m) { + if (ttg::tracing()) ttg::print("TRSM(", key, "): sending output to gemm( ", Key3{m, M, K}, ")"); + keylist_col.push_back(Key3(m, M, K)); + } + + + co_await ttg::to_device(tile_kk.b, tile_mk.b); + int device = tile_kk.b.get_current_device(); + double alpha = 1.0; +#if defined(TTG_HAVE_CUDA) + cublasDtrsm(ttg::detail::cublas_get_handle(), + CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, + CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, + mb, nb, &alpha, + tile_kk.b.device_ptr_on(device), tile_kk.lda(), + tile_mk.b.device_ptr_on(device), tile_mk.lda()); +#elif defined(TTG_HAVE_HIPBLAS) + hipblasDtrsm(ttg::detail:hipblas_get_handle(), + HIPBLAS_SIDE_RIGHT, HIPBLAS_FILL_MODE_LOWER, + HIPBLAS_OP_T, HIPBLAS_DIAG_NON_UNIT, + mb, nb, &alpha, + tile_kk.b.device_ptr_on(device), tile_kk.lda(), + tile_mk.b.device_ptr_on(device), tile_mk.lda()); +#endif + + co_await ttg::device::forward(ttg::device::broadcast<0, 1, 2, 3>(std::make_tuple(key, Key2(K, M), keylist_row, keylist_col), + std::move(tile_mk), out)); + }; + return ttg::make_tt(f, ttg::edges(input_kk, ttg::fuse(input_mk, input_disp)), + ttg::edges(output_result, output_diag, output_row, output_col), "TRSM", + {"tile_kk", "tile_mk/dispatcher"}, {"output_result", "tile_mk", "output_row", "output_col"}); +#else auto f = [=](const Key2& key, const MatrixTile& tile_kk, MatrixTile&& tile_mk, std::tuple>, ttg::Out>, ttg::Out>, ttg::Out>>& out) { @@ -195,6 +274,7 @@ namespace potrf { return ttg::make_tt(f, ttg::edges(input_kk, ttg::fuse(input_mk, input_disp)), ttg::edges(output_result, output_diag, output_row, output_col), "TRSM", {"tile_kk", "tile_mk/dispatcher"}, {"output_result", "tile_mk", "output_row", "output_col"}); +#endif } template @@ -205,6 +285,54 @@ namespace potrf { ttg::Edge>& output_potrf, // to POTRF ttg::Edge>& output_syrk) { using T = typename MatrixT::element_type; +#if defined(TTG_HAS_CUDART) || defined(TTG_HAS_HIP) + auto f = [=](const Key2& key, const MatrixTile& tile_mk, MatrixTile&& tile_kk, + std::tuple>, ttg::Out>>& out) TASKRET { + const int K = key[0]; + const int M = key[1]; + + /* tile_kk is mb x mb and tile_mk is mb x nb */ + assert(tile_kk.rows() == tile_kk.cols()); + assert(tile_mk.rows() == tile_kk.rows()); + + auto mb = tile_mk.rows(); + auto nb = tile_mk.cols(); + + if (ttg::tracing()) ttg::print("SYRK(", key, ")"); + + co_await ttg::to_device(tile_kk.b, tile_mk.b); + + double alpha = -1.0; + double beta = 1.0; +#if defined(TTG_HAVE_CUDA) + cublasDsyrk(ttg::detail::cublas_get_handle(), + CUBLAS_FILL_MODE_LOWER, + CUBLAS_OP_N, + mb, nb, &alpha, + tile_nk.b.device_ptr_on(device), tile_mk.lda(), &beta, + tile_kk.b.device_ptr_on(device), tile_kk.lda()); +#elif defined(TTG_HAVE_HIPBLAS) + hipblasDsyrk(ttg::detail:hipblas_get_handle(), + HIPBLAS_FILL_MODE_LOWER, + HIPBLAS_OP_N, + mb, nb, &alpha, + tile_kk.b.device_ptr_on(device), tile_kk.lda(), &beta, + tile_mk.b.device_ptr_on(device), tile_mk.lda()); +#endif + + if (M == K + 1) { + /* send the tile to potrf */ + if (ttg::tracing()) ttg::print("SYRK(", key, "): sending output to POTRF(", Key1{K + 1}, ")"); + co_await ttg::device::send<0>(Key1(K + 1), std::move(tile_kk), out); + } else { + /* send output to next syrk */ + if (ttg::tracing()) ttg::print("SYRK(", key, "): sending output to SYRK(", Key2{K + 1, M}, ")"); + co_await ttg::device::send<1>(Key2(K + 1, M), std::move(tile_kk), out); + } + }; + return ttg::make_tt(f, ttg::edges(input_mk, ttg::fuse(input_kk, input_disp)), ttg::edges(output_potrf, output_syrk), + "SYRK", {"tile_mk", "tile_kk/dispatcher"}, {"output_potrf", "output_syrk"}); +#else auto f = [=](const Key2& key, const MatrixTile& tile_mk, MatrixTile&& tile_kk, std::tuple>, ttg::Out>>& out) { const int K = key[0]; @@ -242,6 +370,7 @@ namespace potrf { }; return ttg::make_tt(f, ttg::edges(input_mk, ttg::fuse(input_kk, input_disp)), ttg::edges(output_potrf, output_syrk), "SYRK", {"tile_mk", "tile_kk/dispatcher"}, {"output_potrf", "output_syrk"}); +#endif } template @@ -253,6 +382,64 @@ namespace potrf { ttg::Edge>& output_trsm, // to TRSM ttg::Edge>& output_gemm) { using T = typename MatrixT::element_type; +#if defined(TTG_HAS_CUDART) || defined(TTG_HAS_HIP) + auto f = [=](const Key3& key, const MatrixTile& tile_mk, const MatrixTile& tile_nk, MatrixTile&& tile_mn, + std::tuple>, ttg::Out>>& out) TASKRET { + const int M = key[0]; + const int N = key[1]; + const int K = key[2]; + assert(M != N && M > K && N > K); + + assert(tile_mk.cols() == tile_nk.cols()); + assert(tile_mk.rows() == tile_mn.rows()); + assert(tile_nk.rows() == tile_mn.cols()); + + if (ttg::tracing()) ttg::print("GEMM(", key, ")"); +#if defined(DEBUG_TILES_VALUES) + std::cout << "Before GEMM(" << key << "), A(" << M << ", " << K << ") is " << tile_mk << " and A(" << K << ", " + << N << ") is " << tile_nk << " and A(" << M << ", " << N << ") is " << tile_mn; +#endif + + co_await ttg::to_device(tile_mk.b, tile_nk.b, tile_mn.b); + + double alpha = -1.0; + double beta = 1.0; +#if defined(TTG_HAVE_CUDA) + cublasDgemm(ttg::detail:cublas_get_handle(), + CUBLAS_OP_N, CUBLAS_OP_T, + tile_mk.rows(), tile_nk.rows(), + tile_nk.cols(), &alpha, + tile_mk.data(), tile_mk.lda(), + tile_nk.data(), tile_nk.lda(), &beta, + tile_mn.data(), tile_mn.lda()); +#elif defined(TTG_HAVE_HIPBLAS) + hipblasDgemm(ttg::detail:hipblas_get_handle(), + HIPBLAS_OP_N, HIPBLAS_OP_T, + tile_mk.rows(), tile_nk.rows(), + tile_nk.cols(), &alpha, + tile_mk.data(), tile_mk.lda(), + tile_nk.data(), tile_nk.lda(), &beta, + tile_mn.data(), tile_mn.lda()); +#endif + +#if defined(DEBUG_TILES_VALUES) + std::cout << "After GEMM(" << key << "), A(" << M << ", " << N << ") is " << tile_mn << std::endl; +#endif + + if (N == K + 1) { + /* send the tile to trsm */ + if (ttg::tracing()) ttg::print("GEMM(", key, "): sending output to TRSM(", Key2{M, N}, ")"); + co_await ttg::device::send<0>(Key2(M, N), std::move(tile_mn), out); + } else { + /* send the tile to the next gemm */ + if (ttg::tracing()) ttg::print("GEMM(", key, "): sending output to GEMM(", Key3{M, N, K + 1}, ")"); + co_await ttg::device::send<1>(Key3(M, N, K + 1), std::move(tile_mn), out); + } + }; + return ttg::make_tt(f, ttg::edges(input_mk, input_nk, ttg::fuse(input_disp, input_mn)), + ttg::edges(output_trsm, output_gemm), "GEMM", {"input_mk", "input_kn", "input_mn/dispatcher"}, + {"output_trsm", "outout_gemm"}); +#else auto f = [=](const Key3& key, const MatrixTile& tile_mk, const MatrixTile& tile_nk, MatrixTile&& tile_mn, std::tuple>, ttg::Out>>& out) { const int M = key[0]; @@ -291,6 +478,7 @@ namespace potrf { return ttg::make_tt(f, ttg::edges(input_mk, input_nk, ttg::fuse(input_disp, input_mn)), ttg::edges(output_trsm, output_gemm), "GEMM", {"input_mk", "input_kn", "input_mn/dispatcher"}, {"output_trsm", "outout_gemm"}); +#endif } template From 57faac424de2f510d828196d131ec03a82a5f04d Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Mon, 2 Oct 2023 02:16:45 +0000 Subject: [PATCH 04/17] Add allocator to ttg::buffer Signed-off-by: Joseph Schuchart --- examples/matrixtile.h | 25 +++++++------- ttg/ttg/buffer.h | 8 ++--- ttg/ttg/parsec/buffer.h | 72 ++++++++++++++++++++++------------------- ttg/ttg/parsec/fwd.h | 2 +- 4 files changed, 55 insertions(+), 52 deletions(-) diff --git a/examples/matrixtile.h b/examples/matrixtile.h index 929b6a07e..693ae3d1e 100644 --- a/examples/matrixtile.h +++ b/examples/matrixtile.h @@ -6,13 +6,15 @@ #include -template > -class MatrixTile : public ttg::TTValue> { +#include + +template > +class MatrixTile : public ttg::TTValue> { public: using metadata_t = typename std::tuple; - using buffer_t = typename ttg::buffer; - using ttvalue_type = ttg::TTValue>; + using buffer_t = typename ttg::buffer; + using ttvalue_type = ttg::TTValue>; private: buffer_t b; @@ -38,7 +40,7 @@ class MatrixTile : public ttg::TTValue> { MatrixTile(const metadata_t& metadata) : MatrixTile(std::get<0>(metadata), std::get<1>(metadata), std::get<2>(metadata)) {} - MatrixTile(const metadata_t& metadata, pointer_t data) + MatrixTile(const metadata_t& metadata, T* data) : MatrixTile(std::get<0>(metadata), std::get<1>(metadata), std::forward(data), std::get<2>(metadata)) {} /** @@ -47,24 +49,21 @@ class MatrixTile : public ttg::TTValue> { */ MatrixTile(int rows, int cols, T* data, int lda) : ttvalue_type() - , _data(data, lda*cols) + , b(data, lda*cols) , _rows(rows) , _cols(cols) , _lda(lda) { } - MatrixTile(const metadata_t& metadata, T* data) - : MatrixTile(std::get<0>(metadata), std::get<1>(metadata), data, std::get<2>(metadata)) {} - - MatrixTile(MatrixTile&& other) = default; + MatrixTile(MatrixTile&& other) = default; - MatrixTile& operator=(MatrixTile&& other) = default; + MatrixTile& operator=(MatrixTile&& other) = default; /* Deep copy ctor und op are not needed for PO since tiles will never be read * and written concurrently. Hence shallow copies are enough, will all * receiving tasks sharing tile data. Re-enable this once the PaRSEC backend * can handle data sharing without excessive copying */ - MatrixTile(const MatrixTile& other) + MatrixTile(const MatrixTile& other) : ttvalue_type() , b(other._lda*other._cols) , _rows(other._rows) @@ -73,7 +72,7 @@ class MatrixTile : public ttg::TTValue> { std::copy_n(other.data(), _lda * _cols, this->data()); } - MatrixTile& operator=(const MatrixTile& other) { + MatrixTile& operator=(const MatrixTile& other) { this->_rows = other._rows; this->_cols = other._cols; this->_lda = other._lda; diff --git a/ttg/ttg/buffer.h b/ttg/ttg/buffer.h index 1868b7e0c..6d252cb09 100644 --- a/ttg/ttg/buffer.h +++ b/ttg/ttg/buffer.h @@ -7,16 +7,16 @@ namespace ttg { -template -using buffer = TTG_IMPL_NS::buffer; +template> +using buffer = TTG_IMPL_NS::buffer; namespace detail { template struct is_buffer : std::false_type { }; - template - struct is_buffer> : std::true_type + template + struct is_buffer> : std::true_type { }; template diff --git a/ttg/ttg/parsec/buffer.h b/ttg/ttg/parsec/buffer.h index 0e9c58cf9..00c7c996e 100644 --- a/ttg/ttg/parsec/buffer.h +++ b/ttg/ttg/parsec/buffer.h @@ -34,33 +34,43 @@ namespace detail { * in order for TTG to properly facilitate ownership * tracking of the containing object. */ -template -struct buffer : public detail::ttg_parsec_data_wrapper_t { +template +struct buffer : public detail::ttg_parsec_data_wrapper_t + , private Allocator { using element_type = std::decay_t; + using allocator_traits = std::allocator_traits; + using allocator_type = typename allocator_traits::allocator_type; + static_assert(std::is_trivially_copyable_v, "Only trivially copyable types are supported for devices."); static_assert(std::is_default_constructible_v, "Only default constructible types are supported for devices."); private: - using delete_fn_t = std::add_pointer_t; + using delete_fn_t = std::function; using host_data_ptr = std::unique_ptr; host_data_ptr m_host_data; std::size_t m_count = 0; - static void delete_owned(element_type *ptr) { - delete[] ptr; - } - static void delete_non_owned(element_type *ptr) { // nothing to be done, we don't own the memory } friend parsec_data_t* detail::get_parsec_data(const ttg_parsec::buffer&); + allocator_type& get_allocator_reference() { return static_cast(*this); } + + element_type* allocate(std::size_t n) { + return allocator_traits::allocate(get_allocator_reference(), n); + } + + void deallocate(element_type *ptr, std::size_t n) { + allocator_traits::deallocate(get_allocator_reference(), ptr, n); + } + public: /* The device ID of the CPU. */ @@ -69,30 +79,29 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t { buffer() : buffer(nullptr, 0) { } - buffer(std::size_t count) + buffer(std::size_t n) : ttg_parsec_data_wrapper_t() - , m_host_data(new element_type[count](), &delete_owned) - , m_count(count) + , allocator_type() + , m_host_data(allocate(n), [this, n](element_type* ptr){ deallocate(ptr, n); }) + , m_count(n) { //std::cout << "buffer " << this << " ctor count " // << count << "(" << m_host_data.get() << ") ttg_copy " // << m_ttg_copy // << " parsec_data " << m_data.get() << std::endl; - this->reset_parsec_data(m_host_data.get(), count*sizeof(element_type)); + this->reset_parsec_data(m_host_data.get(), n*sizeof(element_type)); } /* Constructing a buffer using application-managed memory. * The memory pointed to by ptr must be accessible during * the life-time of the buffer. */ - buffer(element_type* ptr, std::size_t count = 1) + buffer(element_type* ptr, std::size_t n = 1) : ttg_parsec_data_wrapper_t() + , allocator_type() , m_host_data(ptr, &delete_non_owned) - , m_count(count) + , m_count(n) { - this->reset_parsec_data(m_host_data.get(), count*sizeof(element_type)); - //std::cout << "buffer " << this << " ctor ptr " << ptr << " count " << count - // << " ttg_copy " << m_ttg_copy - // << " parsec_data " << m_data.get() << std::endl; + this->reset_parsec_data(m_host_data.get(), n*sizeof(element_type)); } virtual ~buffer() { @@ -102,18 +111,11 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t { /* allow moving device buffers */ buffer(buffer&& db) : ttg_parsec_data_wrapper_t(std::move(db)) + , allocator_type(std::move(db)) , m_host_data(std::move(db.m_host_data)) , m_count(db.m_count) { - //std::cout << "buffer " << this << " other " << &db << " mv ctor ttg_copy " << m_ttg_copy << std::endl; db.m_count = 0; - - //std::cout << "buffer::move-ctor from " << &db << " ttg-copy " << db.m_ttg_copy - // << " to " << this << " ttg-copy " << m_ttg_copy - // << " parsec-data " << m_data.get() - // << std::endl; - -//#endif // 0 } /* explicitly disable copying of buffers @@ -124,6 +126,7 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t { /* allow moving device buffers */ buffer& operator=(buffer&& db) { ttg_parsec_data_wrapper_t::operator=(std::move(db)); + allocator_type::operator=(std::move(db)); m_host_data = std::move(db.m_host_data); m_count = db.m_count; db.m_count = 0; @@ -202,6 +205,7 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t { void allocate_on(int device_id) { /* TODO: need exposed PaRSEC memory allocator */ + throw std::runtime_error("not implemented yet"); } /* TODO: can we do this automatically? @@ -246,26 +250,26 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t { } /* Reallocate the buffer with count elements */ - void reset(std::size_t count) { + void reset(std::size_t n) { /* TODO: can we resize if count is smaller than m_count? */ /* drop the current data and reallocate */ reset(); - if (count == 0) { + if (n == 0) { m_host_data = host_data_ptr(nullptr, &delete_non_owned); } else { - m_host_data = host_data_ptr(new element_type[count], &delete_owned); + m_host_data = host_data_ptr(allocate(n), [this, n](element_type* ptr){ deallocate(ptr, n); }); } - reset_parsec_data(m_host_data.get(), count*sizeof(element_type)); + reset_parsec_data(m_host_data.get(), n*sizeof(element_type)); //std::cout << "buffer::reset(" << count << ") ptr " << m_host_data.get() // << " ttg_copy " << m_ttg_copy // << " parsec_data " << m_data.get() << std::endl; - m_count = count; + m_count = n; } /* Reset the buffer to use the ptr to count elements */ - void reset(T* ptr, std::size_t count = 1) { + void reset(T* ptr, std::size_t n = 1) { /* TODO: can we resize if count is smaller than m_count? */ - if (count == m_count) { + if (n == m_count) { return; } @@ -274,9 +278,9 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t { m_count = 0; } else { m_host_data = host_data_ptr(ptr, &delete_non_owned); - m_count = count; + m_count = n; } - reset_parsec_data(m_host_data.get(), count*sizeof(element_type)); + reset_parsec_data(m_host_data.get(), n*sizeof(element_type)); //std::cout << "buffer::reset(" << ptr << ", " << count << ") ptr " << m_host_data.get() // << " ttg_copy " << m_ttg_copy // << " parsec_data " << m_data.get() << std::endl; diff --git a/ttg/ttg/parsec/fwd.h b/ttg/ttg/parsec/fwd.h index f184548d4..098f51220 100644 --- a/ttg/ttg/parsec/fwd.h +++ b/ttg/ttg/parsec/fwd.h @@ -16,7 +16,7 @@ namespace ttg_parsec { template struct ptr; - template + template> struct buffer; template struct devicescratch; From 194e93da4ab653476c133603a7602a2798cf6aef Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Mon, 2 Oct 2023 02:19:35 +0000 Subject: [PATCH 05/17] Add first full draft of device-enabled POTRF Signed-off-by: Joseph Schuchart --- examples/CMakeLists.txt | 10 +++++++++- examples/potrf/potrf.h | 6 ++---- examples/potrf/testing_dpotrf.cc | 8 +++++++- ttg/ttg.h | 1 + 4 files changed, 19 insertions(+), 6 deletions(-) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 24b6f5897..b599d97f6 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -14,13 +14,21 @@ if (TARGET tiledarray) if (TARGET CUDA::cublas) add_ttg_executable(bspmm-cuda spmm/spmm_cuda.cc LINK_LIBRARIES tiledarray TiledArray_Eigen BTAS Boost::boost CUDA::cublas - COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2;TTG_HAVE_CUDA=1 + COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2 + RUNTIMES "parsec") + + add_ttg_executable(testing_dpotrf_cuda potrf/testing_dpotrf.cc + LINK_LIBRARIES lapackpp tiledarray CUDA::cublas + COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2 RUNTIMES "parsec") elseif (TARGET roc::hipblas) add_ttg_executable(bspmm-hip spmm/spmm_cuda.cc LINK_LIBRARIES tiledarray TiledArray_Eigen BTAS Boost::boost roc::hipblas COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2 RUNTIMES "parsec") + add_ttg_executable(testing_dpotrf_hip potrf/testing_dpotrf.cc + LINK_LIBRARIES lapackpp tiledarray roc::hipblas + RUNTIMES "parsec") endif() add_ttg_executable(chain-ttg-dev task-benchmarks/chain-ttg-dev.cc LINK_LIBRARIES tiledarray RUNTIMES "parsec") diff --git a/examples/potrf/potrf.h b/examples/potrf/potrf.h index e6966bb2b..039a0065b 100644 --- a/examples/potrf/potrf.h +++ b/examples/potrf/potrf.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include "lapack.hh" #include "pmw.h" @@ -34,6 +35,7 @@ namespace potrf { ttg::Edge>& output_result) { using T = typename MatrixT::element_type; #if defined(TTG_HAS_CUDART) || defined(TTG_HAS_HIP) + std::cout << "Creating CUDA POTRF task " << std::endl; static int device_potrf_workspace_size(blk_t &A) { int Lwork; #if defined(TTG_HAVE_CUDA) @@ -422,10 +424,6 @@ namespace potrf { tile_mn.data(), tile_mn.lda()); #endif -#if defined(DEBUG_TILES_VALUES) - std::cout << "After GEMM(" << key << "), A(" << M << ", " << N << ") is " << tile_mn << std::endl; -#endif - if (N == K + 1) { /* send the tile to trsm */ if (ttg::tracing()) ttg::print("GEMM(", key, "): sending output to TRSM(", Key2{M, N}, ")"); diff --git a/examples/potrf/testing_dpotrf.cc b/examples/potrf/testing_dpotrf.cc index d5686042f..76055b4d4 100644 --- a/examples/potrf/testing_dpotrf.cc +++ b/examples/potrf/testing_dpotrf.cc @@ -56,7 +56,12 @@ int main(int argc, char **argv) bool check = !cmdOptionExists(argv+1, argv+argc, "-x"); bool cow_hint = !cmdOptionExists(argv+1, argv+argc, "-w"); - ttg::initialize(argc, argv, nthreads); + // TODO: need to filter out our arguments to make parsec happy + ttg::initialize(1, argv, nthreads); + + // initialize MADNESS so that TA allocators can be created + madness::ParsecRuntime::initialize_with_existing_context(ttg::default_execution_context().impl().context()); + madness::initialize(argc, argv, /* nthread = */ 1, /* quiet = */ true); auto world = ttg::default_execution_context(); if(nullptr != prof_filename) { @@ -207,6 +212,7 @@ int main(int argc, char **argv) world.profile_off(); + madness::finalize(); ttg::finalize(); return ret; } diff --git a/ttg/ttg.h b/ttg/ttg.h index 09ccfaf90..c84af7854 100644 --- a/ttg/ttg.h +++ b/ttg/ttg.h @@ -1,6 +1,7 @@ #ifndef TTG_H_INCLUDED #define TTG_H_INCLUDED +#include "ttg/config.h" #include "ttg/fwd.h" #include "ttg/runtimes.h" From e41a4b1354c92cd63ff544cdbdd0c2e6036ed742 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Mon, 2 Oct 2023 02:44:45 +0000 Subject: [PATCH 06/17] POTRF: Use the right precompiler guard for CUDA and HIP Signed-off-by: Joseph Schuchart --- examples/potrf/potrf.h | 82 +++++++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 40 deletions(-) diff --git a/examples/potrf/potrf.h b/examples/potrf/potrf.h index 039a0065b..e6230371a 100644 --- a/examples/potrf/potrf.h +++ b/examples/potrf/potrf.h @@ -7,10 +7,10 @@ #undef DEBUG_TILES_VALUES -#if defined(TTG_HAS_CUDART) +#if defined(TTG_HAVE_CUDART) #define ES ttg::ExecutionSpace::CUDA #define TASKRET -> ttg::device_task -#elif defined(TTG_HAS_HIP) +#elif defined(TTG_HAVE_HIP) #define ES ttg::ExecutionSpace::HIP #define TASKRET -> ttg::device_task #else @@ -27,6 +27,41 @@ namespace potrf { inline double FADDS_POTRF(double __n) { return (__n * (((1. / 6.) * __n) * __n - (1. / 6.))); } inline double FLOPS_DPOTRF(double __n) { return FMULS_POTRF(__n) + FADDS_POTRF(__n); } +#if defined(TTG_HAVE_CUDART) || defined(TTG_HAVE_HIP) + static int device_potrf_workspace_size(MatrixTile &A) { + int Lwork; + #if defined(TTG_HAVE_CUDA) + cusolverDnDpotrf_bufferSize(ttg::detail::cublas_get_handle(), + CUBLAS_FILL_MODE_LOWER, A.extent(1), + nullptr, A.extent(0), + &Lwork); + return Lwork; + #elif defined(TTG_HAVE_HIPBLAS) + #error TBCoded + #else + return 0; + #endif + } + + static void device_potrf(MatrixTile &A, double *workspace, int Lwork, int *devInfo) { + int device = A.b.get_current_device(); + assert(device != 0); + #if defined(TTG_HAVE_CUDA) + cusolverDnDpotrf(ttg::detail::cublas_get_handle(), + CUBLAS_FILL_MODE_LOWER, A.extent(1), + A.b.device_ptr_on(device), A.extent(0), + workspace, Lwork, + devInfo); + #elif defined(TTG_HAVE_HIPBLAS) + hipsolverDpotrf(ttg::detail::hipblas_get_handle(), + HIPSOLVER_FILL_MODE_LOWER, A.extent(1), + A.b.device_ptr_on(device), A.extent(0), + workspace, Lwork, + devInfo); + #endif + } +#endif // defined(TTG_HAVE_CUDART) || defined(TTG_HAVE_HIP) + template auto make_potrf(MatrixT& A, ttg::Edge>& input_disp, // from the dispatcher @@ -34,41 +69,8 @@ namespace potrf { ttg::Edge>& output_trsm, ttg::Edge>& output_result) { using T = typename MatrixT::element_type; -#if defined(TTG_HAS_CUDART) || defined(TTG_HAS_HIP) +#if defined(TTG_HAVE_CUDART) || defined(TTG_HAVE_HIP) std::cout << "Creating CUDA POTRF task " << std::endl; - static int device_potrf_workspace_size(blk_t &A) { - int Lwork; - #if defined(TTG_HAVE_CUDA) - cusolverDnDpotrf_bufferSize(ttg::detail::cublas_get_handle(), - CUBLAS_FILL_MODE_LOWER, A.extent(1), - nullptr, A.extent(0), - &Lwork); - return Lwork; - #elif defined(TTG_HAVE_HIPBLAS) - #error TBCoded - #else - return 0; - #endif - } - - static void device_potrf(blk_t &A, double *workspace, int Lwork, int *devInfo) { - int device = A.b.get_current_device(); - assert(device != 0); - #if defined(TTG_HAVE_CUDA) - cusolverDnDpotrf(ttg::detail::cublas_get_handle(), - CUBLAS_FILL_MODE_LOWER, A.extent(1), - A.b.device_ptr_on(device), A.extent(0), - workspace, Lwork, - devInfo); - #elif defined(TTG_HAVE_HIPBLAS) - hipsolverDpotrf(ttg::detail::hipblas_get_handle(), - HIPSOLVER_FILL_MODE_LOWER, A.extent(1), - A.b.device_ptr_on(device), A.extent(0), - workspace, Lwork, - devInfo); - #endif - } - auto f_dev = [=](const Key1& key, MatrixTile&& A, std::tuple>, ttg::Out>>& out) TASKRET { const auto K = key[0]; @@ -120,7 +122,7 @@ namespace potrf { } return ttg::make_tt(f_dev, ttg::edges(ttg::fuse(input, input_disp)), ttg::edges(output_result, output_trsm), "POTRF", {"tile_kk/dispatcher"}, {"output_result", "output_trsm"}); -#else /* defined(TTG_HAS_CUDART) || defined(TTG_HAS_HIP) */ +#else /* defined(TTG_HAVE_CUDART) || defined(TTG_HAVE_HIP) */ auto f = [=](const Key1& key, MatrixTile&& tile_kk, std::tuple>, ttg::Out>>& out) { const int K = key[0]; @@ -163,7 +165,7 @@ namespace potrf { ttg::Edge>& output_col, // to GEMM ttg::Edge>& output_result) { using T = typename MatrixT::element_type; -#if defined(TTG_HAS_CUDART) || defined(TTG_HAS_HIP) +#if defined(TTG_HAVE_CUDART) || defined(TTG_HAVE_HIP) auto f = [=](const Key2& key, const MatrixTile& tile_kk, MatrixTile&& tile_mk, std::tuple>, ttg::Out>, ttg::Out>, ttg::Out>>& out) TASKRET { @@ -287,7 +289,7 @@ namespace potrf { ttg::Edge>& output_potrf, // to POTRF ttg::Edge>& output_syrk) { using T = typename MatrixT::element_type; -#if defined(TTG_HAS_CUDART) || defined(TTG_HAS_HIP) +#if defined(TTG_HAVE_CUDART) || defined(TTG_HAVE_HIP) auto f = [=](const Key2& key, const MatrixTile& tile_mk, MatrixTile&& tile_kk, std::tuple>, ttg::Out>>& out) TASKRET { const int K = key[0]; @@ -384,7 +386,7 @@ namespace potrf { ttg::Edge>& output_trsm, // to TRSM ttg::Edge>& output_gemm) { using T = typename MatrixT::element_type; -#if defined(TTG_HAS_CUDART) || defined(TTG_HAS_HIP) +#if defined(TTG_HAVE_CUDART) || defined(TTG_HAVE_HIP) auto f = [=](const Key3& key, const MatrixTile& tile_mk, const MatrixTile& tile_nk, MatrixTile&& tile_mn, std::tuple>, ttg::Out>>& out) TASKRET { const int M = key[0]; From 18577f96d8a75472fb8e5e23753577cf53392449 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Mon, 2 Oct 2023 22:59:48 +0000 Subject: [PATCH 07/17] POTRF: make it run, still fails factorization Notable changes: - add ttg::device::current_device and ttg::device::current_stream to query current streams - Consistently start counting devices at 0 - The host has ID -2 - Fixes in the broadcast Signed-off-by: Joseph Schuchart --- examples/CMakeLists.txt | 18 +++-- examples/devblas_helper.h | 158 ++++++++++++++++++++++++++++++++++++++ examples/matrixtile.h | 25 ++++-- examples/potrf/potrf.h | 121 +++++++++++++++++------------ ttg/CMakeLists.txt | 1 + ttg/ttg/device/device.h | 61 +++++++++++++++ ttg/ttg/device/task.h | 38 +++++---- ttg/ttg/devicescratch.h | 2 +- ttg/ttg/parsec/buffer.h | 41 +++++----- ttg/ttg/parsec/ttg.h | 30 +++++++- 10 files changed, 388 insertions(+), 107 deletions(-) create mode 100644 examples/devblas_helper.h create mode 100644 ttg/ttg/device/device.h diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index b599d97f6..4d8896444 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -17,18 +17,22 @@ if (TARGET tiledarray) COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2 RUNTIMES "parsec") - add_ttg_executable(testing_dpotrf_cuda potrf/testing_dpotrf.cc - LINK_LIBRARIES lapackpp tiledarray CUDA::cublas - COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2 - RUNTIMES "parsec") + if (TARGET CUDA::cusolver) + add_ttg_executable(testing_dpotrf_cuda potrf/testing_dpotrf.cc + LINK_LIBRARIES lapackpp tiledarray CUDA::cublas CUDA::cusolver + COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2;TTG_ENABLE_CUDA=1 + RUNTIMES "parsec") + endif(TARGET CUDA::cusolver) elseif (TARGET roc::hipblas) add_ttg_executable(bspmm-hip spmm/spmm_cuda.cc LINK_LIBRARIES tiledarray TiledArray_Eigen BTAS Boost::boost roc::hipblas COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2 RUNTIMES "parsec") - add_ttg_executable(testing_dpotrf_hip potrf/testing_dpotrf.cc - LINK_LIBRARIES lapackpp tiledarray roc::hipblas - RUNTIMES "parsec") + if (TARGET roc::hipsolver) + add_ttg_executable(testing_dpotrf_hip potrf/testing_dpotrf.cc + LINK_LIBRARIES lapackpp tiledarray roc::hipblas roc::hipsolver + RUNTIMES "parsec") + endif(TARGET roc::hipsolver) endif() add_ttg_executable(chain-ttg-dev task-benchmarks/chain-ttg-dev.cc LINK_LIBRARIES tiledarray RUNTIMES "parsec") diff --git a/examples/devblas_helper.h b/examples/devblas_helper.h new file mode 100644 index 000000000..704a2bc85 --- /dev/null +++ b/examples/devblas_helper.h @@ -0,0 +1,158 @@ +#include "ttg/config.h" + +#include +#include +#include +#include + +#ifdef TTG_HAVE_CUDART + +#include +#include +#include + +namespace detail { + template + struct device_stream_t { + int device; + Stream stream; + device_stream_t(int device, Stream stream) + : device(device) + , stream(stream) + { } + + bool operator<(const device_stream_t& ds) const { + bool result = ((device < ds.device) && (reinterpret_cast(stream) < reinterpret_cast(ds.stream))); + std::cout << *this << " < " << ds << ": " << result << std::endl; + return result; + } + + bool operator==(const device_stream_t& ds) const { + bool result = ((device == ds.device) && (stream == ds.stream)); + std::cout << *this << " == " << ds << ": " << result << std::endl; + return result; + } + }; +} // namespace detail + +namespace std { +template + std::ostream& operator<<(std::ostream& os, const ::detail::device_stream_t& ds) { + os << "[" << ds.device << ", " << ds.stream << "]"; + return os; + } + +} //namespace std + +/// \brief Returns the cuBLAS handle to be used for launching cuBLAS kernels from the current thread +/// \return the cuBLAS handle for the current thread +inline const cublasHandle_t& cublas_handle() { + using map_type = std::map, cublasHandle_t>; + static thread_local map_type handles; + + int device = ttg::device::current_device(); + cudaStream_t stream = ttg::device::current_stream(); + + map_type::iterator it; + if ((it = handles.find({device, stream})) == handles.end()){ + cublasHandle_t handle; + auto status = cublasCreate_v2(&handle); + if (CUBLAS_STATUS_SUCCESS != status) { + std::cerr << "cublasCreate_v2 failed: " << status << std::endl; + throw std::runtime_error("cublasCreate_v2 failed"); + } + status = cublasSetStream_v2(handle, ttg::device::current_stream()); + if (CUBLAS_STATUS_SUCCESS != status) { + std::cerr << "cublasSetStream_v2 failed: " << status << std::endl; + throw std::runtime_error("cublasSetStream_v2 failed"); + } + auto [iterator, success] = handles.insert({{device, stream}, handle}); + it = iterator; + } + return it->second; +} + +inline const cusolverDnHandle_t& cusolver_handle() { + + //using map_type = std::map, cusolverDnHandle_t>; + using map_type = std::map, cusolverDnHandle_t>; + static thread_local map_type handles; + + int device = ttg::device::current_device(); + cudaStream_t stream = ttg::device::current_stream(); + + map_type::iterator it; + if ((it = handles.find({device, stream})) == handles.end()){ + cusolverDnHandle_t handle; + auto status = cusolverDnCreate(&handle); + if (CUSOLVER_STATUS_SUCCESS != status) { + std::cerr << "cusolverDnCreate failed: " << status << std::endl; + throw std::runtime_error("cusolverDnCreate failed"); + } + status = cusolverDnSetStream(handle, stream); + if (CUSOLVER_STATUS_SUCCESS != status) { + std::cerr << "cusolverDnSetStream failed: " << status << std::endl; + throw std::runtime_error("cusolverDnSetStream failed"); + } + + std::cout << "Creating cusolver handle " << handle << " for device " << device << " stream " << stream << std::endl; + auto [iterator, success] = handles.insert({{device, stream}, handle}); + it = iterator; + } else { + std::cout << "Found cusolver handle " << it->second << " for device " << device << " stream " << stream << std::endl; + } + + return it->second; +} +#endif // TTG_HAVE_CUDART + +#ifdef TTG_HAVE_HIPBLAS + +#include +#include +#include + +/// \brief Returns the rocBLAS handle to be used for launching rocBLAS kernels from the current thread +/// \return the rocBLAS handle for the current thread +const hipblasHandle_t& hipblas_handle() { + static thread_local std::map handles; + int device = ttg::device::current_device(); + std::map::iterator it; + if ((it = handles.find(device)) == handles.end()){ + hipblasHandle_t handle; + auto status = hipblasCreate(&handle); + if (HIPBLAS_STATUS_SUCCESS != status) { + throw std::runtime_error("hipblasCreate failed"); + } + auto [iterator, success] = handles.insert({device, handle}); + it = iterator; + } + hipblasStatus_t status = hipblasSetStream(it->second, ttg::device::current_stream()); + if (HIPBLAS_STATUS_SUCCESS != status) { + throw std::runtime_error("hipblasSetStream failed"); + } + return it->second; +} + +/// \brief Returns the hipsolver handle to be used for launching rocBLAS kernels from the current thread +/// \return the hipsolver handle for the current thread +const hipsolverDnHandle_t& hipsolver_handle() { + static thread_local std::map handles; + int device = ttg::device::current_device(); + std::map::iterator it; + if ((it = handles.find(device)) == handles.end()){ + hipsolverDnHandle_t handle; + auto status = hipsolverDnCreate(&handle); + if (HIPSOLVER_STATUS_SUCCESS != status) { + throw std::runtime_error("hipsolverCreate failed"); + } + auto [iterator, success] = handles.insert({device, handle}); + it = iterator; + } + hipsolverStatus_t status = hipsolverDnSetStream(it->second, ttg::device::current_stream()); + if (HIPSOLVER_STATUS_SUCCESS != status) { + throw std::runtime_error("hipsolverSetStream failed"); + } + return it->second; +} +#endif // TTG_HAVE_HIPBLAS diff --git a/examples/matrixtile.h b/examples/matrixtile.h index 693ae3d1e..ba2fcf9cd 100644 --- a/examples/matrixtile.h +++ b/examples/matrixtile.h @@ -17,13 +17,13 @@ class MatrixTile : public ttg::TTValue> { using ttvalue_type = ttg::TTValue>; private: - buffer_t b; + buffer_t _buffer; int _rows = 0, _cols = 0, _lda = 0; // (Re)allocate the tile memory void realloc() { // std::cout << "Reallocating new tile" << std::endl; - b.reset(_lda * _cols); + _buffer.reset(_lda * _cols); } public: @@ -31,7 +31,7 @@ class MatrixTile : public ttg::TTValue> { MatrixTile(int rows, int cols, int lda) : ttvalue_type() - , b(lda*cols) + , _buffer(lda*cols) , _rows(rows) , _cols(cols) , _lda(lda) @@ -49,7 +49,7 @@ class MatrixTile : public ttg::TTValue> { */ MatrixTile(int rows, int cols, T* data, int lda) : ttvalue_type() - , b(data, lda*cols) + , _buffer(data, lda*cols) , _rows(rows) , _cols(cols) , _lda(lda) @@ -65,7 +65,7 @@ class MatrixTile : public ttg::TTValue> { * can handle data sharing without excessive copying */ MatrixTile(const MatrixTile& other) : ttvalue_type() - , b(other._lda*other._cols) + , _buffer(other._lda*other._cols) , _rows(other._rows) , _cols(other._cols) , _lda(other._lda) { @@ -85,14 +85,15 @@ class MatrixTile : public ttg::TTValue> { _rows = std::get<0>(meta); _cols = std::get<1>(meta); _lda = std::get<2>(meta); + this->realloc(); } metadata_t get_metadata(void) const { return metadata_t{_rows, _cols, _lda}; } // Accessing the raw data - T* data() { return b.host_ptr(); } + T* data() { return _buffer.host_ptr(); } - const T* data() const { return b.host_ptr(); } + const T* data() const { return _buffer.host_ptr(); } size_t size() const { return _cols * _lda; } @@ -102,9 +103,17 @@ class MatrixTile : public ttg::TTValue> { int lda() const { return _lda; } + buffer_t& buffer() { + return _buffer; + } + + const buffer_t& buffer() const { + return _buffer; + } + auto& fill(T value) { std::fill(data().get(), data().get() + size(), value); - b.set_current_device(0); + _buffer.set_current_device(0); return *this; } diff --git a/examples/potrf/potrf.h b/examples/potrf/potrf.h index e6230371a..50a57a766 100644 --- a/examples/potrf/potrf.h +++ b/examples/potrf/potrf.h @@ -4,22 +4,24 @@ #include #include "lapack.hh" #include "pmw.h" +#include "../devblas_helper.h" #undef DEBUG_TILES_VALUES #if defined(TTG_HAVE_CUDART) #define ES ttg::ExecutionSpace::CUDA #define TASKRET -> ttg::device_task +#include #elif defined(TTG_HAVE_HIP) #define ES ttg::ExecutionSpace::HIP #define TASKRET -> ttg::device_task +#include +#include #else #define ES ttg::ExecutionSpace::Host #define TASKRET -> void #endif - - namespace potrf { /* FLOP macros taken from DPLASMA */ @@ -31,9 +33,9 @@ namespace potrf { static int device_potrf_workspace_size(MatrixTile &A) { int Lwork; #if defined(TTG_HAVE_CUDA) - cusolverDnDpotrf_bufferSize(ttg::detail::cublas_get_handle(), - CUBLAS_FILL_MODE_LOWER, A.extent(1), - nullptr, A.extent(0), + cusolverDnDpotrf_bufferSize(cusolver_handle(), + CUBLAS_FILL_MODE_LOWER, A.cols(), + nullptr, A.lda(), &Lwork); return Lwork; #elif defined(TTG_HAVE_HIPBLAS) @@ -44,18 +46,21 @@ namespace potrf { } static void device_potrf(MatrixTile &A, double *workspace, int Lwork, int *devInfo) { - int device = A.b.get_current_device(); - assert(device != 0); + int device = ttg::device::current_device(); + assert(device >= 0); #if defined(TTG_HAVE_CUDA) - cusolverDnDpotrf(ttg::detail::cublas_get_handle(), - CUBLAS_FILL_MODE_LOWER, A.extent(1), - A.b.device_ptr_on(device), A.extent(0), + //std::cout << "POTRF A " << A.buffer().device_ptr_on(device) << " device " << device << " cols " << A.cols() << " lda " << A.lda() << " Lwork " << Lwork << " WS " << workspace << " devInfo " << devInfo << std::endl; + auto handle = cusolver_handle(); + std::cout << "POTRF handle " << handle << " device " << device << " stream " << ttg::device::current_stream() << std::endl; + cusolverDnDpotrf(handle, + CUBLAS_FILL_MODE_LOWER, A.cols(), + A.buffer().device_ptr_on(device), A.lda(), workspace, Lwork, devInfo); #elif defined(TTG_HAVE_HIPBLAS) - hipsolverDpotrf(ttg::detail::hipblas_get_handle(), - HIPSOLVER_FILL_MODE_LOWER, A.extent(1), - A.b.device_ptr_on(device), A.extent(0), + hipsolverDpotrf(hipsolver_handle(), + HIPSOLVER_FILL_MODE_LOWER, A.cols(), + A.buffer().device_ptr_on(device), A.lda(), workspace, Lwork, devInfo); #endif @@ -69,9 +74,11 @@ namespace potrf { ttg::Edge>& output_trsm, ttg::Edge>& output_result) { using T = typename MatrixT::element_type; + auto iallocator = std::make_shared>(); #if defined(TTG_HAVE_CUDART) || defined(TTG_HAVE_HIP) - std::cout << "Creating CUDA POTRF task " << std::endl; - auto f_dev = [=](const Key1& key, MatrixTile&& A, + //std::cout << "Creating CUDA POTRF task " << std::endl; + auto f_dev = [=, iallocator = std::move(iallocator)] + (const Key1& key, MatrixTile&& tile_kk, std::tuple>, ttg::Out>>& out) TASKRET { const auto K = key[0]; @@ -81,8 +88,10 @@ namespace potrf { std::vector keylist; keylist.reserve(A.rows() - K); /* TODO: reverse order of arrays */ + //std::cout << "POTRF K " << K << " A.rows " << A.rows() << std::endl; for (int m = K + 1; m < A.rows(); ++m) { /* send tile to trsm */ + //std::cout << "POTRF successor " << Key2(m, K) << std::endl; keylist.push_back(Key2(m, K)); } @@ -92,34 +101,43 @@ namespace potrf { // Check pmw.h: when we generate the MatrixTile // Also check pinned allocator at the end of DeviceTensor (250-253) - int Lwork = device_potrf_workspace_size(A); + int Lwork = device_potrf_workspace_size(tile_kk); // Instead of using scratch here, we should have hostWS and hostInfo globals and use to_device // this would reduce the number of I/O operations to devices - double hostWS[Lwork]; - ttg::devicescratch devWS = ttg::make_scratch(hostWS, ttg::scope::Allocate); - int hostInfo = -1; - ttg::devicescratch devInfo = ttg::make_scratch(&hostInfo, ttg::scope::Allocate); + double *hostWS = new double[Lwork]; + ttg::devicescratch devWS = ttg::make_scratch(hostWS, ttg::scope::Allocate, Lwork); + int *hostInfo = iallocator->allocate(1); + ttg::devicescratch devInfo = ttg::make_scratch(hostInfo, ttg::scope::Allocate); + + *hostInfo = -32; /* the workspace and the devInfo must be device-level pointers */ - co_await ttg::to_device(A._data, devWS, devInfo); + co_await ttg::to_device(tile_kk.buffer(), devWS, devInfo); + + //std::cout << "devWS host ptr " << hostWS << " device ptr " << devWS.device_ptr() << " size " << devWS.size() + // << " devInfo host ptr " << hostInfo << " device ptr " << devInfo.device_ptr() << "size " << devInfo.size() << std::endl; /* everything is on the device, call the POTRF */ - device_potrf(A, devWS, Lwork, devInfo); + device_potrf(tile_kk, devWS.device_ptr(), Lwork, devInfo.device_ptr()); /* wait for the kernel to complete */ co_await ttg::wait_kernel(devInfo); - if( hostInfo == 0 ) { - co_await ttg::device::forward(ttg::device::broadcast<0, 1>(std::make_tuple(Key2(K, K), std::move(keylist)), std::move(A), out)); + delete[] hostWS; + int info = *hostInfo; + assert(info == 0); + iallocator->deallocate(hostInfo, 1); + if( info == 0 ) { + co_await ttg::device::forward(ttg::device::broadcast<0, 1>(std::make_tuple(Key2(K, K), std::move(keylist)), std::move(tile_kk), out)); // Anything after this co_await is never executed // co_return would look better, but co_return would destroy keylist before the runtime can handle it } else { // Well... Here we should interrupt the DAG of tasks, there is an error. Raise? - std::cerr << "Factorization is SUSPICIOUS (the matrix might not be diagonally dominant)" << std::endl; + std::cerr << "Factorization of tile " << K << " failed: " << info << std::endl; ttg::abort(); } - } + }; return ttg::make_tt(f_dev, ttg::edges(ttg::fuse(input, input_disp)), ttg::edges(output_result, output_trsm), "POTRF", {"tile_kk/dispatcher"}, {"output_result", "output_trsm"}); #else /* defined(TTG_HAVE_CUDART) || defined(TTG_HAVE_HIP) */ @@ -202,23 +220,23 @@ namespace potrf { } - co_await ttg::to_device(tile_kk.b, tile_mk.b); - int device = tile_kk.b.get_current_device(); + co_await ttg::to_device(tile_kk.buffer(), tile_mk.buffer()); + int device = ttg::device::current_device(); double alpha = 1.0; #if defined(TTG_HAVE_CUDA) - cublasDtrsm(ttg::detail::cublas_get_handle(), + cublasDtrsm(cublas_handle(), CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, mb, nb, &alpha, - tile_kk.b.device_ptr_on(device), tile_kk.lda(), - tile_mk.b.device_ptr_on(device), tile_mk.lda()); + tile_kk.buffer().device_ptr_on(device), tile_kk.lda(), + tile_mk.buffer().device_ptr_on(device), tile_mk.lda()); #elif defined(TTG_HAVE_HIPBLAS) - hipblasDtrsm(ttg::detail:hipblas_get_handle(), + hipblasDtrsm(hipblas_handle(), HIPBLAS_SIDE_RIGHT, HIPBLAS_FILL_MODE_LOWER, HIPBLAS_OP_T, HIPBLAS_DIAG_NON_UNIT, mb, nb, &alpha, - tile_kk.b.device_ptr_on(device), tile_kk.lda(), - tile_mk.b.device_ptr_on(device), tile_mk.lda()); + tile_kk.buffer().device_ptr_on(device), tile_kk.lda(), + tile_mk.buffer().device_ptr_on(device), tile_mk.lda()); #endif co_await ttg::device::forward(ttg::device::broadcast<0, 1, 2, 3>(std::make_tuple(key, Key2(K, M), keylist_row, keylist_col), @@ -304,24 +322,26 @@ namespace potrf { if (ttg::tracing()) ttg::print("SYRK(", key, ")"); - co_await ttg::to_device(tile_kk.b, tile_mk.b); + co_await ttg::to_device(tile_kk.buffer(), tile_mk.buffer()); + + int device = ttg::device::current_device(); double alpha = -1.0; double beta = 1.0; #if defined(TTG_HAVE_CUDA) - cublasDsyrk(ttg::detail::cublas_get_handle(), + cublasDsyrk(cublas_handle(), CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, mb, nb, &alpha, - tile_nk.b.device_ptr_on(device), tile_mk.lda(), &beta, - tile_kk.b.device_ptr_on(device), tile_kk.lda()); + tile_mk.buffer().device_ptr_on(device), tile_mk.lda(), &beta, + tile_kk.buffer().device_ptr_on(device), tile_kk.lda()); #elif defined(TTG_HAVE_HIPBLAS) - hipblasDsyrk(ttg::detail:hipblas_get_handle(), + hipblasDsyrk(hipblas_handle(), HIPBLAS_FILL_MODE_LOWER, HIPBLAS_OP_N, mb, nb, &alpha, - tile_kk.b.device_ptr_on(device), tile_kk.lda(), &beta, - tile_mk.b.device_ptr_on(device), tile_mk.lda()); + tile_mk.buffer().device_ptr_on(device), tile_mk.lda(), &beta, + tile_kk.buffer().device_ptr_on(device), tile_kk.lda()); #endif if (M == K + 1) { @@ -404,26 +424,27 @@ namespace potrf { << N << ") is " << tile_nk << " and A(" << M << ", " << N << ") is " << tile_mn; #endif - co_await ttg::to_device(tile_mk.b, tile_nk.b, tile_mn.b); + co_await ttg::to_device(tile_mk.buffer(), tile_nk.buffer(), tile_mn.buffer()); + int device = ttg::device::current_device(); double alpha = -1.0; double beta = 1.0; #if defined(TTG_HAVE_CUDA) - cublasDgemm(ttg::detail:cublas_get_handle(), + cublasDgemm(cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_T, tile_mk.rows(), tile_nk.rows(), tile_nk.cols(), &alpha, - tile_mk.data(), tile_mk.lda(), - tile_nk.data(), tile_nk.lda(), &beta, - tile_mn.data(), tile_mn.lda()); + tile_mk.buffer().device_ptr_on(device), tile_mk.lda(), + tile_nk.buffer().device_ptr_on(device), tile_nk.lda(), &beta, + tile_mn.buffer().device_ptr_on(device), tile_mn.lda()); #elif defined(TTG_HAVE_HIPBLAS) - hipblasDgemm(ttg::detail:hipblas_get_handle(), + hipblasDgemm(hipblas_handle(), HIPBLAS_OP_N, HIPBLAS_OP_T, tile_mk.rows(), tile_nk.rows(), tile_nk.cols(), &alpha, - tile_mk.data(), tile_mk.lda(), - tile_nk.data(), tile_nk.lda(), &beta, - tile_mn.data(), tile_mn.lda()); + tile_mk.buffer().device_ptr_on(device), tile_mk.lda(), + tile_nk.buffer().device_ptr_on(device), tile_nk.lda(), &beta, + tile_mn.buffer().device_ptr_on(device), tile_mn.lda()); #endif if (N == K + 1) { diff --git a/ttg/CMakeLists.txt b/ttg/CMakeLists.txt index a60f4f833..476118524 100644 --- a/ttg/CMakeLists.txt +++ b/ttg/CMakeLists.txt @@ -69,6 +69,7 @@ set(ttg-impl-headers ) set(ttg_device_headers ${CMAKE_CURRENT_SOURCE_DIR}/ttg/device/cublas_helper.h + ${CMAKE_CURRENT_SOURCE_DIR}/ttg/device/device.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/device/task.h ) set(ttg-headers diff --git a/ttg/ttg/device/device.h b/ttg/ttg/device/device.h new file mode 100644 index 000000000..17f8364f1 --- /dev/null +++ b/ttg/ttg/device/device.h @@ -0,0 +1,61 @@ + +#include + +#if defined(TTG_HAVE_CUDA) +#include + +namespace ttg::device { + namespace detail { + inline thread_local int current_device_ts = 0; + inline thread_local cudaStream_t current_stream_ts = 0; // default stream + + void reset_current() { + current_device_ts = 0; + current_stream_ts = 0; + } + + void set_current(int device, cudaStream_t stream) { + current_device_ts = device; + current_stream_ts = stream; + } + } // namespace detail + + int current_device() { + return detail::current_device_ts; + } + + cudaStream_t current_stream() { + return detail::current_stream_ts; + } +} // namespace ttg + +#elif defined(TTG_HAVE_HIP) + +#include + +namespace ttg::device { + namespace detail { + inline thread_local int current_device_ts = 0; + inline thread_local hipStream_t current_stream_ts = 0; // default stream + + void reset_current() { + current_device_ts = 0; + current_stream_ts = 0; + } + + void set_current(int device, hipStream_t stream) { + current_device_ts = device; + current_stream_ts = stream; + } + } // namespace detail + + int current_device() { + return detail::current_device_ts; + } + + hipStream_t current_stream() { + return detail::current_stream_ts; + } +} // namespace ttg + +#endif // defined(TTG_HAVE_HIP) diff --git a/ttg/ttg/device/task.h b/ttg/ttg/device/task.h index b275da195..c51990d99 100644 --- a/ttg/ttg/device/task.h +++ b/ttg/ttg/device/task.h @@ -134,7 +134,8 @@ namespace ttg { } void unhandled_exception() { - + std::cerr << "Send coroutine caught an unhandled exception!" << std::endl; + throw; // fwd } }; @@ -284,9 +285,9 @@ namespace ttg { typename... out_keysT, typename... out_valuesT> inline void prepare_broadcast(const std::tuple &keylists, valueT &&value, std::tuple...> &t) { - std::get(t)->prepare_send(std::get(keylists), std::forward(value)); + std::get(t).prepare_send(std::get(keylists), std::forward(value)); if constexpr (sizeof...(Is) > 0) { - detail::prepare_broadcast(keylists, std::forward(value), t); + prepare_broadcast(keylists, std::forward(value), t); } } @@ -299,7 +300,7 @@ namespace ttg { auto *terminal_ptr = ttg::detail::get_out_terminal(I, "ttg::device::broadcast(keylists, value)"); terminal_ptr->prepare_send(std::get(keylists), value); if constexpr (sizeof...(Is) > 0) { - detail::prepare_broadcast(keylists, std::forward(value)); + prepare_broadcast(keylists, std::forward(value)); } } @@ -307,7 +308,7 @@ namespace ttg { typename... out_keysT, typename... out_valuesT> inline void broadcast(const std::tuple &keylists, valueT &&value, std::tuple...> &t) { - std::get(t)->broadcast(std::get(keylists), std::forward(value)); + std::get(t).broadcast(std::get(keylists), std::forward(value)); if constexpr (sizeof...(Is) > 0) { detail::broadcast(keylists, std::forward(value), t); } @@ -322,7 +323,7 @@ namespace ttg { auto *terminal_ptr = ttg::detail::get_out_terminal(I, "ttg::device::broadcast(keylists, value)"); terminal_ptr->broadcast(std::get(keylists), value); if constexpr (sizeof...(Is) > 0) { - detail::broadcast(keylists, std::forward(value)); + ttg::device::detail::broadcast(keylists, std::forward(value)); } } @@ -340,12 +341,12 @@ namespace ttg { // treat as tuple prepare_broadcast<0, I, Is...>(kl, std::forward>(value), t); co_await ttg::Void{}; // we'll come back once the task is done - broadcast<0, I, Is...>(kl, std::forward>(value), t); + ttg::device::detail::broadcast<0, I, Is...>(kl, std::forward>(value), t); } else if constexpr (!ttg::meta::is_tuple_v) { // create a tie to the captured keylist prepare_broadcast<0, I, Is...>(std::tie(kl), std::forward>(value), t); co_await ttg::Void{}; // we'll come back once the task is done - broadcast<0, I, Is...>(std::tie(kl), std::forward>(value), t); + ttg::device::detail::broadcast<0, I, Is...>(std::tie(kl), std::forward>(value), t); } } @@ -363,12 +364,12 @@ namespace ttg { "Size of keylist tuple must match the number of output terminals"); prepare_broadcast<0, I, Is...>(kl, std::forward>(value)); co_await ttg::Void{}; // we'll come back once the task is done - broadcast<0, I, Is...>(kl, std::forward>(value)); + ttg::device::detail::broadcast<0, I, Is...>(kl, std::forward>(value)); } else if constexpr (!ttg::meta::is_tuple_v) { // create a tie to the captured keylist prepare_broadcast<0, I, Is...>(std::tie(kl), std::forward>(value)); co_await ttg::Void{}; // we'll come back once the task is done - broadcast<0, I, Is...>(std::tie(kl), std::forward>(value)); + ttg::device::detail::broadcast<0, I, Is...>(std::tie(kl), std::forward>(value)); } } } // namespace detail @@ -379,12 +380,12 @@ namespace ttg { [[nodiscard]] inline detail::send_t broadcast(rangeT &&keylist, valueT &&value, - const std::tuple...> &t) { + std::tuple...> &t) { ttg::detail::value_copy_handler copy_handler; return detail::send_t{ - broadcast_coro<0, I, Is...>(std::forward(keylist), - copy_handler(std::forward(value)), - t, std::move(copy_handler))}; + detail::broadcast_coro(std::forward(keylist), + copy_handler(std::forward(value)), + t, std::move(copy_handler))}; } /* overload with implicit terminals and keylist passed by const reference */ @@ -392,8 +393,8 @@ namespace ttg { ttg::Runtime Runtime = ttg::ttg_runtime> inline detail::send_t broadcast(rangeT &&keylist, valueT &&value) { ttg::detail::value_copy_handler copy_handler; - return detail::send_t{broadcast_coro<0, i>(std::tie(keylist), copy_handler(std::forward(value)), - std::move(copy_handler))}; + return detail::send_t{broadcast_coro(std::tie(keylist), copy_handler(std::forward(value)), + std::move(copy_handler))}; } template @@ -577,7 +578,10 @@ namespace ttg { device_task get_return_object() { return device_task{device_task_handle_type::from_promise(*this)}; } - void unhandled_exception() { } + void unhandled_exception() { + std::cerr << "Task coroutine caught an unhandled exception!" << std::endl; + throw; // fwd + } //using iterator = std::vector::iterator; diff --git a/ttg/ttg/devicescratch.h b/ttg/ttg/devicescratch.h index 37510436f..729361fa2 100644 --- a/ttg/ttg/devicescratch.h +++ b/ttg/ttg/devicescratch.h @@ -11,7 +11,7 @@ using devicescratch = TTG_IMPL_NS::devicescratch; template auto make_scratch(T* val, ttg::scope scope, std::size_t count = 1) { - return devicescratch(val, scope, 1); + return devicescratch(val, scope, count); } namespace detail { diff --git a/ttg/ttg/parsec/buffer.h b/ttg/ttg/parsec/buffer.h index 00c7c996e..aac5de428 100644 --- a/ttg/ttg/parsec/buffer.h +++ b/ttg/ttg/parsec/buffer.h @@ -19,8 +19,8 @@ namespace ttg_parsec { namespace detail { // fwd decl - template - parsec_data_t* get_parsec_data(const ttg_parsec::buffer& db); + template + parsec_data_t* get_parsec_data(const ttg_parsec::buffer& db); } // namespace detail /** @@ -59,7 +59,7 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t // nothing to be done, we don't own the memory } - friend parsec_data_t* detail::get_parsec_data(const ttg_parsec::buffer&); + friend parsec_data_t* detail::get_parsec_data(const ttg_parsec::buffer&); allocator_type& get_allocator_reference() { return static_cast(*this); } @@ -74,7 +74,7 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t public: /* The device ID of the CPU. */ - static constexpr int cpu_device = 0; + static constexpr int cpu_device = -2; buffer() : buffer(nullptr, 0) { } @@ -151,15 +151,16 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t /* make sure it's a valid device */ assert(parsec_nb_devices > device_id); /* make sure it's a valid copy */ - assert(m_data->device_copies[device_id] != nullptr); - m_data->owner_device = device_id; + assert(m_data->device_copies[device_id+2] != nullptr); + m_data->owner_device = device_id+2; } /* get the current device ID, i.e., the last updated - * device buffer. */ + * device buffer. A value of -2 designates the host + * as the current device. */ int get_current_device() const { assert(is_valid()); - return m_data->owner_device; + return m_data->owner_device - 2; // 0: host, 1: recursive, 2: first device } /* get the current device pointer */ @@ -179,7 +180,7 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t */ element_type* device_ptr_on(int device_id) { assert(is_valid()); - return static_cast(parsec_data_get_ptr(m_data.get(), device_id)); + return static_cast(parsec_data_get_ptr(m_data.get(), device_id + 2)); } /* get the device pointer at the given device @@ -187,20 +188,20 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t */ const element_type* device_ptr_on(int device_id) const { assert(is_valid()); - return static_cast(parsec_data_get_ptr(m_data.get(), device_id)); + return static_cast(parsec_data_get_ptr(m_data.get(), device_id + 2)); // GPUs start at 2 } element_type* host_ptr() { - return device_ptr_on(cpu_device); + return static_cast(parsec_data_get_ptr(m_data.get(), 0)); } const element_type* host_ptr() const { - return device_ptr_on(cpu_device); + return static_cast(parsec_data_get_ptr(m_data.get(), 0)); } bool is_valid_on(int device_id) const { assert(is_valid()); - return (parsec_data_get_ptr(m_data.get(), device_id) != nullptr); + return (parsec_data_get_ptr(m_data.get(), device_id+2) != nullptr); } void allocate_on(int device_id) { @@ -222,7 +223,7 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t /* Unpin the memory on all devices we currently track. */ void unpin() { if (!is_valid()) return; - for (int i = 1; i < parsec_nb_devices; ++i) { + for (int i = 0; i < parsec_nb_devices-2; ++i) { unpin_on(i); } } @@ -362,20 +363,20 @@ template struct is_buffer : std::false_type { }; -template -struct is_buffer> : std::true_type +template +struct is_buffer> : std::true_type { }; -template -struct is_buffer> : std::true_type +template +struct is_buffer> : std::true_type { }; template constexpr static const bool is_buffer_v = is_buffer::value; namespace detail { - template - parsec_data_t* get_parsec_data(const ttg_parsec::buffer& db) { + template + parsec_data_t* get_parsec_data(const ttg_parsec::buffer& db) { return const_cast(db.m_data.get()); } } // namespace detail diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 288545dbb..2bca31533 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -113,6 +113,8 @@ #include "ttg/device/cublas_helper.h" #include "ttg/parsec/parsec-ext.h" +#include "ttg/device/device.h" + #undef TTG_PARSEC_DEBUG_TRACK_DATA_COPIES /* PaRSEC function declarations */ @@ -1323,15 +1325,35 @@ namespace ttg_parsec { dev_data.state() == ttg::TTG_DEVICE_CORO_WAIT_KERNEL); #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(TTG_HAVE_CUDART) - parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t *)gpu_stream; - ttg::detail::cublas_set_kernel_stream(cuda_stream->cuda_stream); + { + parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t *)gpu_stream; + ttg::detail::cublas_set_kernel_stream(cuda_stream->cuda_stream); + } #endif // PARSEC_HAVE_DEV_CUDA_SUPPORT +#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(TTG_HAVE_CUDA) + { + parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t *)gpu_stream; + int device = gpu_device->super.device_index - 2; // 0: host, 1: recursive, 2: first GPU + ttg::device::detail::set_current(device, cuda_stream->cuda_stream); + } +#endif // defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(TTG_HAVE_CUDA) + #if defined(PARSEC_HAVE_DEV_HIP_SUPPORT) && defined(TTG_HAVE_HIPBLAS) - parsec_hip_exec_stream_t *hip_stream = (parsec_hip_exec_stream_t *)gpu_stream; - ttg::detail::hipblas_set_kernel_stream(hip_stream->hip_stream); + { + parsec_hip_exec_stream_t *hip_stream = (parsec_hip_exec_stream_t *)gpu_stream; + ttg::detail::hipblas_set_kernel_stream(hip_stream->hip_stream); + } #endif // PARSEC_HAVE_DEV_HIP_SUPPORT +#if defined(PARSEC_HAVE_DEV_HIP_SUPPORT) && defined(TTG_HAVE_HIP) + { + parsec_cuda_exec_stream_t *hip_stream = (parsec_hip_exec_stream_t *)gpu_stream; + int device = gpu_device->super.device_index - 2; // 0: host, 1: recursive, 2: first GPU + ttg::device::detail::set_current(device, hip_stream->hip_stream); + } +#endif // defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(TTG_HAVE_CUDA) + /* Here we call back into the coroutine again after the transfers have completed */ static_op(&task->parsec_task); From b60577c62430b2b41e44b1898f4f3ab733430d57 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Tue, 3 Oct 2023 17:51:01 +0000 Subject: [PATCH 08/17] Fix release of data copies in buffer This fixes a segfault in multi-gpu operations. We should not destroy the full data_t but only remove the host-side copy and release our reference on the data. The data will be destroyed once all gpu copies have been removed. Signed-off-by: Joseph Schuchart --- ttg/ttg/parsec/devicefunc.h | 4 ++-- ttg/ttg/parsec/ttg_data_copy.h | 11 +++++++++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/ttg/ttg/parsec/devicefunc.h b/ttg/ttg/parsec/devicefunc.h index d250e060e..703b42fb4 100644 --- a/ttg/ttg/parsec/devicefunc.h +++ b/ttg/ttg/parsec/devicefunc.h @@ -117,7 +117,7 @@ namespace ttg_parsec { /* enqueue the transfer into the compute stream to come back once the compute and transfer are complete */ #if defined(TTG_HAVE_CUDART) && defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) - std::cout << "cudaMemcpyAsync of " << data->nb_elts << "B" << std::endl; + //std::cout << "cudaMemcpyAsync of " << data->nb_elts << "B from " << data->device_copies[data->owner_device]->device_private << " device " << (int)data->owner_device << " to " << data->device_copies[0]->device_private << std::endl; parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t *)stream; cudaMemcpyAsync(data->device_copies[0]->device_private, data->device_copies[data->owner_device]->device_private, @@ -176,4 +176,4 @@ namespace ttg_parsec { } // namespace ttg_parsec -#endif // TTG_PARSEC_DEVICEFUNC_H \ No newline at end of file +#endif // TTG_PARSEC_DEVICEFUNC_H diff --git a/ttg/ttg/parsec/ttg_data_copy.h b/ttg/ttg/parsec/ttg_data_copy.h index 27814ecf5..9d21cc2d1 100644 --- a/ttg/ttg/parsec/ttg_data_copy.h +++ b/ttg/ttg/parsec/ttg_data_copy.h @@ -62,9 +62,16 @@ namespace ttg_parsec { status = cudaHostUnregister(data->device_copies[0]->device_private); assert(cudaSuccess == status); data->device_copies[0]->flags ^= TTG_PARSEC_DATA_FLAG_REGISTERED; - } + } #endif // PARSEC_HAVE_DEV_CUDA_SUPPORT - parsec_data_destroy(data); + //std::fprintf(stderr, "parsec_data_destroy %p device_copy[0] %p\n", data, data->device_copies[0]); + //parsec_data_destroy(data); + assert(data->device_copies[0] != nullptr); + auto copy = data->device_copies[0]; + parsec_data_copy_detach(data, data->device_copies[0], 0); + PARSEC_OBJ_RELEASE(copy); + PARSEC_OBJ_RELEASE(data); + } static void delete_null_parsec_data(parsec_data_t *) { From a2f4bbeed5f17cc433354ed88e7385e8fad3dd3b Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Wed, 4 Oct 2023 01:08:13 +0000 Subject: [PATCH 09/17] Enable POTRF on hip Signed-off-by: Joseph Schuchart --- CMakeLists.txt | 5 +++++ examples/devblas_helper.h | 6 +++--- examples/potrf/potrf.h | 12 ++++++++---- examples/spmm/spmm_cuda.cc | 6 ++++-- ttg/ttg/device/cublas_helper.cpp | 2 +- ttg/ttg/parsec/devicefunc.h | 5 +++++ ttg/ttg/parsec/ttg.h | 4 ++++ 7 files changed, 30 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 06169f4e8..b6b9c39fe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -120,6 +120,11 @@ if (TTG_ENABLE_HIP) if (TARGET roc::hipblas) set(TTG_HAVE_HIPBLAS True CACHE BOOL "TTG detected support for hipBLAS") endif() + + find_package(hipsolver) + if (TARGET roc::hipsolver) + set(TTG_HAVE_HIPSOLVER True CACHE BOOL "TTG detected support for hipSolver") + endif() add_compile_definitions(${TTG_HIP_PLATFORM}) endif(TTG_ENABLE_HIP) diff --git a/examples/devblas_helper.h b/examples/devblas_helper.h index 704a2bc85..10c420111 100644 --- a/examples/devblas_helper.h +++ b/examples/devblas_helper.h @@ -108,9 +108,9 @@ inline const cusolverDnHandle_t& cusolver_handle() { #ifdef TTG_HAVE_HIPBLAS -#include -#include -#include +#include +#include +#include /// \brief Returns the rocBLAS handle to be used for launching rocBLAS kernels from the current thread /// \return the rocBLAS handle for the current thread diff --git a/examples/potrf/potrf.h b/examples/potrf/potrf.h index 50a57a766..24bdd44d2 100644 --- a/examples/potrf/potrf.h +++ b/examples/potrf/potrf.h @@ -15,8 +15,8 @@ #elif defined(TTG_HAVE_HIP) #define ES ttg::ExecutionSpace::HIP #define TASKRET -> ttg::device_task -#include -#include +#include +#include #else #define ES ttg::ExecutionSpace::Host #define TASKRET -> void @@ -39,7 +39,11 @@ namespace potrf { &Lwork); return Lwork; #elif defined(TTG_HAVE_HIPBLAS) - #error TBCoded + hipsolverDnDpotrf_bufferSize(hipsolver_handle(), + HIPSOLVER_FILL_MODE_LOWER, A.cols(), + nullptr, A.lda(), + &Lwork); + return Lwork; #else return 0; #endif @@ -48,7 +52,7 @@ namespace potrf { static void device_potrf(MatrixTile &A, double *workspace, int Lwork, int *devInfo) { int device = ttg::device::current_device(); assert(device >= 0); - #if defined(TTG_HAVE_CUDA) +#if defined(TTG_HAVE_CUDA) //std::cout << "POTRF A " << A.buffer().device_ptr_on(device) << " device " << device << " cols " << A.cols() << " lda " << A.lda() << " Lwork " << Lwork << " WS " << workspace << " devInfo " << devInfo << std::endl; auto handle = cusolver_handle(); std::cout << "POTRF handle " << handle << " device " << device << " stream " << ttg::device::current_stream() << std::endl; diff --git a/examples/spmm/spmm_cuda.cc b/examples/spmm/spmm_cuda.cc index 05a6599b9..3f6ef0dec 100644 --- a/examples/spmm/spmm_cuda.cc +++ b/examples/spmm/spmm_cuda.cc @@ -37,6 +37,8 @@ #include "ttg.h" +#include "../devblas_helper.h" + using namespace ttg; #include "ttg/util/future.h" @@ -273,13 +275,13 @@ static void device_gemm(blk_t &C, const blk_t &A, const blk_t &B) { int device = C.b.get_current_device(); assert(device != 0); #if defined(TTG_HAVE_CUDA) - cublasDgemm(ttg::detail::cublas_get_handle(), + cublasDgemm(cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, C.extent(0), C.extent(1), A.extent(1), &alpha, A.b.device_ptr_on(device), A.extent(0), B.b.device_ptr_on(device), B.extent(0), &beta, C.b.current_device_ptr(), C.extent(0)); #elif defined(TTG_HAVE_HIPBLAS) - hipblasDgemm(ttg::detail::hipblas_get_handle(), + hipblasDgemm(hipblas_handle(), HIPBLAS_OP_N, HIPBLAS_OP_N, C.extent(0), C.extent(1), A.extent(1), &alpha, A.b.device_ptr_on(device), A.extent(0), diff --git a/ttg/ttg/device/cublas_helper.cpp b/ttg/ttg/device/cublas_helper.cpp index 24e34f52c..768411282 100644 --- a/ttg/ttg/device/cublas_helper.cpp +++ b/ttg/ttg/device/cublas_helper.cpp @@ -10,7 +10,7 @@ #endif // TTG_HAVE_CUDART #ifdef TTG_HAVE_HIPBLAS -#include +#include #endif // TTG_HAVE_HIPBLAS namespace ttg::detail { diff --git a/ttg/ttg/parsec/devicefunc.h b/ttg/ttg/parsec/devicefunc.h index 703b42fb4..8686588f4 100644 --- a/ttg/ttg/parsec/devicefunc.h +++ b/ttg/ttg/parsec/devicefunc.h @@ -122,6 +122,11 @@ namespace ttg_parsec { cudaMemcpyAsync(data->device_copies[0]->device_private, data->device_copies[data->owner_device]->device_private, data->nb_elts, cudaMemcpyDeviceToHost, cuda_stream->cuda_stream); +#elif defined(TTG_HAVE_HIP) && defined(PARSEC_HAVE_DEV_HIP_SUPPORT) + parsec_hip_exec_stream_t *hip_stream = (parsec_hip_exec_stream_t *)stream; + hipMemcpyAsync(data->device_copies[0]->device_private, + data->device_copies[data->owner_device]->device_private, + data->nb_elts, hipMemcpyDeviceToHost, hip_stream->hip_stream); #else static_assert(DeviceAvail, "No device implementation detected!"); #endif // defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 2bca31533..422e9f1ef 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -1324,12 +1324,14 @@ namespace ttg_parsec { assert(dev_data.state() == ttg::TTG_DEVICE_CORO_WAIT_TRANSFER || dev_data.state() == ttg::TTG_DEVICE_CORO_WAIT_KERNEL); +#if 0 #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(TTG_HAVE_CUDART) { parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t *)gpu_stream; ttg::detail::cublas_set_kernel_stream(cuda_stream->cuda_stream); } #endif // PARSEC_HAVE_DEV_CUDA_SUPPORT +#endif //0 #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(TTG_HAVE_CUDA) { @@ -1339,12 +1341,14 @@ namespace ttg_parsec { } #endif // defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(TTG_HAVE_CUDA) +#if 0 #if defined(PARSEC_HAVE_DEV_HIP_SUPPORT) && defined(TTG_HAVE_HIPBLAS) { parsec_hip_exec_stream_t *hip_stream = (parsec_hip_exec_stream_t *)gpu_stream; ttg::detail::hipblas_set_kernel_stream(hip_stream->hip_stream); } #endif // PARSEC_HAVE_DEV_HIP_SUPPORT +#endif // 0 #if defined(PARSEC_HAVE_DEV_HIP_SUPPORT) && defined(TTG_HAVE_HIP) { From 965433580334b87d46f7e197fe3f9c47a5929181 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Wed, 4 Oct 2023 02:41:55 +0000 Subject: [PATCH 10/17] POTRF: add -n parameters to control number of iterations Signed-off-by: Joseph Schuchart --- examples/potrf/testing_dpotrf.cc | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/examples/potrf/testing_dpotrf.cc b/examples/potrf/testing_dpotrf.cc index 76055b4d4..402f50dd9 100644 --- a/examples/potrf/testing_dpotrf.cc +++ b/examples/potrf/testing_dpotrf.cc @@ -36,6 +36,7 @@ int main(int argc, char **argv) const char* prof_filename = nullptr; char *opt = nullptr; int ret = EXIT_SUCCESS; + int niter = 3; if( (opt = getCmdOption(argv+1, argv+argc, "-N")) != nullptr ) { N = M = atoi(opt); @@ -53,6 +54,10 @@ int main(int argc, char **argv) prof_filename = opt; } + if( (opt = getCmdOption(argv+1, argv+argc, "-n")) != nullptr) { + niter = atoi(opt); + } + bool check = !cmdOptionExists(argv+1, argv+argc, "-x"); bool cow_hint = !cmdOptionExists(argv+1, argv+argc, "-w"); @@ -128,16 +133,23 @@ int main(int argc, char **argv) std::cout << "==== end dot ====\n"; beg = std::chrono::high_resolution_clock::now(); } - init_tt->invoke(); - ttg::execute(world); - ttg::fence(world); - if (world.rank() == 0) { - end = std::chrono::high_resolution_clock::now(); - auto elapsed = (std::chrono::duration_cast(end - beg).count()); - end = std::chrono::high_resolution_clock::now(); - std::cout << "TTG Execution Time (milliseconds) : " - << elapsed / 1E3 << " : Flops " << (potrf::FLOPS_DPOTRF(N)) << " " << (potrf::FLOPS_DPOTRF(N)/1e9)/(elapsed/1e6) << " GF/s" << std::endl; + for (int i = 0; i < niter; ++i) { + if (world.rank() == 0) { + beg = std::chrono::high_resolution_clock::now(); + } + + init_tt->invoke(); + ttg::execute(world); + ttg::fence(world); + + if (world.rank() == 0) { + end = std::chrono::high_resolution_clock::now(); + auto elapsed = (std::chrono::duration_cast(end - beg).count()); + end = std::chrono::high_resolution_clock::now(); + std::cout << "TTG Execution Time (milliseconds) : " + << elapsed / 1E3 << " : Flops " << (potrf::FLOPS_DPOTRF(N)) << " " << (potrf::FLOPS_DPOTRF(N)/1e9)/(elapsed/1e6) << " GF/s" << std::endl; + } } world.dag_off(); From 410dc23120fdf7dbdb9147a0957f39e04464c78c Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Wed, 4 Oct 2023 02:43:11 +0000 Subject: [PATCH 11/17] Minor fixes to compile on Frontier --- examples/devblas_helper.h | 46 +++++++------------------------------ ttg/ttg/parsec/devicefunc.h | 2 ++ ttg/ttg/parsec/ttg.h | 2 +- 3 files changed, 11 insertions(+), 39 deletions(-) diff --git a/examples/devblas_helper.h b/examples/devblas_helper.h index 10c420111..349025209 100644 --- a/examples/devblas_helper.h +++ b/examples/devblas_helper.h @@ -11,42 +11,10 @@ #include #include -namespace detail { - template - struct device_stream_t { - int device; - Stream stream; - device_stream_t(int device, Stream stream) - : device(device) - , stream(stream) - { } - - bool operator<(const device_stream_t& ds) const { - bool result = ((device < ds.device) && (reinterpret_cast(stream) < reinterpret_cast(ds.stream))); - std::cout << *this << " < " << ds << ": " << result << std::endl; - return result; - } - - bool operator==(const device_stream_t& ds) const { - bool result = ((device == ds.device) && (stream == ds.stream)); - std::cout << *this << " == " << ds << ": " << result << std::endl; - return result; - } - }; -} // namespace detail - -namespace std { -template - std::ostream& operator<<(std::ostream& os, const ::detail::device_stream_t& ds) { - os << "[" << ds.device << ", " << ds.stream << "]"; - return os; - } - -} //namespace std - /// \brief Returns the cuBLAS handle to be used for launching cuBLAS kernels from the current thread /// \return the cuBLAS handle for the current thread -inline const cublasHandle_t& cublas_handle() { +template +inline const cublasHandle_t& cublas_handle(T _ = 0) { using map_type = std::map, cublasHandle_t>; static thread_local map_type handles; @@ -72,9 +40,9 @@ inline const cublasHandle_t& cublas_handle() { return it->second; } -inline const cusolverDnHandle_t& cusolver_handle() { +template +inline const cusolverDnHandle_t& cusolver_handle(T _ = 0) { - //using map_type = std::map, cusolverDnHandle_t>; using map_type = std::map, cusolverDnHandle_t>; static thread_local map_type handles; @@ -114,7 +82,8 @@ inline const cusolverDnHandle_t& cusolver_handle() { /// \brief Returns the rocBLAS handle to be used for launching rocBLAS kernels from the current thread /// \return the rocBLAS handle for the current thread -const hipblasHandle_t& hipblas_handle() { +template +const hipblasHandle_t& hipblas_handle(T _ = 0) { static thread_local std::map handles; int device = ttg::device::current_device(); std::map::iterator it; @@ -136,7 +105,8 @@ const hipblasHandle_t& hipblas_handle() { /// \brief Returns the hipsolver handle to be used for launching rocBLAS kernels from the current thread /// \return the hipsolver handle for the current thread -const hipsolverDnHandle_t& hipsolver_handle() { +template +const hipsolverDnHandle_t& hipsolver_handle(T _ = 0) { static thread_local std::map handles; int device = ttg::device::current_device(); std::map::iterator it; diff --git a/ttg/ttg/parsec/devicefunc.h b/ttg/ttg/parsec/devicefunc.h index 8686588f4..325750e04 100644 --- a/ttg/ttg/parsec/devicefunc.h +++ b/ttg/ttg/parsec/devicefunc.h @@ -11,6 +11,8 @@ #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) #include +#elif defined(PARSEC_HAVE_DEV_HIP_SUPPORT) +#include #endif // PARSEC_HAVE_DEV_CUDA_SUPPORT namespace ttg_parsec { diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 422e9f1ef..ea2063d56 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -1352,7 +1352,7 @@ namespace ttg_parsec { #if defined(PARSEC_HAVE_DEV_HIP_SUPPORT) && defined(TTG_HAVE_HIP) { - parsec_cuda_exec_stream_t *hip_stream = (parsec_hip_exec_stream_t *)gpu_stream; + parsec_hip_exec_stream_t *hip_stream = (parsec_hip_exec_stream_t *)gpu_stream; int device = gpu_device->super.device_index - 2; // 0: host, 1: recursive, 2: first GPU ttg::device::detail::set_current(device, hip_stream->hip_stream); } From 5b81ac4dc380af7473c33fecbbd6f2ef64f614f5 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 5 Oct 2023 15:12:41 +0000 Subject: [PATCH 12/17] buffer: cannot capture this pointer for deleter and move, use raw pointer instead Signed-off-by: Joseph Schuchart --- ttg/ttg/parsec/buffer.h | 60 ++++++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/ttg/ttg/parsec/buffer.h b/ttg/ttg/parsec/buffer.h index aac5de428..0309e7883 100644 --- a/ttg/ttg/parsec/buffer.h +++ b/ttg/ttg/parsec/buffer.h @@ -51,9 +51,10 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t private: using delete_fn_t = std::function; - using host_data_ptr = std::unique_ptr; - host_data_ptr m_host_data; + using host_data_ptr = std::add_pointer_t; + host_data_ptr m_host_data = nullptr; std::size_t m_count = 0; + bool m_owned= false; static void delete_non_owned(element_type *ptr) { // nothing to be done, we don't own the memory @@ -67,8 +68,8 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t return allocator_traits::allocate(get_allocator_reference(), n); } - void deallocate(element_type *ptr, std::size_t n) { - allocator_traits::deallocate(get_allocator_reference(), ptr, n); + void deallocate() { + allocator_traits::deallocate(get_allocator_reference(), m_host_data, m_count); } public: @@ -82,14 +83,15 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t buffer(std::size_t n) : ttg_parsec_data_wrapper_t() , allocator_type() - , m_host_data(allocate(n), [this, n](element_type* ptr){ deallocate(ptr, n); }) + , m_host_data(allocate(n)) , m_count(n) + , m_owned(true) { //std::cout << "buffer " << this << " ctor count " // << count << "(" << m_host_data.get() << ") ttg_copy " // << m_ttg_copy // << " parsec_data " << m_data.get() << std::endl; - this->reset_parsec_data(m_host_data.get(), n*sizeof(element_type)); + this->reset_parsec_data(m_host_data, n*sizeof(element_type)); } /* Constructing a buffer using application-managed memory. @@ -98,13 +100,18 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t buffer(element_type* ptr, std::size_t n = 1) : ttg_parsec_data_wrapper_t() , allocator_type() - , m_host_data(ptr, &delete_non_owned) + , m_host_data(ptr) , m_count(n) + , m_owned(false) { - this->reset_parsec_data(m_host_data.get(), n*sizeof(element_type)); + this->reset_parsec_data(m_host_data, n*sizeof(element_type)); } virtual ~buffer() { + if (m_owned) { + deallocate(); + m_owned = false; + } unpin(); // make sure the copies are not pinned } @@ -112,10 +119,13 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t buffer(buffer&& db) : ttg_parsec_data_wrapper_t(std::move(db)) , allocator_type(std::move(db)) - , m_host_data(std::move(db.m_host_data)) + , m_host_data(db.m_host_data) , m_count(db.m_count) + , m_owned(db.m_owned) { + db.m_host_data = nullptr; db.m_count = 0; + db.m_owned = false; } /* explicitly disable copying of buffers @@ -127,9 +137,9 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t buffer& operator=(buffer&& db) { ttg_parsec_data_wrapper_t::operator=(std::move(db)); allocator_type::operator=(std::move(db)); - m_host_data = std::move(db.m_host_data); - m_count = db.m_count; - db.m_count = 0; + std::swap(m_host_data, db.m_host_data); + std::swap(m_count, db.m_count); + std::swap(m_owned, db.m_owned); //std::cout << "buffer " << this << " other " << &db << " mv op ttg_copy " << m_ttg_copy << std::endl; //std::cout << "buffer::move-assign from " << &db << " ttg-copy " << db.m_ttg_copy // << " to " << this << " ttg-copy " << m_ttg_copy @@ -255,12 +265,20 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t /* TODO: can we resize if count is smaller than m_count? */ /* drop the current data and reallocate */ reset(); + + if (m_owned) { + deallocate(); + m_owned = false; + } + if (n == 0) { - m_host_data = host_data_ptr(nullptr, &delete_non_owned); + m_host_data = nullptr; + m_owned = false; } else { - m_host_data = host_data_ptr(allocate(n), [this, n](element_type* ptr){ deallocate(ptr, n); }); + m_host_data = allocate(n); + m_owned = true; } - reset_parsec_data(m_host_data.get(), n*sizeof(element_type)); + reset_parsec_data(m_host_data, n*sizeof(element_type)); //std::cout << "buffer::reset(" << count << ") ptr " << m_host_data.get() // << " ttg_copy " << m_ttg_copy // << " parsec_data " << m_data.get() << std::endl; @@ -274,14 +292,20 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t return; } + if (m_owned) { + deallocate(); + } + if (nullptr == ptr) { - m_host_data = host_data_ptr(nullptr, &delete_non_owned); + m_host_data = nullptr; m_count = 0; + m_owned = false; } else { - m_host_data = host_data_ptr(ptr, &delete_non_owned); + m_host_data = ptr; m_count = n; + m_owned = false; } - reset_parsec_data(m_host_data.get(), n*sizeof(element_type)); + reset_parsec_data(m_host_data, n*sizeof(element_type)); //std::cout << "buffer::reset(" << ptr << ", " << count << ") ptr " << m_host_data.get() // << " ttg_copy " << m_ttg_copy // << " parsec_data " << m_data.get() << std::endl; From 403b06b575cbddb6de08c1880cf5acb823d8afca Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 5 Oct 2023 15:15:04 +0000 Subject: [PATCH 13/17] POTRF: construct whole TTG each iteration Signed-off-by: Joseph Schuchart --- examples/potrf/testing_dpotrf.cc | 68 ++++++++++++++++---------------- 1 file changed, 35 insertions(+), 33 deletions(-) diff --git a/examples/potrf/testing_dpotrf.cc b/examples/potrf/testing_dpotrf.cc index 402f50dd9..4ea3b9167 100644 --- a/examples/potrf/testing_dpotrf.cc +++ b/examples/potrf/testing_dpotrf.cc @@ -96,45 +96,46 @@ int main(int argc, char **argv) parsec_data_collection_set_key((parsec_data_collection_t*)&dcA, (char*)"Matrix A"); if(!check) { - ttg::Edge startup("startup"); - ttg::Edge> topotrf("To POTRF"); - ttg::Edge> result("To result"); - - //Matrix* A = new Matrix(n_rows, n_cols, NB, NB); - MatrixT A{&dcA}; - /* TODO: initialize the matrix */ - /* This works only with the parsec backend! */ - int random_seed = 3872; - - auto init_tt = ttg::make_tt([&](std::tuple>& out) { - for(int i = 0; i < A.rows(); i++) { - for(int j = 0; j <= i && j < A.cols(); j++) { - if(A.is_local(i, j)) { - if(ttg::tracing()) ttg::print("init(", Key2{i, j}, ")"); - ttg::sendk<0>(Key2{i, j}, out); + for (int i = 0; i < niter; ++i) { + parsec_devices_release_memory(); + ttg::Edge startup("startup"); + ttg::Edge> topotrf("To POTRF"); + ttg::Edge> result("To result"); + + //Matrix* A = new Matrix(n_rows, n_cols, NB, NB); + MatrixT A{&dcA}; + /* TODO: initialize the matrix */ + /* This works only with the parsec backend! */ + int random_seed = 3872; + + auto init_tt = ttg::make_tt([&](std::tuple>& out) { + for(int i = 0; i < A.rows(); i++) { + for(int j = 0; j <= i && j < A.cols(); j++) { + if(A.is_local(i, j)) { + if(ttg::tracing()) ttg::print("init(", Key2{i, j}, ")"); + ttg::sendk<0>(Key2{i, j}, out); + } } } - } - }, ttg::edges(), ttg::edges(startup), "Startup Trigger", {}, {"startup"}); - init_tt->set_keymap([&]() {return world.rank();}); + }, ttg::edges(), ttg::edges(startup), "Startup Trigger", {}, {"startup"}); + init_tt->set_keymap([&]() {return world.rank();}); - auto plgsy_ttg = make_plgsy_ttg(A, N, random_seed, startup, topotrf, cow_hint); - auto potrf_ttg = potrf::make_potrf_ttg(A, topotrf, result, cow_hint); - auto result_ttg = make_result_ttg(A, result, cow_hint); + auto plgsy_ttg = make_plgsy_ttg(A, N, random_seed, startup, topotrf, cow_hint); + auto potrf_ttg = potrf::make_potrf_ttg(A, topotrf, result, cow_hint); + auto result_ttg = make_result_ttg(A, result, cow_hint); - auto connected = make_graph_executable(init_tt.get()); - assert(connected); - TTGUNUSED(connected); - std::cout << "Graph is connected: " << connected << std::endl; + auto connected = make_graph_executable(init_tt.get()); + assert(connected); + TTGUNUSED(connected); + std::cout << "Graph is connected: " << connected << std::endl; - if (world.rank() == 0) { - std::cout << "==== begin dot ====\n"; - std::cout << ttg::Dot()(init_tt.get()) << std::endl; - std::cout << "==== end dot ====\n"; - beg = std::chrono::high_resolution_clock::now(); - } + if (world.rank() == 0) { + std::cout << "==== begin dot ====\n"; + std::cout << ttg::Dot()(init_tt.get()) << std::endl; + std::cout << "==== end dot ====\n"; + beg = std::chrono::high_resolution_clock::now(); + } - for (int i = 0; i < niter; ++i) { if (world.rank() == 0) { beg = std::chrono::high_resolution_clock::now(); } @@ -150,6 +151,7 @@ int main(int argc, char **argv) std::cout << "TTG Execution Time (milliseconds) : " << elapsed / 1E3 << " : Flops " << (potrf::FLOPS_DPOTRF(N)) << " " << (potrf::FLOPS_DPOTRF(N)/1e9)/(elapsed/1e6) << " GF/s" << std::endl; } + parsec_devices_reset_load(ttg::default_execution_context().impl().context()); } world.dag_off(); From 1998655dfaafb564ec089f26fa2e31b38266d646 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 5 Oct 2023 15:19:22 +0000 Subject: [PATCH 14/17] Add TTG_ENABLE_ASAN CMake option Signed-off-by: Joseph Schuchart --- CMakeLists.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index b6b9c39fe..3a65a6335 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,6 +54,7 @@ option(TTG_PARSEC_USE_BOOST_SERIALIZATION "Whether to select Boost serialization option(TTG_ENABLE_CUDA "Whether to TTG will look for CUDA" OFF) option(TTG_ENABLE_HIP "Whether to TTG will look for HIP" OFF) option(TTG_EXAMPLES "Whether to build examples" OFF) +option(TTG_ENABLE_ASAN "Whether to enable address sanitizer" OFF) option(TTG_FETCH_BOOST "Whether to fetch+build Boost, if missing" OFF) option(TTG_IGNORE_BUNDLED_EXTERNALS "Whether to skip installation and use of bundled external depenedencies (Boost.CallableTraits)" OFF) @@ -68,6 +69,11 @@ if (FORCE_COLORED_OUTPUT) endif () endif (FORCE_COLORED_OUTPUT) +if (TTG_ENABLE_ASAN) + add_compile_options(-fsanitize=address) + add_link_options(-fsanitize=address) +endif (TTG_ENABLE_ASAN) + set(TTG_HIP_PLATFORM "__HIP_PLATFORM_AMD__" CACHE STRING "Which platform to use when compiling HIP-related code (default: __HIP_PLATFORM_AMD__)") ########################## #### prerequisites From 548b65bd4664a0ff7faecc951eb9ed0c512be150 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Fri, 6 Oct 2023 14:33:17 +0000 Subject: [PATCH 15/17] Chain benchmark: Disable wait_kernel co_yield Signed-off-by: Joseph Schuchart --- examples/task-benchmarks/chain-ttg-dev.cc | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/examples/task-benchmarks/chain-ttg-dev.cc b/examples/task-benchmarks/chain-ttg-dev.cc index 5ccf1297d..ac96e16b3 100644 --- a/examples/task-benchmarks/chain-ttg-dev.cc +++ b/examples/task-benchmarks/chain-ttg-dev.cc @@ -11,7 +11,7 @@ #error "Either CUDA OR HIP is required to build this test!" #endif // 0 -#define NUM_TASKS 10000 +#define NUM_TASKS 100000 using namespace ttg; @@ -56,7 +56,7 @@ auto make_ttg<1>(bool do_move) { auto next = make_tt([=](const int &key, auto&& value) -> ttg::device_task { //++task_counter; co_await ttg::to_device(value.b); - co_await ttg::wait_kernel(); // empty kernel + //co_await ttg::wait_kernel(); // empty kernel if (key < NUM_TASKS) { if (do_move) { co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(value))); @@ -83,7 +83,7 @@ auto make_ttg<2>(bool do_move) { auto next = make_tt([=](const int &key, A&& v1, A&& v2) -> ttg::device_task { co_await ttg::to_device(v1.b, v2.b); - co_await ttg::wait_kernel(); // empty kernel + //co_await ttg::wait_kernel(); // empty kernel if (key < NUM_TASKS) { if (do_move) { co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(v1)), @@ -114,7 +114,7 @@ auto make_ttg<4>(bool do_move) { auto next = make_tt([=](const int &key, A&& v1, A&& v2, A&& v3, A&& v4) -> ttg::device_task { co_await ttg::to_device(v1.b, v2.b, v3.b, v4.b); - co_await ttg::wait_kernel(); // empty kernel + //co_await ttg::wait_kernel(); // empty kernel if (key < NUM_TASKS) { if (do_move) { co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(v1)), @@ -155,7 +155,7 @@ auto make_ttg<8>(bool do_move) { auto next = make_tt([=](const int &key, auto&& v1, auto&& v2, auto&& v3, auto&& v4, auto&& v5, auto&& v6, auto&& v7, auto&& v8) -> ttg::device_task { co_await ttg::to_device(v1.b, v2.b, v3.b, v4.b, v5.b, v6.b, v7.b, v8.b); - co_await ttg::wait_kernel(); // empty kernel + //co_await ttg::wait_kernel(); // empty kernel if (key < NUM_TASKS) { if (do_move) { co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(v1)), @@ -193,7 +193,7 @@ auto make_ttg<0>(bool do_move) { auto next = make_tt([](const int& key) -> ttg::device_task { co_await ttg::to_device(); - co_await ttg::wait_kernel(); + //co_await ttg::wait_kernel(); if (key < NUM_TASKS) { co_await ttg::device::forward(ttg::device::sendk<0>(key+1)); } @@ -211,6 +211,11 @@ void run_bench(bool do_move) assert(connected); std::cout << "Graph " << num_flows << " is connected.\n"; + if (ttg::default_execution_context().rank() == 0) init->invoke(); + + ttg_execute(ttg_default_execution_context()); + ttg_fence(ttg_default_execution_context()); + auto t0 = now(); if (ttg::default_execution_context().rank() == 0) init->invoke(); @@ -218,8 +223,8 @@ void run_bench(bool do_move) ttg_fence(ttg_default_execution_context()); auto t1 = now(); - std::cout << "# of tasks = " << task_counter.load() << std::endl; - std::cout << "time elapsed (microseconds) = " << duration_in_mus(t0, t1) << std::endl; + std::cout << "# of tasks = " << NUM_TASKS << std::endl; + std::cout << "time elapsed (microseconds) = " << duration_in_mus(t0, t1) << ", avg " << duration_in_mus(t0, t1) / (double)NUM_TASKS << std::endl; } int main(int argc, char* argv[]) { @@ -242,7 +247,7 @@ int main(int argc, char* argv[]) { case 2: run_bench<2>(do_move); break; case 4: run_bench<4>(do_move); break; case 8: run_bench<8>(do_move); break; - default: std::cout << "Unsupported number of flows: " << num_flows << std::endl; + default: std::cout << "Unsupported number of flows: " << NUM_TASKS << std::endl; } ttg_finalize(); From fdf6e2e1218ea81661606da390df38d6b36e459d Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Fri, 6 Oct 2023 14:33:53 +0000 Subject: [PATCH 16/17] PaRSEC: Increase max AM size to 1MB --- ttg/ttg/parsec/ttg.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index ea2063d56..c942940b1 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -240,7 +240,7 @@ namespace ttg_parsec { int parsec_ttg_profile_backend_allocate_datacopy, parsec_ttg_profile_backend_free_datacopy; #endif - static constexpr const int PARSEC_TTG_MAX_AM_SIZE = 4 * 1024; + static constexpr const int PARSEC_TTG_MAX_AM_SIZE = 1 * 1024*1024; WorldImpl(int *argc, char **argv[], int ncores, parsec_context_t *c = nullptr) : WorldImplBase(query_comm_size(), query_comm_rank()) , ctx(c) From 9d21225a774c461470cecac424f3006000bcac34 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Fri, 6 Oct 2023 14:39:58 +0000 Subject: [PATCH 17/17] Remove TTG cublas_helper, superseeded by devblas helper in examples Signed-off-by: Joseph Schuchart --- ttg/CMakeLists.txt | 15 +----- ttg/ttg/device/cublas_helper.cpp | 81 -------------------------------- ttg/ttg/device/cublas_helper.h | 35 -------------- ttg/ttg/parsec/ttg.h | 19 -------- 4 files changed, 1 insertion(+), 149 deletions(-) delete mode 100644 ttg/ttg/device/cublas_helper.cpp delete mode 100644 ttg/ttg/device/cublas_helper.h diff --git a/ttg/CMakeLists.txt b/ttg/CMakeLists.txt index 476118524..427a3ffee 100644 --- a/ttg/CMakeLists.txt +++ b/ttg/CMakeLists.txt @@ -68,7 +68,6 @@ set(ttg-impl-headers ${CMAKE_CURRENT_BINARY_DIR}/ttg/config.h ) set(ttg_device_headers - ${CMAKE_CURRENT_SOURCE_DIR}/ttg/device/cublas_helper.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/device/device.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/device/task.h ) @@ -117,23 +116,11 @@ if (TARGET std::coroutine) ) endif () if (TTG_HAVE_CUDA) - list(APPEND ttg-sources - ${CMAKE_CURRENT_SOURCE_DIR}/ttg/device/cublas_helper.cpp - ) - list(APPEND ttg-deps CUDA::cudart CUDA::cublas) + list(APPEND ttg-deps CUDA::cudart) endif (TTG_HAVE_CUDA) if (TTG_HAVE_HIPBLAS) - # add the helper file if not added already - if (NOT TTG_HAVE_CUDA) - list(APPEND ttg-sources - ${CMAKE_CURRENT_SOURCE_DIR}/ttg/device/cublas_helper.cpp - ) - endif (NOT TTG_HAVE_CUDA) - list(APPEND ttg-deps roc::hipblas) - find_package(hip REQUIRED) list(APPEND ttg-deps hip::host) - endif (TTG_HAVE_HIPBLAS) set(ttg-public-headers ${ttg-headers};${ttg-impl-headers};${ttg-base-headers};${ttg-util-headers};${ttg_device_headers}) diff --git a/ttg/ttg/device/cublas_helper.cpp b/ttg/ttg/device/cublas_helper.cpp deleted file mode 100644 index 768411282..000000000 --- a/ttg/ttg/device/cublas_helper.cpp +++ /dev/null @@ -1,81 +0,0 @@ -#include "ttg/config.h" -#include "ttg/device/cublas_helper.h" - -#include -#include -#include -#include -#ifdef TTG_HAVE_CUDART -#include -#endif // TTG_HAVE_CUDART - -#ifdef TTG_HAVE_HIPBLAS -#include -#endif // TTG_HAVE_HIPBLAS - -namespace ttg::detail { - -#ifdef TTG_HAVE_CUDART -/// \brief Returns the cuBLAS handle to be used for launching cuBLAS kernels from the current thread -/// \return the cuBLAS handle for the current thread -const cublasHandle_t& cublas_get_handle() { - static thread_local std::map handles; - int device; - if (cudaSuccess != cudaGetDevice(&device)){ - throw std::runtime_error("cudaGetDevice failed"); - } - std::map::iterator it; - if ((it = handles.find(device)) == handles.end()){ - cublasHandle_t handle; - auto status = cublasCreate_v2(&handle); - if (CUBLAS_STATUS_SUCCESS != status) { - throw std::runtime_error("cublasCreate_v2 failed"); - } - auto [iterator, success] = handles.insert({device, handle}); - it = iterator; - } - - return it->second; -} - -void cublas_set_kernel_stream(cudaStream_t stream) { - cublasStatus_t status = cublasSetStream_v2(cublas_get_handle(), stream); - if (CUBLAS_STATUS_SUCCESS != status) { - throw std::runtime_error("cublasSetStream_v2 failed"); - } -} -#endif // TTG_HAVE_CUDART - -#ifdef TTG_HAVE_HIPBLAS -/// \brief Returns the rocBLAS handle to be used for launching rocBLAS kernels from the current thread -/// \return the rocBLAS handle for the current thread -const hipblasHandle_t& hipblas_get_handle() { - static thread_local std::map handles; - int device; - if (hipSuccess != hipGetDevice(&device)){ - throw std::runtime_error("hipGetDevice failed"); - } - std::map::iterator it; - if ((it = handles.find(device)) == handles.end()){ - hipblasHandle_t handle; - auto status = hipblasCreate(&handle); - if (HIPBLAS_STATUS_SUCCESS != status) { - throw std::runtime_error("hipblasCreate failed"); - } - auto [iterator, success] = handles.insert({device, handle}); - it = iterator; - } - return it->second; -} - -void hipblas_set_kernel_stream(hipStream_t stream) { - hipblasStatus_t status = hipblasSetStream(hipblas_get_handle(), stream); - if (HIPBLAS_STATUS_SUCCESS != status) { - throw std::runtime_error("hipblasSetStream failed"); - } -} - -#endif // TTG_HAVE_HIPBLAS - - -} // namespace \ No newline at end of file diff --git a/ttg/ttg/device/cublas_helper.h b/ttg/ttg/device/cublas_helper.h deleted file mode 100644 index 6ab106bfb..000000000 --- a/ttg/ttg/device/cublas_helper.h +++ /dev/null @@ -1,35 +0,0 @@ -#ifndef TTG_DEVICE_CUBLAS_HELPER_H -#define TTG_DEVICE_CUBLAS_HELPER_H - -#include "ttg/config.h" - -#ifdef TTG_HAVE_CUDART -#include - -namespace ttg::detail { - -/// \brief Returns the cuBLAS context used by this thread -const cublasHandle_t& cublas_get_handle(); - -/// associates the given CUDA stream with the cuBLAS context used by this thread -void cublas_set_kernel_stream(cudaStream_t stream); - -} // namespace ttg::detail -#endif // TTG_HAVE_CUDART - - -#ifdef TTG_HAVE_HIPBLAS -#include - -namespace ttg::detail { - -const hipblasHandle_t& hipblas_get_handle(); - -/// associates the given HIP stream with the hipblas context used by this thread -void hipblas_set_kernel_stream(hipStream_t stream); - -} // namespace ttg::detail -#endif // TTG_HAVE_HIPBLAS - - -#endif // TTG_DEVICE_CUBLAS_HELPER_H \ No newline at end of file diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index c942940b1..367d6c250 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -110,7 +110,6 @@ #include "ttg/parsec/thread_local.h" #include "ttg/parsec/ptr.h" #include "ttg/parsec/task.h" -#include "ttg/device/cublas_helper.h" #include "ttg/parsec/parsec-ext.h" #include "ttg/device/device.h" @@ -1324,15 +1323,6 @@ namespace ttg_parsec { assert(dev_data.state() == ttg::TTG_DEVICE_CORO_WAIT_TRANSFER || dev_data.state() == ttg::TTG_DEVICE_CORO_WAIT_KERNEL); -#if 0 -#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(TTG_HAVE_CUDART) - { - parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t *)gpu_stream; - ttg::detail::cublas_set_kernel_stream(cuda_stream->cuda_stream); - } -#endif // PARSEC_HAVE_DEV_CUDA_SUPPORT -#endif //0 - #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(TTG_HAVE_CUDA) { parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t *)gpu_stream; @@ -1341,15 +1331,6 @@ namespace ttg_parsec { } #endif // defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(TTG_HAVE_CUDA) -#if 0 -#if defined(PARSEC_HAVE_DEV_HIP_SUPPORT) && defined(TTG_HAVE_HIPBLAS) - { - parsec_hip_exec_stream_t *hip_stream = (parsec_hip_exec_stream_t *)gpu_stream; - ttg::detail::hipblas_set_kernel_stream(hip_stream->hip_stream); - } -#endif // PARSEC_HAVE_DEV_HIP_SUPPORT -#endif // 0 - #if defined(PARSEC_HAVE_DEV_HIP_SUPPORT) && defined(TTG_HAVE_HIP) { parsec_hip_exec_stream_t *hip_stream = (parsec_hip_exec_stream_t *)gpu_stream;