From 52c293774d5358b3eb2faf2f3b6899e946decacc Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 28 Feb 2024 11:18:25 -0500 Subject: [PATCH 01/40] [serialization] introduced `byte_{i,o}streambuf`, to replace boost::iostreams::basic_array_sink --- ttg/ttg/serialization/stream.h | 41 ++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/ttg/ttg/serialization/stream.h b/ttg/ttg/serialization/stream.h index c26781a53..9c50e0f83 100644 --- a/ttg/ttg/serialization/stream.h +++ b/ttg/ttg/serialization/stream.h @@ -6,6 +6,7 @@ #define TTG_SERIALIZATION_STREAM_H #include +#include namespace ttg::detail { @@ -66,6 +67,46 @@ namespace ttg::detail { const std::vector>& iovec_; }; + /// streambuf that writes bytes to a buffer in memory + class byte_ostreambuf : public std::streambuf { + public: + using std::streambuf::streambuf; + + byte_ostreambuf(char_type* buffer, std::streamsize buffer_size = std::numeric_limits::max()) : buffer_(buffer), cursor_(buffer_), buffer_size_(buffer_size) {} + + std::streamsize xsputn(const char_type* s, std::streamsize n) override { + assert((cursor_ - buffer_) + n <= buffer_size_); + std::memcpy(cursor_, s, n * sizeof(char_type)); + cursor_ += n; + return n; + } + + private: + char_type* buffer_; + char_type* cursor_; // current location in buffer_ + std::streamsize buffer_size_; + }; + + /// streambuf that writes bytes to a buffer in memory + class byte_istreambuf : public std::streambuf { + public: + using std::streambuf::streambuf; + + byte_istreambuf(char_type* buffer, std::size_t buffer_size = std::numeric_limits::max()) : buffer_(buffer), cursor_(buffer_), buffer_size_(buffer_size) {} + + std::streamsize xsgetn(char_type* s, std::streamsize max_n) override { + const auto n_to_read = std::min(buffer_size_ - (cursor_ - buffer_), max_n); + std::memcpy(s, cursor_, n_to_read * sizeof(char_type)); + cursor_ += n_to_read; + return n_to_read; + } + + private: + char_type* buffer_; + char_type* cursor_; // current location in buffer_ + std::streamsize buffer_size_; + }; + } // namespace ttg::detail #endif // TTG_SERIALIZATION_STREAM_H From 50251003dd7f4075ed3107170c4148581d42d0b7 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 28 Feb 2024 11:21:44 -0500 Subject: [PATCH 02/40] [serialization] `boost_optimized_oarchive` moar optimized can call S::xsputn directly if S is derived from std::streambuf (i.e. is a streambuf) ... this permits inlining it --- .../serialization/backends/boost/archive.h | 76 +++++++++++++++++-- 1 file changed, 71 insertions(+), 5 deletions(-) diff --git a/ttg/ttg/serialization/backends/boost/archive.h b/ttg/ttg/serialization/backends/boost/archive.h index f9cebdace..ce7626e34 100644 --- a/ttg/ttg/serialization/backends/boost/archive.h +++ b/ttg/ttg/serialization/backends/boost/archive.h @@ -74,7 +74,19 @@ namespace ttg::detail { /// optimized data-only serializer - /// skips metadata (class version, etc.) + /// skips metadata (class version, etc.) by providing optimized save_override function that will be called by + /// `boost::archive::binary_oarchive_impl::save_override` + /// + /// \internal Using `boost::archive::binary_oarchive_impl` provides stock implementation for this custom archive. Unfortunately + /// `boost::archive::binary_oarchive_impl` uses the streambuf object via its std::streambuf base which means that + /// calls to `xsputn` are not inlined. To work around this problem, this class replaces several functions in + /// `boost::archive::binary_oarchive_impl` that were provided by `boost::archive::basic_binary_oprimitive` + /// such as `save`, `save_array` and `save_binary`; the latter calls the streambuf's sputn directly, + /// not via std::streambuf::sputn . To make sure these "replacements" are called this class must be used directly + /// rather than cast to `boost::archive::binary_oarchive_impl`, as an argument to + /// `oarchive_save_override_optimized_dispatch` + /// if \p StreamOrStreambuf is a streambuf (i.e. derived from std::streambuf). + /// template class boost_optimized_oarchive : private StreamOrStreambuf, @@ -84,6 +96,8 @@ namespace ttg::detail { using pbase_type = StreamOrStreambuf; using base_type = boost::archive::binary_oarchive_impl, std::ostream::char_type, std::ostream::traits_type>; + // if pbase_type is derived from std::streambuf can use this information to avoid virtual function calls and inline + static constexpr bool pbase_derived_from_stdstreambuf = std::is_base_of_v; private: friend class boost::archive::save_access; @@ -108,9 +122,12 @@ namespace ttg::detail { : pbase_type(std::forward(arg)) , base_type(this->pbase(), boost::archive::no_header | boost::archive::no_codecvt){}; + /// these provide optimized implementation that's called by base_type::save_override + /// @{ + template void save_override(const T& t) { - oarchive_save_override_optimized_dispatch(this->base(), t); + oarchive_save_override_optimized_dispatch(*this, t); } void save_override(const boost::archive::class_id_optional_type& /* t */) {} @@ -121,11 +138,53 @@ namespace ttg::detail { void save_override(const boost::archive::class_id_type& t) {} void save_override(const boost::archive::class_id_reference_type& t) {} + /// @} + void save_object(const void* x, const boost::archive::detail::basic_oserializer& bos) { abort(); } + /// override default implementations in base_type provided by basic_binary_oprimitive + /// @{ + + // default saving of primitives. + template + void save(const T & t) + { + save_binary(& t, sizeof(T)); + } + + // trap usage of invalid uninitialized boolean which would + // otherwise crash on load. + void save(const bool t){ + BOOST_ASSERT(0 == static_cast(t) || 1 == static_cast(t)); + save_binary(& t, sizeof(t)); + } + public: - BOOST_ARCHIVE_DECL - void save_binary(const void* address, std::size_t count); + + // the optimized save_array dispatches to save_binary + template + void save_array(boost::serialization::array_wrapper const& a, unsigned int) + { + save_binary(a.address(),a.count()*sizeof(ValueType)); + } + + void save_binary(const void *address, std::size_t count) { + if constexpr (pbase_derived_from_stdstreambuf) { // if we were given a streambuf use it directly ... + using Elem = std::ostream::char_type; + count = (count + sizeof(Elem) - 1) / sizeof(Elem); + std::streamsize scount = static_cast(this->pbase()) + .sputn(static_cast(address), static_cast(count)); + if (count != static_cast(scount)) + boost::serialization::throw_exception( + boost::archive::archive_exception(boost::archive::archive_exception::output_stream_error)); + } + else { // ... else let boost::archive::basic_binary_oprimitive handle via std::stringbuf + // (and associated virtual function calls ... no inlining for you) + this->base().save_binary(address, count); + } + } + + /// @} template auto& operator<<(const T& t) { @@ -139,7 +198,14 @@ namespace ttg::detail { return *this << t; } - const auto& streambuf() const { return this->pbase(); } + const auto& streambuf() const { + if constexpr (pbase_derived_from_stdstreambuf) { + return static_cast(this->pbase()); + } + else { + return this->pbase(); + } + } const auto& stream() const { return this->pbase(); } }; From 01c961c112eba8de093884419e6417b7c1a6e8eb Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 28 Feb 2024 11:22:43 -0500 Subject: [PATCH 03/40] [serialization] introduced `boost_byte_{i,o}archive` that use `byte_{i,o}streambuf` --- ttg/ttg/serialization/backends/boost/archive.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/ttg/ttg/serialization/backends/boost/archive.h b/ttg/ttg/serialization/backends/boost/archive.h index ce7626e34..37a55f3f3 100644 --- a/ttg/ttg/serialization/backends/boost/archive.h +++ b/ttg/ttg/serialization/backends/boost/archive.h @@ -219,6 +219,9 @@ namespace ttg::detail { using boost_buffer_oarchive = boost_optimized_oarchive>>; + /// an archive that constructs serialized representation of an object in a memory buffer, in an optimized manner + using boost_byte_oarchive = boost_optimized_oarchive; + /// constructs a boost_buffer_oarchive object /// @param[in] buf pointer to a memory buffer to which serialized representation will be written @@ -316,6 +319,9 @@ namespace ttg::detail { using boost_buffer_iarchive = boost_optimized_iarchive>>; + /// the deserializer for boost_byte_oarchive + using boost_byte_iarchive = boost_optimized_iarchive; + /// constructs a boost_buffer_iarchive object /// @param[in] buf pointer to a memory buffer from which serialized representation will be read @@ -359,6 +365,10 @@ BOOST_SERIALIZATION_REGISTER_ARCHIVE(ttg::detail::boost_iovec_iarchive); BOOST_SERIALIZATION_USE_ARRAY_OPTIMIZATION_FOR_THIS_AND_BASE(ttg::detail::boost_iovec_iarchive); BOOST_SERIALIZATION_REGISTER_ARCHIVE(ttg::detail::boost_buffer_iarchive); BOOST_SERIALIZATION_USE_ARRAY_OPTIMIZATION_FOR_THIS_AND_BASE(ttg::detail::boost_buffer_iarchive); +BOOST_SERIALIZATION_REGISTER_ARCHIVE(ttg::detail::boost_byte_oarchive); +BOOST_SERIALIZATION_USE_ARRAY_OPTIMIZATION_FOR_THIS_AND_BASE(ttg::detail::boost_byte_oarchive); +BOOST_SERIALIZATION_REGISTER_ARCHIVE(ttg::detail::boost_byte_iarchive); +BOOST_SERIALIZATION_USE_ARRAY_OPTIMIZATION_FOR_THIS_AND_BASE(ttg::detail::boost_byte_iarchive); #undef BOOST_SERIALIZATION_USE_ARRAY_OPTIMIZATION_FOR_THIS_AND_BASE From a8a00396b4e3ed540ad4b4b7f585642057257fe6 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 28 Feb 2024 11:23:27 -0500 Subject: [PATCH 04/40] [serialization] Boost-based TTG serialization uses boost_byte_oarchive for optimized operation ... hopefully can inline most of the calls --- ttg/ttg/serialization/backends/boost/archive.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ttg/ttg/serialization/backends/boost/archive.h b/ttg/ttg/serialization/backends/boost/archive.h index 37a55f3f3..37fbbb8f0 100644 --- a/ttg/ttg/serialization/backends/boost/archive.h +++ b/ttg/ttg/serialization/backends/boost/archive.h @@ -230,8 +230,7 @@ namespace ttg::detail { /// @return a boost_buffer_oarchive object referring to @p buf inline auto make_boost_buffer_oarchive(void* const buf, std::size_t size, std::size_t buf_offset = 0) { assert(buf_offset <= size); - using arrsink_t = boost::iostreams::basic_array_sink; - return boost_buffer_oarchive(arrsink_t(static_cast(buf) + buf_offset, size - buf_offset)); + return ttg::detail::boost_byte_oarchive(ttg::detail::byte_ostreambuf(static_cast(buf) + buf_offset, size - buf_offset)); } /// constructs a boost_buffer_oarchive object From 3a75af34fb7a158b5b72aecd6195d83f28bec900 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 28 Feb 2024 13:33:42 -0500 Subject: [PATCH 05/40] [serialization] added `s{put,get}n` functions to `byte_{i,o}streambuf` to avoid the use of `basic_streambuf::s{get,put}n` --- ttg/ttg/serialization/stream.h | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/ttg/ttg/serialization/stream.h b/ttg/ttg/serialization/stream.h index 9c50e0f83..09304be74 100644 --- a/ttg/ttg/serialization/stream.h +++ b/ttg/ttg/serialization/stream.h @@ -72,9 +72,14 @@ namespace ttg::detail { public: using std::streambuf::streambuf; - byte_ostreambuf(char_type* buffer, std::streamsize buffer_size = std::numeric_limits::max()) : buffer_(buffer), cursor_(buffer_), buffer_size_(buffer_size) {} + inline byte_ostreambuf(char_type* buffer, std::streamsize buffer_size = std::numeric_limits::max()) : buffer_(buffer), cursor_(buffer_), buffer_size_(buffer_size) {} - std::streamsize xsputn(const char_type* s, std::streamsize n) override { + // hides basic_streambuf::sputn so can avoid the virtual function dispatch if the compiler is not aggressive enough + inline std::streamsize sputn(const char_type* s, std::streamsize n) noexcept { + return this->xsputn(s, n); + } + + inline std::streamsize xsputn(const char_type* s, std::streamsize n) noexcept override { assert((cursor_ - buffer_) + n <= buffer_size_); std::memcpy(cursor_, s, n * sizeof(char_type)); cursor_ += n; @@ -92,9 +97,14 @@ namespace ttg::detail { public: using std::streambuf::streambuf; - byte_istreambuf(char_type* buffer, std::size_t buffer_size = std::numeric_limits::max()) : buffer_(buffer), cursor_(buffer_), buffer_size_(buffer_size) {} + inline byte_istreambuf(char_type* buffer, std::size_t buffer_size = std::numeric_limits::max()) : buffer_(buffer), cursor_(buffer_), buffer_size_(buffer_size) {} - std::streamsize xsgetn(char_type* s, std::streamsize max_n) override { + // hides basic_streambuf::sgetn so can avoid the virtual function dispatch if the compiler is not aggressive enough + inline std::streamsize sgetn(char_type* s, std::streamsize n) noexcept { + return this->xsgetn(s, n); + } + + inline std::streamsize xsgetn(char_type* s, std::streamsize max_n) noexcept override { const auto n_to_read = std::min(buffer_size_ - (cursor_ - buffer_), max_n); std::memcpy(s, cursor_, n_to_read * sizeof(char_type)); cursor_ += n_to_read; From 4c61860172d82438e0d89adcdccac65a78af088c Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 28 Feb 2024 13:34:18 -0500 Subject: [PATCH 06/40] [serialization] throw -> assert to maybe improve codegen --- ttg/ttg/serialization/backends/boost/archive.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/ttg/ttg/serialization/backends/boost/archive.h b/ttg/ttg/serialization/backends/boost/archive.h index 37fbbb8f0..b6d0bb434 100644 --- a/ttg/ttg/serialization/backends/boost/archive.h +++ b/ttg/ttg/serialization/backends/boost/archive.h @@ -174,9 +174,7 @@ namespace ttg::detail { count = (count + sizeof(Elem) - 1) / sizeof(Elem); std::streamsize scount = static_cast(this->pbase()) .sputn(static_cast(address), static_cast(count)); - if (count != static_cast(scount)) - boost::serialization::throw_exception( - boost::archive::archive_exception(boost::archive::archive_exception::output_stream_error)); + assert(count == static_cast(scount)); } else { // ... else let boost::archive::basic_binary_oprimitive handle via std::stringbuf // (and associated virtual function calls ... no inlining for you) From d216befcfa9d36d4b391dfb0e8ca198f30563f79 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 28 Feb 2024 16:27:53 -0500 Subject: [PATCH 07/40] [serialization] `byte_{i,o}streambuf::xs{get,put}n` must be `final` to inline --- ttg/ttg/serialization/stream.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ttg/ttg/serialization/stream.h b/ttg/ttg/serialization/stream.h index 09304be74..fe2301f8c 100644 --- a/ttg/ttg/serialization/stream.h +++ b/ttg/ttg/serialization/stream.h @@ -72,14 +72,14 @@ namespace ttg::detail { public: using std::streambuf::streambuf; - inline byte_ostreambuf(char_type* buffer, std::streamsize buffer_size = std::numeric_limits::max()) : buffer_(buffer), cursor_(buffer_), buffer_size_(buffer_size) {} + byte_ostreambuf(char_type* buffer, std::streamsize buffer_size = std::numeric_limits::max()) : buffer_(buffer), cursor_(buffer_), buffer_size_(buffer_size) {} // hides basic_streambuf::sputn so can avoid the virtual function dispatch if the compiler is not aggressive enough - inline std::streamsize sputn(const char_type* s, std::streamsize n) noexcept { + std::streamsize sputn(const char_type* s, std::streamsize n) noexcept { return this->xsputn(s, n); } - inline std::streamsize xsputn(const char_type* s, std::streamsize n) noexcept override { + std::streamsize xsputn(const char_type* s, std::streamsize n) noexcept override final { assert((cursor_ - buffer_) + n <= buffer_size_); std::memcpy(cursor_, s, n * sizeof(char_type)); cursor_ += n; @@ -97,14 +97,14 @@ namespace ttg::detail { public: using std::streambuf::streambuf; - inline byte_istreambuf(char_type* buffer, std::size_t buffer_size = std::numeric_limits::max()) : buffer_(buffer), cursor_(buffer_), buffer_size_(buffer_size) {} + byte_istreambuf(char_type* buffer, std::size_t buffer_size = std::numeric_limits::max()) : buffer_(buffer), cursor_(buffer_), buffer_size_(buffer_size) {} // hides basic_streambuf::sgetn so can avoid the virtual function dispatch if the compiler is not aggressive enough - inline std::streamsize sgetn(char_type* s, std::streamsize n) noexcept { + std::streamsize sgetn(char_type* s, std::streamsize n) noexcept { return this->xsgetn(s, n); } - inline std::streamsize xsgetn(char_type* s, std::streamsize max_n) noexcept override { + std::streamsize xsgetn(char_type* s, std::streamsize max_n) noexcept override final { const auto n_to_read = std::min(buffer_size_ - (cursor_ - buffer_), max_n); std::memcpy(s, cursor_, n_to_read * sizeof(char_type)); cursor_ += n_to_read; From 957886cd200749a73b0344d4db3b4bfa30d877a1 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 29 Feb 2024 11:32:28 -0500 Subject: [PATCH 08/40] [serialization] `boost_byte_iarchive` is also inlinable --- .../serialization/backends/boost/archive.h | 76 +++++++++++++++++-- ttg/ttg/serialization/stream.h | 6 +- 2 files changed, 71 insertions(+), 11 deletions(-) diff --git a/ttg/ttg/serialization/backends/boost/archive.h b/ttg/ttg/serialization/backends/boost/archive.h index b6d0bb434..e688dbc66 100644 --- a/ttg/ttg/serialization/backends/boost/archive.h +++ b/ttg/ttg/serialization/backends/boost/archive.h @@ -171,6 +171,7 @@ namespace ttg::detail { void save_binary(const void *address, std::size_t count) { if constexpr (pbase_derived_from_stdstreambuf) { // if we were given a streambuf use it directly ... using Elem = std::ostream::char_type; + static_assert(sizeof(Elem) == 1); count = (count + sizeof(Elem) - 1) / sizeof(Elem); std::streamsize scount = static_cast(this->pbase()) .sputn(static_cast(address), static_cast(count)); @@ -240,8 +241,7 @@ namespace ttg::detail { template inline auto make_boost_buffer_oarchive(char (&buf)[N], std::size_t buf_offset = 0) { assert(buf_offset <= N); - using arrsink_t = boost::iostreams::basic_array_sink; - return boost_buffer_oarchive(arrsink_t(&(buf[buf_offset]), N - buf_offset)); + return ttg::detail::boost_byte_oarchive(ttg::detail::byte_ostreambuf(&(buf[buf_offset], N - buf_offset))); } /// optimized data-only deserializer for boost_optimized_oarchive @@ -254,6 +254,8 @@ namespace ttg::detail { using pbase_type = StreamOrStreambuf; using base_type = boost::archive::binary_iarchive_impl; + // if pbase_type is derived from std::streambuf can use this information to avoid virtual function calls and inline + static constexpr bool pbase_derived_from_stdstreambuf = std::is_base_of_v; private: friend class boost::archive::save_access; @@ -278,9 +280,12 @@ namespace ttg::detail { : pbase_type(std::forward(arg)) , base_type(this->pbase(), boost::archive::no_header | boost::archive::no_codecvt){}; + /// these provide optimized implementation that's called by base_type::load_override + /// @{ + template void load_override(T& t) { - iarchive_load_override_optimized_dispatch(this->base(), t); + iarchive_load_override_optimized_dispatch(*this, t); } void load_override(boost::archive::class_id_optional_type& /* t */) {} @@ -291,8 +296,58 @@ namespace ttg::detail { void load_override(boost::archive::class_id_type& t) {} void load_override(boost::archive::class_id_reference_type& t) {} + /// @} + void load_object(void* x, const boost::archive::detail::basic_oserializer& bos) { abort(); } + /// override default implementations in base_type provided by basic_binary_iprimitive + /// @{ + + // main template for serialization of primitive types + template + void load(T & t){ + load_binary(& t, sizeof(T)); + } + + ///////////////////////////////////////////////////////// + // fundamental types that need special treatment + + // trap usage of invalid uninitialized boolean + void load(bool & t){ + load_binary(& t, sizeof(t)); + int i = t; + BOOST_ASSERT(0 == i || 1 == i); + (void)i; // warning suppression for release builds. + } + + public: + + // the optimized load_array dispatches to load_binary + template + void load_array(boost::serialization::array_wrapper& a, unsigned int) + { + load_binary(a.address(),a.count()*sizeof(ValueType)); + } + + void load_binary( + void *address, + std::size_t count + ) { + if constexpr (pbase_derived_from_stdstreambuf) { // if we were given a streambuf use it directly ... + using Elem = std::ostream::char_type; + static_assert(sizeof(Elem) == 1); + std::streamsize s = static_cast(count); + std::streamsize scount = static_cast(this->pbase()).sgetn(static_cast(address), s); + assert(scount == count); + } + else { // ... else let boost::archive::basic_binary_iprimitive handle via std::stringbuf + // (and associated virtual function calls ... no inlining for you) + this->base().load_binary(address, count); + } + } + + /// @} + template auto& operator>>(T& t) { this->load_override(t); @@ -305,7 +360,14 @@ namespace ttg::detail { return *this >> t; } - const auto& streambuf() const { return this->pbase(); } + const auto& streambuf() const { + if constexpr (pbase_derived_from_stdstreambuf) { + return static_cast(this->pbase()); + } + else { + return this->pbase(); + } + } const auto& stream() const { return this->pbase(); } }; @@ -327,8 +389,7 @@ namespace ttg::detail { /// @return a boost_buffer_iarchive object referring to @p buf inline auto make_boost_buffer_iarchive(const void* const buf, std::size_t size, std::size_t buf_offset = 0) { assert(buf_offset <= size); - using arrsrc_t = boost::iostreams::basic_array_source; - return boost_buffer_iarchive(arrsrc_t(static_cast(buf) + buf_offset, size - buf_offset)); + return ttg::detail::boost_byte_iarchive(ttg::detail::byte_istreambuf(static_cast(buf) + buf_offset, size - buf_offset)); } /// constructs a boost_buffer_iarchive object @@ -340,8 +401,7 @@ namespace ttg::detail { template inline auto make_boost_buffer_iarchive(const char (&buf)[N], std::size_t buf_offset = 0) { assert(buf_offset <= N); - using arrsrc_t = boost::iostreams::basic_array_source; - return boost_buffer_iarchive(arrsrc_t(&(buf[buf_offset]), N - buf_offset)); + return ttg::detail::boost_byte_iarchive(ttg::detail::byte_istreambuf((&(buf[buf_offset]), N - buf_offset))); } } // namespace ttg::detail diff --git a/ttg/ttg/serialization/stream.h b/ttg/ttg/serialization/stream.h index fe2301f8c..f5991b330 100644 --- a/ttg/ttg/serialization/stream.h +++ b/ttg/ttg/serialization/stream.h @@ -97,7 +97,7 @@ namespace ttg::detail { public: using std::streambuf::streambuf; - byte_istreambuf(char_type* buffer, std::size_t buffer_size = std::numeric_limits::max()) : buffer_(buffer), cursor_(buffer_), buffer_size_(buffer_size) {} + byte_istreambuf(const char_type* buffer, std::size_t buffer_size = std::numeric_limits::max()) : buffer_(buffer), cursor_(buffer_), buffer_size_(buffer_size) {} // hides basic_streambuf::sgetn so can avoid the virtual function dispatch if the compiler is not aggressive enough std::streamsize sgetn(char_type* s, std::streamsize n) noexcept { @@ -112,8 +112,8 @@ namespace ttg::detail { } private: - char_type* buffer_; - char_type* cursor_; // current location in buffer_ + const char_type* buffer_; + const char_type* cursor_; // current location in buffer_ std::streamsize buffer_size_; }; From 76c053e31daf936689dc6c011bf9ed5f9b44af5f Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 29 Feb 2024 11:33:35 -0500 Subject: [PATCH 09/40] [serialization] introduced `byte_{i,o}streambuf::size()` --- ttg/ttg/serialization/stream.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/ttg/ttg/serialization/stream.h b/ttg/ttg/serialization/stream.h index f5991b330..848252630 100644 --- a/ttg/ttg/serialization/stream.h +++ b/ttg/ttg/serialization/stream.h @@ -86,6 +86,11 @@ namespace ttg::detail { return n; } + /// number of characters written to the buffer + std::streamsize size() const noexcept { + return cursor_ - buffer_; + } + private: char_type* buffer_; char_type* cursor_; // current location in buffer_ @@ -111,6 +116,11 @@ namespace ttg::detail { return n_to_read; } + /// number of characters read from the buffer + std::streamsize size() const noexcept { + return cursor_ - buffer_; + } + private: const char_type* buffer_; const char_type* cursor_; // current location in buffer_ From 89ced1ad623e328b4c8d4644f45ffa6a54d1f03c Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 29 Feb 2024 11:45:51 -0500 Subject: [PATCH 10/40] [serialization] default_data_descriptor::pack_payload returns the actual number of bytes written + dox --- ttg/ttg/serialization/data_descriptor.h | 195 ++++++++++++++---------- 1 file changed, 116 insertions(+), 79 deletions(-) diff --git a/ttg/ttg/serialization/data_descriptor.h b/ttg/ttg/serialization/data_descriptor.h index f14d00e72..709810b76 100644 --- a/ttg/ttg/serialization/data_descriptor.h +++ b/ttg/ttg/serialization/data_descriptor.h @@ -15,22 +15,41 @@ #include "ttg/serialization/splitmd_data_descriptor.h" -// This provides an efficent API for serializing/deserializing a data type. -// An object of this type will need to be provided for each serializable type. -// The default implementation, in serialization.h, works only for primitive/POD data types; -// backend-specific implementations may be available in backend/serialization.h . +/// This provides an efficient C API for serializing/deserializing a data type to a nonportable contiguous bytestring. +/// An object of this type will need to be provided for each serializable type. +/// The default implementation, in serialization.h, works only for primitive/POD data types; +/// backend-specific implementations may be available in backend/serialization.h . extern "C" struct ttg_data_descriptor { const char *name; + + /// @brief measures the size of the binary representation of @p object + /// @param[in] object pointer to the object to be serialized + /// @return the number of bytes needed for binary representation of @p object uint64_t (*payload_size)(const void *object); - uint64_t (*pack_payload)(const void *object, uint64_t chunk_size, uint64_t pos, void *buf); - void (*unpack_payload)(void *object, uint64_t chunk_size, uint64_t pos, const void *buf); + + /// @brief serializes object to a buffer + /// @param[in] object pointer to the object to be serialized + /// @param[in] max_nbytes_to_write the maximum number of bytes to write + /// @param[in] offset the position in \p buf where the first byte of serialized data will be written + /// @param[in,out] buf the data buffer that will contain the serialized representation of the object + /// @return position in \p buf after the last byte written + uint64_t (*pack_payload)(const void *object, uint64_t max_nbytes_to_write, uint64_t offset, void *buf); + + /// @brief deserializes object from a buffer + /// @param[in] object pointer to the object to be deserialized + /// @param[in] max_nbytes_to_read the maximum number of bytes to read + /// @param[in] offset the position in \p buf where the first byte of serialized data will be read + /// @param[in] buf the data buffer that contains the serialized representation of the object + /// @return position in \p buf after the last byte written + void (*unpack_payload)(void *object, uint64_t max_nbytes_to_read, uint64_t offset, const void *buf); + void (*print)(const void *object); }; namespace ttg { - /** - * \brief Provides (de)serialization of C++ data invocable from C primarily to interface with PaRSEC +/** + * \brief Provides (de)serialization of C++ data that can be invoked from C via ttg_data_descriptor * The default implementation is only provided for POD data types but is efficient in the sense that * it does enable zero-copy remote data transfers. For other data types, optimized implementations @@ -40,7 +59,7 @@ namespace ttg { template struct default_data_descriptor; - /// default_data_descriptor for trivially-copyable types + /// @brief default_data_descriptor for trivially-copyable types /// @tparam T a trivially-copyable type template struct default_data_descriptor< @@ -48,43 +67,46 @@ namespace ttg { !ttg::has_split_metadata::value>> { static constexpr const bool serialize_size_is_const = true; + /// @brief measures the size of the binary representation of @p object /// @param[in] object pointer to the object to be serialized - /// @return size of serialized @p object + /// @return the number of bytes needed for binary representation of @p object static uint64_t payload_size(const void *object) { return static_cast(sizeof(T)); } /// @brief serializes object to a buffer - /// @param[in] object pointer to the object to be serialized - /// @param[in] size the size of @p object in bytes - /// @param[in] begin location in @p buf where the first byte of serialized data will be written - /// @param[in,out] buf the data buffer that will contain serialized data - /// @return location in @p buf after the last byte written - static uint64_t pack_payload(const void *object, uint64_t size, uint64_t begin, void *buf) { + /// @param[in] max_nbytes_to_write the maximum number of bytes to write + /// @param[in] offset the position in \p buf where the first byte of serialized data will be written + /// @param[in,out] buf the data buffer that will contain the serialized representation of the object + /// @return position in \p buf after the last byte written + static uint64_t pack_payload(const void *object, uint64_t max_nbytes_to_write, uint64_t begin, void *buf) { unsigned char *char_buf = reinterpret_cast(buf); - std::memcpy(&char_buf[begin], object, size); - return begin + size; + assert(sizeof(T)<=max_nbytes_to_write); + std::memcpy(&char_buf[begin], object, sizeof(T)); + return begin + sizeof(T); } /// @brief deserializes object from a buffer - - /// @param[in,out] object pointer to the object to be deserialized - /// @param[in] size the size of @p object in bytes - /// @param[in] begin location in @p buf where the first byte of serialized data will be read - /// @param[in] buf the data buffer that contains serialized data - static void unpack_payload(void *object, uint64_t size, uint64_t begin, const void *buf) { + /// @param[in] object pointer to the object to be deserialized + /// @param[in] max_nbytes_to_read the maximum number of bytes to read + /// @param[in] offset the position in \p buf where the first byte of serialized data will be read + /// @param[in] buf the data buffer that contains the serialized representation of the object + /// @return position in \p buf after the last byte written + static void unpack_payload(void *object, uint64_t max_nbytes_to_read, uint64_t begin, const void *buf) { const unsigned char *char_buf = reinterpret_cast(buf); - std::memcpy(object, &char_buf[begin], size); + assert(sizeof(T)<=max_nbytes_to_read); + std::memcpy(object, &char_buf[begin], sizeof(T)); } }; - /// default_data_descriptor for trivially-copyable types - /// @tparam T a trivially-copyable type + /// @brief default_data_descriptor for types that support 2-stage serialization (metadata first, then the rest) for implementing zero-copy transfers + /// @tparam T a type for which `ttg::has_split_metadata::value` is true template struct default_data_descriptor::value>> { static constexpr const bool serialize_size_is_const = false; + /// @brief measures the size of the binary representation of @p object /// @param[in] object pointer to the object to be serialized - /// @return size of serialized @p object + /// @return the number of bytes needed for binary representation of @p object static uint64_t payload_size(const void *object) { SplitMetadataDescriptor smd; const T *t = reinterpret_cast(object); @@ -98,39 +120,40 @@ namespace ttg { } /// @brief serializes object to a buffer - /// @param[in] object pointer to the object to be serialized - /// @param[in] size the size of @p object in bytes - /// @param[in] begin location in @p buf where the first byte of serialized data will be written - /// @param[in,out] buf the data buffer that will contain serialized data - /// @return location in @p buf after the last byte written - static uint64_t pack_payload(const void *object, uint64_t size, uint64_t begin, void *buf) { + /// @param[in] max_nbytes_to_write the maximum number of bytes to write + /// @param[in] offset the position in \p buf where the first byte of serialized data will be written + /// @param[in,out] buf the data buffer that will contain the serialized representation of the object + /// @return position in \p buf after the last byte written + static uint64_t pack_payload(const void *object, uint64_t max_nbytes_to_write, uint64_t begin, void *buf) { SplitMetadataDescriptor smd; const T *t = reinterpret_cast(object); unsigned char *char_buf = reinterpret_cast(buf); auto metadata = smd.get_metadata(t); + assert(sizeof(metadata) <= max_nbytes_to_write); std::memcpy(&char_buf[begin], metadata, sizeof(metadata)); size_t pos = sizeof(metadata); for (auto &&iovec : smd.get_data(t)) { std::memcpy(&char_buf[begin + pos], iovec.data, iovec.num_bytes); pos += iovec.num_bytes; - assert(pos < size); + assert(pos <= max_nbytes_to_write); } - return begin + size; + return begin + pos; } /// @brief deserializes object from a buffer - - /// @param[in,out] object pointer to the object to be deserialized - /// @param[in] size the size of @p object in bytes - /// @param[in] begin location in @p buf where the first byte of serialized data will be read - /// @param[in] buf the data buffer that contains serialized data - static void unpack_payload(void *object, uint64_t size, uint64_t begin, const void *buf) { + /// @param[in] object pointer to the object to be deserialized + /// @param[in] max_nbytes_to_read the maximum number of bytes to read + /// @param[in] offset the position in \p buf where the first byte of serialized data will be read + /// @param[in] buf the data buffer that contains the serialized representation of the object + /// @return position in \p buf after the last byte written + static void unpack_payload(void *object, uint64_t max_nbytes_to_read, uint64_t begin, const void *buf) { SplitMetadataDescriptor smd; T *t = reinterpret_cast(object); using metadata_t = decltype(smd.get_metadata(t)); + assert(sizeof(metadata_t) <= max_nbytes_to_read); const unsigned char *char_buf = reinterpret_cast(buf); const metadata_t *metadata = reinterpret_cast(char_buf + begin); T t_created = smd.create_from_metadata(); @@ -139,7 +162,7 @@ namespace ttg { for (auto &&iovec : smd.get_data(t)) { std::memcpy(iovec.data, &char_buf[begin + pos], iovec.num_bytes); pos += iovec.num_bytes; - assert(pos < size); + assert(pos <= max_nbytes_to_read); } } }; @@ -150,39 +173,45 @@ namespace ttg { namespace ttg { - /// The default implementation for non-POD data types that are not directly copyable - /// and support MADNESS serialization + /// @brief default_data_descriptor for non-POD data types that are not directly copyable or 2-stage serializable and support MADNESS serialization template struct default_data_descriptor< T, std::enable_if_t<((!detail::is_memcpyable_v && detail::is_madness_buffer_serializable_v) || detail::is_madness_user_buffer_serializable_v)&&!ttg::has_split_metadata::value>> { static constexpr const bool serialize_size_is_const = false; + /// @brief measures the size of the binary representation of @p object + /// @param[in] object pointer to the object to be serialized + /// @return the number of bytes needed for binary representation of @p object static uint64_t payload_size(const void *object) { madness::archive::BufferOutputArchive ar; - ar &(*(T *)object); + ar & (*static_cast>>(object)); return static_cast(ar.size()); } - /// object --- obj to be serialized - /// chunk_size --- inputs max amount of data to output, and on output returns amount actually output - /// pos --- position in the input buffer to resume serialization - /// buf[pos] --- place for output - static uint64_t pack_payload(const void *object, uint64_t chunk_size, uint64_t pos, void *_buf) { + /// @brief serializes object to a buffer + /// @param[in] object pointer to the object to be serialized + /// @param[in] max_nbytes_to_write the maximum number of bytes to write + /// @param[in] offset the position in \p buf where the first byte of serialized data will be written + /// @param[in,out] buf the data buffer that will contain the serialized representation of the object + /// @return position in \p buf after the last byte written + static uint64_t pack_payload(const void *object, uint64_t max_nbytes_to_write, uint64_t pos, void *_buf) { unsigned char *buf = reinterpret_cast(_buf); - madness::archive::BufferOutputArchive ar(&buf[pos], chunk_size); - ar &(*(T *)object); - return pos + chunk_size; + madness::archive::BufferOutputArchive ar(&buf[pos], max_nbytes_to_write); + ar & (*static_cast>>(object)); + return pos + ar.size(); } - /// object --- obj to be deserialized - /// chunk_size --- amount of data for input - /// pos --- position in the input buffer to resume deserialization - /// object -- pointer to the object to fill up - static void unpack_payload(void *object, uint64_t chunk_size, uint64_t pos, const void *_buf) { + /// @brief deserializes object from a buffer + /// @param[in] object pointer to the object to be deserialized + /// @param[in] max_nbytes_to_read the maximum number of bytes to read + /// @param[in] offset the position in \p buf where the first byte of serialized data will be read + /// @param[in] buf the data buffer that contains the serialized representation of the object + /// @return position in \p buf after the last byte written + static void unpack_payload(void *object, uint64_t max_nbytes_to_read, uint64_t pos, const void *_buf) { const unsigned char *buf = reinterpret_cast(_buf); - madness::archive::BufferInputArchive ar(&buf[pos], chunk_size); - ar &(*(T *)object); + madness::archive::BufferInputArchive ar(&buf[pos], max_nbytes_to_read); + ar & (*static_cast>(object)); } }; @@ -196,8 +225,7 @@ namespace ttg { namespace ttg { - /// The default implementation for non-POD data types that are not directly copyable, - /// do not support MADNESS serialization, and support Boost serialization + /// @brief default_data_descriptor for non-POD data types that are not directly copyable, not 2-stage serializable, do not support MADNESS serialization, and support Boost serialization template struct default_data_descriptor< T, std::enable_if_t<(!detail::is_memcpyable_v && !detail::is_madness_buffer_serializable_v && @@ -206,29 +234,38 @@ namespace ttg { detail::is_boost_user_buffer_serializable_v)>> { static constexpr const bool serialize_size_is_const = false; + /// @brief measures the size of the binary representation of @p object + /// @param[in] object pointer to the object to be serialized + /// @return the number of bytes needed for binary representation of @p object static uint64_t payload_size(const void *object) { ttg::detail::boost_counting_oarchive oa; - oa << (*(T *)object); + oa << (*static_cast>>(object)); return oa.streambuf().size(); } - /// object --- obj to be serialized - /// chunk_size --- inputs max amount of data to output, and on output returns amount actually output - /// pos --- position in the input buffer to resume serialization - /// buf[pos] --- place for output - static uint64_t pack_payload(const void *object, uint64_t chunk_size, uint64_t pos, void *_buf) { - auto oa = ttg::detail::make_boost_buffer_oarchive(_buf, pos + chunk_size, pos); - oa << (*(T *)object); - return pos + chunk_size; + /// @brief serializes object to a buffer + /// @param[in] object pointer to the object to be serialized + /// @param[in] max_nbytes_to_write the maximum number of bytes to write + /// @param[in] offset the position in \p buf where the first byte of serialized data will be written + /// @param[in,out] buf the data buffer that will contain the serialized representation of the object + /// @return position in \p buf after the last byte written + static uint64_t pack_payload(const void *object, uint64_t max_nbytes_to_write, uint64_t pos, void *buf) { + auto oa = ttg::detail::make_boost_buffer_oarchive(buf, pos + max_nbytes_to_write, pos); + oa << (*static_cast>>(object)); + assert(oa.streambuf().size() <= max_nbytes_to_write); + return pos + oa.streambuf().size(); } - /// object --- obj to be deserialized - /// chunk_size --- amount of data for input - /// pos --- position in the input buffer to resume deserialization - /// object -- pointer to the object to fill up - static void unpack_payload(void *object, uint64_t chunk_size, uint64_t pos, const void *_buf) { - auto ia = ttg::detail::make_boost_buffer_iarchive(_buf, pos + chunk_size, pos); - ia >> (*(T *)object); + /// @brief deserializes object from a buffer + /// @param[in] object pointer to the object to be deserialized + /// @param[in] max_nbytes_to_read the maximum number of bytes to read + /// @param[in] offset the position in \p buf where the first byte of serialized data will be read + /// @param[in] buf the data buffer that contains the serialized representation of the object + /// @return position in \p buf after the last byte written + static void unpack_payload(void *object, uint64_t max_nbytes_to_read, uint64_t pos, const void *buf) { + auto ia = ttg::detail::make_boost_buffer_iarchive(buf, pos + max_nbytes_to_read, pos); + ia >> (*static_cast>(object)); + assert(ia.streambuf().size() <= max_nbytes_to_read); } }; From 84ea6f41382244137bc8b8baf41a99bd9be1bb93 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 29 Feb 2024 11:54:19 -0500 Subject: [PATCH 11/40] [serialization] default_data_descriptor::unpack_payload returns the actual number of bytes read to be symmetric wrt pack_payload --- tests/unit/serialization.cc | 4 ++-- ttg/ttg/parsec/ttg.h | 14 ++++++-------- ttg/ttg/serialization/data_descriptor.h | 24 ++++++++++++++---------- 3 files changed, 22 insertions(+), 20 deletions(-) diff --git a/tests/unit/serialization.cc b/tests/unit/serialization.cc index e12607de3..772feaea6 100644 --- a/tests/unit/serialization.cc +++ b/tests/unit/serialization.cc @@ -504,7 +504,7 @@ TEST_CASE("MADNESS Serialization", "[serialization]") { T g_obj; void* g = (void*)&g_obj; - CHECK_NOTHROW(d->unpack_payload(g, obj_size, 0, buf.get())); + CHECK(d->unpack_payload(g, obj_size, 0, buf.get()) == pos); }; test(99); @@ -755,7 +755,7 @@ TEST_CASE("TTG Serialization", "[serialization]") { T g_obj; void* g = (void*)&g_obj; - CHECK_NOTHROW(d->unpack_payload(g, obj_size, 0, buf.get())); + CHECK(d->unpack_payload(g, obj_size, 0, buf.get()) == pos); }; test(99); diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 65e39991a..1c31e69ec 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -1835,13 +1835,12 @@ ttg::abort(); // should not happen uint64_t payload_size; if constexpr (!ttg::default_data_descriptor>::serialize_size_is_const) { const ttg_data_descriptor *dSiz = ttg::get_data_descriptor(); - dSiz->unpack_payload(&payload_size, sizeof(uint64_t), pos, _bytes); - pos += sizeof(uint64_t); + pos = dSiz->unpack_payload(&payload_size, sizeof(uint64_t), pos, _bytes); } else { payload_size = dObj->payload_size(&obj); } - dObj->unpack_payload(&obj, payload_size, pos, _bytes); - return pos + payload_size; + pos = dObj->unpack_payload(&obj, payload_size, pos, _bytes); + return pos; } template @@ -1855,11 +1854,10 @@ ttg::abort(); // should not happen if constexpr (!ttg::default_data_descriptor>::serialize_size_is_const) { const ttg_data_descriptor *dSiz = ttg::get_data_descriptor(); - dSiz->pack_payload(&payload_size, sizeof(uint64_t), pos, bytes); - pos += sizeof(uint64_t); + pos = dSiz->pack_payload(&payload_size, sizeof(uint64_t), pos, bytes); } - dObj->pack_payload(&obj, payload_size, pos, bytes); - return pos + payload_size; + pos = dObj->pack_payload(&obj, payload_size, pos, bytes); + return pos; } static void static_set_arg(void *data, std::size_t size, ttg::TTBase *bop) { diff --git a/ttg/ttg/serialization/data_descriptor.h b/ttg/ttg/serialization/data_descriptor.h index 709810b76..da9bb1088 100644 --- a/ttg/ttg/serialization/data_descriptor.h +++ b/ttg/ttg/serialization/data_descriptor.h @@ -40,8 +40,8 @@ extern "C" struct ttg_data_descriptor { /// @param[in] max_nbytes_to_read the maximum number of bytes to read /// @param[in] offset the position in \p buf where the first byte of serialized data will be read /// @param[in] buf the data buffer that contains the serialized representation of the object - /// @return position in \p buf after the last byte written - void (*unpack_payload)(void *object, uint64_t max_nbytes_to_read, uint64_t offset, const void *buf); + /// @return position in \p buf after the last byte read + uint64_t (*unpack_payload)(void *object, uint64_t max_nbytes_to_read, uint64_t offset, const void *buf); void (*print)(const void *object); }; @@ -90,11 +90,12 @@ namespace ttg { /// @param[in] max_nbytes_to_read the maximum number of bytes to read /// @param[in] offset the position in \p buf where the first byte of serialized data will be read /// @param[in] buf the data buffer that contains the serialized representation of the object - /// @return position in \p buf after the last byte written - static void unpack_payload(void *object, uint64_t max_nbytes_to_read, uint64_t begin, const void *buf) { + /// @return position in \p buf after the last byte read + static uint64_t unpack_payload(void *object, uint64_t max_nbytes_to_read, uint64_t begin, const void *buf) { const unsigned char *char_buf = reinterpret_cast(buf); assert(sizeof(T)<=max_nbytes_to_read); std::memcpy(object, &char_buf[begin], sizeof(T)); + return begin + sizeof(T); } }; @@ -147,8 +148,8 @@ namespace ttg { /// @param[in] max_nbytes_to_read the maximum number of bytes to read /// @param[in] offset the position in \p buf where the first byte of serialized data will be read /// @param[in] buf the data buffer that contains the serialized representation of the object - /// @return position in \p buf after the last byte written - static void unpack_payload(void *object, uint64_t max_nbytes_to_read, uint64_t begin, const void *buf) { + /// @return position in \p buf after the last byte read + static uint64_t unpack_payload(void *object, uint64_t max_nbytes_to_read, uint64_t begin, const void *buf) { SplitMetadataDescriptor smd; T *t = reinterpret_cast(object); @@ -164,6 +165,7 @@ namespace ttg { pos += iovec.num_bytes; assert(pos <= max_nbytes_to_read); } + return begin + pos; } }; @@ -207,11 +209,12 @@ namespace ttg { /// @param[in] max_nbytes_to_read the maximum number of bytes to read /// @param[in] offset the position in \p buf where the first byte of serialized data will be read /// @param[in] buf the data buffer that contains the serialized representation of the object - /// @return position in \p buf after the last byte written - static void unpack_payload(void *object, uint64_t max_nbytes_to_read, uint64_t pos, const void *_buf) { + /// @return position in \p buf after the last byte read + static uint64_t unpack_payload(void *object, uint64_t max_nbytes_to_read, uint64_t pos, const void *_buf) { const unsigned char *buf = reinterpret_cast(_buf); madness::archive::BufferInputArchive ar(&buf[pos], max_nbytes_to_read); ar & (*static_cast>(object)); + return pos + (max_nbytes_to_read - ar.nbyte_avail()); } }; @@ -261,11 +264,12 @@ namespace ttg { /// @param[in] max_nbytes_to_read the maximum number of bytes to read /// @param[in] offset the position in \p buf where the first byte of serialized data will be read /// @param[in] buf the data buffer that contains the serialized representation of the object - /// @return position in \p buf after the last byte written - static void unpack_payload(void *object, uint64_t max_nbytes_to_read, uint64_t pos, const void *buf) { + /// @return position in \p buf after the last byte read + static uint64_t unpack_payload(void *object, uint64_t max_nbytes_to_read, uint64_t pos, const void *buf) { auto ia = ttg::detail::make_boost_buffer_iarchive(buf, pos + max_nbytes_to_read, pos); ia >> (*static_cast>(object)); assert(ia.streambuf().size() <= max_nbytes_to_read); + return pos + ia.streambuf().size(); } }; From c013f36bd432c3ed2f4f3b0126fe15cbec72d493 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 29 Feb 2024 15:21:27 -0500 Subject: [PATCH 12/40] [serialization] remove the uses of get_data_descriptor in PaRSEC backend, use default_data_descriptor instead directly --- ttg/ttg/parsec/ttg.h | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 1c31e69ec..f4653f962 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -1831,32 +1831,30 @@ ttg::abort(); // should not happen protected: template uint64_t unpack(T &obj, void *_bytes, uint64_t pos) { - const ttg_data_descriptor *dObj = ttg::get_data_descriptor>(); + using dd_t = ttg::default_data_descriptor>; uint64_t payload_size; - if constexpr (!ttg::default_data_descriptor>::serialize_size_is_const) { - const ttg_data_descriptor *dSiz = ttg::get_data_descriptor(); - pos = dSiz->unpack_payload(&payload_size, sizeof(uint64_t), pos, _bytes); + if constexpr (!dd_t::serialize_size_is_const) { + pos = ttg::default_data_descriptor::unpack_payload(&payload_size, sizeof(uint64_t), pos, _bytes); } else { - payload_size = dObj->payload_size(&obj); + payload_size = dd_t::payload_size(&obj); } - pos = dObj->unpack_payload(&obj, payload_size, pos, _bytes); + pos = dd_t::unpack_payload(&obj, payload_size, pos, _bytes); return pos; } template uint64_t pack(T &obj, void *bytes, uint64_t pos, detail::ttg_data_copy_t *copy = nullptr) { - const ttg_data_descriptor *dObj = ttg::get_data_descriptor>(); - uint64_t payload_size = dObj->payload_size(&obj); + using dd_t = ttg::default_data_descriptor>; + uint64_t payload_size = dd_t::payload_size(&obj); if (copy) { /* reset any tracked data, we don't care about the packing from the payload size */ copy->iovec_reset(); } - if constexpr (!ttg::default_data_descriptor>::serialize_size_is_const) { - const ttg_data_descriptor *dSiz = ttg::get_data_descriptor(); - pos = dSiz->pack_payload(&payload_size, sizeof(uint64_t), pos, bytes); + if constexpr (!dd_t::serialize_size_is_const) { + pos = ttg::default_data_descriptor::pack_payload(&payload_size, sizeof(uint64_t), pos, bytes); } - pos = dObj->pack_payload(&obj, payload_size, pos, bytes); + pos = dd_t::pack_payload(&obj, payload_size, pos, bytes); return pos; } From 8c7fc0cb22fb7feec54feb2dcd05092deb6b1ee7 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Wed, 3 Apr 2024 12:58:27 -0400 Subject: [PATCH 13/40] SPMM: add Boost::graph as dependency Signed-off-by: Joseph Schuchart --- cmake/modules/FindOrFetchBoost.cmake | 1 + examples/CMakeLists.txt | 8 +++++--- examples/spmm/spmm.cc | 6 +++++- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/cmake/modules/FindOrFetchBoost.cmake b/cmake/modules/FindOrFetchBoost.cmake index 1e133eee0..34550ecac 100644 --- a/cmake/modules/FindOrFetchBoost.cmake +++ b/cmake/modules/FindOrFetchBoost.cmake @@ -23,6 +23,7 @@ if (TTG_PARSEC_USE_BOOST_SERIALIZATION) list(APPEND optional_components serialization iostreams + graph ) endif() diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 10b808540..bbcc0e9e0 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -7,10 +7,12 @@ add_ttg_executable(t9-streaming t9/t9_streaming.cc) # sparse matmul need Eigen ... it's always provided by TA if (TARGET tiledarray) # MADworld used for MADNESS serialization - add_ttg_executable(spmm spmm/spmm.cc LINK_LIBRARIES TiledArray_Eigen) + add_ttg_executable(spmm spmm/spmm.cc LINK_LIBRARIES TiledArray_Eigen $ + COMPILE_DEFINITIONS $<$:HAVE_BOOST_GRAPH=1>) # block-sparse needs BTAS ... it's always provided by TA # since only need to use matrices, limit BTAS_TARGET_MAX_INDEX_RANK to 2 - add_ttg_executable(bspmm spmm/spmm.cc LINK_LIBRARIES tiledarray TiledArray_Eigen BTAS COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2) + add_ttg_executable(bspmm spmm/spmm.cc LINK_LIBRARIES tiledarray TiledArray_Eigen BTAS + COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2) add_ttg_executable(testing_dpotrf potrf/testing_dpotrf.cc LINK_LIBRARIES tiledarray lapackpp) add_ttg_executable(testing_dtrtri potrf/testing_dtrtri.cc LINK_LIBRARIES tiledarray lapackpp) @@ -37,7 +39,7 @@ if (TARGET tiledarray) if (TARGET roc::hipsolver) add_ttg_executable(testing_dpotrf_hip potrf/testing_dpotrf.cc LINK_LIBRARIES lapackpp tiledarray roc::hipblas roc::hipsolver - COMPILE_DEFINITIONS TTG_ENABLE_HIP=1;DEBUG_TILES_VALUES=1 + COMPILE_DEFINITIONS TTG_ENABLE_HIP=1 #;DEBUG_TILES_VALUES=1 RUNTIMES "parsec") endif(TARGET roc::hipsolver) elseif (TARGET MKL::MKL_DPCPP) diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc index 0c85eb3d4..e61bbffa3 100644 --- a/examples/spmm/spmm.cc +++ b/examples/spmm/spmm.cc @@ -21,8 +21,8 @@ #endif #include -#include #if !defined(BLOCK_SPARSE_GEMM) +#include #include #include #include @@ -1104,6 +1104,7 @@ static void initSpMatrixMarket(const std::function &)> &keymap, K = (int)A.cols(); } +#ifdef HAVE_BOOST_GRAPH static void initSpRmat(const std::function &)> &keymap, const char *opt, SpMatrix<> &A, SpMatrix<> &B, SpMatrix<> &C, int &M, int &N, int &K, unsigned long seed) { int E; @@ -1160,6 +1161,7 @@ static void initSpRmat(const std::function &)> &keymap, const c std::cout << "#R-MAT: " << E << " nonzero elements, density: " << (double)nnz / (double)N / (double)N << std::endl; } } +#endif // HAVE_BOOST_GRAPH static void initSpHardCoded(const std::function &)> &keymap, SpMatrix<> &A, SpMatrix<> &B, SpMatrix<> &C, int &m, int &n, int &k) { @@ -1802,10 +1804,12 @@ int main(int argc, char **argv) { char *filename = getCmdOption(argv, argv + argc, "-mm"); tiling_type = filename; initSpMatrixMarket(ij_keymap, filename, A, B, C, M, N, K); +#ifdef HAVE_BOOST_GRAPH } else if (cmdOptionExists(argv, argv + argc, "-rmat")) { char *opt = getCmdOption(argv, argv + argc, "-rmat"); tiling_type = "RandomSparseMatrix"; initSpRmat(ij_keymap, opt, A, B, C, M, N, K, seed); +#endif // HAVE_BOOST_GRAPH } else { tiling_type = "HardCodedSparseMatrix"; initSpHardCoded(ij_keymap, A, B, C, M, N, K); From 7a56f2e5e11aa3941d365988355fcb3366626b05 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 4 Apr 2024 10:07:58 -0400 Subject: [PATCH 14/40] Pass argc/argv to allocator_init They are needed to initialize madness and thus the TA allocators. Signed-off-by: Joseph Schuchart --- examples/matrixtile.h | 4 ++-- examples/potrf/testing_dlauum.cc | 2 +- examples/potrf/testing_dpoinv.cc | 2 +- examples/potrf/testing_dpotrf.cc | 2 +- examples/potrf/testing_dtrtri.cc | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/matrixtile.h b/examples/matrixtile.h index a2080f3ed..a759b1c3c 100644 --- a/examples/matrixtile.h +++ b/examples/matrixtile.h @@ -14,7 +14,7 @@ #if defined(TILEDARRAY_HAS_DEVICE) #define ALLOCATOR TiledArray::device_pinned_allocator -inline void allocator_init() { +inline void allocator_init(int argc, char **argv) { // initialize MADNESS so that TA allocators can be created #if defined(TTG_PARSEC_IMPORTED) madness::ParsecRuntime::initialize_with_existing_context(ttg::default_execution_context().impl().context()); @@ -28,7 +28,7 @@ inline void allocator_fini() { #else // TILEDARRAY_HAS_DEVICE #define ALLOCATOR std::allocator -inline void allocator_init() { } +inline void allocator_init(int argc, char **argv) { } inline void allocator_fini() { } diff --git a/examples/potrf/testing_dlauum.cc b/examples/potrf/testing_dlauum.cc index df87c63a6..bab6d676c 100644 --- a/examples/potrf/testing_dlauum.cc +++ b/examples/potrf/testing_dlauum.cc @@ -59,7 +59,7 @@ int main(int argc, char **argv) ttg::initialize(argc, argv, nthreads); /* set up TA to get the allocator */ - allocator_init(); + allocator_init(argc, argv); auto world = ttg::default_execution_context(); diff --git a/examples/potrf/testing_dpoinv.cc b/examples/potrf/testing_dpoinv.cc index bd10b24b7..646f2477e 100644 --- a/examples/potrf/testing_dpoinv.cc +++ b/examples/potrf/testing_dpoinv.cc @@ -95,7 +95,7 @@ int main(int argc, char **argv) delete[] ttg_argv; /* set up TA to get the allocator */ - allocator_init(); + allocator_init(argc, argv); ttg::trace_on(); diff --git a/examples/potrf/testing_dpotrf.cc b/examples/potrf/testing_dpotrf.cc index 1658dbed3..6b80bb68c 100644 --- a/examples/potrf/testing_dpotrf.cc +++ b/examples/potrf/testing_dpotrf.cc @@ -65,7 +65,7 @@ int main(int argc, char **argv) ttg::initialize(1, argv, nthreads); /* set up TA to get the allocator */ - allocator_init(); + allocator_init(argc, argv); auto world = ttg::default_execution_context(); if(nullptr != prof_filename) { diff --git a/examples/potrf/testing_dtrtri.cc b/examples/potrf/testing_dtrtri.cc index bebe8cccf..3124ab2be 100644 --- a/examples/potrf/testing_dtrtri.cc +++ b/examples/potrf/testing_dtrtri.cc @@ -75,7 +75,7 @@ int main(int argc, char **argv) ttg::initialize(argc, argv, nthreads); /* set up TA to get the allocator */ - allocator_init(); + allocator_init(argc, argv); auto world = ttg::default_execution_context(); From 5ea99884830384faac08d0e30e7b07d36ca17a40 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 23 May 2024 05:16:22 -0400 Subject: [PATCH 15/40] [ci] remove gcc refs on macos runner --- .github/workflows/cmake.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 68933c61c..5e311df1f 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -45,7 +45,7 @@ jobs: - name: Install prerequisite MacOS packages if: ${{ matrix.os == 'macos-latest' }} - run: brew install ninja gcc@10 boost eigen open-mpi bison ccache + run: brew install ninja boost eigen open-mpi bison ccache - name: Install prerequisites Ubuntu packages if: ${{ matrix.os == 'ubuntu-22.04' }} From 6c69a794c0786e8833fe41d23e01635fc1f13ee1 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 4 Apr 2024 15:21:26 -0400 Subject: [PATCH 16/40] Move copy version increment to CPU hook The new copy versioning in PaRSEC requires this only for CPU tasks to make sure the results are handled correctly. Signed-off-by: Joseph Schuchart --- cmake/modules/ExternalDependenciesVersions.cmake | 2 +- cmake/modules/FindOrFetchPARSEC.cmake | 2 +- ttg/ttg/parsec/ttg.h | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/cmake/modules/ExternalDependenciesVersions.cmake b/cmake/modules/ExternalDependenciesVersions.cmake index fd96e5816..2e79bdb88 100644 --- a/cmake/modules/ExternalDependenciesVersions.cmake +++ b/cmake/modules/ExternalDependenciesVersions.cmake @@ -4,7 +4,7 @@ set(TTG_TRACKED_VG_CMAKE_KIT_TAG 7ea2d4d3f8854b9e417f297fd74d6fc49aa13fd5) # used to provide "real" FindOrFetchBoost set(TTG_TRACKED_CATCH2_VERSION 3.5.0) set(TTG_TRACKED_MADNESS_TAG 2eb3bcf0138127ee2dbc651f1aabd3e9b0def4e3) -set(TTG_TRACKED_PARSEC_TAG 0b3140f58ad9dc78a3d64da9fd73ecc7f443ece7) +set(TTG_TRACKED_PARSEC_TAG fc18d7881059a520e73cb7ccb6b56e6da705635e) set(TTG_TRACKED_BTAS_TAG 4e8f5233aa7881dccdfcc37ce07128833926d3c2) set(TTG_TRACKED_TILEDARRAY_TAG 493c109379a1b64ddd5ef59f7e33b95633b68d73) diff --git a/cmake/modules/FindOrFetchPARSEC.cmake b/cmake/modules/FindOrFetchPARSEC.cmake index 7b164019f..2a663f914 100644 --- a/cmake/modules/FindOrFetchPARSEC.cmake +++ b/cmake/modules/FindOrFetchPARSEC.cmake @@ -17,7 +17,7 @@ if (NOT TARGET PaRSEC::parsec) FetchContent_Declare( PARSEC - GIT_REPOSITORY https://github.com/devreal/parsec-1.git + GIT_REPOSITORY https://github.com/abouteiller/parsec.git GIT_TAG ${TTG_TRACKED_PARSEC_TAG} ) FetchContent_MakeAvailable(PARSEC) diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index f4653f962..048b15a0d 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -740,6 +740,7 @@ namespace ttg_parsec { inline void transfer_ownership_impl(ttg_data_copy_t *copy, int device) { if constexpr(!std::is_const_v>) { copy->transfer_ownership(PARSEC_FLOW_ACCESS_RW, device); + copy->inc_current_version(); } } @@ -4266,8 +4267,6 @@ struct ttg::detail::value_copy_handler { /* this copy won't be modified anymore so mark it as read-only */ copy->reset_readers(); } - /* the value was potentially changed, so increment version */ - copy->inc_current_version(); } /* We're coming from a writer so mark the data as modified. * That way we can force a pushout in prepare_send if we move to read-only tasks (needed by PaRSEC). */ From de6d8c1084a2bcffc3a1059ad857ff21cec9226b Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 4 Apr 2024 18:49:44 -0400 Subject: [PATCH 17/40] Initialize parsec_task data[].data_in to NULL Not sure if this really needed. Would prefer to remove this again. Signed-off-by: Joseph Schuchart --- ttg/ttg/parsec/task.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ttg/ttg/parsec/task.h b/ttg/ttg/parsec/task.h index 5b23d53af..fa415375f 100644 --- a/ttg/ttg/parsec/task.h +++ b/ttg/ttg/parsec/task.h @@ -150,6 +150,9 @@ namespace ttg_parsec { parsec_task.mempool_owner = mempool; parsec_task.task_class = task_class; parsec_task.priority = 0; + + // TODO: can we avoid this? + for (int i = 0; i < MAX_PARAM_COUNT; ++i) { this->parsec_task.data[i].data_in = nullptr; } } parsec_ttg_task_base_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class, @@ -168,6 +171,9 @@ namespace ttg_parsec { parsec_task.taskpool = taskpool; parsec_task.priority = priority; parsec_task.chore_mask = 1<<0; + + // TODO: can we avoid this? + for (int i = 0; i < MAX_PARAM_COUNT; ++i) { this->parsec_task.data[i].data_in = nullptr; } } public: From 6492573d36ce85d7bb12c253cd81d4432d50dba7 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 4 Apr 2024 18:50:17 -0400 Subject: [PATCH 18/40] Remove use of parsec_get_best_device This is now handled by parsec directly. Signed-off-by: Joseph Schuchart --- ttg/ttg/parsec/ttg.h | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 048b15a0d..5d0992053 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -1454,7 +1454,6 @@ namespace ttg_parsec { static parsec_hook_return_t device_static_op(parsec_task_t* parsec_task) { static_assert(derived_has_device_op()); - int dev_index; double ratio = 1.0; task_t *task = (task_t*)parsec_task; @@ -1469,7 +1468,7 @@ namespace ttg_parsec { PARSEC_OBJ_CONSTRUCT(gpu_task, parsec_list_item_t); gpu_task->ec = parsec_task; gpu_task->task_type = 0; // user task - gpu_task->load = 1; // TODO: can we do better? + //gpu_task->load = 1; // TODO: can we do better? gpu_task->last_data_check_epoch = -1; // used internally gpu_task->pushout = 0; gpu_task->submit = &TT::device_static_submit; @@ -1513,20 +1512,11 @@ namespace ttg_parsec { /* TODO: is this the right place to set the mask? */ task->parsec_task.chore_mask = PARSEC_DEV_ALL; - /* get a device and come back if we need another one */ - int64_t task_load = 1; - dev_index = parsec_get_best_device(parsec_task, &task_load); /* swap back the original task class */ task->parsec_task.task_class = tmp; - gpu_task->load = task_load; - assert(dev_index >= 0); - if (!parsec_mca_device_is_gpu(dev_index)) { - return PARSEC_HOOK_RETURN_NEXT; /* Fall back */ - } - - parsec_device_gpu_module_t *device = (parsec_device_gpu_module_t*)parsec_mca_device_get(dev_index); + parsec_device_gpu_module_t *device = (parsec_device_gpu_module_t*)task->parsec_task.selected_device; assert(NULL != device); task->dev_ptr->device = device; @@ -3745,7 +3735,7 @@ ttg::abort(); // should not happen ((__parsec_chore_t *)self.incarnations)[1].type = PARSEC_DEV_NONE; ((__parsec_chore_t *)self.incarnations)[1].evaluate = NULL; ((__parsec_chore_t *)self.incarnations)[1].hook = NULL; - } else if (derived_has_hip_op()) { + } else if constexpr (derived_has_hip_op()) { self.incarnations = (__parsec_chore_t *)malloc(3 * sizeof(__parsec_chore_t)); ((__parsec_chore_t *)self.incarnations)[0].type = PARSEC_DEV_HIP; ((__parsec_chore_t *)self.incarnations)[0].evaluate = NULL; @@ -3754,7 +3744,8 @@ ttg::abort(); // should not happen ((__parsec_chore_t *)self.incarnations)[1].type = PARSEC_DEV_NONE; ((__parsec_chore_t *)self.incarnations)[1].evaluate = NULL; ((__parsec_chore_t *)self.incarnations)[1].hook = NULL; - } else if (derived_has_level_zero_op()) { +#if defined(PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT) + } else if constexpr (derived_has_level_zero_op()) { self.incarnations = (__parsec_chore_t *)malloc(3 * sizeof(__parsec_chore_t)); ((__parsec_chore_t *)self.incarnations)[0].type = PARSEC_DEV_LEVEL_ZERO; ((__parsec_chore_t *)self.incarnations)[0].evaluate = NULL; @@ -3763,6 +3754,7 @@ ttg::abort(); // should not happen ((__parsec_chore_t *)self.incarnations)[1].type = PARSEC_DEV_NONE; ((__parsec_chore_t *)self.incarnations)[1].evaluate = NULL; ((__parsec_chore_t *)self.incarnations)[1].hook = NULL; +#endif // PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT } else { self.incarnations = (__parsec_chore_t *)malloc(2 * sizeof(__parsec_chore_t)); ((__parsec_chore_t *)self.incarnations)[0].type = PARSEC_DEV_CPU; From 3be65cf2f973abf333d0f5dce75b88a51a6dffbd Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Fri, 5 Apr 2024 18:57:13 -0400 Subject: [PATCH 19/40] Use the evaluate hook to query input data before calling the compute hook PaRSEC needs to know what input data we have and needs to be available in the taskclass, so put a copy of the task-class into the task object. Signed-off-by: Joseph Schuchart --- ttg/ttg/parsec/task.h | 10 +++ ttg/ttg/parsec/ttg.h | 144 ++++++++++++++++++++++++++++-------------- 2 files changed, 105 insertions(+), 49 deletions(-) diff --git a/ttg/ttg/parsec/task.h b/ttg/ttg/parsec/task.h index fa415375f..d60c87f3c 100644 --- a/ttg/ttg/parsec/task.h +++ b/ttg/ttg/parsec/task.h @@ -15,6 +15,7 @@ namespace ttg_parsec { parsec_flow_t* flows = nullptr; parsec_gpu_exec_stream_t* stream = nullptr; parsec_device_gpu_module_t* device = nullptr; + parsec_task_class_t task_class; // copy of the taskclass }; template @@ -240,6 +241,15 @@ namespace ttg_parsec { } } + template + parsec_hook_return_t invoke_evaluate() { + if constexpr (Space == ttg::ExecutionSpace::Host) { + return PARSEC_HOOK_RETURN_DONE; + } else { + return TT::template device_static_evaluate(&this->parsec_task); + } + } + parsec_key_t pkey() { return reinterpret_cast(&key); } }; diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 5d0992053..6f46e1c75 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -766,7 +766,7 @@ namespace ttg_parsec { parsec_ttg_task_t *me = (parsec_ttg_task_t *)parsec_task; return me->template invoke_op(); } else { - throw std::runtime_error("PaRSEC CUDA hook invoked on a TT that does not support CUDA operations!"); + return PARSEC_HOOK_RETURN_NEXT; } } @@ -776,7 +776,7 @@ namespace ttg_parsec { parsec_ttg_task_t *me = (parsec_ttg_task_t *)parsec_task; return me->template invoke_op(); } else { - throw std::runtime_error("PaRSEC HIP hook invoked on a TT that does not support HIP operations!"); + return PARSEC_HOOK_RETURN_NEXT; } } @@ -786,10 +786,42 @@ namespace ttg_parsec { parsec_ttg_task_t *me = (parsec_ttg_task_t *)parsec_task; return me->template invoke_op(); } else { - throw std::runtime_error("PaRSEC HIP hook invoked on a TT that does not support HIP operations!"); + return PARSEC_HOOK_RETURN_NEXT; } } + + template + inline parsec_hook_return_t evaluate_cuda(struct parsec_execution_stream_s *es, parsec_task_t *parsec_task) { + if constexpr(TT::derived_has_cuda_op()) { + parsec_ttg_task_t *me = (parsec_ttg_task_t *)parsec_task; + return me->template invoke_evaluate(); + } else { + return PARSEC_HOOK_RETURN_NEXT; + } + } + + template + inline parsec_hook_return_t evaluate_hip(struct parsec_execution_stream_s *es, parsec_task_t *parsec_task) { + if constexpr(TT::derived_has_hip_op()) { + parsec_ttg_task_t *me = (parsec_ttg_task_t *)parsec_task; + return me->template invoke_evaluate(); + } else { + return PARSEC_HOOK_RETURN_NEXT; + } + } + + template + inline parsec_hook_return_t evaluate_level_zero(struct parsec_execution_stream_s *es, parsec_task_t *parsec_task) { + if constexpr(TT::derived_has_level_zero_op()) { + parsec_ttg_task_t *me = (parsec_ttg_task_t *)parsec_task; + return me->template invoke_evaluate(); + } else { + return PARSEC_HOOK_RETURN_NEXT; + } + } + + template class rma_delayed_activate { std::vector _keylist; @@ -1451,35 +1483,69 @@ namespace ttg_parsec { } template - static parsec_hook_return_t device_static_op(parsec_task_t* parsec_task) { - static_assert(derived_has_device_op()); - - double ratio = 1.0; + static parsec_hook_return_t device_static_evaluate(parsec_task_t* parsec_task) { task_t *task = (task_t*)parsec_task; - parsec_execution_stream_s *es = task->tt->world.impl().execution_stream(); + if (task->dev_ptr->gpu_task == nullptr) { + + //std::cout << "device_static_op: task " << parsec_task << std::endl; + + /* set up a device task */ + parsec_gpu_task_t *gpu_task; + /* PaRSEC wants to free the gpu_task, because F***K ownerships */ + gpu_task = static_cast(std::calloc(1, sizeof(*gpu_task))); + PARSEC_OBJ_CONSTRUCT(gpu_task, parsec_list_item_t); + gpu_task->ec = parsec_task; + gpu_task->task_type = 0; // user task + //gpu_task->load = 1; // TODO: can we do better? + gpu_task->last_data_check_epoch = -1; // used internally + gpu_task->pushout = 0; + gpu_task->submit = &TT::device_static_submit; - //std::cout << "device_static_op: task " << parsec_task << std::endl; + /* set the gpu_task so it's available in register_device_memory */ + task->dev_ptr->gpu_task = gpu_task; - /* set up a device task */ - parsec_gpu_task_t *gpu_task; - /* PaRSEC wants to free the gpu_task, because F***K ownerships */ - gpu_task = static_cast(std::calloc(1, sizeof(*gpu_task))); - PARSEC_OBJ_CONSTRUCT(gpu_task, parsec_list_item_t); - gpu_task->ec = parsec_task; - gpu_task->task_type = 0; // user task - //gpu_task->load = 1; // TODO: can we do better? - gpu_task->last_data_check_epoch = -1; // used internally - gpu_task->pushout = 0; - gpu_task->submit = &TT::device_static_submit; + /* TODO: is this the right place to set the mask? */ + task->parsec_task.chore_mask = PARSEC_DEV_ALL; - /* set the gpu_task so it's available in register_device_memory */ - task->dev_ptr->gpu_task = gpu_task; + /* copy over the task class, because that's what we need */ + task->dev_ptr->task_class = *task->parsec_task.task_class; - // first invocation of the coroutine to get the coroutine handle - static_op(parsec_task); + // first invocation of the coroutine to get the coroutine handle + static_op(parsec_task); - /* when we come back here, the flows in gpu_task are set (see register_device_memory) */ + /* when we come back here, the flows in gpu_task are set (see register_device_memory) */ + + parsec_task_class_t& tc = *task->dev_ptr->task_class; + + // input flows are set up during register_device_memory as part of the first invocation above + for (int i = 0; i < MAX_PARAM_COUNT; ++i) { + tc.in[i] = gpu_task->flow[i]; + tc.out[i] = gpu_task->flow[i]; + } + tc.nb_flows = MAX_PARAM_COUNT; + + /* set the new task class that contains the flows */ + task->parsec_task.task_class = task->dev_ptr->task_class; + + /* select this one */ + return PARSEC_HOOK_RETURN_DONE; + } + + std::cerr << "EVALUATE called on task with assigned GPU task!" << std::endl; + + /* not sure if this might happen*/ + return PARSEC_HOOK_RETURN_ERROR; + + } + + template + static parsec_hook_return_t device_static_op(parsec_task_t* parsec_task) { + static_assert(derived_has_device_op()); + + /* when we come in here we have a device assigned and are ready to go */ + + task_t *task = (task_t*)parsec_task; if (nullptr == task->suspended_task_address) { /* short-cut in case the task returned immediately */ @@ -1495,31 +1561,11 @@ namespace ttg_parsec { /* for now make sure we're waiting for transfers and the coro hasn't skipped this step */ assert(dev_data.state() == ttg::device::detail::TTG_DEVICE_CORO_WAIT_TRANSFER); - /* set up a temporary task-class to correctly specify the flows */ - parsec_task_class_t tc = *task->parsec_task.task_class; - - tc.name = task->parsec_task.task_class->name; - // input flows are set up during register_device_memory as part of the first invocation above - for (int i = 0; i < MAX_PARAM_COUNT; ++i) { - tc.in[i] = gpu_task->flow[i]; - tc.out[i] = gpu_task->flow[i]; - } - tc.nb_flows = MAX_PARAM_COUNT; - - /* swap in the new task class */ - const parsec_task_class_t* tmp = task->parsec_task.task_class; - *const_cast(&task->parsec_task.task_class) = &tc; - - /* TODO: is this the right place to set the mask? */ - task->parsec_task.chore_mask = PARSEC_DEV_ALL; - - /* swap back the original task class */ - task->parsec_task.task_class = tmp; - parsec_device_gpu_module_t *device = (parsec_device_gpu_module_t*)task->parsec_task.selected_device; assert(NULL != device); task->dev_ptr->device = device; + parsec_execution_stream_s *es = task->tt->world.impl().execution_stream(); switch(device->super.type) { @@ -3730,7 +3776,7 @@ ttg::abort(); // should not happen if constexpr (derived_has_cuda_op()) { self.incarnations = (__parsec_chore_t *)malloc(3 * sizeof(__parsec_chore_t)); ((__parsec_chore_t *)self.incarnations)[0].type = PARSEC_DEV_CUDA; - ((__parsec_chore_t *)self.incarnations)[0].evaluate = NULL; + ((__parsec_chore_t *)self.incarnations)[0].evaluate = &detail::evaluate_cuda; ((__parsec_chore_t *)self.incarnations)[0].hook = &detail::hook_cuda; ((__parsec_chore_t *)self.incarnations)[1].type = PARSEC_DEV_NONE; ((__parsec_chore_t *)self.incarnations)[1].evaluate = NULL; @@ -3738,7 +3784,7 @@ ttg::abort(); // should not happen } else if constexpr (derived_has_hip_op()) { self.incarnations = (__parsec_chore_t *)malloc(3 * sizeof(__parsec_chore_t)); ((__parsec_chore_t *)self.incarnations)[0].type = PARSEC_DEV_HIP; - ((__parsec_chore_t *)self.incarnations)[0].evaluate = NULL; + ((__parsec_chore_t *)self.incarnations)[0].evaluate = &detail::evaluate_hip; ((__parsec_chore_t *)self.incarnations)[0].hook = &detail::hook_hip; ((__parsec_chore_t *)self.incarnations)[1].type = PARSEC_DEV_NONE; @@ -3748,7 +3794,7 @@ ttg::abort(); // should not happen } else if constexpr (derived_has_level_zero_op()) { self.incarnations = (__parsec_chore_t *)malloc(3 * sizeof(__parsec_chore_t)); ((__parsec_chore_t *)self.incarnations)[0].type = PARSEC_DEV_LEVEL_ZERO; - ((__parsec_chore_t *)self.incarnations)[0].evaluate = NULL; + ((__parsec_chore_t *)self.incarnations)[0].evaluate = &detail::evaluate_level_zero; ((__parsec_chore_t *)self.incarnations)[0].hook = &detail::hook_level_zero; ((__parsec_chore_t *)self.incarnations)[1].type = PARSEC_DEV_NONE; From e42e3bdb4a9f062162ebc003807807c9587689ed Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Fri, 5 Apr 2024 19:59:46 -0400 Subject: [PATCH 20/40] Properly construct the parsec task Signed-off-by: Joseph Schuchart --- ttg/ttg/parsec/task.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/ttg/ttg/parsec/task.h b/ttg/ttg/parsec/task.h index d60c87f3c..7b63c1d4b 100644 --- a/ttg/ttg/parsec/task.h +++ b/ttg/ttg/parsec/task.h @@ -147,13 +147,14 @@ namespace ttg_parsec { : data_count(data_count) , copies(copies) , defer_writer(defer_writer) { + PARSEC_OBJ_CONSTRUCT(&parsec_task, parsec_task_t); PARSEC_LIST_ITEM_SINGLETON(&parsec_task.super); parsec_task.mempool_owner = mempool; parsec_task.task_class = task_class; parsec_task.priority = 0; - // TODO: can we avoid this? - for (int i = 0; i < MAX_PARAM_COUNT; ++i) { this->parsec_task.data[i].data_in = nullptr; } + // TODO: can we avoid this? + for (int i = 0; i < MAX_PARAM_COUNT; ++i) { this->parsec_task.data[i].data_in = nullptr; } } parsec_ttg_task_base_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class, @@ -173,8 +174,8 @@ namespace ttg_parsec { parsec_task.priority = priority; parsec_task.chore_mask = 1<<0; - // TODO: can we avoid this? - for (int i = 0; i < MAX_PARAM_COUNT; ++i) { this->parsec_task.data[i].data_in = nullptr; } + // TODO: can we avoid this? + for (int i = 0; i < MAX_PARAM_COUNT; ++i) { this->parsec_task.data[i].data_in = nullptr; } } public: From 0216894366c8df64998f6dd59bc7960dbaab9cf4 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Fri, 5 Apr 2024 20:00:19 -0400 Subject: [PATCH 21/40] Use correct evaluate function signatures Signed-off-by: Joseph Schuchart --- ttg/ttg/parsec/ttg.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 6f46e1c75..991e29664 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -792,7 +792,7 @@ namespace ttg_parsec { template - inline parsec_hook_return_t evaluate_cuda(struct parsec_execution_stream_s *es, parsec_task_t *parsec_task) { + inline parsec_hook_return_t evaluate_cuda(const parsec_task_t *parsec_task) { if constexpr(TT::derived_has_cuda_op()) { parsec_ttg_task_t *me = (parsec_ttg_task_t *)parsec_task; return me->template invoke_evaluate(); @@ -802,7 +802,7 @@ namespace ttg_parsec { } template - inline parsec_hook_return_t evaluate_hip(struct parsec_execution_stream_s *es, parsec_task_t *parsec_task) { + inline parsec_hook_return_t evaluate_hip(const parsec_task_t *parsec_task) { if constexpr(TT::derived_has_hip_op()) { parsec_ttg_task_t *me = (parsec_ttg_task_t *)parsec_task; return me->template invoke_evaluate(); @@ -812,7 +812,7 @@ namespace ttg_parsec { } template - inline parsec_hook_return_t evaluate_level_zero(struct parsec_execution_stream_s *es, parsec_task_t *parsec_task) { + inline parsec_hook_return_t evaluate_level_zero(const parsec_task_t *parsec_task) { if constexpr(TT::derived_has_level_zero_op()) { parsec_ttg_task_t *me = (parsec_ttg_task_t *)parsec_task; return me->template invoke_evaluate(); @@ -1516,7 +1516,7 @@ namespace ttg_parsec { /* when we come back here, the flows in gpu_task are set (see register_device_memory) */ - parsec_task_class_t& tc = *task->dev_ptr->task_class; + parsec_task_class_t& tc = task->dev_ptr->task_class; // input flows are set up during register_device_memory as part of the first invocation above for (int i = 0; i < MAX_PARAM_COUNT; ++i) { @@ -1526,7 +1526,7 @@ namespace ttg_parsec { tc.nb_flows = MAX_PARAM_COUNT; /* set the new task class that contains the flows */ - task->parsec_task.task_class = task->dev_ptr->task_class; + task->parsec_task.task_class = &task->dev_ptr->task_class; /* select this one */ return PARSEC_HOOK_RETURN_DONE; @@ -1565,6 +1565,7 @@ namespace ttg_parsec { assert(NULL != device); task->dev_ptr->device = device; + parsec_gpu_task_t *gpu_task = task->dev_ptr->gpu_task; parsec_execution_stream_s *es = task->tt->world.impl().execution_stream(); switch(device->super.type) { From 2e5741966d3464f42eb3d593cf30d562babcbe6c Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 11 Apr 2024 23:28:52 -0400 Subject: [PATCH 22/40] Properly construct parsec task object Signed-off-by: Joseph Schuchart --- ttg/ttg/parsec/task.h | 1 + 1 file changed, 1 insertion(+) diff --git a/ttg/ttg/parsec/task.h b/ttg/ttg/parsec/task.h index 7b63c1d4b..1d17a9ad7 100644 --- a/ttg/ttg/parsec/task.h +++ b/ttg/ttg/parsec/task.h @@ -166,6 +166,7 @@ namespace ttg_parsec { , copies(copies) , release_task_cb(release_fn) , defer_writer(defer_writer) { + PARSEC_OBJ_CONSTRUCT(&parsec_task, parsec_task_t); PARSEC_LIST_ITEM_SINGLETON(&parsec_task.super); parsec_task.mempool_owner = mempool; parsec_task.task_class = task_class; From edf48fc1c12a9f944273f9e0dd57c2a62c084251 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Fri, 12 Apr 2024 01:13:04 -0400 Subject: [PATCH 23/40] Fix dataflags binary operators Signed-off-by: Joseph Schuchart --- ttg/ttg/parsec/task.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/ttg/ttg/parsec/task.h b/ttg/ttg/parsec/task.h index 1d17a9ad7..5109dbcaa 100644 --- a/ttg/ttg/parsec/task.h +++ b/ttg/ttg/parsec/task.h @@ -58,9 +58,10 @@ namespace ttg_parsec { } inline - ttg_parsec_data_flags operator|=(ttg_parsec_data_flags lhs, ttg_parsec_data_flags rhs) { + ttg_parsec_data_flags operator|=(ttg_parsec_data_flags& lhs, ttg_parsec_data_flags rhs) { using flags_type = std::underlying_type::type; - return ttg_parsec_data_flags(static_cast(lhs) | static_cast(rhs)); + lhs = ttg_parsec_data_flags(static_cast(lhs) | static_cast(rhs)); + return lhs; } inline @@ -69,6 +70,13 @@ namespace ttg_parsec { return static_cast(lhs) & static_cast(rhs); } + inline + ttg_parsec_data_flags operator&=(ttg_parsec_data_flags& lhs, ttg_parsec_data_flags rhs) { + using flags_type = std::underlying_type::type; + lhs = ttg_parsec_data_flags(static_cast(lhs) & static_cast(rhs)); + return lhs; + } + inline bool operator!(ttg_parsec_data_flags lhs) { using flags_type = std::underlying_type::type; From 0e6235276c9eb358e96f342bd3536e9f02a914a6 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Fri, 12 Apr 2024 01:13:20 -0400 Subject: [PATCH 24/40] Fix logic for detecting device-aware MPI Signed-off-by: Joseph Schuchart --- ttg/ttg/parsec/ttg.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 991e29664..3069922b5 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -3390,11 +3390,11 @@ ttg::abort(); // should not happen if (!need_pushout) { bool device_supported = false; if constexpr (derived_has_cuda_op()) { - device_supported = !world.impl().mpi_support(ttg::ExecutionSpace::CUDA); + device_supported = world.impl().mpi_support(ttg::ExecutionSpace::CUDA); } else if constexpr (derived_has_hip_op()) { - device_supported = !world.impl().mpi_support(ttg::ExecutionSpace::HIP); + device_supported = world.impl().mpi_support(ttg::ExecutionSpace::HIP); } else if constexpr (derived_has_level_zero_op()) { - device_supported = !world.impl().mpi_support(ttg::ExecutionSpace::L0); + device_supported = world.impl().mpi_support(ttg::ExecutionSpace::L0); } /* if MPI supports the device we don't care whether we have remote peers * because we can send from the device directly */ From 38e7efb07363b0d57169de22f1b02cff89fc3d2a Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Fri, 12 Apr 2024 01:14:04 -0400 Subject: [PATCH 25/40] Add override keyword to fence and make_executable The hip compiler complains about the override keyword missing. Signed-off-by: Joseph Schuchart --- ttg/ttg/tt.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ttg/ttg/tt.h b/ttg/ttg/tt.h index 7024776aa..ec7eff6d8 100644 --- a/ttg/ttg/tt.h +++ b/ttg/ttg/tt.h @@ -96,9 +96,9 @@ namespace ttg { ttg::World get_world() const override final { return tts[0]->get_world(); } - void fence() { tts[0]->fence(); } + void fence() override { tts[0]->fence(); } - void make_executable() { + void make_executable() override { for (auto &op : tts) op->make_executable(); } From 5dc442d773213dff6c7d5dab5d13f785f9ecdf74 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Fri, 12 Apr 2024 01:15:47 -0400 Subject: [PATCH 26/40] POTRF: use correct keymap for SYRK The second element of the key contains the tile ID, the first one contains the iteration. Signed-off-by: Joseph Schuchart --- examples/potrf/potrf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/potrf/potrf.h b/examples/potrf/potrf.h index f6ba6e147..1ecb2c23f 100644 --- a/examples/potrf/potrf.h +++ b/examples/potrf/potrf.h @@ -674,7 +674,7 @@ namespace potrf { auto keymap1 = [&](const Key1& key) { return A.rank_of(key[0], key[0]); }; auto keymap2a = [&](const Key2& key) { return A.rank_of(key[0], key[1]); }; - auto keymap2b = [&](const Key2& key) { return A.rank_of(key[0], key[0]); }; + auto keymap2b = [&](const Key2& key) { return A.rank_of(key[1], key[1]); }; auto keymap3 = [&](const Key3& key) { return A.rank_of(key[0], key[1]); }; From ec2b796e3e86d9c35c7cfa8ea5b4a6d6dd4aaab1 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Fri, 12 Apr 2024 01:18:07 -0400 Subject: [PATCH 27/40] Remove unused variable Signed-off-by: Joseph Schuchart --- ttg/ttg/parsec/ttg.h | 1 - 1 file changed, 1 deletion(-) diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 3069922b5..740c8fd07 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -3417,7 +3417,6 @@ ttg::abort(); // should not happen auto remote_check = [&](){ auto world = ttg_default_execution_context(); int rank = world.rank(); - uint64_t pos = 0; bool remote = keylist.end() != std::find_if(keylist.begin(), keylist.end(), [&](const Key &key) { return keymap(key) != rank; }); return remote; From b7c5088f9e251eb9ede67a3e072658798834b8b3 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Wed, 15 May 2024 17:57:37 -0400 Subject: [PATCH 28/40] Add device hint to TT and buffer For POTRF, we want to provide a hint that tasks on the same column should be executed on the same device, to reduce data movement and provide a hint on load balancing up front. Signed-off-by: Joseph Schuchart --- examples/potrf/pmw.h | 10 ++++++- examples/potrf/potrf.h | 24 ++++++++++++++++ ttg/ttg/device/device.h | 7 +++++ ttg/ttg/madness/device.h | 9 ++++++ ttg/ttg/madness/ttg.h | 1 + ttg/ttg/parsec/buffer.h | 8 ++++++ ttg/ttg/parsec/device.h | 7 +++++ ttg/ttg/parsec/fwd.h | 2 ++ ttg/ttg/parsec/ttg.h | 61 ++++++++++++++++++++++++++++++++++++++++ ttg/ttg/util/meta.h | 14 ++++----- 10 files changed, 135 insertions(+), 8 deletions(-) create mode 100644 ttg/ttg/madness/device.h diff --git a/examples/potrf/pmw.h b/examples/potrf/pmw.h index 0f8d75d7d..25f01933c 100644 --- a/examples/potrf/pmw.h +++ b/examples/potrf/pmw.h @@ -102,6 +102,14 @@ class PaRSECMatrixWrapper { (pm->uplo == PARSEC_MATRIX_UPPER && col >= row); } + int P() const { + return pm->grid.rows; + } + + int Q() const { + return pm->grid.cols; + } + PaRSECMatrixT* parsec() { return pm; } @@ -132,7 +140,7 @@ class PaRSECMatrixWrapper { }; template -using MatrixT = PaRSECMatrixWrapper; +using MatrixT = PaRSECMatrixWrapper; static auto make_load_tt(MatrixT &A, ttg::Edge> &toop, bool defer_write) { diff --git a/examples/potrf/potrf.h b/examples/potrf/potrf.h index 1ecb2c23f..4adbff106 100644 --- a/examples/potrf/potrf.h +++ b/examples/potrf/potrf.h @@ -678,6 +678,18 @@ namespace potrf { auto keymap3 = [&](const Key3& key) { return A.rank_of(key[0], key[1]); }; + /** + * Device map hints: we try to keep tiles on one row on the same device to minimize + * data movement between devices. This provides hints for load-balancing up front + * and avoids movement of the TRSM result to GEMM tasks. + */ + auto devmap1 = [&](const key1& key) { return (key[0] / A.P()) % ttg::device::num_devices(); } + + auto devmap2a = [&](const key2& key) { return (key[0] / A.P()) % ttg::device::num_devices(); } + auto devmap2b = [&](const key2& key) { return (key[1] / A.P()) % ttg::device::num_devices(); } + + auto devmap3 = [&](const key3& key) { return (key[0] / A.P()) % ttg::device::num_devices(); } + ttg::Edge> syrk_potrf("syrk_potrf"), disp_potrf("disp_potrf"); ttg::Edge> potrf_trsm("potrf_trsm"), trsm_syrk("trsm_syrk"), gemm_trsm("gemm_trsm"), @@ -692,18 +704,30 @@ namespace potrf { auto tt_potrf = make_potrf(A, disp_potrf, syrk_potrf, potrf_trsm, output); tt_potrf->set_keymap(keymap1); tt_potrf->set_defer_writer(defer_write); +#ifdef ENABLE_DEVICE_KERNEL + tt_potrf->set_devmap(devmap1); +#endif // 0 auto tt_trsm = make_trsm(A, disp_trsm, potrf_trsm, gemm_trsm, trsm_syrk, trsm_gemm_row, trsm_gemm_col, output); tt_trsm->set_keymap(keymap2a); tt_trsm->set_defer_writer(defer_write); +#ifdef ENABLE_DEVICE_KERNEL + tt_trsm->set_devmap(devmap2a); +#endif // 0 auto tt_syrk = make_syrk(A, disp_syrk, trsm_syrk, syrk_syrk, syrk_potrf, syrk_syrk); tt_syrk->set_keymap(keymap2b); tt_syrk->set_defer_writer(defer_write); +#ifdef ENABLE_DEVICE_KERNEL + tt_syrk->set_devmap(devmap2b); +#endif // 0 auto tt_gemm = make_gemm(A, disp_gemm, trsm_gemm_row, trsm_gemm_col, gemm_gemm, gemm_trsm, gemm_gemm); tt_gemm->set_keymap(keymap3); tt_gemm->set_defer_writer(defer_write); +#ifdef ENABLE_DEVICE_KERNEL + tt_gemm->set_devmap(devmap3); +#endif // 0 /* Priorities taken from DPLASMA */ auto nt = A.cols(); diff --git a/ttg/ttg/device/device.h b/ttg/ttg/device/device.h index 6690982f6..e815aaf87 100644 --- a/ttg/ttg/device/device.h +++ b/ttg/ttg/device/device.h @@ -2,6 +2,7 @@ #include "ttg/config.h" #include "ttg/execution.h" +#include "ttg/impl_selector.h" @@ -180,3 +181,9 @@ namespace ttg::device { } } // namespace ttg #endif // defined(TTG_HAVE_HIP) + +namespace ttg::device { + inline int num_devices() { + return TTG_IMPL_NS::num_devices(); + } +} diff --git a/ttg/ttg/madness/device.h b/ttg/ttg/madness/device.h new file mode 100644 index 000000000..e13321194 --- /dev/null +++ b/ttg/ttg/madness/device.h @@ -0,0 +1,9 @@ +#ifndef TTG_MADNESS_DEVICE_H +#define TTG_MADNESS_DEVICE_H + +namespace ttg_madness { + /* no device support in MADNESS */ + inline int num_devices() { return 0; } +} + +#endif // TTG_MADNESS_DEVICE_H \ No newline at end of file diff --git a/ttg/ttg/madness/ttg.h b/ttg/ttg/madness/ttg.h index 5fab6f6dd..34c576f56 100644 --- a/ttg/ttg/madness/ttg.h +++ b/ttg/ttg/madness/ttg.h @@ -13,6 +13,7 @@ #include "ttg/base/keymap.h" #include "ttg/base/tt.h" #include "ttg/func.h" +#include "ttg/madness/device.h" #include "ttg/runtimes.h" #include "ttg/tt.h" #include "ttg/util/bug.h" diff --git a/ttg/ttg/parsec/buffer.h b/ttg/ttg/parsec/buffer.h index 98b14eb12..6e609a158 100644 --- a/ttg/ttg/parsec/buffer.h +++ b/ttg/ttg/parsec/buffer.h @@ -329,6 +329,14 @@ struct Buffer : public detail::ttg_parsec_data_wrapper_t // << " parsec_data " << m_data.get() << std::endl; } + void prefer_device(ttg::device::Device dev) { + /* only set device if the host has the latest copy as otherwise we might end up with a stale copy */ + if (dev.is_device() && this->parsec_data()->owner_device == 0) { + parsec_advise_data_on_device(this->parsec_data(), detail::ttg_device_to_parsec_device(dev), + PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE); + } + } + /* serialization support */ #ifdef TTG_SERIALIZATION_SUPPORTS_BOOST diff --git a/ttg/ttg/parsec/device.h b/ttg/ttg/parsec/device.h index 77722b1c1..9f8ada05c 100644 --- a/ttg/ttg/parsec/device.h +++ b/ttg/ttg/parsec/device.h @@ -2,6 +2,7 @@ #define TTG_PARSEC_DEVICE_H #include "ttg/device/device.h" +#include namespace ttg_parsec { @@ -35,6 +36,12 @@ namespace ttg_parsec { } } // namespace detail + + inline + int num_devices() { + return parsec_nb_devices - detail::first_device_id; + } + } // namespace ttg_parsec #endif // TTG_PARSEC_DEVICE_H \ No newline at end of file diff --git a/ttg/ttg/parsec/fwd.h b/ttg/ttg/parsec/fwd.h index d5bc8931e..0cd798e87 100644 --- a/ttg/ttg/parsec/fwd.h +++ b/ttg/ttg/parsec/fwd.h @@ -82,6 +82,8 @@ namespace ttg_parsec { template inline void mark_device_out(std::tuple &b); + inline int num_devices(); + #if 0 template inline std::pair>...>> get_ptr(Args&&... args); diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 740c8fd07..1c795677c 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -1296,6 +1296,7 @@ namespace ttg_parsec { ttg::World world; ttg::meta::detail::keymap_t keymap; ttg::meta::detail::keymap_t priomap; + ttg::meta::detail::keymap_t devicemap; // For now use same type for unary/streaming input terminals, and stream reducers assigned at runtime ttg::meta::detail::input_reducers_t input_reducers; //!< Reducers for the input terminals (empty = expect single value) @@ -1502,6 +1503,12 @@ namespace ttg_parsec { gpu_task->pushout = 0; gpu_task->submit = &TT::device_static_submit; + // one way to force the task device + // currently this will probably break all of PaRSEC if this hint + // does not match where the data is located, not really useful for us + // instead we set a hint on the data if there is no hint set yet + //parsec_task->selected_device = ...; + /* set the gpu_task so it's available in register_device_memory */ task->dev_ptr->gpu_task = gpu_task; @@ -1525,6 +1532,29 @@ namespace ttg_parsec { } tc.nb_flows = MAX_PARAM_COUNT; + /* set the device hint on the data */ + TT *tt = task->tt; + if (tt->devicemap) { + int parsec_dev; + if constexpr (std::is_void_v) { + parsec_dev = ttg::device::ttg_device_to_parsec_device(tt->devicemap()); + } else { + parsec_dev = ttg::device::ttg_device_to_parsec_device(tt->devicemap(tt->key)); + } + for (int i = 0; i < MAX_PARAM_COUNT; ++i) { + /* only set on mutable data since we have exclusive access */ + if (tc.in[i].flow_flags & PARSEC_FLOW_ACCESS_WRITE) { + parsec_data_t *data = parsec_task->data[i].data_in->original; + /* only set the preferred device if the host has the latest copy + * as otherwise we may end up with the wrong data if there is a newer + * version on a different device. Also, keep fingers crossed. */ + if (data->owner_device == 0) { + parsec_advise_data_on_device(data, parsec_dev, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE); + } + } + } + } + /* set the new task class that contains the flows */ task->parsec_task.task_class = &task->dev_ptr->task_class; @@ -4195,6 +4225,37 @@ ttg::abort(); // should not happen priomap = std::forward(pm); } + /// device map setter + /// The device map provides a hint on which device a task should execute. + /// TTG may not be able to honor the request and the corresponding task + /// may execute on a different device. + /// @arg pm a function that provides a hint on which device the task should execute. + template + void set_devicemap(Devicemap&& dm) { + static_assert(derived_has_device_op(), "Device map only allowed on device-enabled TT!"); + if constexpr (std::is_same_v()))>) { + // dm returns a Device + devicemap = std::forward(dm); + } else { + // convert dm return into a Device + devicemap = [=](const keyT& key) { + if constexpr (derived_has_cuda_op()) { + return ttg::device::Device(dm(key), ttg::ExecutionSpace::CUDA); + } else if constexpr (derived_has_hip_op()) { + return ttg::device::Device(dm(key), ttg::ExecutionSpace::HIP); + } else if constexpr (derived_has_level_zero_op()) { + return ttg::device::Device(dm(key), ttg::ExecutionSpace::L0); + } else { + throw std::runtime_error("Unknown device type!"); + } + }; + } + } + + /// device map accessor + /// @return the device map + auto get_devicemap() { return devicemap; } + // Register the static_op function to associate it to instance_id void register_static_op_function(void) { int rank; diff --git a/ttg/ttg/util/meta.h b/ttg/ttg/util/meta.h index b7bb31690..5322fa577 100644 --- a/ttg/ttg/util/meta.h +++ b/ttg/ttg/util/meta.h @@ -848,18 +848,18 @@ namespace ttg { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // keymap_t = std::function, protected against void key //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - template + template struct keymap; - template + template struct keymap>> { - using type = std::function; + using type = std::function; }; - template + template struct keymap>> { - using type = std::function; + using type = std::function; }; - template - using keymap_t = typename keymap::type; + template + using keymap_t = typename keymap::type; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // input_reducers_t = std::tuple< From 27f3fdebc7d97fe43422d34ec89756792e2a729d Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Wed, 15 May 2024 19:19:58 -0400 Subject: [PATCH 29/40] Fixes to device hint implementation Signed-off-by: Joseph Schuchart --- examples/potrf/potrf.h | 16 ++++++++-------- ttg/ttg/parsec/ttg.h | 6 +++--- ttg/ttg/util/meta.h | 4 ++-- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/examples/potrf/potrf.h b/examples/potrf/potrf.h index 4adbff106..60daef72f 100644 --- a/examples/potrf/potrf.h +++ b/examples/potrf/potrf.h @@ -683,12 +683,12 @@ namespace potrf { * data movement between devices. This provides hints for load-balancing up front * and avoids movement of the TRSM result to GEMM tasks. */ - auto devmap1 = [&](const key1& key) { return (key[0] / A.P()) % ttg::device::num_devices(); } + auto devmap1 = [&](const Key1& key) { return (key[0] / A.P()) % ttg::device::num_devices(); }; - auto devmap2a = [&](const key2& key) { return (key[0] / A.P()) % ttg::device::num_devices(); } - auto devmap2b = [&](const key2& key) { return (key[1] / A.P()) % ttg::device::num_devices(); } + auto devmap2a = [&](const Key2& key) { return (key[0] / A.P()) % ttg::device::num_devices(); }; + auto devmap2b = [&](const Key2& key) { return (key[1] / A.P()) % ttg::device::num_devices(); }; - auto devmap3 = [&](const key3& key) { return (key[0] / A.P()) % ttg::device::num_devices(); } + auto devmap3 = [&](const Key3& key) { return (key[0] / A.P()) % ttg::device::num_devices(); }; ttg::Edge> syrk_potrf("syrk_potrf"), disp_potrf("disp_potrf"); @@ -705,28 +705,28 @@ namespace potrf { tt_potrf->set_keymap(keymap1); tt_potrf->set_defer_writer(defer_write); #ifdef ENABLE_DEVICE_KERNEL - tt_potrf->set_devmap(devmap1); + tt_potrf->set_devicemap(devmap1); #endif // 0 auto tt_trsm = make_trsm(A, disp_trsm, potrf_trsm, gemm_trsm, trsm_syrk, trsm_gemm_row, trsm_gemm_col, output); tt_trsm->set_keymap(keymap2a); tt_trsm->set_defer_writer(defer_write); #ifdef ENABLE_DEVICE_KERNEL - tt_trsm->set_devmap(devmap2a); + tt_trsm->set_devicemap(devmap2a); #endif // 0 auto tt_syrk = make_syrk(A, disp_syrk, trsm_syrk, syrk_syrk, syrk_potrf, syrk_syrk); tt_syrk->set_keymap(keymap2b); tt_syrk->set_defer_writer(defer_write); #ifdef ENABLE_DEVICE_KERNEL - tt_syrk->set_devmap(devmap2b); + tt_syrk->set_devicemap(devmap2b); #endif // 0 auto tt_gemm = make_gemm(A, disp_gemm, trsm_gemm_row, trsm_gemm_col, gemm_gemm, gemm_trsm, gemm_gemm); tt_gemm->set_keymap(keymap3); tt_gemm->set_defer_writer(defer_write); #ifdef ENABLE_DEVICE_KERNEL - tt_gemm->set_devmap(devmap3); + tt_gemm->set_devicemap(devmap3); #endif // 0 /* Priorities taken from DPLASMA */ diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 1c795677c..b2a89f11b 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -1537,13 +1537,13 @@ namespace ttg_parsec { if (tt->devicemap) { int parsec_dev; if constexpr (std::is_void_v) { - parsec_dev = ttg::device::ttg_device_to_parsec_device(tt->devicemap()); + parsec_dev = detail::ttg_device_to_parsec_device(tt->devicemap()); } else { - parsec_dev = ttg::device::ttg_device_to_parsec_device(tt->devicemap(tt->key)); + parsec_dev = detail::ttg_device_to_parsec_device(tt->devicemap(task->key)); } for (int i = 0; i < MAX_PARAM_COUNT; ++i) { /* only set on mutable data since we have exclusive access */ - if (tc.in[i].flow_flags & PARSEC_FLOW_ACCESS_WRITE) { + if (tc.in[i]->flow_flags & PARSEC_FLOW_ACCESS_WRITE) { parsec_data_t *data = parsec_task->data[i].data_in->original; /* only set the preferred device if the host has the latest copy * as otherwise we may end up with the wrong data if there is a newer diff --git a/ttg/ttg/util/meta.h b/ttg/ttg/util/meta.h index 5322fa577..f3af03152 100644 --- a/ttg/ttg/util/meta.h +++ b/ttg/ttg/util/meta.h @@ -851,11 +851,11 @@ namespace ttg { template struct keymap; template - struct keymap>> { + struct keymap>> { using type = std::function; }; template - struct keymap>> { + struct keymap>> { using type = std::function; }; template From ee6fb2a09259251bce9e3bfc55f85b36fd0e8ef8 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Wed, 22 May 2024 15:14:15 -0400 Subject: [PATCH 30/40] Point back to PaRSEC main repository Signed-off-by: Joseph Schuchart --- cmake/modules/ExternalDependenciesVersions.cmake | 2 +- cmake/modules/FindOrFetchPARSEC.cmake | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/modules/ExternalDependenciesVersions.cmake b/cmake/modules/ExternalDependenciesVersions.cmake index 2e79bdb88..43042cacd 100644 --- a/cmake/modules/ExternalDependenciesVersions.cmake +++ b/cmake/modules/ExternalDependenciesVersions.cmake @@ -4,7 +4,7 @@ set(TTG_TRACKED_VG_CMAKE_KIT_TAG 7ea2d4d3f8854b9e417f297fd74d6fc49aa13fd5) # used to provide "real" FindOrFetchBoost set(TTG_TRACKED_CATCH2_VERSION 3.5.0) set(TTG_TRACKED_MADNESS_TAG 2eb3bcf0138127ee2dbc651f1aabd3e9b0def4e3) -set(TTG_TRACKED_PARSEC_TAG fc18d7881059a520e73cb7ccb6b56e6da705635e) +set(TTG_TRACKED_PARSEC_TAG 58f8f3089ecad2e8ee50e80a9586e05ce8873b1c) set(TTG_TRACKED_BTAS_TAG 4e8f5233aa7881dccdfcc37ce07128833926d3c2) set(TTG_TRACKED_TILEDARRAY_TAG 493c109379a1b64ddd5ef59f7e33b95633b68d73) diff --git a/cmake/modules/FindOrFetchPARSEC.cmake b/cmake/modules/FindOrFetchPARSEC.cmake index 2a663f914..b3fd5faa3 100644 --- a/cmake/modules/FindOrFetchPARSEC.cmake +++ b/cmake/modules/FindOrFetchPARSEC.cmake @@ -17,7 +17,7 @@ if (NOT TARGET PaRSEC::parsec) FetchContent_Declare( PARSEC - GIT_REPOSITORY https://github.com/abouteiller/parsec.git + GIT_REPOSITORY https://github.com/ICLDisco/parsec.git GIT_TAG ${TTG_TRACKED_PARSEC_TAG} ) FetchContent_MakeAvailable(PARSEC) From f3e8b14147610ec5c43b7954164b3f5c54b4c8d9 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Wed, 22 May 2024 15:43:05 -0400 Subject: [PATCH 31/40] Add missing fwd decl in madness backend Signed-off-by: Joseph Schuchart --- ttg/ttg/madness/fwd.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ttg/ttg/madness/fwd.h b/ttg/ttg/madness/fwd.h index 0d340db0b..6bee1a832 100644 --- a/ttg/ttg/madness/fwd.h +++ b/ttg/ttg/madness/fwd.h @@ -77,6 +77,8 @@ namespace ttg_madness { template inline void mark_device_out(std::tuple &b); + inline int num_devices(); + } // namespace ttg_madness #endif // TTG_MADNESS_FWD_H From 2384e4d3dab2d636ccdcfd15c59ce2784ff662a2 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 23 May 2024 10:29:16 -0400 Subject: [PATCH 32/40] Cleanup return values in parsec backend We should not return PARSEC_HOOK_RETURN_NEXT in hooks, only in the evaluate callback. Signed-off-by: Joseph Schuchart --- ttg/ttg/parsec/ttg.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index b2a89f11b..71a943406 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -766,7 +766,8 @@ namespace ttg_parsec { parsec_ttg_task_t *me = (parsec_ttg_task_t *)parsec_task; return me->template invoke_op(); } else { - return PARSEC_HOOK_RETURN_NEXT; + std::cerr << "CUDA hook called without having a CUDA op!" << std::endl; + return PARSEC_HOOK_RETURN_ERROR; } } @@ -776,7 +777,8 @@ namespace ttg_parsec { parsec_ttg_task_t *me = (parsec_ttg_task_t *)parsec_task; return me->template invoke_op(); } else { - return PARSEC_HOOK_RETURN_NEXT; + std::cerr << "HIP hook called without having a HIP op!" << std::endl; + return PARSEC_HOOK_RETURN_ERROR; } } @@ -786,7 +788,8 @@ namespace ttg_parsec { parsec_ttg_task_t *me = (parsec_ttg_task_t *)parsec_task; return me->template invoke_op(); } else { - return PARSEC_HOOK_RETURN_NEXT; + std::cerr << "L0 hook called without having a L0 op!" << std::endl; + return PARSEC_HOOK_RETURN_ERROR; } } @@ -1489,8 +1492,6 @@ namespace ttg_parsec { task_t *task = (task_t*)parsec_task; if (task->dev_ptr->gpu_task == nullptr) { - //std::cout << "device_static_op: task " << parsec_task << std::endl; - /* set up a device task */ parsec_gpu_task_t *gpu_task; /* PaRSEC wants to free the gpu_task, because F***K ownerships */ @@ -1498,7 +1499,6 @@ namespace ttg_parsec { PARSEC_OBJ_CONSTRUCT(gpu_task, parsec_list_item_t); gpu_task->ec = parsec_task; gpu_task->task_type = 0; // user task - //gpu_task->load = 1; // TODO: can we do better? gpu_task->last_data_check_epoch = -1; // used internally gpu_task->pushout = 0; gpu_task->submit = &TT::device_static_submit; From 4b50c68dd4484127be8c9ec5e57505051b1907c3 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Fri, 24 May 2024 14:00:40 -0400 Subject: [PATCH 33/40] Install madness backend device.h header Signed-off-by: Joseph Schuchart --- ttg/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/ttg/CMakeLists.txt b/ttg/CMakeLists.txt index 900df6519..1918f40fe 100644 --- a/ttg/CMakeLists.txt +++ b/ttg/CMakeLists.txt @@ -208,6 +208,7 @@ endif(TARGET Boost::serialization) if (TARGET MADworld) set(ttg-mad-headers ${CMAKE_CURRENT_SOURCE_DIR}/ttg/madness/buffer.h + ${CMAKE_CURRENT_SOURCE_DIR}/ttg/madness/device.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/madness/fwd.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/madness/import.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/madness/ttg.h From 72aa980bf795b15bbdc3fb168a83d3d56e1376a6 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Tue, 28 May 2024 11:03:57 -0400 Subject: [PATCH 34/40] Add missing fwd.h header to device.h Signed-off-by: Joseph Schuchart --- ttg/ttg/device/device.h | 1 + 1 file changed, 1 insertion(+) diff --git a/ttg/ttg/device/device.h b/ttg/ttg/device/device.h index e815aaf87..244e9c944 100644 --- a/ttg/ttg/device/device.h +++ b/ttg/ttg/device/device.h @@ -3,6 +3,7 @@ #include "ttg/config.h" #include "ttg/execution.h" #include "ttg/impl_selector.h" +#include "ttg/fwd.h" From 755ab26f6512defa427ead70a890fa34db76e2bc Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Tue, 28 May 2024 11:39:25 -0400 Subject: [PATCH 35/40] Check for broken GCC versions and disable Coroutine support if found Signed-off-by: Joseph Schuchart --- CMakeLists.txt | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 28490557f..26433deb5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -94,8 +94,23 @@ endif (BUILD_TESTING) ########################### # Boost include("${PROJECT_SOURCE_DIR}/cmake/modules/FindOrFetchBoost.cmake") -# C++ coroutines -find_package(CXXStdCoroutine MODULE REQUIRED COMPONENTS Final Experimental) + +# C++ coroutines, check for broken GCC releases and skip if one is found +set(SKIP_COROUTINE_DETECTION FALSE) +if (${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU") + if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 11.4.0) + set(SKIP_COROUTINE_DETECTION TRUE) + elseif(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.1.0 AND ${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 12.3.0) + set(SKIP_COROUTINE_DETECTION TRUE) + endif() + if (SKIP_COROUTINE_DETECTION) + message(WARNING "GCC with broken Coroutine support detected, disabling Coroutine support. At least GCC 11.4, 12.3, or 13.1 required.") + endif(SKIP_COROUTINE_DETECTION) +endif(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU") + +if (SKIP_COROUTINE_DETECTION) + find_package(CXXStdCoroutine MODULE REQUIRED COMPONENTS Final Experimental) +endif(SKIP_COROUTINE_DETECTION) ########################## From 8947d84200282a62a36df6acfe7bbfa432491e3a Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Tue, 28 May 2024 12:09:29 -0400 Subject: [PATCH 36/40] Protect unit tests for cases where coroutines are not found Signed-off-by: Joseph Schuchart --- tests/unit/CMakeLists.txt | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index 275f3fdd8..7f25dd6fe 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -12,12 +12,14 @@ set(ut_libs Catch2::Catch2) # coroutine tests # we definitely have TARGET std::coroutine -list(APPEND ut_src fibonacci-coro.cc) -list(APPEND ut_src device_coro.cc) -if (TTG_HAVE_CUDA) - list(APPEND ut_src cuda_kernel.cu) -endif(TTG_HAVE_CUDA) -list(APPEND ut_libs std::coroutine) +if (CXXStdCoroutine_FOUND) + list(APPEND ut_src fibonacci-coro.cc) + list(APPEND ut_src device_coro.cc) + if (TTG_HAVE_CUDA) + list(APPEND ut_src cuda_kernel.cu) + endif(TTG_HAVE_CUDA) + list(APPEND ut_libs std::coroutine) +endif(CXXStdCoroutine_FOUND) add_ttg_executable(core-unittests-ttg "${ut_src}" LINK_LIBRARIES "${ut_libs}" COMPILE_DEFINITIONS "CATCH_CONFIG_NO_POSIX_SIGNALS=1" ) From 40595c1314cacc6f4c179adbb8f8eb464fc1b95e Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Tue, 28 May 2024 12:32:22 -0400 Subject: [PATCH 37/40] Fix macro TTG_PROCESS_TT_OP_RETURN if coros are not available Signed-off-by: Joseph Schuchart --- ttg/ttg/tt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ttg/ttg/tt.h b/ttg/ttg/tt.h index ec7eff6d8..637ae42b5 100644 --- a/ttg/ttg/tt.h +++ b/ttg/ttg/tt.h @@ -196,7 +196,7 @@ namespace ttg { } \ } #else -#define TTG_PROCESS_TT_OP_RETURN(result, invoke) invoke +#define TTG_PROCESS_TT_OP_RETURN(result, id, invoke) invoke #endif #else #error "TTG_PROCESS_TT_OP_RETURN already defined in ttg/tt.h, check your header guards" From 1421c4f35e65169d775a685e8e2f27e45f35639e Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 30 May 2024 09:28:43 -0400 Subject: [PATCH 38/40] Only use suspended_task_address if we have coroutines Signed-off-by: Joseph Schuchart --- ttg/ttg/parsec/ttg.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 71a943406..1aaa21bdf 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -3695,6 +3695,7 @@ ttg::abort(); // should not happen task_t *task = (task_t*)parsec_task; +#ifdef TTG_HAS_COROUTINE /* if we still have a coroutine handle we invoke it one more time to get the sends/broadcasts */ if (task->suspended_task_address) { assert(task->coroutine_id != ttg::TaskCoroutineID::Invalid); @@ -3725,6 +3726,7 @@ ttg::abort(); // should not happen /* the coroutine should have completed and we cannot access the promise anymore */ task->suspended_task_address = nullptr; } +#endif // TTG_HAS_COROUTINE /* release our data copies */ for (int i = 0; i < task->data_count; i++) { From df32b1ed0e93c8a0762e832183c3abd3d91e5955 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 30 May 2024 10:24:23 -0400 Subject: [PATCH 39/40] Replace TTG_HAS_COROUTINE with TTG_HAVE_COROUTINE and add to config.h Signed-off-by: Joseph Schuchart --- CMakeLists.txt | 3 ++- ttg/CMakeLists.txt | 1 - ttg/ttg/config.in.h | 3 +++ ttg/ttg/coroutine.h | 4 ++++ ttg/ttg/device/task.h | 6 +++++- ttg/ttg/madness/ttg.h | 22 ++++++++++------------ ttg/ttg/make_tt.h | 8 ++++---- ttg/ttg/parsec/task.h | 4 ++-- ttg/ttg/parsec/ttg.h | 32 +++++++++++++++++--------------- ttg/ttg/tt.h | 5 +++-- 10 files changed, 50 insertions(+), 38 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 26433deb5..519b13938 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -108,8 +108,9 @@ if (${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU") endif(SKIP_COROUTINE_DETECTION) endif(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU") -if (SKIP_COROUTINE_DETECTION) +if (NOT SKIP_COROUTINE_DETECTION) find_package(CXXStdCoroutine MODULE REQUIRED COMPONENTS Final Experimental) + set(TTG_HAVE_COROUTINE CXXStdCoroutine_FOUND CACHE BOOL "True if the compiler has coroutine support") endif(SKIP_COROUTINE_DETECTION) diff --git a/ttg/CMakeLists.txt b/ttg/CMakeLists.txt index 1918f40fe..b1fa72947 100644 --- a/ttg/CMakeLists.txt +++ b/ttg/CMakeLists.txt @@ -114,7 +114,6 @@ if (TTG_ENABLE_TRACE) endif (TTG_ENABLE_TRACE) if (TARGET std::coroutine) list(APPEND ttg-deps std::coroutine) - list(APPEND ttg-defs "TTG_HAS_COROUTINE=1") list(APPEND ttg-util-headers ${CMAKE_CURRENT_SOURCE_DIR}/ttg/coroutine.h ) diff --git a/ttg/ttg/config.in.h b/ttg/ttg/config.in.h index 51e58b4a2..106f0c58a 100644 --- a/ttg/ttg/config.in.h +++ b/ttg/ttg/config.in.h @@ -11,6 +11,9 @@ /** the C++ namespace containing the coroutine API */ #define TTG_CXX_COROUTINE_NAMESPACE @CXX_COROUTINE_NAMESPACE@ +/** whether the compiler supports C++ coroutines */ +#cmakedefine TTG_HAVE_COROUTINE + /** whether TTG has CUDA language support */ #cmakedefine TTG_HAVE_CUDA diff --git a/ttg/ttg/coroutine.h b/ttg/ttg/coroutine.h index 81d5b1657..890a98b4f 100644 --- a/ttg/ttg/coroutine.h +++ b/ttg/ttg/coroutine.h @@ -11,6 +11,8 @@ #include #include +#ifdef TTG_HAVE_COROUTINE + namespace ttg { // import std coroutine API into ttg namespace @@ -227,4 +229,6 @@ namespace ttg { } // namespace ttg +#endif // TTG_HAVE_COROUTINE + #endif // TTG_COROUTINE_H diff --git a/ttg/ttg/device/task.h b/ttg/ttg/device/task.h index d95e0d1eb..8e2d14cfc 100644 --- a/ttg/ttg/device/task.h +++ b/ttg/ttg/device/task.h @@ -9,6 +9,8 @@ #include "ttg/impl_selector.h" #include "ttg/ptr.h" +#ifdef TTG_HAVE_COROUTINE + namespace ttg::device { namespace detail { @@ -632,6 +634,8 @@ namespace ttg::device { bool device_reducer::completed() { return base_type::promise().state() == ttg::device::detail::TTG_DEVICE_CORO_COMPLETE; } #endif // 0 -} // namespace ttg::devie +} // namespace ttg::device + +#endif // TTG_HAVE_COROUTINE #endif // TTG_DEVICE_TASK_H diff --git a/ttg/ttg/madness/ttg.h b/ttg/ttg/madness/ttg.h index 34c576f56..5d2360cfb 100644 --- a/ttg/ttg/madness/ttg.h +++ b/ttg/ttg/madness/ttg.h @@ -24,9 +24,7 @@ #include "ttg/util/meta/callable.h" #include "ttg/util/void.h" #include "ttg/world.h" -#ifdef TTG_HAS_COROUTINE #include "ttg/coroutine.h" -#endif #include #include @@ -303,10 +301,10 @@ namespace ttg_madness { derivedT *derived; // Pointer to derived class instance bool pull_terminals_invoked = false; std::conditional_t, ttg::Void, keyT> key; // Task key -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE void *suspended_task_address = nullptr; // if not null the function is suspended ttg::TaskCoroutineID coroutine_id = ttg::TaskCoroutineID::Invalid; -#endif +#endif // TTG_HAVE_COROUTINE /// makes a tuple of references out of tuple of template @@ -336,11 +334,11 @@ namespace ttg_madness { ttT::threaddata.call_depth++; void *suspended_task_address = -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE this->suspended_task_address; // non-null = need to resume the task -#else +#else // TTG_HAVE_COROUTINE nullptr; -#endif +#endif // TTG_HAVE_COROUTINE if (suspended_task_address == nullptr) { // task is a coroutine that has not started or an ordinary function // ttg::print("starting task"); if constexpr (!ttg::meta::is_void_v && !ttg::meta::is_empty_tuple_v) { @@ -362,7 +360,7 @@ namespace ttg_madness { } else // unreachable ttg::abort(); } else { // resume suspended coroutine -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE auto ret = static_cast(ttg::coroutine_handle::from_address(suspended_task_address)); assert(ret.ready()); ret.resume(); @@ -373,9 +371,9 @@ namespace ttg_madness { // leave suspended_task_address as is } this->suspended_task_address = suspended_task_address; -#else +#else // TTG_HAVE_COROUTINE ttg::abort(); // should not happen -#endif +#endif // TTG_HAVE_COROUTINE } ttT::threaddata.call_depth--; @@ -384,7 +382,7 @@ namespace ttg_madness { // ttg::print("finishing task",ttT::threaddata.call_depth); // } -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE if (suspended_task_address) { // TODO implement handling of suspended coroutines properly @@ -412,7 +410,7 @@ namespace ttg_madness { ttg::abort(); } } -#endif // TTG_HAS_COROUTINE +#endif // TTG_HAVE_COROUTINE } virtual ~TTArgs() {} // Will be deleted via TaskInterface* diff --git a/ttg/ttg/make_tt.h b/ttg/ttg/make_tt.h index 1711d8556..81897b816 100644 --- a/ttg/ttg/make_tt.h +++ b/ttg/ttg/make_tt.h @@ -149,7 +149,7 @@ class CallableWrapTTArgs std::conditional_t, std::add_pointer_t, noref_funcT> func; using op_return_type = -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE std::conditional_t, ttg::coroutine_handle, #ifdef TTG_HAVE_DEVICE @@ -160,9 +160,9 @@ class CallableWrapTTArgs void #endif // TTG_HAVE_DEVICE >; -#else // TTG_HAS_COROUTINE +#else // TTG_HAVE_COROUTINE void; -#endif // TTG_HAS_COROUTINE +#endif // TTG_HAVE_COROUTINE public: static constexpr bool have_cuda_op = (space == ttg::ExecutionSpace::CUDA); @@ -176,7 +176,7 @@ class CallableWrapTTArgs static_assert(std::is_same_v, returnT>, "CallableWrapTTArgs: returnT does not match the actual return type of funcT"); if constexpr (!std::is_void_v) { // protect from compiling for void returnT -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE if constexpr (std::is_same_v) { ttg::coroutine_handle coro_handle; // if task completed destroy it diff --git a/ttg/ttg/parsec/task.h b/ttg/ttg/parsec/task.h index 5109dbcaa..656117b94 100644 --- a/ttg/ttg/parsec/task.h +++ b/ttg/ttg/parsec/task.h @@ -202,7 +202,7 @@ namespace ttg_parsec { TT* tt = nullptr; key_type key; std::array streams; -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE void* suspended_task_address = nullptr; // if not null the function is suspended ttg::TaskCoroutineID coroutine_id = ttg::TaskCoroutineID::Invalid; #endif @@ -268,7 +268,7 @@ namespace ttg_parsec { static constexpr size_t num_streams = TT::numins; TT* tt = nullptr; std::array streams; -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE void* suspended_task_address = nullptr; // if not null the function is suspended ttg::TaskCoroutineID coroutine_id = ttg::TaskCoroutineID::Invalid; #endif diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 1aaa21bdf..a241d2b3b 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -12,6 +12,8 @@ * This may cause deadlocks, so use with caution. */ #define TTG_PARSEC_DEFER_WRITER false +#include "ttg/config.h" + #include "ttg/impl_selector.h" /* include ttg header to make symbols available in case this header is included directly */ @@ -1643,11 +1645,11 @@ namespace ttg_parsec { task_t *task = (task_t*)parsec_task; void* suspended_task_address = -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE task->suspended_task_address; // non-null = need to resume the task -#else +#else // TTG_HAVE_COROUTINE nullptr; -#endif +#endif // TTG_HAVE_COROUTINE //std::cout << "static_op: suspended_task_address " << suspended_task_address << std::endl; if (suspended_task_address == nullptr) { // task is a coroutine that has not started or an ordinary function @@ -1679,9 +1681,9 @@ namespace ttg_parsec { } else { // resume the suspended coroutine +#ifdef TTG_HAVE_COROUTINE assert(task->coroutine_id != ttg::TaskCoroutineID::Invalid); -#ifdef TTG_HAS_COROUTINE #ifdef TTG_HAVE_DEVICE if (task->coroutine_id == ttg::TaskCoroutineID::DeviceTask) { ttg::device::Task coro = ttg::device::detail::device_task_handle_type::from_address(suspended_task_address); @@ -1725,9 +1727,9 @@ namespace ttg_parsec { } else ttg::abort(); // unrecognized task id -#else // TTG_HAS_COROUTINE -ttg::abort(); // should not happen -#endif // TTG_HAS_COROUTINE +#else // TTG_HAVE_COROUTINE + ttg::abort(); // should not happen +#endif // TTG_HAVE_COROUTINE } task->suspended_task_address = suspended_task_address; @@ -1750,11 +1752,11 @@ ttg::abort(); // should not happen task_t *task = static_cast(parsec_task); void* suspended_task_address = -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE task->suspended_task_address; // non-null = need to resume the task -#else +#else // TTG_HAVE_COROUTINE nullptr; -#endif +#endif // TTG_HAVE_COROUTINE if (suspended_task_address == nullptr) { // task is a coroutine that has not started or an ordinary function ttT *baseobj = (ttT *)task->object_ptr; derivedT *obj = (derivedT *)task->object_ptr; @@ -1769,7 +1771,7 @@ ttg::abort(); // should not happen detail::parsec_ttg_caller = NULL; } else { -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE auto ret = static_cast(ttg::coroutine_handle::from_address(suspended_task_address)); assert(ret.ready()); ret.resume(); @@ -1780,9 +1782,9 @@ ttg::abort(); // should not happen else { // not yet completed // leave suspended_task_address as is } -#else +#else // TTG_HAVE_COROUTINE ttg::abort(); // should not happen -#endif +#endif // TTG_HAVE_COROUTINE } task->suspended_task_address = suspended_task_address; @@ -3695,7 +3697,7 @@ ttg::abort(); // should not happen task_t *task = (task_t*)parsec_task; -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE /* if we still have a coroutine handle we invoke it one more time to get the sends/broadcasts */ if (task->suspended_task_address) { assert(task->coroutine_id != ttg::TaskCoroutineID::Invalid); @@ -3726,7 +3728,7 @@ ttg::abort(); // should not happen /* the coroutine should have completed and we cannot access the promise anymore */ task->suspended_task_address = nullptr; } -#endif // TTG_HAS_COROUTINE +#endif // TTG_HAVE_COROUTINE /* release our data copies */ for (int i = 0; i < task->data_count; i++) { diff --git a/ttg/ttg/tt.h b/ttg/ttg/tt.h index 637ae42b5..6b3972984 100644 --- a/ttg/ttg/tt.h +++ b/ttg/ttg/tt.h @@ -1,12 +1,13 @@ #ifndef TTG_TT_H #define TTG_TT_H +#include "ttg/config.h" #include "ttg/fwd.h" #include "ttg/base/tt.h" #include "ttg/edge.h" -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE #include "ttg/coroutine.h" #endif @@ -176,7 +177,7 @@ namespace ttg { } // namespace ttg #ifndef TTG_PROCESS_TT_OP_RETURN -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE #define TTG_PROCESS_TT_OP_RETURN(result, id, invoke) \ { \ using return_type = decltype(invoke); \ From a9c33d4b3652ddca7c912d18f007767fe97f0690 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 30 May 2024 11:31:31 -0400 Subject: [PATCH 40/40] Add TTG_ENABLE_COROUTINES CMake option and fix non-coro builds Signed-off-by: Joseph Schuchart --- CMakeLists.txt | 37 ++++++++++++++++++++----------------- ttg/ttg/coroutine.h | 3 ++- ttg/ttg/parsec/ttg.h | 2 +- 3 files changed, 23 insertions(+), 19 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 519b13938..5406877c8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -59,6 +59,7 @@ option(TTG_ENABLE_LEVEL_ZERO "Whether to TTG will look for Intel oneAPI Level Ze option(TTG_EXAMPLES "Whether to build examples" OFF) option(TTG_ENABLE_ASAN "Whether to enable address sanitizer" OFF) +option(TTG_ENABLE_COROUTINES "Whether to enable C++ coroutines, needed for accelerator device support" ON) option(TTG_FETCH_BOOST "Whether to fetch+build Boost, if missing" OFF) option(TTG_IGNORE_BUNDLED_EXTERNALS "Whether to skip installation and use of bundled external dependencies (Boost.CallableTraits)" OFF) option(TTG_ENABLE_TRACE "Whether to enable ttg::trace() output" OFF) @@ -95,23 +96,25 @@ endif (BUILD_TESTING) # Boost include("${PROJECT_SOURCE_DIR}/cmake/modules/FindOrFetchBoost.cmake") -# C++ coroutines, check for broken GCC releases and skip if one is found -set(SKIP_COROUTINE_DETECTION FALSE) -if (${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU") - if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 11.4.0) - set(SKIP_COROUTINE_DETECTION TRUE) - elseif(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.1.0 AND ${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 12.3.0) - set(SKIP_COROUTINE_DETECTION TRUE) - endif() - if (SKIP_COROUTINE_DETECTION) - message(WARNING "GCC with broken Coroutine support detected, disabling Coroutine support. At least GCC 11.4, 12.3, or 13.1 required.") - endif(SKIP_COROUTINE_DETECTION) -endif(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU") - -if (NOT SKIP_COROUTINE_DETECTION) - find_package(CXXStdCoroutine MODULE REQUIRED COMPONENTS Final Experimental) - set(TTG_HAVE_COROUTINE CXXStdCoroutine_FOUND CACHE BOOL "True if the compiler has coroutine support") -endif(SKIP_COROUTINE_DETECTION) +if (TTG_ENABLE_COROUTINES) + set(SKIP_COROUTINE_DETECTION FALSE) + # C++ coroutines, check for broken GCC releases and skip if one is found + if (${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU") + if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 11.4.0) + set(SKIP_COROUTINE_DETECTION TRUE) + elseif(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.1.0 AND ${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 12.3.0) + set(SKIP_COROUTINE_DETECTION TRUE) + endif() + if (SKIP_COROUTINE_DETECTION) + message(WARNING "GCC with broken Coroutine support detected, disabling Coroutine support. At least GCC 11.4, 12.3, or 13.1 required.") + endif(SKIP_COROUTINE_DETECTION) + endif(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU") + + if (NOT SKIP_COROUTINE_DETECTION) + find_package(CXXStdCoroutine MODULE REQUIRED COMPONENTS Final Experimental) + set(TTG_HAVE_COROUTINE CXXStdCoroutine_FOUND CACHE BOOL "True if the compiler has coroutine support") + endif(NOT SKIP_COROUTINE_DETECTION) +endif(TTG_ENABLE_COROUTINES) ########################## diff --git a/ttg/ttg/coroutine.h b/ttg/ttg/coroutine.h index 890a98b4f..dc1ed8e5b 100644 --- a/ttg/ttg/coroutine.h +++ b/ttg/ttg/coroutine.h @@ -6,12 +6,13 @@ #define TTG_COROUTINE_H #include "ttg/config.h" + +#ifdef TTG_HAVE_COROUTINE #include TTG_CXX_COROUTINE_HEADER #include #include -#ifdef TTG_HAVE_COROUTINE namespace ttg { diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index a241d2b3b..73672d285 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -1724,6 +1724,7 @@ namespace ttg_parsec { } task->tt->set_outputs_tls_ptr(old_output_tls_ptr); detail::parsec_ttg_caller = nullptr; + task->suspended_task_address = suspended_task_address; } else ttg::abort(); // unrecognized task id @@ -1731,7 +1732,6 @@ namespace ttg_parsec { ttg::abort(); // should not happen #endif // TTG_HAVE_COROUTINE } - task->suspended_task_address = suspended_task_address; if (suspended_task_address == nullptr) { ttT *baseobj = task->tt;