From bb5b44b00594749c2f8efeac9f51b58f6c75ff9a Mon Sep 17 00:00:00 2001 From: adstraw Date: Fri, 12 Aug 2022 12:43:57 -0700 Subject: [PATCH 01/11] [Hexagon] Asynchronous DMA support --- src/runtime/hexagon/hexagon_user_dma.cc | 108 +++++++---- src/runtime/hexagon/hexagon_user_dma.h | 63 +++++++ .../hexagon/hexagon_user_dma_descriptors.h | 2 - .../hexagon/hexagon_user_dma_instructions.h | 8 +- .../hexagon/hexagon_user_dma_tests.cc | 175 ++++++++++++++++++ 5 files changed, 312 insertions(+), 44 deletions(-) create mode 100644 src/runtime/hexagon/hexagon_user_dma.h create mode 100644 tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc diff --git a/src/runtime/hexagon/hexagon_user_dma.cc b/src/runtime/hexagon/hexagon_user_dma.cc index 0e3fbd2048f6..b486e8e55b50 100644 --- a/src/runtime/hexagon/hexagon_user_dma.cc +++ b/src/runtime/hexagon/hexagon_user_dma.cc @@ -17,6 +17,8 @@ * under the License. */ +#include "hexagon_user_dma.h" + #include #include "hexagon_common.h" @@ -28,55 +30,41 @@ namespace tvm { namespace runtime { namespace hexagon { -int init_hexagon_user_dma() { -#if __HEXAGON_ARCH__ >= 68 +unsigned int HexagonUserDMA::Init() { // reset DMA engine unsigned int status = dmpause() & DM0_STATUS_MASK; - if (status != DM0_STATUS_IDLE) { - return DMA_FAILURE; - } -#endif - return DMA_SUCCESS; + return status; } -int hexagon_user_dma_1d_sync_helper(void* dst, void* src, uint32_t length) { -#if __HEXAGON_ARCH__ >= 68 - static int config_dma = init_hexagon_user_dma(); - if (config_dma != DMA_SUCCESS) { +int HexagonUserDMA::Copy(void* dst, void* src, uint32_t length) { + // length limited to 24 bits + if (length > DESC_LENGTH_MASK) { return DMA_FAILURE; } - uint64_t src64 = reinterpret_cast(src); // source address limited to 32 bits - if (src64 > DESC_SRC_MASK) { + uint64_t src64 = reinterpret_cast(src); + if (!src64 || src64 > DESC_SRC_MASK) { return DMA_FAILURE; } - uint64_t dst64 = reinterpret_cast(dst); // destination address limited to 32 bits - if (dst64 > DESC_DST_MASK) { - return DMA_FAILURE; - } - - // length limited to 24 bits - if (length > DESC_LENGTH_MASK) { + uint64_t dst64 = reinterpret_cast(dst); + if (!dst64 || dst64 > DESC_DST_MASK) { return DMA_FAILURE; } - uint32_t src32 = src64 & DESC_SRC_MASK; - uint32_t dst32 = dst64 & DESC_DST_MASK; + uint32_t src32 = static_cast(src64); + uint32_t dst32 = static_cast(dst64); + // allocate new descriptor void* dma_desc = nullptr; - int ret = posix_memalign(&dma_desc, DMA_DESC_2D_SIZE, DMA_DESC_2D_SIZE); - if (ret) { - return DMA_FAILURE; - } - - if (!dma_desc) { + if (ret || !dma_desc) { return DMA_FAILURE; } + // populate descriptor fields dma_desc_set_state(dma_desc, DESC_STATE_READY); dma_desc_set_next(dma_desc, DMA_NULL_PTR); dma_desc_set_length(dma_desc, length); @@ -90,23 +78,66 @@ int hexagon_user_dma_1d_sync_helper(void* dst, void* src, uint32_t length) { dma_desc_set_src(dma_desc, src32); dma_desc_set_dst(dma_desc, dst32); - dmstart(dma_desc); - unsigned int status = dmwait() & DM0_STATUS_MASK; - unsigned int done = dma_desc_get_done(dma_desc); + if (first_dma_) { + // reset DMA engine + auto status = Init(); + if (status != DM0_STATUS_IDLE) { + return DMA_FAILURE; + } + + // `dmstart` first descriptor + dmstart(dma_desc); + first_dma_ = false; + } else { + // `dmlink` descriptor to tail + dmlink(dma_descriptors_.back(), dma_desc); + } - free(dma_desc); + // set descriptor as new tail + dma_descriptors_.push_back(dma_desc); - if (status == DM0_STATUS_IDLE && done == DESC_DONE_COMPLETE) { - return DMA_SUCCESS; + return DMA_SUCCESS; +} + +void HexagonUserDMA::Wait(uint32_t max_dmas_in_flight) { + while (DMAsInFlight() > max_dmas_in_flight) { + } +} + +uint32_t HexagonUserDMA::Poll() { return DMAsInFlight(); } + +uint32_t HexagonUserDMA::DMAsInFlight() { + // poll DMA engine to make sure DMA status is current + dmpoll(); + + // find the oldest DMA in flight + for (; oldest_dma_in_flight_ < dma_descriptors_.size(); ++oldest_dma_in_flight_) { + // read the `done` bit from the DMA descriptor and stop if incomplete + unsigned int done = dma_desc_get_done(dma_descriptors_[oldest_dma_in_flight_]); + if (done == DESC_DONE_INCOMPLETE) { + break; + } + } + // total DMAs in flight = total DMAs - oldest DMA in flight + return dma_descriptors_.size() - oldest_dma_in_flight_; +} + +HexagonUserDMA::~HexagonUserDMA() { + Init(); // reset DMA engine + for (auto dma_desc : dma_descriptors_) { + free(dma_desc); } -#endif - return DMA_FAILURE; } int hexagon_user_dma_1d_sync(void* dst, void* src, uint32_t length) { // One DMA transfer can copy at most DESC_LENGTH_MASK bytes. // Make the common case quick. - if (length <= DESC_LENGTH_MASK) return hexagon_user_dma_1d_sync_helper(dst, src, length); + if (length <= DESC_LENGTH_MASK) { + int ret_val = HexagonUserDMA::Get().Copy(dst, src, length); + if (ret_val != DMA_SUCCESS) return ret_val; + HexagonUserDMA::Get().Wait(0); + return DMA_SUCCESS; + } // Split big transfers into smaller transfers. char* cast_src = static_cast(src); @@ -114,8 +145,9 @@ int hexagon_user_dma_1d_sync(void* dst, void* src, uint32_t length) { for (uint32_t i = 0; i < length;) { // Ensure there is no overflow while updating i uint32_t cur_len = std::min(length - i, DESC_LENGTH_MASK); - int ret_val = hexagon_user_dma_1d_sync_helper(&cast_dst[i], &cast_src[i], cur_len); + int ret_val = HexagonUserDMA::Get().Copy(&cast_dst[i], &cast_src[i], cur_len); if (ret_val != DMA_SUCCESS) return ret_val; + HexagonUserDMA::Get().Wait(0); // 2 cases for new val for i: // 1. length - i <= DESC_LENGTH_MASK (<= MAX_UINT) // new_i = i + (length - i) = length, no more iter diff --git a/src/runtime/hexagon/hexagon_user_dma.h b/src/runtime/hexagon/hexagon_user_dma.h new file mode 100644 index 000000000000..b9559a7e1827 --- /dev/null +++ b/src/runtime/hexagon/hexagon_user_dma.h @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include + +#ifndef TVM_RUNTIME_HEXAGON_HEXAGON_USER_DMA_H_ +#define TVM_RUNTIME_HEXAGON_HEXAGON_USER_DMA_H_ + +namespace tvm { +namespace runtime { +namespace hexagon { + +#define DMA_SUCCESS 0 +#define DMA_FAILURE -1 + +class HexagonUserDMA { + public: + int Copy(void* dst, void* src, uint32_t length); + void Wait(uint32_t max_dmas_in_flight); + uint32_t Poll(); + + static HexagonUserDMA& Get() { + static HexagonUserDMA* hud = new HexagonUserDMA(); + return *hud; + } + + private: + HexagonUserDMA() = default; + ~HexagonUserDMA(); + HexagonUserDMA(const HexagonUserDMA&) = delete; + HexagonUserDMA& operator=(const HexagonUserDMA&) = delete; + HexagonUserDMA(HexagonUserDMA&&) = delete; + HexagonUserDMA& operator=(HexagonUserDMA&&) = delete; + + unsigned int Init(); + uint32_t DMAsInFlight(); + + bool first_dma_{true}; + uint32_t oldest_dma_in_flight_{0}; + std::vector dma_descriptors_; +}; + +} // namespace hexagon +} // namespace runtime +} // namespace tvm + +#endif // TVM_RUNTIME_HEXAGON_HEXAGON_USER_DMA_H_ diff --git a/src/runtime/hexagon/hexagon_user_dma_descriptors.h b/src/runtime/hexagon/hexagon_user_dma_descriptors.h index 643dbc5e8bf5..913b025df138 100644 --- a/src/runtime/hexagon/hexagon_user_dma_descriptors.h +++ b/src/runtime/hexagon/hexagon_user_dma_descriptors.h @@ -126,8 +126,6 @@ namespace hexagon { #define DESC_DSTWIDTHOFFSET_MASK 0xFFFF0000 #define DESC_DSTWIDTHOFFSET_SHIFT 16 -#define DMA_SUCCESS 0 -#define DMA_FAILURE -1 #define DMA_NULL_PTR 0 /**************************/ diff --git a/src/runtime/hexagon/hexagon_user_dma_instructions.h b/src/runtime/hexagon/hexagon_user_dma_instructions.h index e160b7395658..2345d4daaf21 100644 --- a/src/runtime/hexagon/hexagon_user_dma_instructions.h +++ b/src/runtime/hexagon/hexagon_user_dma_instructions.h @@ -24,8 +24,6 @@ namespace tvm { namespace runtime { namespace hexagon { -#if __HEXAGON_ARCH__ >= 68 - inline unsigned int dmpause() { unsigned int dm0 = 0; asm volatile(" %0 = dmpause" : "=r"(dm0)); @@ -34,6 +32,10 @@ inline unsigned int dmpause() { inline void dmstart(void* next) { asm volatile(" dmstart(%0)" : : "r"(next)); } +inline void dmlink(void* tail, void* next) { + asm volatile(" dmlink(%0, %1)" : : "r"(tail), "r"(next)); +} + inline unsigned int dmpoll() { unsigned int dm0 = 0; asm volatile(" %0 = dmpoll" : "=r"(dm0)); @@ -70,8 +72,6 @@ inline void dmcfgwr(unsigned int dmindex, unsigned int data) { asm volatile(" dmcfgwr(%0, %1)" : : "r"(dmindex), "r"(data)); } -#endif - } // namespace hexagon } // namespace runtime } // namespace tvm diff --git a/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc b/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc new file mode 100644 index 000000000000..3339ed56de6f --- /dev/null +++ b/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc @@ -0,0 +1,175 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +//#include +#include +#include + +#include "../src/runtime/hexagon/hexagon_user_dma.h" + +using namespace tvm::runtime; +using namespace tvm::runtime::hexagon; + +class HexagonUserDMATest : public ::testing::Test { + void SetUp() override { + src = malloc(length); + dst = malloc(length); + ASSERT_NE(src, nullptr); + ASSERT_NE(dst, nullptr); + + src_char = static_cast(src); + dst_char = static_cast(dst); + for (uint32_t i = 0; i < length; ++i) { + src_char[i] = 1; + dst_char[i] = 0; + } + } + void TearDown() override { + free(src); + free(dst); + } + + public: + int ret{-1}; + void* src{nullptr}; + void* dst{nullptr}; + char* src_char{nullptr}; + char* dst_char{nullptr}; + uint32_t length{0x400000}; // 4MB +}; + +TEST_F(HexagonUserDMATest, wait) { + HexagonUserDMA::Get().Wait(0); + HexagonUserDMA::Get().Wait(10); +} + +TEST_F(HexagonUserDMATest, poll) { ASSERT_EQ(HexagonUserDMA::Get().Poll(), 0); } + +TEST_F(HexagonUserDMATest, bad_copy) { + uint64_t bigaddr = 0x100000000; + void* src64 = reinterpret_cast(bigaddr); + void* dst64 = reinterpret_cast(bigaddr); + uint32_t biglength = 0x1000000; + ASSERT_NE(HexagonUserDMA::Get().Copy(dst64, src, length), DMA_SUCCESS); + ASSERT_NE(HexagonUserDMA::Get().Copy(dst, src64, length), DMA_SUCCESS); + ASSERT_NE(HexagonUserDMA::Get().Copy(dst, src, biglength), DMA_SUCCESS); +} + +TEST_F(HexagonUserDMATest, sync_dma) { + // kick off 1 DMA + ret = HexagonUserDMA::Get().Copy(dst, src, length); + ASSERT_EQ(ret, DMA_SUCCESS); + + // wait for DMA to complete + HexagonUserDMA::Get().Wait(0); + + // verify + for (uint32_t i = 0; i < length; ++i) { + ASSERT_EQ(src_char[i], dst_char[i]); + } +} + +TEST_F(HexagonUserDMATest, async_dma) { + // kick off 10x duplicate DMAs + for (uint32_t i = 0; i < 10; ++i) { + ret = HexagonUserDMA::Get().Copy(dst, src, length); + ASSERT_EQ(ret, DMA_SUCCESS); + } + + // verify at least 1 DMA in flight + ASSERT_GT(HexagonUserDMA::Get().Poll(), 0); + + // wait for at least 1 DMA to complete + HexagonUserDMA::Get().Wait(9); + + // verify + for (uint32_t i = 0; i < length; ++i) { + ASSERT_EQ(src_char[i], dst_char[i]); + } +} + +TEST_F(HexagonUserDMATest, pipeline) { + // auto start = std::chrono::high_resolution_clock::now(); + time_t start = time(nullptr); + + // sync DMA + ret = HexagonUserDMA::Get().Copy(dst, src, length); + HexagonUserDMA::Get().Wait(0); + + // compute + for (uint32_t i = 0; i < length; ++i) { + dst_char[i]++; + } + + // auto end = std::chrono::high_resolution_clock::now(); + // auto sync_ns = std::chrono::duration_cast(end-start).count(); + time_t end = time(nullptr); + double sync_ns = difftime(start, end); + + // verify + ASSERT_EQ(ret, 0); + for (uint32_t i = 0; i < length; ++i) { + ASSERT_EQ(dst_char[i], 2); + } + + // start = std::chrono::high_resolution_clock::now(); + start = time(nullptr); + + uint32_t pipeline_depth = 4; + uint32_t pipeline_length = length / pipeline_depth; + + for (uint32_t i = 0; i < pipeline_depth; ++i) { + ret |= HexagonUserDMA::Get().Copy(dst_char + i * pipeline_length, + src_char + i * pipeline_length, pipeline_length); + } + + HexagonUserDMA::Get().Wait(3); + for (uint32_t i = 0; i < pipeline_length; ++i) { + dst_char[i]++; + } + + HexagonUserDMA::Get().Wait(2); + for (uint32_t i = pipeline_length; i < 2 * pipeline_length; ++i) { + dst_char[i]++; + } + + HexagonUserDMA::Get().Wait(1); + for (uint32_t i = 2 * pipeline_length; i < 3 * pipeline_length; ++i) { + dst_char[i]++; + } + + HexagonUserDMA::Get().Wait(0); + for (uint32_t i = 3 * pipeline_length; i < 4 * pipeline_length; ++i) { + dst_char[i]++; + } + + // end = std::chrono::high_resolution_clock::now(); + // auto async_ns = std::chrono::duration_cast(end-start).count(); + end = time(nullptr); + double async_ns = difftime(start, end); + + // verify + ASSERT_EQ(ret, 0); + for (uint32_t i = 0; i < length; ++i) { + ASSERT_EQ(dst_char[i], 2); + } + + // TODO: time/chrono not working, returns 0 + ASSERT_LE(async_ns, sync_ns); +} \ No newline at end of file From 27ad1b62913ba47f8dbd618e7aa44ec70b86aeed Mon Sep 17 00:00:00 2001 From: adstraw Date: Mon, 15 Aug 2022 13:15:04 -0700 Subject: [PATCH 02/11] turn off tests with TODO that fail in CI on simulator add comments --- src/runtime/hexagon/hexagon_user_dma.cc | 4 ++ src/runtime/hexagon/hexagon_user_dma.h | 31 +++++++++++++- .../hexagon/hexagon_user_dma_tests.cc | 42 ++----------------- 3 files changed, 37 insertions(+), 40 deletions(-) diff --git a/src/runtime/hexagon/hexagon_user_dma.cc b/src/runtime/hexagon/hexagon_user_dma.cc index b486e8e55b50..66c69b134136 100644 --- a/src/runtime/hexagon/hexagon_user_dma.cc +++ b/src/runtime/hexagon/hexagon_user_dma.cc @@ -78,6 +78,7 @@ int HexagonUserDMA::Copy(void* dst, void* src, uint32_t length) { dma_desc_set_src(dma_desc, src32); dma_desc_set_dst(dma_desc, dst32); + // only for first DMA if (first_dma_) { // reset DMA engine auto status = Init(); @@ -100,6 +101,7 @@ int HexagonUserDMA::Copy(void* dst, void* src, uint32_t length) { } void HexagonUserDMA::Wait(uint32_t max_dmas_in_flight) { + // wait (forever) until max DMAs in flight <= actual DMAs in flight while (DMAsInFlight() > max_dmas_in_flight) { } } @@ -133,6 +135,7 @@ int hexagon_user_dma_1d_sync(void* dst, void* src, uint32_t length) { // One DMA transfer can copy at most DESC_LENGTH_MASK bytes. // Make the common case quick. if (length <= DESC_LENGTH_MASK) { + // sync DMA -> `Copy` and then `Wait(0)` int ret_val = HexagonUserDMA::Get().Copy(dst, src, length); if (ret_val != DMA_SUCCESS) return ret_val; HexagonUserDMA::Get().Wait(0); @@ -145,6 +148,7 @@ int hexagon_user_dma_1d_sync(void* dst, void* src, uint32_t length) { for (uint32_t i = 0; i < length;) { // Ensure there is no overflow while updating i uint32_t cur_len = std::min(length - i, DESC_LENGTH_MASK); + // sync DMA -> `Copy` and then `Wait(0)` int ret_val = HexagonUserDMA::Get().Copy(&cast_dst[i], &cast_src[i], cur_len); if (ret_val != DMA_SUCCESS) return ret_val; HexagonUserDMA::Get().Wait(0); diff --git a/src/runtime/hexagon/hexagon_user_dma.h b/src/runtime/hexagon/hexagon_user_dma.h index b9559a7e1827..f3be34d5d9dc 100644 --- a/src/runtime/hexagon/hexagon_user_dma.h +++ b/src/runtime/hexagon/hexagon_user_dma.h @@ -31,10 +31,29 @@ namespace hexagon { class HexagonUserDMA { public: + /*! + * \brief Initiate DMA to copy memory from source to destination address + * \param dst Destination address + * \param src Source address + * \param length Length in bytes to copy + * \returns Status, either DMA_SUCCESS or DMA_FAILURE + */ int Copy(void* dst, void* src, uint32_t length); + + /*! + * \brief Wait until the number of DMAs in flight is less than or equal to some maximum + * \param max_dmas_in_flight Maximum number of DMAs allowed to be in flight + * to satisfy the `Wait` e.g. use `Wait(0)` to wait on "all" outstanding DMAs to complete + */ void Wait(uint32_t max_dmas_in_flight); + + /*! + * \brief Poll the number of DMAs in flight + * \returns Number of DMAs in flight + */ uint32_t Poll(); + //! HexagonUserDMA uses the singleton pattern static HexagonUserDMA& Get() { static HexagonUserDMA* hud = new HexagonUserDMA(); return *hud; @@ -48,12 +67,20 @@ class HexagonUserDMA { HexagonUserDMA(HexagonUserDMA&&) = delete; HexagonUserDMA& operator=(HexagonUserDMA&&) = delete; + //! \brief Initializes / resets the Hexagon User DMA engine unsigned int Init(); + + //! \brief Calculates the number of DMAs in flight uint32_t DMAsInFlight(); - bool first_dma_{true}; - uint32_t oldest_dma_in_flight_{0}; + //! \brief Stores descriptors for all DMAs std::vector dma_descriptors_; + + //! \brief Index to the descriptor for the oldest DMA in flight + uint32_t oldest_dma_in_flight_{0}; + + //! \brief Tracks whether (or not) we are executing the very first DMA + bool first_dma_{true}; }; } // namespace hexagon diff --git a/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc b/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc index 3339ed56de6f..7cc87113f37b 100644 --- a/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc +++ b/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc @@ -17,9 +17,7 @@ * under the License. */ -//#include #include -#include #include "../src/runtime/hexagon/hexagon_user_dma.h" @@ -46,7 +44,7 @@ class HexagonUserDMATest : public ::testing::Test { } public: - int ret{-1}; + int ret{0}; void* src{nullptr}; void* dst{nullptr}; char* src_char{nullptr}; @@ -93,7 +91,8 @@ TEST_F(HexagonUserDMATest, async_dma) { } // verify at least 1 DMA in flight - ASSERT_GT(HexagonUserDMA::Get().Poll(), 0); + // TODO: re-enable when CI runs on hardware - fails on simulator + // ASSERT_GT(HexagonUserDMA::Get().Poll(), 0); // wait for at least 1 DMA to complete HexagonUserDMA::Get().Wait(9); @@ -104,33 +103,8 @@ TEST_F(HexagonUserDMATest, async_dma) { } } +// TODO: Run non-pipelined case with sync DMA and execution time vs. pipelined case TEST_F(HexagonUserDMATest, pipeline) { - // auto start = std::chrono::high_resolution_clock::now(); - time_t start = time(nullptr); - - // sync DMA - ret = HexagonUserDMA::Get().Copy(dst, src, length); - HexagonUserDMA::Get().Wait(0); - - // compute - for (uint32_t i = 0; i < length; ++i) { - dst_char[i]++; - } - - // auto end = std::chrono::high_resolution_clock::now(); - // auto sync_ns = std::chrono::duration_cast(end-start).count(); - time_t end = time(nullptr); - double sync_ns = difftime(start, end); - - // verify - ASSERT_EQ(ret, 0); - for (uint32_t i = 0; i < length; ++i) { - ASSERT_EQ(dst_char[i], 2); - } - - // start = std::chrono::high_resolution_clock::now(); - start = time(nullptr); - uint32_t pipeline_depth = 4; uint32_t pipeline_length = length / pipeline_depth; @@ -159,17 +133,9 @@ TEST_F(HexagonUserDMATest, pipeline) { dst_char[i]++; } - // end = std::chrono::high_resolution_clock::now(); - // auto async_ns = std::chrono::duration_cast(end-start).count(); - end = time(nullptr); - double async_ns = difftime(start, end); - // verify ASSERT_EQ(ret, 0); for (uint32_t i = 0; i < length; ++i) { ASSERT_EQ(dst_char[i], 2); } - - // TODO: time/chrono not working, returns 0 - ASSERT_LE(async_ns, sync_ns); } \ No newline at end of file From 83a7c34dcaba177b4b7fc4150c7574cd6ed67447 Mon Sep 17 00:00:00 2001 From: adstraw Date: Wed, 17 Aug 2022 14:31:43 -0700 Subject: [PATCH 03/11] add #if __HEXAGON_ARCH__ >= 68 guards around 1D sync DMA calls --- src/runtime/hexagon/hexagon_buffer.cc | 6 +++++- src/runtime/hexagon/hexagon_device_api.cc | 7 ++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/runtime/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon_buffer.cc index 0fc71d8ac29c..25dfa89d364d 100644 --- a/src/runtime/hexagon/hexagon_buffer.cc +++ b/src/runtime/hexagon/hexagon_buffer.cc @@ -240,8 +240,12 @@ void hexagon_buffer_copy_across_regions(const BufferSet& dest, const BufferSet& // Finally, do the memory copies. for (const auto& copy : macro_copies) { - int error_code = hexagon_user_dma_1d_sync(copy.dest, copy.src, copy.num_bytes); +#if __HEXAGON_ARCH__ >= 68 + int error_code = hexagon_user_dma_1d_sync(dst, src, size); CHECK_EQ(error_code, 0); +#else + memcpy(dst, src, size); +#endif } } diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc index 92a7b22784fb..4ff4fea92092 100644 --- a/src/runtime/hexagon/hexagon_device_api.cc +++ b/src/runtime/hexagon/hexagon_device_api.cc @@ -170,7 +170,12 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVM void* src = args[1]; int size = args[2]; - hexagon_user_dma_1d_sync(dst, src, size); +#if __HEXAGON_ARCH__ >= 68 + int error_code = hexagon_user_dma_1d_sync(dst, src, size); + CHECK_EQ(error_code, 0); +#else + memcpy(dst, src, size); +#endif *rv = static_cast(0); }); From edf3b9ab257fc88d99c1e3155c80f97960355a55 Mon Sep 17 00:00:00 2001 From: adstraw Date: Wed, 17 Aug 2022 14:44:21 -0700 Subject: [PATCH 04/11] fix Hexagon build error --- src/runtime/hexagon/hexagon_buffer.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/runtime/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon_buffer.cc index 25dfa89d364d..cd3fd8f971e8 100644 --- a/src/runtime/hexagon/hexagon_buffer.cc +++ b/src/runtime/hexagon/hexagon_buffer.cc @@ -241,10 +241,10 @@ void hexagon_buffer_copy_across_regions(const BufferSet& dest, const BufferSet& // Finally, do the memory copies. for (const auto& copy : macro_copies) { #if __HEXAGON_ARCH__ >= 68 - int error_code = hexagon_user_dma_1d_sync(dst, src, size); + int error_code = hexagon_user_dma_1d_sync(copy.dest, copy.src, copy.num_bytes); CHECK_EQ(error_code, 0); #else - memcpy(dst, src, size); + memcpy(copy.dest, copy.src, copy.num_bytes); #endif } } From f7f07d29fe1b38500d36d8f20b8f5cd9a1b02875 Mon Sep 17 00:00:00 2001 From: adstraw Date: Thu, 18 Aug 2022 17:45:51 -0700 Subject: [PATCH 05/11] address feedback; Init() to ctor; add async_dma_poll test --- src/runtime/hexagon/hexagon_user_dma.cc | 15 ++++----- src/runtime/hexagon/hexagon_user_dma.h | 8 ++--- .../hexagon/hexagon_user_dma_tests.cc | 32 +++++++++++++++---- 3 files changed, 35 insertions(+), 20 deletions(-) diff --git a/src/runtime/hexagon/hexagon_user_dma.cc b/src/runtime/hexagon/hexagon_user_dma.cc index 66c69b134136..39f58178a1a0 100644 --- a/src/runtime/hexagon/hexagon_user_dma.cc +++ b/src/runtime/hexagon/hexagon_user_dma.cc @@ -31,7 +31,6 @@ namespace runtime { namespace hexagon { unsigned int HexagonUserDMA::Init() { - // reset DMA engine unsigned int status = dmpause() & DM0_STATUS_MASK; return status; } @@ -78,14 +77,7 @@ int HexagonUserDMA::Copy(void* dst, void* src, uint32_t length) { dma_desc_set_src(dma_desc, src32); dma_desc_set_dst(dma_desc, dst32); - // only for first DMA if (first_dma_) { - // reset DMA engine - auto status = Init(); - if (status != DM0_STATUS_IDLE) { - return DMA_FAILURE; - } - // `dmstart` first descriptor dmstart(dma_desc); first_dma_ = false; @@ -124,8 +116,13 @@ uint32_t HexagonUserDMA::DMAsInFlight() { return dma_descriptors_.size() - oldest_dma_in_flight_; } +HexagonUserDMA::HexagonUserDMA() { + // reset DMA engine + unsigned int status = Init(); + CHECK_EQ(status, DM0_STATUS_IDLE); +} + HexagonUserDMA::~HexagonUserDMA() { - Init(); // reset DMA engine for (auto dma_desc : dma_descriptors_) { free(dma_desc); } diff --git a/src/runtime/hexagon/hexagon_user_dma.h b/src/runtime/hexagon/hexagon_user_dma.h index f3be34d5d9dc..aff38aa867b9 100644 --- a/src/runtime/hexagon/hexagon_user_dma.h +++ b/src/runtime/hexagon/hexagon_user_dma.h @@ -17,11 +17,11 @@ * under the License. */ -#include - #ifndef TVM_RUNTIME_HEXAGON_HEXAGON_USER_DMA_H_ #define TVM_RUNTIME_HEXAGON_HEXAGON_USER_DMA_H_ +#include + namespace tvm { namespace runtime { namespace hexagon { @@ -60,14 +60,14 @@ class HexagonUserDMA { } private: - HexagonUserDMA() = default; + HexagonUserDMA(); ~HexagonUserDMA(); HexagonUserDMA(const HexagonUserDMA&) = delete; HexagonUserDMA& operator=(const HexagonUserDMA&) = delete; HexagonUserDMA(HexagonUserDMA&&) = delete; HexagonUserDMA& operator=(HexagonUserDMA&&) = delete; - //! \brief Initializes / resets the Hexagon User DMA engine + //! \brief Initializes the Hexagon User DMA engine unsigned int Init(); //! \brief Calculates the number of DMAs in flight diff --git a/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc b/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc index 7cc87113f37b..2fc1017f379e 100644 --- a/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc +++ b/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc @@ -57,9 +57,9 @@ TEST_F(HexagonUserDMATest, wait) { HexagonUserDMA::Get().Wait(10); } -TEST_F(HexagonUserDMATest, poll) { ASSERT_EQ(HexagonUserDMA::Get().Poll(), 0); } +TEST_F(HexagonUserDMATest, just_poll) { ASSERT_EQ(HexagonUserDMA::Get().Poll(), 0); } -TEST_F(HexagonUserDMATest, bad_copy) { +TEST_F(HexagonUserDMATest, copy) { uint64_t bigaddr = 0x100000000; void* src64 = reinterpret_cast(bigaddr); void* dst64 = reinterpret_cast(bigaddr); @@ -83,17 +83,13 @@ TEST_F(HexagonUserDMATest, sync_dma) { } } -TEST_F(HexagonUserDMATest, async_dma) { +TEST_F(HexagonUserDMATest, async_dma_wait) { // kick off 10x duplicate DMAs for (uint32_t i = 0; i < 10; ++i) { ret = HexagonUserDMA::Get().Copy(dst, src, length); ASSERT_EQ(ret, DMA_SUCCESS); } - // verify at least 1 DMA in flight - // TODO: re-enable when CI runs on hardware - fails on simulator - // ASSERT_GT(HexagonUserDMA::Get().Poll(), 0); - // wait for at least 1 DMA to complete HexagonUserDMA::Get().Wait(9); @@ -101,6 +97,28 @@ TEST_F(HexagonUserDMATest, async_dma) { for (uint32_t i = 0; i < length; ++i) { ASSERT_EQ(src_char[i], dst_char[i]); } + + // empty the DMA queue + HexagonUserDMA::Get().Wait(0); +} + +TEST_F(HexagonUserDMATest, async_dma_poll) { + // kick off 10x duplicate DMAs + for (uint32_t i = 0; i < 10; ++i) { + ret = HexagonUserDMA::Get().Copy(dst, src, length); + ASSERT_EQ(ret, DMA_SUCCESS); + } + + // poll until at least 1 DMA is complete + while (HexagonUserDMA::Get().Poll() == 10) {}; + + // verify + for (uint32_t i = 0; i < length; ++i) { + ASSERT_EQ(src_char[i], dst_char[i]); + } + + // empty the DMA queue + HexagonUserDMA::Get().Wait(0); } // TODO: Run non-pipelined case with sync DMA and execution time vs. pipelined case From 5e06778a0770a9abb7a5a68288d34d2ae6c4eef8 Mon Sep 17 00:00:00 2001 From: adstraw Date: Thu, 18 Aug 2022 17:51:38 -0700 Subject: [PATCH 06/11] remove #if __HEXAGON_ARCH__ >= 68 guards per feedback --- src/runtime/hexagon/hexagon_buffer.cc | 4 ---- src/runtime/hexagon/hexagon_device_api.cc | 4 ---- tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc | 3 ++- 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/src/runtime/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon_buffer.cc index cd3fd8f971e8..0fc71d8ac29c 100644 --- a/src/runtime/hexagon/hexagon_buffer.cc +++ b/src/runtime/hexagon/hexagon_buffer.cc @@ -240,12 +240,8 @@ void hexagon_buffer_copy_across_regions(const BufferSet& dest, const BufferSet& // Finally, do the memory copies. for (const auto& copy : macro_copies) { -#if __HEXAGON_ARCH__ >= 68 int error_code = hexagon_user_dma_1d_sync(copy.dest, copy.src, copy.num_bytes); CHECK_EQ(error_code, 0); -#else - memcpy(copy.dest, copy.src, copy.num_bytes); -#endif } } diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc index 4ff4fea92092..f22afca10bfa 100644 --- a/src/runtime/hexagon/hexagon_device_api.cc +++ b/src/runtime/hexagon/hexagon_device_api.cc @@ -170,12 +170,8 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVM void* src = args[1]; int size = args[2]; -#if __HEXAGON_ARCH__ >= 68 int error_code = hexagon_user_dma_1d_sync(dst, src, size); CHECK_EQ(error_code, 0); -#else - memcpy(dst, src, size); -#endif *rv = static_cast(0); }); diff --git a/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc b/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc index 2fc1017f379e..359845f15954 100644 --- a/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc +++ b/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc @@ -110,7 +110,8 @@ TEST_F(HexagonUserDMATest, async_dma_poll) { } // poll until at least 1 DMA is complete - while (HexagonUserDMA::Get().Poll() == 10) {}; + while (HexagonUserDMA::Get().Poll() == 10) { + }; // verify for (uint32_t i = 0; i < length; ++i) { From c5ac10879699b5de6b5aaa40cdc8b8c9074ddb3f Mon Sep 17 00:00:00 2001 From: adstraw Date: Mon, 22 Aug 2022 15:32:02 -0700 Subject: [PATCH 07/11] use ring buffer to store DMA descriptors --- src/runtime/hexagon/hexagon_user_dma.cc | 62 ++++++++++++++----- src/runtime/hexagon/hexagon_user_dma.h | 27 ++++++-- .../hexagon/hexagon_user_dma_tests.cc | 4 +- 3 files changed, 68 insertions(+), 25 deletions(-) diff --git a/src/runtime/hexagon/hexagon_user_dma.cc b/src/runtime/hexagon/hexagon_user_dma.cc index 39f58178a1a0..92ea2d44c043 100644 --- a/src/runtime/hexagon/hexagon_user_dma.cc +++ b/src/runtime/hexagon/hexagon_user_dma.cc @@ -56,13 +56,26 @@ int HexagonUserDMA::Copy(void* dst, void* src, uint32_t length) { uint32_t src32 = static_cast(src64); uint32_t dst32 = static_cast(dst64); - // allocate new descriptor - void* dma_desc = nullptr; - int ret = posix_memalign(&dma_desc, DMA_DESC_2D_SIZE, DMA_DESC_2D_SIZE); - if (ret || !dma_desc) { - return DMA_FAILURE; + // check if the next DMA descriptor will overwrite an in flight DMA descriptor + // if this is the first DMA there is nothting to check + if (!first_dma_) { + // update the ID of the oldest DMA descriptor in flight + DMAsInFlight(); + // calcultate whether there are DMA descriptors in flight + bool dma_desc_in_flight = id_next_dma_desc_ != id_oldest_dma_desc_in_flight_; + // calculate whether the next DMA descriptor will overwrite the oldest DMA descriptor in flight + bool same_ring_buff_index = (id_next_dma_desc_ % dma_desc_ring_buff_size_) == + (id_oldest_dma_desc_in_flight_ % dma_desc_ring_buff_size_); + // fail if there are DMA descriptors in flight + // and the next DMA descriptor overwrites the oldest DMA descriptor in flight + if (dma_desc_in_flight && same_ring_buff_index) { + return DMA_FAILURE; + } } + // get pointer to next DMA descriptor + void* dma_desc = GetDescriptorAddr(id_next_dma_desc_); + // populate descriptor fields dma_desc_set_state(dma_desc, DESC_STATE_READY); dma_desc_set_next(dma_desc, DMA_NULL_PTR); @@ -82,12 +95,13 @@ int HexagonUserDMA::Copy(void* dst, void* src, uint32_t length) { dmstart(dma_desc); first_dma_ = false; } else { - // `dmlink` descriptor to tail - dmlink(dma_descriptors_.back(), dma_desc); + // `dmlink` descriptor to tail descriptor + void* tail = GetDescriptorAddr(id_next_dma_desc_ - 1); + dmlink(tail, dma_desc); } - // set descriptor as new tail - dma_descriptors_.push_back(dma_desc); + // update the ID of the next DMA descriptor + id_next_dma_desc_++; return DMA_SUCCESS; } @@ -104,28 +118,42 @@ uint32_t HexagonUserDMA::DMAsInFlight() { // poll DMA engine to make sure DMA status is current dmpoll(); - // find the oldest DMA in flight - for (; oldest_dma_in_flight_ < dma_descriptors_.size(); ++oldest_dma_in_flight_) { + // find the oldest DMA descriptor in flight + // total number of DMA descriptors in flight == ID of the next DMA descriptor + for (; id_oldest_dma_desc_in_flight_ < id_next_dma_desc_; ++id_oldest_dma_desc_in_flight_) { // read the `done` bit from the DMA descriptor and stop if incomplete - unsigned int done = dma_desc_get_done(dma_descriptors_[oldest_dma_in_flight_]); + unsigned int done = dma_desc_get_done(GetDescriptorAddr(id_oldest_dma_desc_in_flight_)); if (done == DESC_DONE_INCOMPLETE) { break; } } - // total DMAs in flight = total DMAs - oldest DMA in flight - return dma_descriptors_.size() - oldest_dma_in_flight_; + + // total DMA descriptors in flight = total number DMA desc - ID of the oldest DMA desc in flight + // note that these two IDs are equivalent when no DMA descriptors are in flight + return id_next_dma_desc_ - id_oldest_dma_desc_in_flight_; +} + +void* HexagonUserDMA::GetDescriptorAddr(uint32_t dma_desc_id) { + return static_cast(dma_desc_ring_buff_) + + DMA_DESC_2D_SIZE * (dma_desc_id % dma_desc_ring_buff_size_); } HexagonUserDMA::HexagonUserDMA() { // reset DMA engine unsigned int status = Init(); CHECK_EQ(status, DM0_STATUS_IDLE); + + // allocate memory for ring buffer storage for all DMA descriptors + int ret = posix_memalign(&dma_desc_ring_buff_, DMA_DESC_2D_SIZE, + DMA_DESC_2D_SIZE * dma_desc_ring_buff_size_); + CHECK_EQ(ret, 0); + CHECK_NE(dma_desc_ring_buff_, nullptr); } HexagonUserDMA::~HexagonUserDMA() { - for (auto dma_desc : dma_descriptors_) { - free(dma_desc); - } + // stop the DMA engine + Init(); + free(dma_desc_ring_buff_); } int hexagon_user_dma_1d_sync(void* dst, void* src, uint32_t length) { diff --git a/src/runtime/hexagon/hexagon_user_dma.h b/src/runtime/hexagon/hexagon_user_dma.h index aff38aa867b9..e5f488d1a151 100644 --- a/src/runtime/hexagon/hexagon_user_dma.h +++ b/src/runtime/hexagon/hexagon_user_dma.h @@ -20,7 +20,7 @@ #ifndef TVM_RUNTIME_HEXAGON_HEXAGON_USER_DMA_H_ #define TVM_RUNTIME_HEXAGON_HEXAGON_USER_DMA_H_ -#include +#include namespace tvm { namespace runtime { @@ -70,14 +70,29 @@ class HexagonUserDMA { //! \brief Initializes the Hexagon User DMA engine unsigned int Init(); - //! \brief Calculates the number of DMAs in flight + //! \brief Calculates and returns the number of DMAs in flight; updates the ID of the oldest + //! descriptor in flight uint32_t DMAsInFlight(); - //! \brief Stores descriptors for all DMAs - std::vector dma_descriptors_; + //! \brief Calculates and returns the address of a DMA descriptor in the ring buffer given a + //! descriptor ID + void* GetDescriptorAddr(uint32_t dma_desc_id); - //! \brief Index to the descriptor for the oldest DMA in flight - uint32_t oldest_dma_in_flight_{0}; + //! \brief Pointer to ring buffer storage for all DMA descriptors + void* dma_desc_ring_buff_{nullptr}; + + //! \brief Size of ring buffer storage for all DMA descriptors + const uint32_t dma_desc_ring_buff_size_{100}; + + //! \brief Tracks both the total number of DMA descriptors and the ID of the next DMA descriptor + //! to be added to the ring buffer - modulo ring buffer size to find the ring buffer index for the + //! next DMA descriptor + uint32_t id_next_dma_desc_{0}; + + //! \brief Tracks the ID of the oldest DMA descriptor in flight OR the ID of the next DMA + //! descriptor if no DMA descriptors are in flight - modulo ring buffer size to find the ring + //! buffer index for the oldest DMA descriptor in flight + uint32_t id_oldest_dma_desc_in_flight_{0}; //! \brief Tracks whether (or not) we are executing the very first DMA bool first_dma_{true}; diff --git a/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc b/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc index 359845f15954..9f2e56800bec 100644 --- a/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc +++ b/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc @@ -57,9 +57,9 @@ TEST_F(HexagonUserDMATest, wait) { HexagonUserDMA::Get().Wait(10); } -TEST_F(HexagonUserDMATest, just_poll) { ASSERT_EQ(HexagonUserDMA::Get().Poll(), 0); } +TEST_F(HexagonUserDMATest, poll) { ASSERT_EQ(HexagonUserDMA::Get().Poll(), 0); } -TEST_F(HexagonUserDMATest, copy) { +TEST_F(HexagonUserDMATest, bad_copy) { uint64_t bigaddr = 0x100000000; void* src64 = reinterpret_cast(bigaddr); void* dst64 = reinterpret_cast(bigaddr); From bd1f7d5d932eb4419186f2e0eb4442dd2a5b425c Mon Sep 17 00:00:00 2001 From: adstraw Date: Wed, 24 Aug 2022 17:44:18 -0700 Subject: [PATCH 08/11] add RingBuffer class; used by HexUserDMA to store descriptors --- src/runtime/hexagon/hexagon_user_dma.cc | 73 +++--------- src/runtime/hexagon/hexagon_user_dma.h | 40 +++---- src/runtime/hexagon/ring_buffer.h | 94 +++++++++++++++ .../cpp-runtime/hexagon/ring_buffer_tests.cc | 109 ++++++++++++++++++ 4 files changed, 234 insertions(+), 82 deletions(-) create mode 100644 src/runtime/hexagon/ring_buffer.h create mode 100644 tests/cpp-runtime/hexagon/ring_buffer_tests.cc diff --git a/src/runtime/hexagon/hexagon_user_dma.cc b/src/runtime/hexagon/hexagon_user_dma.cc index 92ea2d44c043..85960f7545d0 100644 --- a/src/runtime/hexagon/hexagon_user_dma.cc +++ b/src/runtime/hexagon/hexagon_user_dma.cc @@ -21,11 +21,6 @@ #include -#include "hexagon_common.h" -#include "hexagon_user_dma_descriptors.h" -#include "hexagon_user_dma_instructions.h" -#include "hexagon_user_dma_registers.h" - namespace tvm { namespace runtime { namespace hexagon { @@ -56,26 +51,12 @@ int HexagonUserDMA::Copy(void* dst, void* src, uint32_t length) { uint32_t src32 = static_cast(src64); uint32_t dst32 = static_cast(dst64); - // check if the next DMA descriptor will overwrite an in flight DMA descriptor - // if this is the first DMA there is nothting to check - if (!first_dma_) { - // update the ID of the oldest DMA descriptor in flight - DMAsInFlight(); - // calcultate whether there are DMA descriptors in flight - bool dma_desc_in_flight = id_next_dma_desc_ != id_oldest_dma_desc_in_flight_; - // calculate whether the next DMA descriptor will overwrite the oldest DMA descriptor in flight - bool same_ring_buff_index = (id_next_dma_desc_ % dma_desc_ring_buff_size_) == - (id_oldest_dma_desc_in_flight_ % dma_desc_ring_buff_size_); - // fail if there are DMA descriptors in flight - // and the next DMA descriptor overwrites the oldest DMA descriptor in flight - if (dma_desc_in_flight && same_ring_buff_index) { - return DMA_FAILURE; - } + // get pointer to next descriptor + dma_desc_2d_t* dma_desc = descriptors_->Next(); + if (!dma_desc) { + return DMA_FAILURE; } - // get pointer to next DMA descriptor - void* dma_desc = GetDescriptorAddr(id_next_dma_desc_); - // populate descriptor fields dma_desc_set_state(dma_desc, DESC_STATE_READY); dma_desc_set_next(dma_desc, DMA_NULL_PTR); @@ -96,13 +77,11 @@ int HexagonUserDMA::Copy(void* dst, void* src, uint32_t length) { first_dma_ = false; } else { // `dmlink` descriptor to tail descriptor - void* tail = GetDescriptorAddr(id_next_dma_desc_ - 1); - dmlink(tail, dma_desc); + dmlink(tail_dma_desc_, dma_desc); } - // update the ID of the next DMA descriptor - id_next_dma_desc_++; - + // update tail + tail_dma_desc_ = dma_desc; return DMA_SUCCESS; } @@ -115,27 +94,8 @@ void HexagonUserDMA::Wait(uint32_t max_dmas_in_flight) { uint32_t HexagonUserDMA::Poll() { return DMAsInFlight(); } uint32_t HexagonUserDMA::DMAsInFlight() { - // poll DMA engine to make sure DMA status is current - dmpoll(); - - // find the oldest DMA descriptor in flight - // total number of DMA descriptors in flight == ID of the next DMA descriptor - for (; id_oldest_dma_desc_in_flight_ < id_next_dma_desc_; ++id_oldest_dma_desc_in_flight_) { - // read the `done` bit from the DMA descriptor and stop if incomplete - unsigned int done = dma_desc_get_done(GetDescriptorAddr(id_oldest_dma_desc_in_flight_)); - if (done == DESC_DONE_INCOMPLETE) { - break; - } - } - - // total DMA descriptors in flight = total number DMA desc - ID of the oldest DMA desc in flight - // note that these two IDs are equivalent when no DMA descriptors are in flight - return id_next_dma_desc_ - id_oldest_dma_desc_in_flight_; -} - -void* HexagonUserDMA::GetDescriptorAddr(uint32_t dma_desc_id) { - return static_cast(dma_desc_ring_buff_) + - DMA_DESC_2D_SIZE * (dma_desc_id % dma_desc_ring_buff_size_); + dmpoll(); // update DMA engine status + return descriptors_->InFlight(); } HexagonUserDMA::HexagonUserDMA() { @@ -143,17 +103,16 @@ HexagonUserDMA::HexagonUserDMA() { unsigned int status = Init(); CHECK_EQ(status, DM0_STATUS_IDLE); - // allocate memory for ring buffer storage for all DMA descriptors - int ret = posix_memalign(&dma_desc_ring_buff_, DMA_DESC_2D_SIZE, - DMA_DESC_2D_SIZE * dma_desc_ring_buff_size_); - CHECK_EQ(ret, 0); - CHECK_NE(dma_desc_ring_buff_, nullptr); + auto desc_in_flight = [](dma_desc_2d_t* dma_desc) { + unsigned int done = dma_desc_get_done(dma_desc); + return (done != DESC_DONE_COMPLETE); + }; + descriptors_ = new RingBuffer(100, desc_in_flight); } HexagonUserDMA::~HexagonUserDMA() { - // stop the DMA engine - Init(); - free(dma_desc_ring_buff_); + Init(); // stop DMA engine + delete descriptors_; } int hexagon_user_dma_1d_sync(void* dst, void* src, uint32_t length) { diff --git a/src/runtime/hexagon/hexagon_user_dma.h b/src/runtime/hexagon/hexagon_user_dma.h index e5f488d1a151..d091067cfcfe 100644 --- a/src/runtime/hexagon/hexagon_user_dma.h +++ b/src/runtime/hexagon/hexagon_user_dma.h @@ -20,7 +20,11 @@ #ifndef TVM_RUNTIME_HEXAGON_HEXAGON_USER_DMA_H_ #define TVM_RUNTIME_HEXAGON_HEXAGON_USER_DMA_H_ -#include +#include "hexagon_common.h" +#include "hexagon_user_dma_descriptors.h" +#include "hexagon_user_dma_instructions.h" +#include "hexagon_user_dma_registers.h" +#include "ring_buffer.h" namespace tvm { namespace runtime { @@ -36,7 +40,7 @@ class HexagonUserDMA { * \param dst Destination address * \param src Source address * \param length Length in bytes to copy - * \returns Status, either DMA_SUCCESS or DMA_FAILURE + * \returns Status: DMA_SUCCESS or DMA_FAILURE */ int Copy(void* dst, void* src, uint32_t length); @@ -53,13 +57,14 @@ class HexagonUserDMA { */ uint32_t Poll(); - //! HexagonUserDMA uses the singleton pattern + //! \brief HexagonUserDMA uses the singleton pattern static HexagonUserDMA& Get() { static HexagonUserDMA* hud = new HexagonUserDMA(); return *hud; } private: + // HexagonUserDMA uses the singleton pattern HexagonUserDMA(); ~HexagonUserDMA(); HexagonUserDMA(const HexagonUserDMA&) = delete; @@ -70,32 +75,17 @@ class HexagonUserDMA { //! \brief Initializes the Hexagon User DMA engine unsigned int Init(); - //! \brief Calculates and returns the number of DMAs in flight; updates the ID of the oldest - //! descriptor in flight + //! \brief Calculates and returns the number of DMAs in flight uint32_t DMAsInFlight(); - //! \brief Calculates and returns the address of a DMA descriptor in the ring buffer given a - //! descriptor ID - void* GetDescriptorAddr(uint32_t dma_desc_id); - - //! \brief Pointer to ring buffer storage for all DMA descriptors - void* dma_desc_ring_buff_{nullptr}; - - //! \brief Size of ring buffer storage for all DMA descriptors - const uint32_t dma_desc_ring_buff_size_{100}; - - //! \brief Tracks both the total number of DMA descriptors and the ID of the next DMA descriptor - //! to be added to the ring buffer - modulo ring buffer size to find the ring buffer index for the - //! next DMA descriptor - uint32_t id_next_dma_desc_{0}; + //! \brief Tracks whether the very first DMA has been executed + bool first_dma_{true}; - //! \brief Tracks the ID of the oldest DMA descriptor in flight OR the ID of the next DMA - //! descriptor if no DMA descriptors are in flight - modulo ring buffer size to find the ring - //! buffer index for the oldest DMA descriptor in flight - uint32_t id_oldest_dma_desc_in_flight_{0}; + //! \brief Tracks the tail DMA descriptor + void* tail_dma_desc_{nullptr}; - //! \brief Tracks whether (or not) we are executing the very first DMA - bool first_dma_{true}; + //! \brief Storage for all DMA descriptors + RingBuffer* descriptors_{nullptr}; }; } // namespace hexagon diff --git a/src/runtime/hexagon/ring_buffer.h b/src/runtime/hexagon/ring_buffer.h new file mode 100644 index 000000000000..97e100cf8ac8 --- /dev/null +++ b/src/runtime/hexagon/ring_buffer.h @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef TVM_RUNTIME_HEXAGON_RING_BUFFER_H_ +#define TVM_RUNTIME_HEXAGON_RING_BUFFER_H_ + +#include + +#include "hexagon_common.h" + +namespace tvm { +namespace runtime { +namespace hexagon { + +template +class RingBuffer { + public: + //! \brief Returns the number of Ts in flight + uint32_t InFlight() { + while (!in_flight_(GetAddr(id_oldest_))) { + id_oldest_++; + } + return id_next_ - id_oldest_; + } + + //! \brief Returns pointer to next T; null if ring buffer is full + T* Next() { + if (InFlight() == ring_buff_size_) { + return nullptr; + } + T* next = GetAddr(id_next_); + id_next_++; + return next; + } + + /*! \brief Creates a ring buffer for storage items of type T + * \param ring_buff_size Size of the ring buffer in number of Ts + * \param in_flight Function that determines whether a T is in flight + */ + RingBuffer(uint32_t ring_buff_size, std::function in_flight) + : ring_buff_size_(ring_buff_size), in_flight_(in_flight) { + CHECK_NE(ring_buff_size, 0); + int ret = posix_memalign(reinterpret_cast(&ring_buff_ptr_), sizeof(T), + sizeof(T) * ring_buff_size_); + CHECK_EQ(ret, 0); + CHECK_NE(ring_buff_ptr_, nullptr); + } + + ~RingBuffer() { free(ring_buff_ptr_); } + + private: + //! \brief Returns the address of a T given its index + T* GetAddr(uint32_t id) const { + uint32_t ring_buff_index = id % ring_buff_size_; + return ring_buff_ptr_ + ring_buff_index; + } + + //! \brief Pointer to the ring buffer + T* ring_buff_ptr_{nullptr}; + + //! \brief Size of the ring buffer in number of Ts + const uint32_t ring_buff_size_; + + //! \brief Function that determines whether a T is in flight + const std::function in_flight_; + + //! \brief Tracks the ID of the next T to be added to the ring buffer + uint32_t id_next_{0}; + + //! \brief Tracks the ID of the oldest T in flight + uint32_t id_oldest_{0}; +}; + +} // namespace hexagon +} // namespace runtime +} // namespace tvm + +#endif // TVM_RUNTIME_HEXAGON_RING_BUFFER_H_ diff --git a/tests/cpp-runtime/hexagon/ring_buffer_tests.cc b/tests/cpp-runtime/hexagon/ring_buffer_tests.cc new file mode 100644 index 000000000000..a61ce82cf749 --- /dev/null +++ b/tests/cpp-runtime/hexagon/ring_buffer_tests.cc @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include + +#include "../src/runtime/hexagon/ring_buffer.h" + +using namespace tvm::runtime; +using namespace tvm::runtime::hexagon; + +class RingBufferTest : public ::testing::Test { + void SetUp() override { + always_in_flight_rb = new RingBuffer(10, always_in_flight); + ring_buff = new RingBuffer(10, check_answer); + } + void TearDown() override { delete always_in_flight_rb; } + + public: + std::function always_in_flight = [](int* ptr) { return true; }; + RingBuffer* always_in_flight_rb; + + std::function check_answer = [](int* ptr) { + if (*ptr == 42) { + // complete, retired, done + return false; + } + // in flight + return true; + }; + RingBuffer* ring_buff; +}; + +TEST_F(RingBufferTest, zero_size_ring_buffer) { + ASSERT_THROW(RingBuffer(0, always_in_flight), InternalError); +} + +TEST_F(RingBufferTest, in_flight) { ASSERT_EQ(always_in_flight_rb->InFlight(), 0); } + +TEST_F(RingBufferTest, next) { + int* ptr = always_in_flight_rb->Next(); + ASSERT_NE(ptr, nullptr); + ASSERT_EQ(always_in_flight_rb->InFlight(), 1); +} + +TEST_F(RingBufferTest, full) { + for (int i = 0; i < 10; ++i) { + int* ptr = always_in_flight_rb->Next(); + ASSERT_NE(ptr, nullptr); + } + ASSERT_EQ(always_in_flight_rb->InFlight(), 10); + ASSERT_EQ(always_in_flight_rb->Next(), nullptr); + ASSERT_EQ(always_in_flight_rb->InFlight(), 10); +} + +TEST_F(RingBufferTest, half_full) { + // these will complete + for (int i = 0; i < 5; ++i) { + int* ptr = ring_buff->Next(); + ASSERT_NE(ptr, nullptr); + *ptr = 42; + } + + // these will not complete + for (int i = 0; i < 5; ++i) { + int* ptr = ring_buff->Next(); + ASSERT_NE(ptr, nullptr); + *ptr = 43; + } + + ASSERT_EQ(ring_buff->InFlight(), 5); + ASSERT_NE(ring_buff->Next(), nullptr); + ASSERT_EQ(ring_buff->InFlight(), 6); +} + +TEST_F(RingBufferTest, still_full) { + // these will not complete + for (int i = 0; i < 5; ++i) { + int* ptr = ring_buff->Next(); + ASSERT_NE(ptr, nullptr); + *ptr = 43; + } + + // these would complete, but they are blocked + for (int i = 0; i < 5; ++i) { + int* ptr = ring_buff->Next(); + ASSERT_NE(ptr, nullptr); + *ptr = 42; + } + + ASSERT_EQ(ring_buff->InFlight(), 10); + ASSERT_EQ(ring_buff->Next(), nullptr); + ASSERT_EQ(ring_buff->InFlight(), 10); +} From 3e2cae6b8df3104e826173f2edb1393095a43707 Mon Sep 17 00:00:00 2001 From: adstraw Date: Wed, 24 Aug 2022 22:07:36 -0700 Subject: [PATCH 09/11] trigger ci From 2c5970249f3bafbbc633f9efa99292caf54b1841 Mon Sep 17 00:00:00 2001 From: adstraw Date: Thu, 25 Aug 2022 16:02:19 -0700 Subject: [PATCH 10/11] fix bug with RingBuffer::InFlight and improve testing --- src/runtime/hexagon/ring_buffer.h | 2 +- .../cpp-runtime/hexagon/ring_buffer_tests.cc | 149 ++++++++++++++---- 2 files changed, 116 insertions(+), 35 deletions(-) diff --git a/src/runtime/hexagon/ring_buffer.h b/src/runtime/hexagon/ring_buffer.h index 97e100cf8ac8..d21b2b9953c2 100644 --- a/src/runtime/hexagon/ring_buffer.h +++ b/src/runtime/hexagon/ring_buffer.h @@ -33,7 +33,7 @@ class RingBuffer { public: //! \brief Returns the number of Ts in flight uint32_t InFlight() { - while (!in_flight_(GetAddr(id_oldest_))) { + while (id_oldest_ < id_next_ && !in_flight_(GetAddr(id_oldest_))) { id_oldest_++; } return id_next_ - id_oldest_; diff --git a/tests/cpp-runtime/hexagon/ring_buffer_tests.cc b/tests/cpp-runtime/hexagon/ring_buffer_tests.cc index a61ce82cf749..cd40dca87b02 100644 --- a/tests/cpp-runtime/hexagon/ring_buffer_tests.cc +++ b/tests/cpp-runtime/hexagon/ring_buffer_tests.cc @@ -25,85 +25,166 @@ using namespace tvm::runtime; using namespace tvm::runtime::hexagon; class RingBufferTest : public ::testing::Test { - void SetUp() override { - always_in_flight_rb = new RingBuffer(10, always_in_flight); - ring_buff = new RingBuffer(10, check_answer); - } - void TearDown() override { delete always_in_flight_rb; } + void SetUp() override { ring_buff = new RingBuffer(size, in_flight); } + void TearDown() override { delete ring_buff; } public: - std::function always_in_flight = [](int* ptr) { return true; }; - RingBuffer* always_in_flight_rb; - - std::function check_answer = [](int* ptr) { + std::function in_flight = [](int* ptr) { if (*ptr == 42) { - // complete, retired, done + // finished return false; } // in flight return true; }; + + int finished = 42; + int inflight = 43; + uint32_t size = 4; + uint32_t half = size / 2; RingBuffer* ring_buff; }; TEST_F(RingBufferTest, zero_size_ring_buffer) { - ASSERT_THROW(RingBuffer(0, always_in_flight), InternalError); + ASSERT_THROW(RingBuffer(0, in_flight), InternalError); } -TEST_F(RingBufferTest, in_flight) { ASSERT_EQ(always_in_flight_rb->InFlight(), 0); } +TEST_F(RingBufferTest, in_flight) { ASSERT_EQ(ring_buff->InFlight(), 0); } TEST_F(RingBufferTest, next) { - int* ptr = always_in_flight_rb->Next(); + // get pointer to first item + int* ptr = ring_buff->Next(); ASSERT_NE(ptr, nullptr); - ASSERT_EQ(always_in_flight_rb->InFlight(), 1); + + // mark it in flight and check + *ptr = inflight; + ASSERT_EQ(ring_buff->InFlight(), 1); + + // mark it finished and check + *ptr = finished; + ASSERT_EQ(ring_buff->InFlight(), 0); } TEST_F(RingBufferTest, full) { - for (int i = 0; i < 10; ++i) { - int* ptr = always_in_flight_rb->Next(); + // fill the ring buffer + for (int i = 0; i < size; ++i) { + int* ptr = ring_buff->Next(); + ASSERT_NE(ptr, nullptr); + + // mark in flight and check + *ptr = inflight; + ASSERT_EQ(ring_buff->InFlight(), i + 1); + } + + // check that the ring buffer is full + ASSERT_EQ(ring_buff->Next(), nullptr); + ASSERT_EQ(ring_buff->InFlight(), size); +} + +TEST_F(RingBufferTest, wrap) { + // fill the ring buffer, but mark each finished + bool first = true; + int* firstptr = nullptr; + for (int i = 0; i < size; ++i) { + int* ptr = ring_buff->Next(); ASSERT_NE(ptr, nullptr); + + // save first ptr for later comparison + if (first) { + firstptr = ptr; + first = false; + } + + // mark finished and check + *ptr = finished; + ASSERT_EQ(ring_buff->InFlight(), 0); + } + + // reuse the first ring buffer entry + int* ptr = ring_buff->Next(); + ASSERT_EQ(ptr, firstptr); + + // mark it in flight and check + *ptr = inflight; + ASSERT_EQ(ring_buff->InFlight(), 1); + + // mark it finished and check + *ptr = finished; + ASSERT_EQ(ring_buff->InFlight(), 0); +} + +TEST_F(RingBufferTest, wrap_corner) { + for (int i = 0; i < size; ++i) { + int* ptr = ring_buff->Next(); + *ptr = finished; } - ASSERT_EQ(always_in_flight_rb->InFlight(), 10); - ASSERT_EQ(always_in_flight_rb->Next(), nullptr); - ASSERT_EQ(always_in_flight_rb->InFlight(), 10); + + // reuse the first ring buffer entry + int* ptr = ring_buff->Next(); + ASSERT_NE(ptr, nullptr); + + // user must mark the item "inflight" before checking in flight count + // here the "finished" status is inherited from the reused ring buffer entry + // thus the in flight count is zero instead one; which the user might expect + ASSERT_EQ(ring_buff->InFlight(), 0); + + // marking the item "inflight" after checking the in flight count + // will not change the outcome; the ring buffer considers the item "finished" + *ptr = inflight; + ASSERT_EQ(ring_buff->InFlight(), 0); } -TEST_F(RingBufferTest, half_full) { +TEST_F(RingBufferTest, half_in_flight) { // these will complete - for (int i = 0; i < 5; ++i) { + for (int i = 0; i < half; ++i) { int* ptr = ring_buff->Next(); ASSERT_NE(ptr, nullptr); - *ptr = 42; + *ptr = finished; + ASSERT_EQ(ring_buff->InFlight(), 0); } // these will not complete - for (int i = 0; i < 5; ++i) { + for (int i = 0; i < half; ++i) { int* ptr = ring_buff->Next(); ASSERT_NE(ptr, nullptr); - *ptr = 43; + *ptr = inflight; + ASSERT_EQ(ring_buff->InFlight(), i + 1); } - ASSERT_EQ(ring_buff->InFlight(), 5); - ASSERT_NE(ring_buff->Next(), nullptr); - ASSERT_EQ(ring_buff->InFlight(), 6); + // check half in flight + ASSERT_EQ(ring_buff->InFlight(), half); + + // get pointer to next item + int* ptr = ring_buff->Next(); + ASSERT_NE(ptr, nullptr); + + // mark it inflight and check + *ptr = inflight; + ASSERT_EQ(ring_buff->InFlight(), 3); + + // mark it finished and check also blocked + *ptr = finished; + ASSERT_EQ(ring_buff->InFlight(), 3); } -TEST_F(RingBufferTest, still_full) { +TEST_F(RingBufferTest, half_in_flight_blocked) { // these will not complete - for (int i = 0; i < 5; ++i) { + for (int i = 0; i < half; ++i) { int* ptr = ring_buff->Next(); ASSERT_NE(ptr, nullptr); - *ptr = 43; + *ptr = inflight; + ASSERT_EQ(ring_buff->InFlight(), i + 1); } // these would complete, but they are blocked - for (int i = 0; i < 5; ++i) { + for (int i = half; i < size; ++i) { int* ptr = ring_buff->Next(); ASSERT_NE(ptr, nullptr); - *ptr = 42; + *ptr = finished; + ASSERT_EQ(ring_buff->InFlight(), i + 1); } - ASSERT_EQ(ring_buff->InFlight(), 10); + // check that the ring buffer is full ASSERT_EQ(ring_buff->Next(), nullptr); - ASSERT_EQ(ring_buff->InFlight(), 10); + ASSERT_EQ(ring_buff->InFlight(), size); } From af53b91bc3adbb94714b61f0023f6f36739682ad Mon Sep 17 00:00:00 2001 From: adstraw Date: Thu, 25 Aug 2022 16:38:51 -0700 Subject: [PATCH 11/11] add test to overflow the HexagonUserDMA ring buffer --- src/runtime/hexagon/hexagon_user_dma.cc | 4 ++-- src/runtime/hexagon/hexagon_user_dma.h | 2 ++ .../hexagon/hexagon_user_dma_tests.cc | 24 ++++++++++++++++--- 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/src/runtime/hexagon/hexagon_user_dma.cc b/src/runtime/hexagon/hexagon_user_dma.cc index 85960f7545d0..8d45b7590bc4 100644 --- a/src/runtime/hexagon/hexagon_user_dma.cc +++ b/src/runtime/hexagon/hexagon_user_dma.cc @@ -54,7 +54,7 @@ int HexagonUserDMA::Copy(void* dst, void* src, uint32_t length) { // get pointer to next descriptor dma_desc_2d_t* dma_desc = descriptors_->Next(); if (!dma_desc) { - return DMA_FAILURE; + return DMA_RETRY; } // populate descriptor fields @@ -107,7 +107,7 @@ HexagonUserDMA::HexagonUserDMA() { unsigned int done = dma_desc_get_done(dma_desc); return (done != DESC_DONE_COMPLETE); }; - descriptors_ = new RingBuffer(100, desc_in_flight); + descriptors_ = new RingBuffer(MAX_DMA_DESCRIPTORS, desc_in_flight); } HexagonUserDMA::~HexagonUserDMA() { diff --git a/src/runtime/hexagon/hexagon_user_dma.h b/src/runtime/hexagon/hexagon_user_dma.h index d091067cfcfe..aa00df79c4d0 100644 --- a/src/runtime/hexagon/hexagon_user_dma.h +++ b/src/runtime/hexagon/hexagon_user_dma.h @@ -32,6 +32,8 @@ namespace hexagon { #define DMA_SUCCESS 0 #define DMA_FAILURE -1 +#define DMA_RETRY 1 +#define MAX_DMA_DESCRIPTORS 100 class HexagonUserDMA { public: diff --git a/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc b/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc index 9f2e56800bec..bf7a23712d7d 100644 --- a/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc +++ b/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc @@ -49,7 +49,7 @@ class HexagonUserDMATest : public ::testing::Test { void* dst{nullptr}; char* src_char{nullptr}; char* dst_char{nullptr}; - uint32_t length{0x400000}; // 4MB + uint32_t length{0x4000}; // 16KB }; TEST_F(HexagonUserDMATest, wait) { @@ -153,8 +153,26 @@ TEST_F(HexagonUserDMATest, pipeline) { } // verify - ASSERT_EQ(ret, 0); + ASSERT_EQ(ret, DMA_SUCCESS); for (uint32_t i = 0; i < length; ++i) { - ASSERT_EQ(dst_char[i], 2); + ASSERT_EQ(2, dst_char[i]); + } +} + +TEST_F(HexagonUserDMATest, overflow_ring_buffer) { + uint32_t number_of_dmas = 0x400; // 1k + uint32_t length_of_each_dma = length / number_of_dmas; + + for (uint32_t i = 0; i < number_of_dmas; ++i) { + do { + ret = HexagonUserDMA::Get().Copy(dst_char + i * length_of_each_dma, + src_char + i * length_of_each_dma, length_of_each_dma); + } while (ret == DMA_RETRY); + ASSERT_EQ(ret, DMA_SUCCESS); + } + + // verify + for (uint32_t i = 0; i < length; ++i) { + ASSERT_EQ(src_char[i], dst_char[i]); } } \ No newline at end of file