STEllAR-GROUP · Pansysk75 · Sep 7, 2024 · Sep 9, 2024 · Sep 11, 2024 · Sep 17, 2024
@@ -42,7 +42,7 @@ set(allocator_support_compat_headers
 )
 # cmake-format: on
 
-set(allocator_support_sources)
+set(allocator_support_sources thread_local_caching_allocator.cpp)
 
 include(HPX_AddModule)
 add_hpx_module(
@@ -52,6 +52,7 @@ add_hpx_module(
   HEADERS ${allocator_support_headers}
   COMPAT_HEADERS ${allocator_support_compat_headers}
   DEPENDENCIES hpx_dependencies_allocator
-  MODULE_DEPENDENCIES hpx_concepts hpx_config hpx_preprocessor hpx_type_support
+  MODULE_DEPENDENCIES hpx_assertion hpx_concepts hpx_config hpx_preprocessor
+                      hpx_type_support
   CMAKE_SUBDIRS examples tests
 )
@@ -1,4 +1,4 @@
-//  Copyright (c) 2023 Hartmut Kaiser
+//  Copyright (c) 2023-2024 Hartmut Kaiser
 //
 //  SPDX-License-Identifier: BSL-1.0
 //  Distributed under the Boost Software License, Version 1.0. (See accompanying
@@ -8,11 +8,13 @@
 
 #include <hpx/config.hpp>
 #include <hpx/allocator_support/config/defines.hpp>
+#include <hpx/assert.hpp>
 
 #include <cstddef>
+#include <cstdint>
+#include <functional>
 #include <memory>
 #include <new>
-#include <stack>
 #include <type_traits>
 #include <utility>
 
@@ -21,14 +23,42 @@
 #if defined(HPX_ALLOCATOR_SUPPORT_HAVE_CACHING) &&                             \
     !((defined(HPX_HAVE_CUDA) && defined(__CUDACC__)) ||                       \
         defined(HPX_HAVE_HIP))
+
+    namespace detail {
+
+        HPX_CORE_EXPORT void init_allocator_cache(
+            std::size_t, std::function<void()>&& clear_cache);
+        HPX_CORE_EXPORT std::pair<void*, std::size_t> allocate_from_cache(
+            std::size_t) noexcept;
+        [[nodiscard]] HPX_CORE_EXPORT bool cache_empty(std::size_t) noexcept;
+        HPX_CORE_EXPORT void return_to_cache(
+            std::size_t, void* p, std::size_t n);
+
+        // maximal number of caches [0...max)
+        inline constexpr int max_number_of_caches = 16;
+
+        ///////////////////////////////////////////////////////////////////////
+        constexpr int next_power_of_two(std::int64_t n) noexcept
+        {
+            int i = 0;
+            for (--n; n > 0; n >>= 1)
+            {
+                ++i;
+            }
+            return i;
+        }
+    }    // namespace detail
+
     ///////////////////////////////////////////////////////////////////////////
     template <typename T = char, typename Allocator = std::allocator<T>>
     struct thread_local_caching_allocator
     {
         HPX_NO_UNIQUE_ADDRESS Allocator alloc;
 
+    private:
         using traits = std::allocator_traits<Allocator>;
 
+    public:
         using value_type = typename traits::value_type;
         using pointer = typename traits::pointer;
         using const_pointer = typename traits::const_pointer;
@@ -50,86 +80,38 @@
         using propagate_on_container_swap =
             typename traits::propagate_on_container_swap;
 
-    private:
-        struct allocated_cache
+        explicit thread_local_caching_allocator(
+            Allocator const& alloc = Allocator{}) noexcept(noexcept(std::
+                is_nothrow_copy_constructible_v<Allocator>))
+          : alloc(alloc)
         {
-            explicit allocated_cache(Allocator const& a) noexcept(
-                noexcept(std::is_nothrow_copy_constructible_v<Allocator>))
-              : alloc(a)
-            {
-            }
-
-            allocated_cache(allocated_cache const&) = delete;
-            allocated_cache(allocated_cache&&) = delete;
-            allocated_cache& operator=(allocated_cache const&) = delete;
-            allocated_cache& operator=(allocated_cache&&) = delete;
-
-            ~allocated_cache()
-            {
-                clear_cache();
-            }
-
-            pointer allocate(size_type n)
-            {
-                pointer p;
-                if (data.empty())
+            // Note: capturing the allocator will be ok only as long as it
+            // doesn't have any state as this lambda will be possibly called
+            // very late during destruction of the thread_local cache.
+            static_assert(std::is_empty_v<Allocator>,
+                "Please don't use allocators with state in conjunction with "
+                "the thread_local_caching_allocator");
+
+            constexpr std::size_t num_cache =
+                detail::next_power_of_two(sizeof(T));
+
+            static_assert(num_cache < detail::max_number_of_caches,
+                "This allocator does not support allocating objects larger "
+                "than 2^16 bytes");
+
+            auto f = [=]() mutable {
+                while (!detail::cache_empty(num_cache))
                 {
-                    p = traits::allocate(alloc, n);
-                    if (p == nullptr)
+                    auto [p, n] = detail::allocate_from_cache(num_cache);
+                    if (p != nullptr)
                     {
-                        throw std::bad_alloc();
+                        traits::deallocate(const_cast<Allocator&>(alloc),
+                            static_cast<char*>(p), n);
                     }
                 }
-                else
-                {
-                    p = data.top().first;
-                    data.pop();
-                }
-
-                ++allocated;
-                return p;
-            }
+            };
 
-            void deallocate(pointer p, size_type n) noexcept
-            {
-                data.push(std::make_pair(p, n));
-                if (++deallocated > 2 * (allocated + 16))
-                {
-                    clear_cache();
-                    allocated = 0;
-                    deallocated = 0;
-                }
-            }
-
-        private:
-            void clear_cache() noexcept
-            {
-                while (!data.empty())
-                {
-                    traits::deallocate(
-                        alloc, data.top().first, data.top().second);
-                    data.pop();
-                }
-            }
-
-            HPX_NO_UNIQUE_ADDRESS Allocator alloc;
-            std::stack<std::pair<T*, size_type>> data;
-            std::size_t allocated = 0;
-            std::size_t deallocated = 0;
-        };
-
-        allocated_cache& cache()
-        {
-            thread_local allocated_cache allocated_data(alloc);
-            return allocated_data;
-        }
-
-    public:
-        explicit thread_local_caching_allocator(
-            Allocator const& alloc = Allocator{}) noexcept(noexcept(std::
-                is_nothrow_copy_constructible_v<Allocator>))
-          : alloc(alloc)
-        {
+            detail::init_allocator_cache(num_cache, HPX_MOVE(f));
         }
 
         template <typename U, typename Alloc>
@@ -154,16 +136,32 @@
 
         [[nodiscard]] pointer allocate(size_type n, void const* = nullptr)
         {
-            if (max_size() < n)
+            constexpr std::size_t num_cache =
+                detail::next_power_of_two(sizeof(T));
+            std::size_t N = n * (1ull << num_cache);
+
+            if (max_size() < N)
             {
                 throw std::bad_array_new_length();
             }
-            return cache().allocate(n);
+
+            auto [p, _] = detail::allocate_from_cache(num_cache);
+            if (p == nullptr)
+            {
+                p = traits::allocate(alloc, N);
+                if (p == nullptr)
+                {
+                    throw std::bad_alloc();
+                }
+            }
+            return static_cast<pointer>(p);
         }
 
-        void deallocate(pointer p, size_type n) noexcept
+        void deallocate(pointer p, size_type n)
         {
-            cache().deallocate(p, n);
+            constexpr std::size_t num_cache =
+                detail::next_power_of_two(sizeof(T));
+            detail::return_to_cache(num_cache, p, n * (1ull << num_cache));
         }
 
         [[nodiscard]] constexpr size_type max_size() noexcept

@@ -0,0 +1,123 @@
+//  Copyright (c) 2023-2024 Hartmut Kaiser
+//
+//  SPDX-License-Identifier: BSL-1.0
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <hpx/config.hpp>
+#include <hpx/allocator_support/config/defines.hpp>
+
+#if defined(HPX_ALLOCATOR_SUPPORT_HAVE_CACHING) &&                             \
+    !((defined(HPX_HAVE_CUDA) && defined(__CUDACC__)) ||                       \
+        defined(HPX_HAVE_HIP))
+
+#include <hpx/allocator_support/thread_local_caching_allocator.hpp>
+#include <hpx/assert.hpp>
+#include <hpx/type_support/static_reinit_interface.hpp>
+
+#include <cstddef>
+#include <functional>
+#include <stack>
+#include <utility>
+
+namespace hpx::util::detail {
+
+    ///////////////////////////////////////////////////////////////////////////
+    struct allocated_cache
+    {
+        explicit allocated_cache() noexcept = default;
+
+        void init(std::function<void()>&& clear)
+        {
+            if (!clear_cache)    // initialize once
+            {
+                clear_cache = HPX_MOVE(clear);
+                util::reinit_register(std::function<void()>(), clear_cache);
+            }
+        }
+
+        allocated_cache(allocated_cache const&) = delete;
+        allocated_cache(allocated_cache&&) = delete;
+        allocated_cache& operator=(allocated_cache const&) = delete;
+        allocated_cache& operator=(allocated_cache&&) = delete;
+
+        ~allocated_cache()
+        {
+            if (clear_cache)
+            {
+                clear_cache();
+            }
+        }
+
+        std::pair<void*, std::size_t> allocate() noexcept
+        {
+            std::pair<void*, std::size_t> p{nullptr, 0};
+            if (!data.empty())
+            {
+                p = data.top();
+                data.pop();
+
+                ++allocated;
+            }
+            return p;
+        }
+
+        void deallocate(void* p, std::size_t n)
+        {
+            data.emplace(p, n);
+            if (++deallocated > 2 * (allocated + 16))
+            {
+                if (clear_cache)
+                {
+                    clear_cache();
+                }
+
+                allocated = 0;
+                deallocated = 0;
+            }
+        }
+
+        [[nodiscard]] bool empty() const noexcept
+        {
+            return data.empty();
+        }
+
+    private:
+        std::stack<std::pair<void*, std::size_t>> data;
+        std::size_t allocated = 0;
+        std::size_t deallocated = 0;
+        std::function<void()> clear_cache;
+    };
+
+    ///////////////////////////////////////////////////////////////////////////
+    allocated_cache& cache(std::size_t n)
+    {
+        HPX_ASSERT(n < max_number_of_caches);
+
+        thread_local allocated_cache allocated_data[max_number_of_caches];
+        return allocated_data[n];
+    }
+
+    void init_allocator_cache(
+        std::size_t n, std::function<void()>&& clear_cache)
+    {
+        cache(n).init(HPX_MOVE(clear_cache));
+    }
+
+    std::pair<void*, std::size_t> allocate_from_cache(std::size_t n) noexcept
+    {
+        return cache(n).allocate();
+    }
+
+    void return_to_cache(std::size_t n, void* p, std::size_t const size)
+    {
+        cache(n).deallocate(p, size);
+    }
+
+    bool cache_empty(std::size_t n) noexcept
+    {
+        return cache(n).empty();
+    }
+}    // namespace hpx::util::detail
+
+#endif
@@ -1,4 +1,4 @@
-//  Copyright (c) 2007-2022 Hartmut Kaiser
+//  Copyright (c) 2007-2024 Hartmut Kaiser
 //  Copyright (c) 2013 Agustin Berge
 //  Copyright (c) 2017 Denis Blank
 //
@@ -201,14 +201,16 @@ namespace hpx::lcos::detail {
             return async_visit_future(HPX_FORWARD(T, current));
         }
 
+        // clang-format off
         template <typename T, typename N>
         auto operator()(hpx::util::async_traverse_detach_tag, T&& current,
             N&& next) -> decltype(async_detach_future(HPX_FORWARD(T, current),
-            HPX_FORWARD(N, next)))
+                          HPX_FORWARD(N, next)))
         {
             return async_detach_future(
                 HPX_FORWARD(T, current), HPX_FORWARD(N, next));
         }
+        // clang-format on
 
         template <typename T>
         void operator()(hpx::util::async_traverse_complete_tag, T&& pack)