Skip to content

[libc] Cache the most recently used slot for a chunk size #149751

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from

Conversation

jhuber6
Copy link
Contributor

@jhuber6 jhuber6 commented Jul 21, 2025

Summary:
This patch changes the find_slab logic to simply cache the most
successful slot. This means the happy fast path is now a single atomic
load on this index. I removed the SIMT shuffling logic that did slab
lookups wave-parallel. Here I am considering the actual traversal to be
comparatively unlikely, so it's not overly bad that it takes longer.
ideally one thread finds a slot and shared it with the rest so we only
pay that cost once.

@llvmbot
Copy link
Member

llvmbot commented Jul 21, 2025

@llvm/pr-subscribers-libc

Author: Joseph Huber (jhuber6)

Changes

Summary:
This patch changes the find_slab logic to simply cache the most
successful slot. This means the happy fast path is now a single atomic
load on this index. I removed the SIMT shuffling logic that did slab
lookups wave-parallel. Here I am considering the actual traversal to be
comparatively unlikely, so it's not overly bad that it takes longer.
ideally one thread finds a slot and shared it with the rest so we only
pay that cost once.


Full diff: https://github.com/llvm/llvm-project/pull/149751.diff

1 Files Affected:

  • (modified) libc/src/__support/GPU/allocator.cpp (+58-55)
diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index 7923fbb2c1c24..791d2381671d9 100644
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -34,13 +34,12 @@ constexpr static uint32_t BITS_IN_WORD = sizeof(uint32_t) * 8;
 constexpr static uint32_t MIN_SIZE = 16;
 constexpr static uint32_t MIN_ALIGNMENT = MIN_SIZE - 1;
 
+// The number of times to attempt claiming an in-progress slab allocation.
+constexpr static uint32_t MAX_TRIES = 128;
+
 // A sentinel used to indicate an invalid but non-null pointer value.
 constexpr static uint64_t SENTINEL = cpp::numeric_limits<uint64_t>::max();
 
-// The number of times we will try starting on a single index before skipping
-// past it.
-constexpr static uint32_t MAX_TRIES = 512;
-
 static_assert(!(ARRAY_SIZE & (ARRAY_SIZE - 1)), "Must be a power of two");
 
 namespace impl {
@@ -92,20 +91,10 @@ static inline uint32_t xorshift32(uint32_t &state) {
   return state * 0x9e3779bb;
 }
 
-// Final stage of murmurhash used to get a unique index for the global array
-static inline uint32_t hash(uint32_t x) {
-  x ^= x >> 16;
-  x *= 0x85ebca6b;
-  x ^= x >> 13;
-  x *= 0xc2b2ae35;
-  x ^= x >> 16;
-  return x;
-}
-
 // Rounds the input value to the closest permitted chunk size. Here we accept
 // the sum of the closest three powers of two. For a 2MiB slab size this is 48
 // different chunk sizes. This gives us average internal fragmentation of 87.5%.
-static inline uint32_t get_chunk_size(uint32_t x) {
+static inline constexpr uint32_t get_chunk_size(uint32_t x) {
   uint32_t y = x < MIN_SIZE ? MIN_SIZE : x;
   uint32_t pow2 = BITS_IN_WORD - cpp::countl_zero(y - 1);
 
@@ -123,6 +112,16 @@ static inline uint32_t get_chunk_size(uint32_t x) {
   return (s3 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT;
 }
 
+// Converts a chunk size into an index suitable for a statically sized array.
+static inline constexpr uint32_t get_chunk_id(uint32_t x) {
+  uint32_t y = x >> 4;
+  if (x <= MIN_SIZE)
+    return 0;
+  if (x < MIN_SIZE << 2)
+    return __builtin_popcountg(y);
+  return __builtin_popcountg(y) + 3 * (BITS_IN_WORD - __builtin_clzg(y)) - 7;
+}
+
 // Rounds to the nearest power of two.
 template <uint32_t N, typename T>
 static inline constexpr T round_up(const T x) {
@@ -451,67 +450,71 @@ struct GuardPtr {
 // The global array used to search for a valid slab to allocate from.
 static GuardPtr slots[ARRAY_SIZE] = {};
 
+// Keep a cache of the last successful slot for each chunk size. Initialize it
+// to an even spread of the total size. Must be updated if the chunking scheme
+// changes.
+#define START(X) ((ARRAY_SIZE * impl::get_chunk_id(X)) / ARRAY_SIZE)
+static cpp::Atomic<uint32_t> indicies[] = {
+    START(16),     START(32),     START(48),     START(64),     START(96),
+    START(112),    START(128),    START(192),    START(224),    START(256),
+    START(384),    START(448),    START(512),    START(768),    START(896),
+    START(1024),   START(1536),   START(1792),   START(2048),   START(3072),
+    START(3584),   START(4096),   START(6144),   START(7168),   START(8192),
+    START(12288),  START(14336),  START(16384),  START(24576),  START(28672),
+    START(32768),  START(49152),  START(57344),  START(65536),  START(98304),
+    START(114688), START(131072), START(196608), START(229376), START(262144),
+    START(393216), START(458752), START(524288), START(786432), START(917504),
+    START(1048576)};
+
 // Tries to find a slab in the table that can support the given chunk size.
 static Slab *find_slab(uint32_t chunk_size) {
   // We start at a hashed value to spread out different chunk sizes.
-  uint32_t start = impl::hash(chunk_size);
+  uint32_t start =
+      indicies[impl::get_chunk_id(chunk_size)].load(cpp::MemoryOrder::RELAXED);
   uint64_t lane_mask = gpu::get_lane_mask();
   uint64_t uniform = gpu::match_any(lane_mask, chunk_size);
 
-  Slab *result = nullptr;
-  uint32_t nudge = 0;
-  for (uint64_t mask = lane_mask; mask;
-       mask = gpu::ballot(lane_mask, !result), ++nudge) {
-    uint32_t index = cpp::numeric_limits<uint32_t>::max();
-    for (uint32_t offset = nudge / MAX_TRIES;
-         gpu::ballot(lane_mask, index == cpp::numeric_limits<uint32_t>::max());
-         offset += cpp::popcount(uniform & lane_mask)) {
-      uint32_t candidate =
-          (start + offset + impl::lane_count(uniform & lane_mask)) % ARRAY_SIZE;
-      uint64_t available =
-          gpu::ballot(lane_mask, slots[candidate].use_count() <
-                                     Slab::available_chunks(chunk_size));
-      uint32_t new_index = gpu::shuffle(
-          lane_mask, cpp::countr_zero(available & uniform), candidate);
-
-      // Each uniform group will use the first empty slot they find.
-      if ((index == cpp::numeric_limits<uint32_t>::max() &&
-           (available & uniform)))
-        index = new_index;
-
-      // Guaruntees that this loop will eventuall exit if there is no space.
-      if (offset >= ARRAY_SIZE) {
-        result = reinterpret_cast<Slab *>(SENTINEL);
-        index = 0;
-      }
-    }
+  for (uint32_t offset = 0; offset < ARRAY_SIZE; ++offset) {
+    uint32_t index =
+        offset == 0 ? start : (START(chunk_size) + offset) % ARRAY_SIZE;
 
-    // Try to claim a slot for the found slot.
-    if (!result) {
+    if (slots[index].use_count() < Slab::available_chunks(chunk_size)) {
+      uint64_t lane_mask = gpu::get_lane_mask();
       uint64_t reserved = 0;
-      Slab *slab = slots[index].try_lock(lane_mask & mask, uniform & mask,
+
+      Slab *slab = slots[index].try_lock(lane_mask, uniform & lane_mask,
                                          reserved, chunk_size, index);
+
+      // If there is a slab allocation in progress we retry a few times.
+      for (uint32_t retries = 0;
+           retries < MAX_TRIES && !slab && reserved != SENTINEL; retries++) {
+        slab = slots[index].try_lock(lane_mask, uniform & lane_mask, reserved,
+                                     chunk_size, index);
+        sleep_briefly();
+      }
+
       // If we find a slab with a matching chunk size then we store the result.
       // Otherwise, we need to free the claimed lock and continue. In the case
-      // of out-of-memory we return a sentinel value.
+      // of out-of-memory we return a sentinel value as slightly obtuse control
+      // flow to keep this loop more convergent.
       if (slab && reserved <= Slab::available_chunks(chunk_size) &&
           slab->get_chunk_size() == chunk_size) {
-        result = slab;
+        if (index != start)
+          indicies[impl::get_chunk_id(chunk_size)].store(
+              index, cpp::MemoryOrder::RELAXED);
+        return slab;
       } else if (slab && (reserved > Slab::available_chunks(chunk_size) ||
                           slab->get_chunk_size() != chunk_size)) {
-        if (slab->get_chunk_size() != chunk_size)
-          start = index + 1;
         slots[index].unlock(gpu::get_lane_mask(),
                             gpu::get_lane_mask() & uniform);
-      } else if (!slab && reserved == cpp::numeric_limits<uint64_t>::max()) {
-        result = reinterpret_cast<Slab *>(SENTINEL);
-      } else {
-        sleep_briefly();
+      } else if (!slab && reserved == SENTINEL) {
+        return nullptr;
       }
     }
   }
-  return result;
+  return nullptr;
 }
+#undef START
 
 // Release the lock associated with a given slab.
 static void release_slab(Slab *slab) {

@jhuber6 jhuber6 force-pushed the allocator_cache branch 3 times, most recently from 44888bd to 41bb7c8 Compare July 21, 2025 03:47
@jhuber6 jhuber6 force-pushed the allocator_cache branch 2 times, most recently from d307922 to 70ea6e4 Compare July 21, 2025 15:40
Summary:
This patch changes the `find_slab` logic to simply cache the most
successful slot. This means the happy fast path is now a single atomic
load on this index. I removed the SIMT shuffling logic that did slab
lookups wave-parallel. Here I am considering the actual traversal to be
comparatively unlikely, so it's not overly bad that it takes longer.
ideally one thread finds a slot and shared it with the rest so we only
pay that cost once.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants