-
Notifications
You must be signed in to change notification settings - Fork 14.5k
[libc] Cache the most recently used slot for a chunk size #149751
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-libc Author: Joseph Huber (jhuber6) ChangesSummary: Full diff: https://github.com/llvm/llvm-project/pull/149751.diff 1 Files Affected:
diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index 7923fbb2c1c24..791d2381671d9 100644
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -34,13 +34,12 @@ constexpr static uint32_t BITS_IN_WORD = sizeof(uint32_t) * 8;
constexpr static uint32_t MIN_SIZE = 16;
constexpr static uint32_t MIN_ALIGNMENT = MIN_SIZE - 1;
+// The number of times to attempt claiming an in-progress slab allocation.
+constexpr static uint32_t MAX_TRIES = 128;
+
// A sentinel used to indicate an invalid but non-null pointer value.
constexpr static uint64_t SENTINEL = cpp::numeric_limits<uint64_t>::max();
-// The number of times we will try starting on a single index before skipping
-// past it.
-constexpr static uint32_t MAX_TRIES = 512;
-
static_assert(!(ARRAY_SIZE & (ARRAY_SIZE - 1)), "Must be a power of two");
namespace impl {
@@ -92,20 +91,10 @@ static inline uint32_t xorshift32(uint32_t &state) {
return state * 0x9e3779bb;
}
-// Final stage of murmurhash used to get a unique index for the global array
-static inline uint32_t hash(uint32_t x) {
- x ^= x >> 16;
- x *= 0x85ebca6b;
- x ^= x >> 13;
- x *= 0xc2b2ae35;
- x ^= x >> 16;
- return x;
-}
-
// Rounds the input value to the closest permitted chunk size. Here we accept
// the sum of the closest three powers of two. For a 2MiB slab size this is 48
// different chunk sizes. This gives us average internal fragmentation of 87.5%.
-static inline uint32_t get_chunk_size(uint32_t x) {
+static inline constexpr uint32_t get_chunk_size(uint32_t x) {
uint32_t y = x < MIN_SIZE ? MIN_SIZE : x;
uint32_t pow2 = BITS_IN_WORD - cpp::countl_zero(y - 1);
@@ -123,6 +112,16 @@ static inline uint32_t get_chunk_size(uint32_t x) {
return (s3 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT;
}
+// Converts a chunk size into an index suitable for a statically sized array.
+static inline constexpr uint32_t get_chunk_id(uint32_t x) {
+ uint32_t y = x >> 4;
+ if (x <= MIN_SIZE)
+ return 0;
+ if (x < MIN_SIZE << 2)
+ return __builtin_popcountg(y);
+ return __builtin_popcountg(y) + 3 * (BITS_IN_WORD - __builtin_clzg(y)) - 7;
+}
+
// Rounds to the nearest power of two.
template <uint32_t N, typename T>
static inline constexpr T round_up(const T x) {
@@ -451,67 +450,71 @@ struct GuardPtr {
// The global array used to search for a valid slab to allocate from.
static GuardPtr slots[ARRAY_SIZE] = {};
+// Keep a cache of the last successful slot for each chunk size. Initialize it
+// to an even spread of the total size. Must be updated if the chunking scheme
+// changes.
+#define START(X) ((ARRAY_SIZE * impl::get_chunk_id(X)) / ARRAY_SIZE)
+static cpp::Atomic<uint32_t> indicies[] = {
+ START(16), START(32), START(48), START(64), START(96),
+ START(112), START(128), START(192), START(224), START(256),
+ START(384), START(448), START(512), START(768), START(896),
+ START(1024), START(1536), START(1792), START(2048), START(3072),
+ START(3584), START(4096), START(6144), START(7168), START(8192),
+ START(12288), START(14336), START(16384), START(24576), START(28672),
+ START(32768), START(49152), START(57344), START(65536), START(98304),
+ START(114688), START(131072), START(196608), START(229376), START(262144),
+ START(393216), START(458752), START(524288), START(786432), START(917504),
+ START(1048576)};
+
// Tries to find a slab in the table that can support the given chunk size.
static Slab *find_slab(uint32_t chunk_size) {
// We start at a hashed value to spread out different chunk sizes.
- uint32_t start = impl::hash(chunk_size);
+ uint32_t start =
+ indicies[impl::get_chunk_id(chunk_size)].load(cpp::MemoryOrder::RELAXED);
uint64_t lane_mask = gpu::get_lane_mask();
uint64_t uniform = gpu::match_any(lane_mask, chunk_size);
- Slab *result = nullptr;
- uint32_t nudge = 0;
- for (uint64_t mask = lane_mask; mask;
- mask = gpu::ballot(lane_mask, !result), ++nudge) {
- uint32_t index = cpp::numeric_limits<uint32_t>::max();
- for (uint32_t offset = nudge / MAX_TRIES;
- gpu::ballot(lane_mask, index == cpp::numeric_limits<uint32_t>::max());
- offset += cpp::popcount(uniform & lane_mask)) {
- uint32_t candidate =
- (start + offset + impl::lane_count(uniform & lane_mask)) % ARRAY_SIZE;
- uint64_t available =
- gpu::ballot(lane_mask, slots[candidate].use_count() <
- Slab::available_chunks(chunk_size));
- uint32_t new_index = gpu::shuffle(
- lane_mask, cpp::countr_zero(available & uniform), candidate);
-
- // Each uniform group will use the first empty slot they find.
- if ((index == cpp::numeric_limits<uint32_t>::max() &&
- (available & uniform)))
- index = new_index;
-
- // Guaruntees that this loop will eventuall exit if there is no space.
- if (offset >= ARRAY_SIZE) {
- result = reinterpret_cast<Slab *>(SENTINEL);
- index = 0;
- }
- }
+ for (uint32_t offset = 0; offset < ARRAY_SIZE; ++offset) {
+ uint32_t index =
+ offset == 0 ? start : (START(chunk_size) + offset) % ARRAY_SIZE;
- // Try to claim a slot for the found slot.
- if (!result) {
+ if (slots[index].use_count() < Slab::available_chunks(chunk_size)) {
+ uint64_t lane_mask = gpu::get_lane_mask();
uint64_t reserved = 0;
- Slab *slab = slots[index].try_lock(lane_mask & mask, uniform & mask,
+
+ Slab *slab = slots[index].try_lock(lane_mask, uniform & lane_mask,
reserved, chunk_size, index);
+
+ // If there is a slab allocation in progress we retry a few times.
+ for (uint32_t retries = 0;
+ retries < MAX_TRIES && !slab && reserved != SENTINEL; retries++) {
+ slab = slots[index].try_lock(lane_mask, uniform & lane_mask, reserved,
+ chunk_size, index);
+ sleep_briefly();
+ }
+
// If we find a slab with a matching chunk size then we store the result.
// Otherwise, we need to free the claimed lock and continue. In the case
- // of out-of-memory we return a sentinel value.
+ // of out-of-memory we return a sentinel value as slightly obtuse control
+ // flow to keep this loop more convergent.
if (slab && reserved <= Slab::available_chunks(chunk_size) &&
slab->get_chunk_size() == chunk_size) {
- result = slab;
+ if (index != start)
+ indicies[impl::get_chunk_id(chunk_size)].store(
+ index, cpp::MemoryOrder::RELAXED);
+ return slab;
} else if (slab && (reserved > Slab::available_chunks(chunk_size) ||
slab->get_chunk_size() != chunk_size)) {
- if (slab->get_chunk_size() != chunk_size)
- start = index + 1;
slots[index].unlock(gpu::get_lane_mask(),
gpu::get_lane_mask() & uniform);
- } else if (!slab && reserved == cpp::numeric_limits<uint64_t>::max()) {
- result = reinterpret_cast<Slab *>(SENTINEL);
- } else {
- sleep_briefly();
+ } else if (!slab && reserved == SENTINEL) {
+ return nullptr;
}
}
}
- return result;
+ return nullptr;
}
+#undef START
// Release the lock associated with a given slab.
static void release_slab(Slab *slab) {
|
44888bd
to
41bb7c8
Compare
d307922
to
70ea6e4
Compare
Summary: This patch changes the `find_slab` logic to simply cache the most successful slot. This means the happy fast path is now a single atomic load on this index. I removed the SIMT shuffling logic that did slab lookups wave-parallel. Here I am considering the actual traversal to be comparatively unlikely, so it's not overly bad that it takes longer. ideally one thread finds a slot and shared it with the rest so we only pay that cost once.
Summary:
This patch changes the
find_slab
logic to simply cache the mostsuccessful slot. This means the happy fast path is now a single atomic
load on this index. I removed the SIMT shuffling logic that did slab
lookups wave-parallel. Here I am considering the actual traversal to be
comparatively unlikely, so it's not overly bad that it takes longer.
ideally one thread finds a slot and shared it with the rest so we only
pay that cost once.