diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp index 7923fbb2c1c24..a499c2d9b9e59 100644 --- a/libc/src/__support/GPU/allocator.cpp +++ b/libc/src/__support/GPU/allocator.cpp @@ -251,12 +251,18 @@ struct Slab { // The uniform mask represents which lanes contain a uniform target pointer. // We attempt to place these next to each other. void *result = nullptr; + uint32_t after = ~0u; + uint32_t old_index = 0; for (uint64_t mask = lane_mask; mask; mask = gpu::ballot(lane_mask, !result)) { if (result) continue; - uint32_t start = gpu::broadcast_value(lane_mask, impl::xorshift32(state)); + // We try using any known empty bits from the previous attempt first. + uint32_t start = gpu::shuffle(mask, cpp::countr_zero(uniform & mask), + ~after ? (old_index & ~(BITS_IN_WORD - 1)) + + cpp::countr_zero(~after) + : impl::xorshift32(state)); uint32_t id = impl::lane_count(uniform & mask); uint32_t index = (start + id) % usable_bits(chunk_size); @@ -266,8 +272,9 @@ struct Slab { // Get the mask of bits destined for the same slot and coalesce it. uint64_t match = uniform & gpu::match_any(mask, slot); uint32_t length = cpp::popcount(match); - uint32_t bitmask = static_cast((uint64_t(1) << length) - 1) - << bit; + uint32_t bitmask = gpu::shuffle( + mask, cpp::countr_zero(match), + static_cast((uint64_t(1) << length) - 1) << bit); uint32_t before = 0; if (gpu::get_lane_id() == static_cast(cpp::countr_zero(match))) @@ -278,6 +285,9 @@ struct Slab { result = ptr_from_index(index, chunk_size); else sleep_briefly(); + + after = before | bitmask; + old_index = index; } cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);