Skip to content

[libc] Cache the most recently used slot for a chunk size #149751

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 63 additions & 59 deletions libc/src/__support/GPU/allocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,12 @@ constexpr static uint32_t BITS_IN_WORD = sizeof(uint32_t) * 8;
constexpr static uint32_t MIN_SIZE = 16;
constexpr static uint32_t MIN_ALIGNMENT = MIN_SIZE - 1;

// The number of times to attempt claiming an in-progress slab allocation.
constexpr static uint32_t MAX_TRIES = 128;

// A sentinel used to indicate an invalid but non-null pointer value.
constexpr static uint64_t SENTINEL = cpp::numeric_limits<uint64_t>::max();

// The number of times we will try starting on a single index before skipping
// past it.
constexpr static uint32_t MAX_TRIES = 512;

static_assert(!(ARRAY_SIZE & (ARRAY_SIZE - 1)), "Must be a power of two");

namespace impl {
Expand Down Expand Up @@ -92,20 +91,10 @@ static inline uint32_t xorshift32(uint32_t &state) {
return state * 0x9e3779bb;
}

// Final stage of murmurhash used to get a unique index for the global array
static inline uint32_t hash(uint32_t x) {
x ^= x >> 16;
x *= 0x85ebca6b;
x ^= x >> 13;
x *= 0xc2b2ae35;
x ^= x >> 16;
return x;
}

// Rounds the input value to the closest permitted chunk size. Here we accept
// the sum of the closest three powers of two. For a 2MiB slab size this is 48
// different chunk sizes. This gives us average internal fragmentation of 87.5%.
static inline uint32_t get_chunk_size(uint32_t x) {
static inline constexpr uint32_t get_chunk_size(uint32_t x) {
uint32_t y = x < MIN_SIZE ? MIN_SIZE : x;
uint32_t pow2 = BITS_IN_WORD - cpp::countl_zero(y - 1);

Expand All @@ -123,6 +112,16 @@ static inline uint32_t get_chunk_size(uint32_t x) {
return (s3 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT;
}

// Converts a chunk size into an index suitable for a statically sized array.
static inline constexpr uint32_t get_chunk_id(uint32_t x) {
uint32_t y = x >> 4;
if (x <= MIN_SIZE)
return 0;
if (x < MIN_SIZE << 2)
return cpp::popcount(y);
return cpp::popcount(y) + 3 * (BITS_IN_WORD - cpp::countl_zero(y)) - 7;
}

// Rounds to the nearest power of two.
template <uint32_t N, typename T>
static inline constexpr T round_up(const T x) {
Expand All @@ -143,6 +142,12 @@ static inline constexpr bool is_pow2(uint64_t x) {
return x && (x & (x - 1)) == 0;
}

// Where this chunk size should start looking in the global array.
static inline constexpr uint32_t start_index(uint32_t chunk_index) {
return (ARRAY_SIZE * impl::get_chunk_id(chunk_index)) /
impl::get_chunk_id(SLAB_SIZE / 2);
}

} // namespace impl

/// A slab allocator used to hand out identically sized slabs of memory.
Expand Down Expand Up @@ -451,66 +456,65 @@ struct GuardPtr {
// The global array used to search for a valid slab to allocate from.
static GuardPtr slots[ARRAY_SIZE] = {};

// Keep a cache of the last successful slot for each chunk size. Initialize it
// to an even spread of the total size. Must be updated if the chunking scheme
// changes.
#define S(X) (impl::start_index(X))
static cpp::Atomic<uint32_t> indices[] = {
S(16), S(32), S(48), S(64), S(96), S(112), S(128),
S(192), S(224), S(256), S(384), S(448), S(512), S(768),
S(896), S(1024), S(1536), S(1792), S(2048), S(3072), S(3584),
S(4096), S(6144), S(7168), S(8192), S(12288), S(14336), S(16384),
S(24576), S(28672), S(32768), S(49152), S(57344), S(65536), S(98304),
S(114688), S(131072), S(196608), S(229376), S(262144), S(393216), S(458752),
S(524288), S(786432), S(917504), S(1048576)};
#undef S

// Tries to find a slab in the table that can support the given chunk size.
static Slab *find_slab(uint32_t chunk_size) {
// We start at a hashed value to spread out different chunk sizes.
uint32_t start = impl::hash(chunk_size);
uint64_t lane_mask = gpu::get_lane_mask();
uint64_t uniform = gpu::match_any(lane_mask, chunk_size);

Slab *result = nullptr;
uint32_t nudge = 0;
for (uint64_t mask = lane_mask; mask;
mask = gpu::ballot(lane_mask, !result), ++nudge) {
uint32_t index = cpp::numeric_limits<uint32_t>::max();
for (uint32_t offset = nudge / MAX_TRIES;
gpu::ballot(lane_mask, index == cpp::numeric_limits<uint32_t>::max());
offset += cpp::popcount(uniform & lane_mask)) {
uint32_t candidate =
(start + offset + impl::lane_count(uniform & lane_mask)) % ARRAY_SIZE;
uint64_t available =
gpu::ballot(lane_mask, slots[candidate].use_count() <
Slab::available_chunks(chunk_size));
uint32_t new_index = gpu::shuffle(
lane_mask, cpp::countr_zero(available & uniform), candidate);

// Each uniform group will use the first empty slot they find.
if ((index == cpp::numeric_limits<uint32_t>::max() &&
(available & uniform)))
index = new_index;

// Guaruntees that this loop will eventuall exit if there is no space.
if (offset >= ARRAY_SIZE) {
result = reinterpret_cast<Slab *>(SENTINEL);
index = 0;
}
}
// We start at the index of the last successful allocation for this kind.
uint32_t chunk_id = impl::get_chunk_id(chunk_size);
uint32_t start = indices[chunk_id].load(cpp::MemoryOrder::RELAXED);
uint64_t uniform = gpu::match_any(gpu::get_lane_mask(), chunk_size);

for (uint32_t offset = 0; offset < ARRAY_SIZE; ++offset) {
uint32_t index =
!offset ? start : (impl::start_index(chunk_size) + offset) % ARRAY_SIZE;

// Try to claim a slot for the found slot.
if (!result) {
if (slots[index].use_count() < Slab::available_chunks(chunk_size)) {
uint64_t lane_mask = gpu::get_lane_mask();
uint64_t reserved = 0;
Slab *slab = slots[index].try_lock(lane_mask & mask, uniform & mask,

Slab *slab = slots[index].try_lock(lane_mask, uniform & lane_mask,
reserved, chunk_size, index);

// If there is a slab allocation in progress we retry a few times.
for (uint32_t retries = 0;
retries < MAX_TRIES && !slab && reserved != SENTINEL; retries++) {
uint64_t lane_mask = gpu::get_lane_mask();
slab = slots[index].try_lock(lane_mask, uniform & lane_mask, reserved,
chunk_size, index);
sleep_briefly();
}

// If we find a slab with a matching chunk size then we store the result.
// Otherwise, we need to free the claimed lock and continue. In the case
// of out-of-memory we return a sentinel value.
// of out-of-memory we receive a sentinel value and return a failure.
if (slab && reserved <= Slab::available_chunks(chunk_size) &&
slab->get_chunk_size() == chunk_size) {
result = slab;
if (index != start)
indices[chunk_id].store(index, cpp::MemoryOrder::RELAXED);
return slab;
} else if (slab && (reserved > Slab::available_chunks(chunk_size) ||
slab->get_chunk_size() != chunk_size)) {
if (slab->get_chunk_size() != chunk_size)
start = index + 1;
slots[index].unlock(gpu::get_lane_mask(),
gpu::get_lane_mask() & uniform);
} else if (!slab && reserved == cpp::numeric_limits<uint64_t>::max()) {
result = reinterpret_cast<Slab *>(SENTINEL);
} else {
sleep_briefly();
} else if (!slab && reserved == SENTINEL) {
return nullptr;
}
}
}
return result;
return nullptr;
}

// Release the lock associated with a given slab.
Expand Down
Loading