-
Notifications
You must be signed in to change notification settings - Fork 13.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Introduce OnceVec<T> primitive and use it for AllocId caches #136105
Closed
Closed
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,229 @@ | ||
use std::alloc::Layout; | ||
use std::marker::PhantomData; | ||
use std::mem::MaybeUninit; | ||
use std::ptr::NonNull; | ||
use std::sync::Mutex; | ||
use std::sync::atomic::{AtomicPtr, AtomicU8, Ordering}; | ||
|
||
/// Provides a singly-settable Vec. | ||
/// | ||
/// This provides amortized, concurrent O(1) access to &T, expecting a densely numbered key space | ||
/// (all value slots are allocated up to the highest key inserted). | ||
pub struct OnceVec<T> { | ||
// Provide storage for up to 2^35 elements, which we expect to be enough in practice -- but can | ||
// be increased if needed. We may want to make the `slabs` list dynamic itself, likely by | ||
// indirecting through one more pointer to reduce space consumption of OnceVec if this grows | ||
// much larger. | ||
// | ||
// None of the code makes assumptions based on this size so bumping it up is easy. | ||
slabs: [Slab<T>; 36], | ||
} | ||
|
||
impl<T> Default for OnceVec<T> { | ||
fn default() -> Self { | ||
OnceVec { slabs: [const { Slab::new() }; 36] } | ||
} | ||
} | ||
|
||
unsafe impl<#[may_dangle] T> Drop for OnceVec<T> { | ||
fn drop(&mut self) { | ||
for (idx, slab) in self.slabs.iter_mut().enumerate() { | ||
unsafe { slab.deallocate(1 << idx) } | ||
} | ||
} | ||
} | ||
|
||
impl<T> OnceVec<T> { | ||
#[inline] | ||
fn to_slab_args(idx: usize) -> (usize, usize, usize) { | ||
let slab_idx = (idx + 1).ilog2() as usize; | ||
let cap = 1 << slab_idx; | ||
let idx_in_slab = idx - (cap - 1); | ||
(slab_idx, cap, idx_in_slab) | ||
} | ||
|
||
pub fn insert(&self, idx: usize, value: T) -> Result<(), T> { | ||
let (slab_idx, cap, idx_in_slab) = Self::to_slab_args(idx); | ||
self.slabs[slab_idx].insert(cap, idx_in_slab, value) | ||
} | ||
|
||
pub fn get(&self, idx: usize) -> Option<&T> { | ||
let (slab_idx, cap, idx_in_slab) = Self::to_slab_args(idx); | ||
self.slabs[slab_idx].get(cap, idx_in_slab) | ||
} | ||
} | ||
|
||
struct Slab<T> { | ||
// If non-zero, points to a contiguously allocated block which starts with a bitset | ||
// (two bits per value, one for whether a value is present and the other for whether a value is | ||
// currently being written) and then `[V]` (some of which may be missing). | ||
// | ||
// The capacity is implicit and passed with all accessors. | ||
v: AtomicPtr<u8>, | ||
_phantom: PhantomData<[T; 1]>, | ||
} | ||
|
||
impl<T> Slab<T> { | ||
const fn new() -> Slab<T> { | ||
Slab { v: AtomicPtr::new(std::ptr::null_mut()), _phantom: PhantomData } | ||
} | ||
|
||
fn initialize(&self, cap: usize) -> NonNull<u8> { | ||
static LOCK: Mutex<()> = Mutex::new(()); | ||
|
||
if let Some(ptr) = NonNull::new(self.v.load(Ordering::Acquire)) { | ||
return ptr; | ||
} | ||
|
||
// If we are initializing the bucket, then acquire a global lock. | ||
// | ||
// This path is quite cold, so it's cheap to use a global lock. This ensures that we never | ||
// have multiple allocations for the same bucket. | ||
let _allocator_guard = LOCK.lock().unwrap_or_else(|e| e.into_inner()); | ||
|
||
// Check the lock again, sicne we might have been initialized while waiting on the lock. | ||
if let Some(ptr) = NonNull::new(self.v.load(Ordering::Acquire)) { | ||
return ptr; | ||
} | ||
|
||
let layout = Self::layout(cap).0; | ||
assert!(layout.size() > 0); | ||
|
||
// SAFETY: Checked above that layout is non-zero sized. | ||
let Some(allocation) = NonNull::new(unsafe { std::alloc::alloc_zeroed(layout) }) else { | ||
std::alloc::handle_alloc_error(layout); | ||
}; | ||
|
||
self.v.store(allocation.as_ptr(), Ordering::Release); | ||
|
||
allocation | ||
} | ||
|
||
fn bitset(ptr: NonNull<u8>, cap: usize) -> NonNull<[AtomicU8]> { | ||
NonNull::slice_from_raw_parts(ptr.cast(), cap.div_ceil(4)) | ||
} | ||
|
||
// SAFETY: Must be called on a `initialize`d `ptr` for this capacity. | ||
unsafe fn slice(ptr: NonNull<u8>, cap: usize) -> NonNull<[MaybeUninit<T>]> { | ||
let offset = Self::layout(cap).1; | ||
// SAFETY: Passed up to caller. | ||
NonNull::slice_from_raw_parts(unsafe { ptr.add(offset).cast() }, cap) | ||
} | ||
|
||
// idx is already compacted to within this slab | ||
fn get(&self, cap: usize, idx: usize) -> Option<&T> { | ||
// avoid initializing for get queries | ||
let Some(ptr) = NonNull::new(self.v.load(Ordering::Acquire)) else { | ||
return None; | ||
}; | ||
|
||
let bitset = unsafe { Self::bitset(ptr, cap).as_ref() }; | ||
|
||
// Check if the entry is initialized. | ||
// | ||
// Bottom 4 bits are the "is initialized" bits, top 4 bits are used for "is initializing" | ||
// lock. | ||
let word = bitset[idx / 4].load(Ordering::Acquire); | ||
if word & (1 << (idx % 4)) == 0 { | ||
return None; | ||
} | ||
|
||
// Avoid as_ref() since we don't want to assert shared refs to all slots (some are being | ||
// concurrently updated). | ||
// | ||
// SAFETY: `ptr` is only written by `initialize`, so this is safe. | ||
let slice = unsafe { Self::slice(ptr, cap) }; | ||
assert!(idx < slice.len()); | ||
// SAFETY: assertion above checks that we're in-bounds. | ||
let slot = unsafe { slice.cast::<T>().add(idx) }; | ||
|
||
// SAFETY: We checked `bitset` and this value was initialized. Our Acquire load | ||
// establishes the memory ordering with the release store which set the bit, so we're safe | ||
// to read it. | ||
Some(unsafe { slot.as_ref() }) | ||
} | ||
|
||
// idx is already compacted to within this slab | ||
fn insert(&self, cap: usize, idx: usize, value: T) -> Result<(), T> { | ||
// avoid initializing for get queries | ||
let ptr = self.initialize(cap); | ||
let bitset = unsafe { Self::bitset(ptr, cap).as_ref() }; | ||
|
||
// Check if the entry is initialized, and lock it for writing. | ||
let word = bitset[idx / 4].fetch_or(1 << (4 + idx % 4), Ordering::AcqRel); | ||
if word & (1 << (idx % 4)) != 0 { | ||
// Already fully initialized prior to us setting the "is writing" bit. | ||
return Err(value); | ||
} | ||
if word & (1 << (4 + idx % 4)) != 0 { | ||
// Someone else already acquired the lock for writing. | ||
return Err(value); | ||
} | ||
|
||
let slice = unsafe { Self::slice(ptr, cap) }; | ||
assert!(idx < slice.len()); | ||
// SAFETY: assertion above checks that we're in-bounds. | ||
let slot = unsafe { slice.cast::<T>().add(idx) }; | ||
|
||
// SAFETY: We locked this slot for writing with the fetch_or above, and were the first to do | ||
// so (checked in 2nd `if` above). | ||
unsafe { | ||
slot.write(value); | ||
} | ||
|
||
// Set the is-present bit, indicating that we have finished writing this value. | ||
// Acquire ensures we don't break synchronizes-with relationships in other bits (unclear if | ||
// strictly necessary but definitely doesn't hurt). | ||
bitset[idx / 4].fetch_or(1 << (idx % 4), Ordering::AcqRel); | ||
|
||
Ok(()) | ||
} | ||
|
||
/// Returns the layout for a Slab with capacity for `cap` elements, and the offset into the | ||
/// allocation at which the T slice starts. | ||
fn layout(cap: usize) -> (Layout, usize) { | ||
Layout::array::<AtomicU8>(cap.div_ceil(4)) | ||
.unwrap() | ||
.extend(Layout::array::<T>(cap).unwrap()) | ||
.unwrap() | ||
} | ||
|
||
// Drop, except passing the capacity | ||
unsafe fn deallocate(&mut self, cap: usize) { | ||
// avoid initializing just to Drop | ||
let Some(ptr) = NonNull::new(self.v.load(Ordering::Acquire)) else { | ||
return; | ||
}; | ||
|
||
if std::mem::needs_drop::<T>() { | ||
// SAFETY: `ptr` is only written by `initialize`, and zero-init'd so AtomicU8 is present in | ||
// the bitset range. | ||
let bitset = unsafe { Self::bitset(ptr, cap).as_ref() }; | ||
// SAFETY: `ptr` is only written by `initialize`, so satisfies slice precondition. | ||
let slice = unsafe { Self::slice(ptr, cap).cast::<T>() }; | ||
|
||
for (word_idx, word) in bitset.iter().enumerate() { | ||
let word = word.load(Ordering::Acquire); | ||
for word_offset in 0..4 { | ||
if word & (1 << word_offset) != 0 { | ||
// Was initialized, need to drop the value. | ||
let idx = word_idx * 4 + word_offset; | ||
unsafe { | ||
std::ptr::drop_in_place(slice.add(idx).as_ptr()); | ||
} | ||
} | ||
} | ||
} | ||
} | ||
|
||
let layout = Self::layout(cap).0; | ||
|
||
// SAFETY: Allocated with `alloc` and the same layout. | ||
unsafe { | ||
std::alloc::dealloc(ptr.as_ptr(), layout); | ||
} | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod test; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
use super::*; | ||
|
||
#[test] | ||
#[cfg(not(miri))] | ||
fn empty() { | ||
let cache: OnceVec<u32> = OnceVec::default(); | ||
for key in 0..u32::MAX { | ||
assert!(cache.get(key as usize).is_none()); | ||
} | ||
} | ||
|
||
#[test] | ||
fn insert_and_check() { | ||
let cache: OnceVec<usize> = OnceVec::default(); | ||
for idx in 0..100 { | ||
cache.insert(idx, idx).unwrap(); | ||
} | ||
for idx in 0..100 { | ||
assert_eq!(cache.get(idx), Some(&idx)); | ||
} | ||
} | ||
|
||
#[test] | ||
fn sparse_inserts() { | ||
let cache: OnceVec<u32> = OnceVec::default(); | ||
let end = if cfg!(target_pointer_width = "64") && cfg!(target_os = "linux") { | ||
// For paged memory, 64-bit systems we should be able to sparsely allocate all of the pages | ||
// needed for these inserts cheaply (without needing to actually have gigabytes of resident | ||
// memory). | ||
31 | ||
} else { | ||
// Otherwise, still run the test but scaled back: | ||
// | ||
// Each slot is <5 bytes, so 2^25 entries (on non-virtual memory systems, like e.g. Windows) | ||
// will mean 160 megabytes of allocated memory. Going beyond that is probably not reasonable | ||
// for tests. | ||
25 | ||
}; | ||
for shift in 0..end { | ||
let key = 1u32 << shift; | ||
cache.insert(key as usize, shift).unwrap(); | ||
assert_eq!(cache.get(key as usize), Some(&shift)); | ||
} | ||
} | ||
|
||
#[test] | ||
fn concurrent_stress_check() { | ||
let cache: OnceVec<usize> = OnceVec::default(); | ||
std::thread::scope(|s| { | ||
for idx in 0..100 { | ||
let cache = &cache; | ||
s.spawn(move || { | ||
cache.insert(idx, idx).unwrap(); | ||
}); | ||
} | ||
}); | ||
|
||
for idx in 0..100 { | ||
assert_eq!(cache.get(idx), Some(&idx)); | ||
} | ||
} | ||
|
||
#[test] | ||
#[cfg(not(miri))] | ||
fn slot_index_exhaustive() { | ||
let mut prev = None::<(usize, usize, usize)>; | ||
for idx in 0..=u32::MAX as usize { | ||
let slot_idx = OnceVec::<()>::to_slab_args(idx); | ||
if let Some(p) = prev { | ||
if p.0 == slot_idx.0 { | ||
assert_eq!(p.2 + 1, slot_idx.2); | ||
} else { | ||
assert_eq!(slot_idx.2, 0); | ||
} | ||
} else { | ||
assert_eq!(idx, 0); | ||
assert_eq!(slot_idx.2, 0); | ||
assert_eq!(slot_idx.0, 0); | ||
} | ||
|
||
prev = Some(slot_idx); | ||
} | ||
} |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There can't be more than like 2^29 elements without proc macros as you only get 2^32 bytes for all source code combined and you need on average multiple characters for each identifier. In other words support for 2^35 elements is plenty.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think that's probably not true for e.g. AllocIds, right? At least with miri, you can presumably run a program that runs indefinitely and so allocates filling up memory here. You'd probably run out of RAM, etc., so I'm not actually particularly worried though :)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In Miri (and CTFE), you can keep allocating and freeing memory and that way use up
AllocId
without using up more and mire RAM.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
... but if you do that, those allocations don't end up in
alloc_map
; only the final value of a const/static is put there.