Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FixedBufPool::next #199

Merged
merged 13 commits into from
Feb 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ keywords = ["async", "fs", "io-uring"]
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
tokio = { version = "1.2", features = ["net", "rt"] }
tokio = { version = "1.2", features = ["net", "rt", "sync"] }
slab = "0.4.2"
libc = "0.2.80"
io-uring = { version = "0.5.12", features = ["unstable"] }
Expand Down
2 changes: 1 addition & 1 deletion src/buf/fixed/handle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use std::ops::{Deref, DerefMut};
use std::rc::Rc;

// Data to construct a `FixedBuf` handle from.
pub(super) struct CheckedOutBuf {
pub(crate) struct CheckedOutBuf {
// Pointer and size of the buffer.
pub iovec: iovec,
// Length of the initialized part.
Expand Down
4 changes: 3 additions & 1 deletion src/buf/fixed/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@ pub use handle::FixedBuf;
mod buffers;
pub(crate) use buffers::FixedBuffers;

mod pool;
mod plumbing;

pub mod pool;
pub use pool::FixedBufPool;

mod registry;
Expand Down
8 changes: 8 additions & 0 deletions src/buf/fixed/plumbing/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
// Internal data structures shared between thread-local and thread-safe
// fixed buffer collections.

mod pool;
pub(super) use pool::Pool;

mod registry;
pub(super) use registry::Registry;
193 changes: 193 additions & 0 deletions src/buf/fixed/plumbing/pool.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
use crate::buf::fixed::{handle::CheckedOutBuf, FixedBuffers};
use crate::buf::IoBufMut;

use libc::{iovec, UIO_MAXIOV};
use tokio::sync::Notify;

use std::cmp;
use std::collections::HashMap;
use std::mem;
use std::ptr;
use std::slice;
use std::sync::Arc;

// Internal state shared by FixedBufPool and FixedBuf handles.
pub(crate) struct Pool<T: IoBufMut> {
// Pointer to an allocated array of iovec records referencing
// the allocated buffers. The number of initialized records is the
// same as the length of the states array.
raw_bufs: ptr::NonNull<iovec>,
// Original capacity of raw_bufs as a Vec.
orig_cap: usize,
// State information on the buffers. Indices in this array correspond to
// the indices in the array at raw_bufs.
states: Vec<BufState>,
// Table of head indices of the free buffer lists in each size bucket.
free_buf_head_by_cap: HashMap<usize, u16>,
// Original buffers, kept until drop
buffers: Vec<T>,
// Used to notify tasks pending on `next`
notify_next_by_cap: HashMap<usize, Arc<Notify>>,
FrankReh marked this conversation as resolved.
Show resolved Hide resolved
}

// State information of a buffer in the registry,
enum BufState {
// The buffer is not in use.
Free {
// This field records the length of the initialized part.
init_len: usize,
// Index of the next buffer of the same capacity in a free buffer list, if any.
next: Option<u16>,
},
// The buffer is checked out.
// Its data are logically owned by the FixedBuf handle,
// which also keeps track of the length of the initialized part.
CheckedOut,
}

impl<T: IoBufMut> Pool<T> {
pub(crate) fn new(bufs: impl Iterator<Item = T>) -> Self {
// Limit the number of buffers to the maximum allowable number.
let bufs = bufs.take(cmp::min(UIO_MAXIOV as usize, u16::MAX as usize));
// Collect into `buffers`, which holds the backing buffers for
// the lifetime of the pool. Using collect may allow
// the compiler to apply collect in place specialization,
// to avoid an allocation.
let mut buffers = bufs.collect::<Vec<T>>();
let mut iovecs = Vec::with_capacity(buffers.len());
let mut states = Vec::with_capacity(buffers.len());
let mut free_buf_head_by_cap = HashMap::new();
for (index, buf) in buffers.iter_mut().enumerate() {
let cap = buf.bytes_total();

// Link the buffer as the head of the free list for its capacity.
// This constructs the free buffer list to be initially retrieved
// back to front, which should be of no difference to the user.
let next = free_buf_head_by_cap.insert(cap, index as u16);

iovecs.push(iovec {
iov_base: buf.stable_mut_ptr() as *mut _,
iov_len: cap,
});
states.push(BufState::Free {
init_len: buf.bytes_init(),
next,
});
}
debug_assert_eq!(iovecs.len(), states.len());
debug_assert_eq!(iovecs.len(), buffers.len());

// Safety: Vec::as_mut_ptr never returns null
let raw_bufs = unsafe { ptr::NonNull::new_unchecked(iovecs.as_mut_ptr()) };
let orig_cap = iovecs.capacity();
mem::forget(iovecs);
Pool {
raw_bufs,
orig_cap,
states,
free_buf_head_by_cap,
buffers,
notify_next_by_cap: HashMap::new(),
}
}

// If the free buffer list for this capacity is not empty, checks out the first buffer
// from the list and returns its data. Otherwise, returns None.
pub(crate) fn try_next(&mut self, cap: usize) -> Option<CheckedOutBuf> {
let free_head = self.free_buf_head_by_cap.get_mut(&cap)?;
let index = *free_head as usize;
let state = &mut self.states[index];

let (init_len, next) = match *state {
BufState::Free { init_len, next } => {
*state = BufState::CheckedOut;
(init_len, next)
}
BufState::CheckedOut => panic!("buffer is checked out"),
};

// Update the head of the free list for this capacity.
match next {
Some(i) => {
*free_head = i;
}
None => {
self.free_buf_head_by_cap.remove(&cap);
}
}

// Safety: the allocated array under the pointer is valid
// for the lifetime of self, a free buffer index is inside the array,
// as also asserted by the indexing operation on the states array
// that has the same length.
let iovec = unsafe { self.raw_bufs.as_ptr().add(index).read() };
debug_assert_eq!(iovec.iov_len, cap);
Some(CheckedOutBuf {
iovec,
init_len,
index: index as u16,
})
}

// Returns a `Notify` to use for waking up tasks awaiting a buffer of
// the specified capacity.
pub(crate) fn notify_on_next(&mut self, cap: usize) -> Arc<Notify> {
Copy link
Contributor Author

@mzabaluev mzabaluev Dec 15, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Arc is not necessary for the single-threaded pool, but I'm going to reuse this for the thread-safe implementation. Notify contains a mutex and uses atomic pointers for tracking waiters, so it's going to be slow regardless when we hit the waiting branch in next. Performance of next in the depleted pool case is not high on my current priority list for micro-optimizations.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But we are left without a simple/fast pool, that doesn't allocate, for the current-thread case. The case I care about. But all is not lost, once the API allows us to provide our own pool.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd just like to clarify my understanding here: once a notify has been registered for a capacity, then every time a buffer of that capacity is commited, notify_one() is called (because the notify_next_by_cap entry is never removed)?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this could be improved by keeping a (usize, Notify) instead, to explicitly keep a count for the number of waiting tasks, and removing the entry from the hashset once count == zero. That way, you only pay the SeqCst cost of calling notify() when you need to.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not saying this is needed, I will let @mzabaluev set the path on this one. But I would prefer something that had a little hysteresis to it. Like removing the Notify if the pool size reached something like 10 buffers again. (But I have no idea why I would pick 10. Just something larger than 0.) But I haven't tried to design anything around this idea so feel free to disregard entirely.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My concern is that with the current design, once try_next() has failed once in the next() method. every checking from then on for that buffer size is going to issue a SeqCst memory barrier with the call to notify_one(). Thats quite a penalty. Its unavoidable with the surrent Notify, which is designed for Send operations, but we could at least limit it to when there is actually a consumer waiting.

Copy link
Collaborator

@FrankReh FrankReh Jan 27, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ollie-etl With your idea, I don't see how the usize field is decremented properly? In the case of a future being cancelled, there is no future to do the work of updating the count (from my understanding).

And along my idea of not keeping a count but removing once there are some number of buffers back in the pool, I don't see an elegant way of knowing whether the waiters linked list is empty or not. There is no public API.

Copy link
Contributor

@ollie-etl ollie-etl Jan 30, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You'd need to return a struct Next(usize, Notify) wrapper, implment Future,and decrement on drop

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, that's much more like @mzabaluev 's original idea I think, of returning a future that does work when it is polled, and does work when it is dropped. But using Notify.

FrankReh marked this conversation as resolved.
Show resolved Hide resolved
let notify = self.notify_next_by_cap.entry(cap).or_default();
Arc::clone(notify)
}

fn check_in_internal(&mut self, index: u16, init_len: usize) {
let cap = self.iovecs()[index as usize].iov_len;
let state = &mut self.states[index as usize];
debug_assert!(
matches!(state, BufState::CheckedOut),
"the buffer must be checked out"
);

// Link the buffer as the new head of the free list for its capacity.
// Recently checked in buffers will be first to be reused,
// improving cache locality.
let next = self.free_buf_head_by_cap.insert(cap, index);

*state = BufState::Free { init_len, next };

if let Some(notify) = self.notify_next_by_cap.get(&cap) {
// Wake up a single task pending on `next`
notify.notify_one();
}
}
}

impl<T: IoBufMut> FixedBuffers for Pool<T> {
fn iovecs(&self) -> &[iovec] {
// Safety: the raw_bufs pointer is valid for the lifetime of self,
// the length of the states array is also the length of buffers array
// by construction.
unsafe { slice::from_raw_parts(self.raw_bufs.as_ptr(), self.states.len()) }
}

unsafe fn check_in(&mut self, index: u16, init_len: usize) {
self.check_in_internal(index, init_len)
}
}

impl<T: IoBufMut> Drop for Pool<T> {
fn drop(&mut self) {
for (i, state) in self.states.iter().enumerate() {
match state {
BufState::Free { init_len, .. } => {
// Update buffer initialization.
// The buffer is about to dropped, but this may release it
// from Registry ownership, rather than deallocate.
unsafe { self.buffers[i].set_init(*init_len) };
}
BufState::CheckedOut => unreachable!("all buffers must be checked in"),
}
}

// Rebuild Vec<iovec>, so it's dropped
let _ = unsafe {
Vec::from_raw_parts(self.raw_bufs.as_ptr(), self.states.len(), self.orig_cap)
};
}
}
140 changes: 140 additions & 0 deletions src/buf/fixed/plumbing/registry.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
use crate::buf::fixed::{handle::CheckedOutBuf, FixedBuffers};
use crate::buf::IoBufMut;

use libc::{iovec, UIO_MAXIOV};
use std::cmp;
use std::mem;
use std::ptr;
use std::slice;

// Internal state shared by FixedBufRegistry and FixedBuf handles.
pub(crate) struct Registry<T: IoBufMut> {
// Pointer to an allocated array of iovec records referencing
// the allocated buffers. The number of initialized records is the
// same as the length of the states array.
raw_bufs: ptr::NonNull<iovec>,
// Original capacity of raw_bufs as a Vec.
orig_cap: usize,
// State information on the buffers. Indices in this array correspond to
// the indices in the array at raw_bufs.
states: Vec<BufState>,
// The owned buffers are kept until Drop
buffers: Vec<T>,
}

// State information of a buffer in the registry,
enum BufState {
// The buffer is not in use.
// The field records the length of the initialized part.
Free { init_len: usize },
// The buffer is checked out.
// Its data are logically owned by the FixedBuf handle,
// which also keeps track of the length of the initialized part.
CheckedOut,
}

impl<T: IoBufMut> Registry<T> {
pub(crate) fn new(bufs: impl Iterator<Item = T>) -> Self {
// Limit the number of buffers to the maximum allowable number.
let bufs = bufs.take(cmp::min(UIO_MAXIOV as usize, u16::MAX as usize));
// Collect into `buffers`, which holds the backing buffers for
// the lifetime of the pool. Using collect may allow
// the compiler to apply collect in place specialization,
// to avoid an allocation.
let mut buffers = bufs.collect::<Vec<T>>();
let mut iovecs = Vec::with_capacity(buffers.len());
let mut states = Vec::with_capacity(buffers.len());
for buf in buffers.iter_mut() {
iovecs.push(iovec {
iov_base: buf.stable_mut_ptr() as *mut _,
iov_len: buf.bytes_total(),
});
states.push(BufState::Free {
init_len: buf.bytes_init(),
});
}
debug_assert_eq!(iovecs.len(), states.len());
debug_assert_eq!(iovecs.len(), buffers.len());

// Safety: Vec::as_mut_ptr never returns null
let raw_bufs = unsafe { ptr::NonNull::new_unchecked(iovecs.as_mut_ptr()) };
let orig_cap = iovecs.capacity();
mem::forget(iovecs);
Registry {
raw_bufs,
orig_cap,
states,
buffers,
}
}

// If the indexed buffer is free, changes its state to checked out
// and returns its data.
// If the buffer is already checked out, returns None.
pub(crate) fn check_out(&mut self, index: usize) -> Option<CheckedOutBuf> {
let state = self.states.get_mut(index)?;
let BufState::Free { init_len } = *state else {
return None
};

*state = BufState::CheckedOut;

// Safety: the allocated array under the pointer is valid
// for the lifetime of self, the index is inside the array
// as checked by Vec::get_mut above, called on the array of
// states that has the same length.
let iovec = unsafe { self.raw_bufs.as_ptr().add(index).read() };
debug_assert!(index <= u16::MAX as usize);
Some(CheckedOutBuf {
iovec,
init_len,
index: index as u16,
})
}

fn check_in_internal(&mut self, index: u16, init_len: usize) {
let state = self
.states
.get_mut(index as usize)
.expect("invalid buffer index");
debug_assert!(
matches!(state, BufState::CheckedOut),
"the buffer must be checked out"
);
*state = BufState::Free { init_len };
}
}

impl<T: IoBufMut> FixedBuffers for Registry<T> {
fn iovecs(&self) -> &[iovec] {
// Safety: the raw_bufs pointer is valid for the lifetime of self,
// the length of the states array is also the length of buffers array
// by construction.
unsafe { slice::from_raw_parts(self.raw_bufs.as_ptr(), self.states.len()) }
}

unsafe fn check_in(&mut self, index: u16, init_len: usize) {
self.check_in_internal(index, init_len)
}
}

impl<T: IoBufMut> Drop for Registry<T> {
fn drop(&mut self) {
for (i, state) in self.states.iter().enumerate() {
match state {
BufState::Free { init_len, .. } => {
// Update buffer initialization.
// The buffer is about to be dropped, but this may release it
// from Registry ownership, rather than deallocate.
unsafe { self.buffers[i].set_init(*init_len) };
}
BufState::CheckedOut => unreachable!("all buffers must be checked in"),
}
}

// Rebuild Vec<iovec>, so it's dropped
let _ = unsafe {
Vec::from_raw_parts(self.raw_bufs.as_ptr(), self.states.len(), self.orig_cap)
};
}
}
Loading