-
Notifications
You must be signed in to change notification settings - Fork 13k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
std: Add retain method for HashMap and HashSet #39560
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -416,22 +416,26 @@ fn search_hashed<K, V, M, F>(table: M, hash: SafeHash, mut is_match: F) -> Inter | |
} | ||
} | ||
|
||
fn pop_internal<K, V>(starting_bucket: FullBucketMut<K, V>) -> (K, V) { | ||
fn pop_internal<K, V>(starting_bucket: FullBucketMut<K, V>) | ||
-> (K, V, &mut RawTable<K, V>) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Any reason not to return There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Accept. |
||
{ | ||
let (empty, retkey, retval) = starting_bucket.take(); | ||
let mut gap = match empty.gap_peek() { | ||
Some(b) => b, | ||
None => return (retkey, retval), | ||
Ok(b) => b, | ||
Err(b) => return (retkey, retval, b.into_table()), | ||
}; | ||
|
||
while gap.full().displacement() != 0 { | ||
gap = match gap.shift() { | ||
Some(b) => b, | ||
None => break, | ||
Ok(b) => b, | ||
Err(b) => { | ||
return (retkey, retval, b.into_table()); | ||
}, | ||
}; | ||
} | ||
|
||
// Now we've done all our shifting. Return the value we grabbed earlier. | ||
(retkey, retval) | ||
(retkey, retval, gap.into_bucket().into_table()) | ||
} | ||
|
||
/// Perform robin hood bucket stealing at the given `bucket`. You must | ||
|
@@ -721,38 +725,7 @@ impl<K, V, S> HashMap<K, V, S> | |
return; | ||
} | ||
|
||
// Grow the table. | ||
// Specialization of the other branch. | ||
let mut bucket = Bucket::first(&mut old_table); | ||
|
||
// "So a few of the first shall be last: for many be called, | ||
// but few chosen." | ||
// | ||
// We'll most likely encounter a few buckets at the beginning that | ||
// have their initial buckets near the end of the table. They were | ||
// placed at the beginning as the probe wrapped around the table | ||
// during insertion. We must skip forward to a bucket that won't | ||
// get reinserted too early and won't unfairly steal others spot. | ||
// This eliminates the need for robin hood. | ||
loop { | ||
bucket = match bucket.peek() { | ||
Full(full) => { | ||
if full.displacement() == 0 { | ||
// This bucket occupies its ideal spot. | ||
// It indicates the start of another "cluster". | ||
bucket = full.into_bucket(); | ||
break; | ||
} | ||
// Leaving this bucket in the last cluster for later. | ||
full.into_bucket() | ||
} | ||
Empty(b) => { | ||
// Encountered a hole between clusters. | ||
b.into_bucket() | ||
} | ||
}; | ||
bucket.next(); | ||
} | ||
let mut bucket = Bucket::head_bucket(&mut old_table); | ||
|
||
// This is how the buckets might be laid out in memory: | ||
// ($ marks an initialized bucket) | ||
|
@@ -1208,6 +1181,57 @@ impl<K, V, S> HashMap<K, V, S> | |
|
||
self.search_mut(k).into_occupied_bucket().map(|bucket| pop_internal(bucket).1) | ||
} | ||
|
||
/// Retains only the elements specified by the predicate. | ||
/// | ||
/// In other words, remove all pairs `(k, v)` such that `f(&k,&mut v)` returns `false`. | ||
/// | ||
/// # Examples | ||
/// | ||
/// ``` | ||
/// #![feature(retain_hash_collection)] | ||
/// use std::collections::HashMap; | ||
/// | ||
/// let mut map: HashMap<isize, isize> = (0..8).map(|x|(x, x*10)).collect(); | ||
/// map.retain(|&k, _| k % 2 == 0); | ||
/// assert_eq!(map.len(), 4); | ||
/// ``` | ||
#[unstable(feature = "retain_hash_collection", issue = "36648")] | ||
pub fn retain<F>(&mut self, mut f: F) | ||
where F: FnMut(&K, &mut V) -> bool | ||
{ | ||
if self.table.capacity() == 0 || self.table.size() == 0 { | ||
return; | ||
} | ||
let mut bucket = Bucket::head_bucket(&mut self.table); | ||
bucket.prev(); | ||
let tail = bucket.index(); | ||
loop { | ||
bucket = match bucket.peek() { | ||
Full(mut full) => { | ||
let should_remove = { | ||
let (k, v) = full.read_mut(); | ||
!f(k, v) | ||
}; | ||
if should_remove { | ||
let prev_idx = full.index(); | ||
let prev_raw = full.raw(); | ||
let (_, _, t) = pop_internal(full); | ||
Bucket::new_from(prev_raw, prev_idx, t) | ||
} else { | ||
full.into_bucket() | ||
} | ||
}, | ||
Empty(b) => { | ||
b.into_bucket() | ||
} | ||
}; | ||
bucket.prev(); // reverse iteration | ||
if bucket.index() == tail { | ||
break; | ||
} | ||
} | ||
} | ||
} | ||
|
||
#[stable(feature = "rust1", since = "1.0.0")] | ||
|
@@ -1862,7 +1886,8 @@ impl<'a, K, V> OccupiedEntry<'a, K, V> { | |
/// ``` | ||
#[stable(feature = "map_entry_recover_keys2", since = "1.12.0")] | ||
pub fn remove_entry(self) -> (K, V) { | ||
pop_internal(self.elem) | ||
let (k, v, _) = pop_internal(self.elem); | ||
(k, v) | ||
} | ||
|
||
/// Gets a reference to the value in the entry. | ||
|
@@ -3156,4 +3181,15 @@ mod test_map { | |
assert_eq!(a.len(), 1); | ||
assert_eq!(a[key], value); | ||
} | ||
|
||
#[test] | ||
fn test_retain() { | ||
let mut map: HashMap<isize, isize> = (0..100).map(|x|(x, x*10)).collect(); | ||
|
||
map.retain(|&k, _| k % 2 == 0); | ||
assert_eq!(map.len(), 50); | ||
assert_eq!(map[&2], 20); | ||
assert_eq!(map[&4], 40); | ||
assert_eq!(map[&6], 60); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -85,7 +85,7 @@ pub struct RawTable<K, V> { | |
unsafe impl<K: Send, V: Send> Send for RawTable<K, V> {} | ||
unsafe impl<K: Sync, V: Sync> Sync for RawTable<K, V> {} | ||
|
||
struct RawBucket<K, V> { | ||
pub struct RawBucket<K, V> { | ||
hash: *mut HashUint, | ||
// We use *const to ensure covariance with respect to K and V | ||
pair: *const (K, V), | ||
|
@@ -216,6 +216,10 @@ impl<K, V, M> FullBucket<K, V, M> { | |
pub fn index(&self) -> usize { | ||
self.idx | ||
} | ||
/// Get the raw bucket. | ||
pub fn raw(&self) -> RawBucket<K, V> { | ||
self.raw | ||
} | ||
} | ||
|
||
impl<K, V, M> EmptyBucket<K, V, M> { | ||
|
@@ -230,6 +234,10 @@ impl<K, V, M> Bucket<K, V, M> { | |
pub fn index(&self) -> usize { | ||
self.idx | ||
} | ||
/// get the table. | ||
pub fn into_table(self) -> M { | ||
self.table | ||
} | ||
} | ||
|
||
impl<K, V, M> Deref for FullBucket<K, V, M> | ||
|
@@ -275,6 +283,16 @@ impl<K, V, M: Deref<Target = RawTable<K, V>>> Bucket<K, V, M> { | |
Bucket::at_index(table, hash.inspect() as usize) | ||
} | ||
|
||
pub fn new_from(r: RawBucket<K, V>, i: usize, t: M) | ||
-> Bucket<K, V, M> | ||
{ | ||
Bucket { | ||
raw: r, | ||
idx: i, | ||
table: t, | ||
} | ||
} | ||
|
||
pub fn at_index(table: M, ib_index: usize) -> Bucket<K, V, M> { | ||
// if capacity is 0, then the RawBucket will be populated with bogus pointers. | ||
// This is an uncommon case though, so avoid it in release builds. | ||
|
@@ -296,6 +314,40 @@ impl<K, V, M: Deref<Target = RawTable<K, V>>> Bucket<K, V, M> { | |
} | ||
} | ||
|
||
// "So a few of the first shall be last: for many be called, | ||
// but few chosen." | ||
// | ||
// We'll most likely encounter a few buckets at the beginning that | ||
// have their initial buckets near the end of the table. They were | ||
// placed at the beginning as the probe wrapped around the table | ||
// during insertion. We must skip forward to a bucket that won't | ||
// get reinserted too early and won't unfairly steal others spot. | ||
// This eliminates the need for robin hood. | ||
pub fn head_bucket(table: M) -> Bucket<K, V, M> { | ||
let mut bucket = Bucket::first(table); | ||
|
||
loop { | ||
bucket = match bucket.peek() { | ||
Full(full) => { | ||
if full.displacement() == 0 { | ||
// This bucket occupies its ideal spot. | ||
// It indicates the start of another "cluster". | ||
bucket = full.into_bucket(); | ||
break; | ||
} | ||
// Leaving this bucket in the last cluster for later. | ||
full.into_bucket() | ||
} | ||
Empty(b) => { | ||
// Encountered a hole between clusters. | ||
b.into_bucket() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now that we use it for retain we should early return on an empty bucket as well, avoiding walking the buckets more than needed. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't understand this. Empty bucket and occupied bucket are not in separate groups. I have to walk all the buckets without missing any one, just like the Could you please explain more about this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Due to the way RobinHood hashing works as soon as you encounter an empty bucket the next non-empty bucket is guaranteed to have displacement = 0. Right now the code goes all the way to the first full bucket with displacement = 0, which is fine for both use cases but may mean extra work for retain. Changing it to return on empty bucket as well changes nothing for resize and possibly saves this bit of work on retain. Micro, but still... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nevermind this, it's probably not worth it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm still struggling to fully understand the algorithm... Thank you all the same. Is there any other issue there? |
||
} | ||
}; | ||
bucket.next(); | ||
} | ||
bucket | ||
} | ||
|
||
/// Reads a bucket at a given index, returning an enum indicating whether | ||
/// it's initialized or not. You need to match on this enum to get | ||
/// the appropriate types to call most of the other functions in | ||
|
@@ -333,6 +385,17 @@ impl<K, V, M: Deref<Target = RawTable<K, V>>> Bucket<K, V, M> { | |
self.raw = self.raw.offset(dist); | ||
} | ||
} | ||
|
||
/// Modifies the bucket pointer in place to make it point to the previous slot. | ||
pub fn prev(&mut self) { | ||
let range = self.table.capacity(); | ||
let new_idx = self.idx.wrapping_sub(1) & (range - 1); | ||
let dist = (new_idx as isize).wrapping_sub(self.idx as isize); | ||
self.idx = new_idx; | ||
unsafe { | ||
self.raw = self.raw.offset(dist); | ||
} | ||
} | ||
} | ||
|
||
impl<K, V, M: Deref<Target = RawTable<K, V>>> EmptyBucket<K, V, M> { | ||
|
@@ -352,7 +415,7 @@ impl<K, V, M: Deref<Target = RawTable<K, V>>> EmptyBucket<K, V, M> { | |
} | ||
} | ||
|
||
pub fn gap_peek(self) -> Option<GapThenFull<K, V, M>> { | ||
pub fn gap_peek(self) -> Result<GapThenFull<K, V, M>, Bucket<K, V, M>> { | ||
let gap = EmptyBucket { | ||
raw: self.raw, | ||
idx: self.idx, | ||
|
@@ -361,12 +424,12 @@ impl<K, V, M: Deref<Target = RawTable<K, V>>> EmptyBucket<K, V, M> { | |
|
||
match self.next().peek() { | ||
Full(bucket) => { | ||
Some(GapThenFull { | ||
Ok(GapThenFull { | ||
gap: gap, | ||
full: bucket, | ||
}) | ||
} | ||
Empty(..) => None, | ||
Empty(e) => Err(e.into_bucket()), | ||
} | ||
} | ||
} | ||
|
@@ -529,7 +592,11 @@ impl<K, V, M> GapThenFull<K, V, M> | |
&self.full | ||
} | ||
|
||
pub fn shift(mut self) -> Option<GapThenFull<K, V, M>> { | ||
pub fn into_bucket(self) -> Bucket<K, V, M> { | ||
self.full.into_bucket() | ||
} | ||
|
||
pub fn shift(mut self) -> Result<GapThenFull<K, V, M>, Bucket<K, V, M>> { | ||
unsafe { | ||
*self.gap.raw.hash = mem::replace(&mut *self.full.raw.hash, EMPTY_BUCKET); | ||
ptr::copy_nonoverlapping(self.full.raw.pair, self.gap.raw.pair as *mut (K, V), 1); | ||
|
@@ -544,9 +611,9 @@ impl<K, V, M> GapThenFull<K, V, M> | |
|
||
self.full = bucket; | ||
|
||
Some(self) | ||
Ok(self) | ||
} | ||
Empty(..) => None, | ||
Empty(b) => Err(b.into_bucket()), | ||
} | ||
} | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you revert this to return the &mut Table (reverting my previous suggestion)? Sorry about that.
Right now it isn't consistent regarding what bucket it returns. Fixing that would increase the diff even more for no good reason.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok. Should I squash all the commits together?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please do.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.