diff --git a/accounts-db/benches/read_only_accounts_cache.rs b/accounts-db/benches/read_only_accounts_cache.rs index 4f1fa4febd3820..4aba9bd4111ac5 100644 --- a/accounts-db/benches/read_only_accounts_cache.rs +++ b/accounts-db/benches/read_only_accounts_cache.rs @@ -61,6 +61,7 @@ fn bench_read_only_accounts_cache(c: &mut Criterion) { let cache = Arc::new(ReadOnlyAccountsCache::new( AccountsDb::DEFAULT_MAX_READ_ONLY_CACHE_DATA_SIZE_LO, AccountsDb::DEFAULT_MAX_READ_ONLY_CACHE_DATA_SIZE_HI, + AccountsDb::DEFAULT_READ_ONLY_CACHE_EVICT_SAMPLE_SIZE, AccountsDb::READ_ONLY_CACHE_MS_TO_SKIP_LRU_UPDATE, )); @@ -181,6 +182,7 @@ fn bench_read_only_accounts_cache_eviction( let cache = Arc::new(ReadOnlyAccountsCache::new( max_data_size_lo, max_data_size_hi, + AccountsDb::DEFAULT_READ_ONLY_CACHE_EVICT_SAMPLE_SIZE, AccountsDb::READ_ONLY_CACHE_MS_TO_SKIP_LRU_UPDATE, )); diff --git a/accounts-db/src/accounts_db.rs b/accounts-db/src/accounts_db.rs index 878642ed396eb8..1e7a6a57e472e6 100644 --- a/accounts-db/src/accounts_db.rs +++ b/accounts-db/src/accounts_db.rs @@ -499,6 +499,7 @@ pub const ACCOUNTS_DB_CONFIG_FOR_TESTING: AccountsDbConfig = AccountsDbConfig { shrink_paths: None, shrink_ratio: DEFAULT_ACCOUNTS_SHRINK_THRESHOLD_OPTION, read_cache_limit_bytes: None, + read_cache_evict_sample_size: None, write_cache_limit_bytes: None, ancient_append_vec_offset: None, ancient_storage_ideal_size: None, @@ -525,6 +526,7 @@ pub const ACCOUNTS_DB_CONFIG_FOR_BENCHMARKS: AccountsDbConfig = AccountsDbConfig shrink_paths: None, shrink_ratio: DEFAULT_ACCOUNTS_SHRINK_THRESHOLD_OPTION, read_cache_limit_bytes: None, + read_cache_evict_sample_size: None, write_cache_limit_bytes: None, ancient_append_vec_offset: None, ancient_storage_ideal_size: None, @@ -649,6 +651,7 @@ pub struct AccountsDbConfig { /// The low and high watermark sizes for the read cache, in bytes. /// If None, defaults will be used. pub read_cache_limit_bytes: Option<(usize, usize)>, + pub read_cache_evict_sample_size: Option, pub write_cache_limit_bytes: Option, /// if None, ancient append vecs are set to ANCIENT_APPEND_VEC_DEFAULT_OFFSET /// Some(offset) means include slots up to (max_slot - (slots_per_epoch - 'offset')) @@ -1891,6 +1894,9 @@ impl AccountsDb { #[cfg_attr(feature = "dev-context-only-utils", qualifiers(pub))] const DEFAULT_MAX_READ_ONLY_CACHE_DATA_SIZE_HI: usize = 410 * 1024 * 1024; + #[cfg_attr(feature = "dev-context-only-utils", qualifiers(pub))] + const DEFAULT_READ_ONLY_CACHE_EVICT_SAMPLE_SIZE: usize = 8; + pub fn default_for_tests() -> Self { Self::new_single_for_tests() } @@ -1972,6 +1978,9 @@ impl AccountsDb { Self::DEFAULT_MAX_READ_ONLY_CACHE_DATA_SIZE_LO, Self::DEFAULT_MAX_READ_ONLY_CACHE_DATA_SIZE_HI, )); + let read_cache_evict_sample_size = accounts_db_config + .read_cache_evict_sample_size + .unwrap_or(Self::DEFAULT_READ_ONLY_CACHE_EVICT_SAMPLE_SIZE); // Increase the stack for foreground threads // rayon needs a lot of stack @@ -2027,6 +2036,7 @@ impl AccountsDb { read_only_accounts_cache: ReadOnlyAccountsCache::new( read_cache_size.0, read_cache_size.1, + read_cache_evict_sample_size, Self::READ_ONLY_CACHE_MS_TO_SKIP_LRU_UPDATE, ), write_cache_limit_bytes: accounts_db_config.write_cache_limit_bytes, diff --git a/accounts-db/src/read_only_accounts_cache.rs b/accounts-db/src/read_only_accounts_cache.rs index a616a863535073..0e53719a622e97 100644 --- a/accounts-db/src/read_only_accounts_cache.rs +++ b/accounts-db/src/read_only_accounts_cache.rs @@ -4,8 +4,8 @@ use qualifier_attr::qualifiers; use { dashmap::{mapref::entry::Entry, DashMap}, - index_list::{Index, IndexList}, log::*, + rand::{seq::SliceRandom, thread_rng}, solana_measure::{measure::Measure, measure_us}, solana_sdk::{ account::{AccountSharedData, ReadableAccount}, @@ -17,7 +17,7 @@ use { mem::ManuallyDrop, sync::{ atomic::{AtomicBool, AtomicU32, AtomicU64, AtomicUsize, Ordering}, - Arc, Mutex, + Arc, }, thread, time::Duration, @@ -38,8 +38,6 @@ struct ReadOnlyAccountCacheEntry { /// make sure that both pubkey and slot matches in the cache. Otherwise, we /// may return the wrong account. slot: Slot, - /// Index of the entry in the eviction queue. - index: AtomicU32, /// lower bits of last timestamp when eviction queue was updated, in ms last_update_time: AtomicU32, } @@ -72,12 +70,6 @@ struct AtomicReadOnlyCacheStats { #[derive(Debug)] pub(crate) struct ReadOnlyAccountsCache { cache: Arc>, - /// When an item is first entered into the cache, it is added to the end of - /// the queue. Also each time an entry is looked up from the cache it is - /// moved to the end of the queue. As a result, items in the queue are - /// always sorted in the order that they have last been accessed. When doing - /// LRU eviction, cache entries are evicted from the front of the queue. - queue: Arc>>, _max_data_size_lo: usize, _max_data_size_hi: usize, data_size: Arc, @@ -101,11 +93,11 @@ impl ReadOnlyAccountsCache { pub(crate) fn new( max_data_size_lo: usize, max_data_size_hi: usize, + evict_sample_size: usize, ms_to_skip_lru_update: u32, ) -> Self { assert!(max_data_size_lo <= max_data_size_hi); let cache = Arc::new(DashMap::default()); - let queue = Arc::new(Mutex::>::default()); let data_size = Arc::new(AtomicUsize::default()); let stats = Arc::new(AtomicReadOnlyCacheStats::default()); let evictor_exit_flag = Arc::new(AtomicBool::new(false)); @@ -114,8 +106,8 @@ impl ReadOnlyAccountsCache { max_data_size_lo, max_data_size_hi, data_size.clone(), + evict_sample_size, cache.clone(), - queue.clone(), stats.clone(), ); @@ -124,7 +116,6 @@ impl ReadOnlyAccountsCache { _max_data_size_lo: max_data_size_lo, _max_data_size_hi: max_data_size_hi, cache, - queue, data_size, ms_to_skip_lru_update, stats, @@ -148,15 +139,8 @@ impl ReadOnlyAccountsCache { let mut found = None; if let Some(entry) = self.cache.get(&pubkey) { if entry.slot == slot { - // Move the entry to the end of the queue. - // self.queue is modified while holding a reference to the cache entry; - // so that another thread cannot write to the same key. - // If we updated the eviction queue within this much time, then leave it where it is. We're likely to hit it again. let update_lru = entry.ms_since_last_update() >= self.ms_to_skip_lru_update; if update_lru { - let mut queue = self.queue.lock().unwrap(); - queue.remove(entry.index()); - entry.set_index(queue.insert_last(pubkey)); entry .last_update_time .store(ReadOnlyAccountCacheEntry::timestamp(), Ordering::Release); @@ -187,14 +171,9 @@ impl ReadOnlyAccountsCache { self.highest_slot_stored.fetch_max(slot, Ordering::Release); let account_size = Self::account_size(&account); self.data_size.fetch_add(account_size, Ordering::Relaxed); - // self.queue is modified while holding a reference to the cache entry; - // so that another thread cannot write to the same key. match self.cache.entry(pubkey) { Entry::Vacant(entry) => { - // Insert the entry at the end of the queue. - let mut queue = self.queue.lock().unwrap(); - let index = queue.insert_last(pubkey); - entry.insert(ReadOnlyAccountCacheEntry::new(account, slot, index)); + entry.insert(ReadOnlyAccountCacheEntry::new(account, slot)); } Entry::Occupied(mut entry) => { let entry = entry.get_mut(); @@ -202,10 +181,6 @@ impl ReadOnlyAccountsCache { self.data_size.fetch_sub(account_size, Ordering::Relaxed); entry.account = account; entry.slot = slot; - // Move the entry to the end of the queue. - let mut queue = self.queue.lock().unwrap(); - queue.remove(entry.index()); - entry.set_index(queue.insert_last(pubkey)); } }; let store_us = measure_store.end_as_us(); @@ -227,21 +202,16 @@ impl ReadOnlyAccountsCache { #[cfg_attr(feature = "dev-context-only-utils", qualifiers(pub))] pub(crate) fn remove(&self, pubkey: Pubkey) -> Option { - Self::do_remove(&pubkey, &self.cache, &self.queue, &self.data_size) + Self::do_remove(&pubkey, &self.cache, &self.data_size) } /// Removes `key` from the cache, if present, and returns the removed account fn do_remove( key: &ReadOnlyCacheKey, cache: &DashMap, - queue: &Mutex>, data_size: &AtomicUsize, ) -> Option { let (_, entry) = cache.remove(key)?; - // self.queue should be modified only after removing the entry from the - // cache, so that this is still safe if another thread writes to the - // same key. - queue.lock().unwrap().remove(entry.index()); let account_size = Self::account_size(&entry.account); data_size.fetch_sub(account_size, Ordering::Relaxed); Some(entry.account) @@ -289,8 +259,8 @@ impl ReadOnlyAccountsCache { max_data_size_lo: usize, max_data_size_hi: usize, data_size: Arc, + evict_sample_size: usize, cache: Arc>, - queue: Arc>>, stats: Arc, ) -> thread::JoinHandle<()> { thread::Builder::new() @@ -316,8 +286,12 @@ impl ReadOnlyAccountsCache { .evictor_wakeup_count_productive .fetch_add(1, Ordering::Relaxed); - let (num_evicts, evict_us) = - measure_us!(Self::evict(max_data_size_lo, &data_size, &cache, &queue)); + let (num_evicts, evict_us) = measure_us!(Self::evict( + max_data_size_lo, + &data_size, + evict_sample_size, + &cache, + )); stats.evicts.fetch_add(num_evicts, Ordering::Relaxed); stats.evict_us.fetch_add(evict_us, Ordering::Relaxed); } @@ -333,16 +307,39 @@ impl ReadOnlyAccountsCache { fn evict( target_data_size: usize, data_size: &AtomicUsize, + evict_sample_size: usize, cache: &DashMap, - queue: &Mutex>, ) -> u64 { + let mut rng = thread_rng(); + let mut num_evicts = 0; while data_size.load(Ordering::Relaxed) > target_data_size { - let Some(&key) = queue.lock().unwrap().get_first() else { - // if there are no more entries, we're done - break; + let shard = cache + .shards() + .choose(&mut rng) + // PANICS: It's practically impossible to end up with 0 + // shards in the dashmap. The default number of shards is + // determined here: + // https://github.com/xacrimon/dashmap/blob/v.5.5.3/src/lib.rs#L66-L71 + // Which would return 0 only if there are 0 CPUs. + .unwrap(); + let (key, account_size) = { + let shard = shard.read(); + let Some((key, entry)) = shard + .iter() + .take(evict_sample_size) + .min_by_key(|(_, entry)| entry.get().last_update_time.load(Ordering::Relaxed)) + else { + // This can only happen if the shard is empty. In that case, + // continue looping. + continue; + }; + (key.to_owned(), entry.get().account.data().len()) }; - Self::do_remove(&key, cache, queue, data_size); + + shard.write().remove_entry(&key); + let account_size = CACHE_ENTRY_SIZE + account_size; + data_size.fetch_sub(account_size, Ordering::Relaxed); num_evicts += 1; } num_evicts @@ -361,29 +358,14 @@ impl Drop for ReadOnlyAccountsCache { } impl ReadOnlyAccountCacheEntry { - fn new(account: AccountSharedData, slot: Slot, index: Index) -> Self { - let index = unsafe { std::mem::transmute::(index) }; - let index = AtomicU32::new(index); + fn new(account: AccountSharedData, slot: Slot) -> Self { Self { account, slot, - index, last_update_time: AtomicU32::new(Self::timestamp()), } } - #[inline] - fn index(&self) -> Index { - let index = self.index.load(Ordering::Relaxed); - unsafe { std::mem::transmute::(index) } - } - - #[inline] - fn set_index(&self, index: Index) { - let index = unsafe { std::mem::transmute::(index) }; - self.index.store(index, Ordering::Relaxed); - } - /// lower bits of current timestamp. We don't need higher bits and u32 packs with Index u32 in `ReadOnlyAccountCacheEntry` fn timestamp() -> u32 { timestamp() as u32 @@ -399,10 +381,7 @@ impl ReadOnlyAccountCacheEntry { mod tests { use { super::*, - rand::{ - seq::{IteratorRandom, SliceRandom}, - Rng, SeedableRng, - }, + rand::{seq::IteratorRandom, Rng, SeedableRng}, rand_chacha::ChaChaRng, solana_sdk::account::{accounts_equal, Account, WritableAccount}, std::{ @@ -411,6 +390,7 @@ mod tests { sync::Arc, time::{Duration, Instant}, }, + test_case::test_matrix, }; impl ReadOnlyAccountsCache { @@ -418,17 +398,21 @@ mod tests { // // Evicting in the background is non-deterministic w.r.t. when the evictor runs, // which can make asserting invariants difficult in tests. - fn evict_in_foreground(&self) { + fn evict_in_foreground(&self, evict_sample_size: usize) { #[allow(clippy::used_underscore_binding)] let target_data_size = self._max_data_size_lo; - Self::evict(target_data_size, &self.data_size, &self.cache, &self.queue); + Self::evict( + target_data_size, + &self.data_size, + evict_sample_size, + &self.cache, + ); } /// reset the read only accounts cache #[cfg(feature = "dev-context-only-utils")] pub fn reset_for_tests(&self) { self.cache.clear(); - self.queue.lock().unwrap().clear(); self.data_size.store(0, Ordering::Relaxed); } } @@ -446,9 +430,13 @@ mod tests { let per_account_size = CACHE_ENTRY_SIZE; let data_size = 100; let max = data_size + per_account_size; + // Larger sample size doesn't make sense for the small amount of + // accounts we're testing here. + let evict_sample_size = 1; let cache = ReadOnlyAccountsCache::new( max, usize::MAX, // <-- do not evict in the background + evict_sample_size, READ_ONLY_CACHE_MS_TO_SKIP_LRU_UPDATE_FOR_TESTS, ); let slot = 0; @@ -467,67 +455,121 @@ mod tests { account2.checked_add_lamports(1).unwrap(); // so they compare differently let mut account3 = account1.clone(); account3.checked_add_lamports(4).unwrap(); // so they compare differently + + // Store `account1`. cache.store(key1, slot, account1.clone()); - cache.evict_in_foreground(); + cache.evict_in_foreground(evict_sample_size); assert_eq!(100 + per_account_size, cache.data_size()); assert!(accounts_equal(&cache.load(key1, slot).unwrap(), &account1)); // pass a wrong slot and check that load fails assert!(cache.load(key1, slot + 1).is_none()); // insert another entry for slot+1, and assert only one entry for key1 is in the cache + + // Insert `account1` again for slot+1, and assert only one entry for + // `key1` is in the cache. cache.store(key1, slot + 1, account1.clone()); assert_eq!(1, cache.cache_len()); + + // Store `acocunt2`. cache.store(key2, slot, account2.clone()); - cache.evict_in_foreground(); - assert_eq!(100 + per_account_size, cache.data_size()); assert!(accounts_equal(&cache.load(key2, slot).unwrap(), &account2)); + cache.evict_in_foreground(evict_sample_size); + assert_eq!(100 + per_account_size, cache.data_size()); + // Due to sampled LRU eviction, we are not 100% sure whether it's the + // `account1` which got evicted. What we know for sure is that the + // cache should contain just one element. assert_eq!(1, cache.cache_len()); - cache.store(key2, slot, account1.clone()); // overwrite key2 with account1 - cache.evict_in_foreground(); + let entries: Vec<_> = cache + .cache + .iter() + .map(|entry| (entry.key().to_owned(), entry.value().account.to_owned())) + .collect(); + assert_eq!(entries.len(), 1); + let (loaded_key, loaded_account) = entries.first().unwrap(); + if *loaded_key == key1 { + assert!(accounts_equal(loaded_account, &account1)); + } else if *loaded_key == key2 { + assert!(accounts_equal(loaded_account, &account2)); + } else { + panic!("Unexpected key: {loaded_key:?}"); + } + + // Overwrite `key2` with `account1`. + cache.store(key2, slot, account1.clone()); + cache.evict_in_foreground(evict_sample_size); assert_eq!(100 + per_account_size, cache.data_size()); - assert!(accounts_equal(&cache.load(key2, slot).unwrap(), &account1)); + // Regardless of which key was evicted, both `key1` and `key2` should + // hold `account1`. assert_eq!(1, cache.cache_len()); + let entries: Vec<_> = cache + .cache + .iter() + .map(|entry| (entry.key().to_owned(), entry.value().account.to_owned())) + .collect(); + assert_eq!(entries.len(), 1); + let (_, loaded_account) = entries.first().unwrap(); + assert!(accounts_equal(loaded_account, &account1)); + + cache.remove(key1); cache.remove(key2); assert_eq!(0, cache.data_size()); assert_eq!(0, cache.cache_len()); - // can store 2 items, 3rd item kicks oldest item out + // Can store 2 items, 3rd item kicks the oldest item out. let max = (data_size + per_account_size) * 2; let cache = ReadOnlyAccountsCache::new( max, usize::MAX, // <-- do not evict in the background + evict_sample_size, READ_ONLY_CACHE_MS_TO_SKIP_LRU_UPDATE_FOR_TESTS, ); cache.store(key1, slot, account1.clone()); - cache.evict_in_foreground(); + cache.evict_in_foreground(evict_sample_size); assert_eq!(100 + per_account_size, cache.data_size()); assert!(accounts_equal(&cache.load(key1, slot).unwrap(), &account1)); assert_eq!(1, cache.cache_len()); cache.store(key2, slot, account2.clone()); - cache.evict_in_foreground(); + cache.evict_in_foreground(evict_sample_size); assert_eq!(max, cache.data_size()); assert!(accounts_equal(&cache.load(key1, slot).unwrap(), &account1)); assert!(accounts_equal(&cache.load(key2, slot).unwrap(), &account2)); assert_eq!(2, cache.cache_len()); cache.store(key2, slot, account1.clone()); // overwrite key2 with account1 - cache.evict_in_foreground(); + cache.evict_in_foreground(evict_sample_size); assert_eq!(max, cache.data_size()); assert!(accounts_equal(&cache.load(key1, slot).unwrap(), &account1)); assert!(accounts_equal(&cache.load(key2, slot).unwrap(), &account1)); assert_eq!(2, cache.cache_len()); cache.store(key3, slot, account3.clone()); - cache.evict_in_foreground(); + cache.evict_in_foreground(evict_sample_size); assert_eq!(max, cache.data_size()); - assert!(cache.load(key1, slot).is_none()); // was lru purged - assert!(accounts_equal(&cache.load(key2, slot).unwrap(), &account1)); - assert!(accounts_equal(&cache.load(key3, slot).unwrap(), &account3)); + assert_eq!(2, cache.cache_len()); + let entries: Vec<_> = cache + .cache + .iter() + .map(|entry| (entry.key().to_owned(), entry.value().account.to_owned())) + .collect(); + assert_eq!(entries.len(), 2); + for (loaded_key, loaded_account) in entries { + if loaded_key == key1 { + assert!(accounts_equal(&loaded_account, &account1)); + } else if loaded_key == key2 { + // `key2` was overwritten with `account1`. + assert!(accounts_equal(&loaded_account, &account1)); + } else if loaded_key == key3 { + assert!(accounts_equal(&loaded_account, &account3)); + } else { + panic!("Unexpected key: {loaded_key:?}"); + } + } } /// tests like to deterministically update lru always const READ_ONLY_CACHE_MS_TO_SKIP_LRU_UPDATE_FOR_TESTS: u32 = 0; - #[test] - fn test_read_only_accounts_cache_random() { + #[test_matrix([1, 2, 4, 8, 10, 16, 32])] + fn test_read_only_accounts_cache_random(evict_sample_size: usize) { const SEED: [u8; 32] = [0xdb; 32]; const DATA_SIZE: usize = 19; const MAX_CACHE_SIZE: usize = 17 * (CACHE_ENTRY_SIZE + DATA_SIZE); @@ -535,6 +577,7 @@ mod tests { let cache = ReadOnlyAccountsCache::new( MAX_CACHE_SIZE, usize::MAX, // <-- do not evict in the background + evict_sample_size, READ_ONLY_CACHE_MS_TO_SKIP_LRU_UPDATE_FOR_TESTS, ); let slots: Vec = repeat_with(|| rng.gen_range(0..1000)).take(5).collect(); @@ -570,33 +613,37 @@ mod tests { let pubkey = *pubkeys.choose(&mut rng).unwrap(); hash_map.insert(pubkey, (account.clone(), slot, ix)); cache.store(pubkey, slot, account); - cache.evict_in_foreground(); + cache.evict_in_foreground(evict_sample_size); } } assert_eq!(cache.cache_len(), 17); assert_eq!(hash_map.len(), 35); - let index = hash_map - .iter() - .filter(|(k, _)| cache.cache.contains_key(k)) - .map(|(_, (_, _, ix))| *ix) - .min() - .unwrap(); - for (pubkey, (account, slot, ix)) in hash_map { - assert_eq!( - cache.load(pubkey, slot), - if ix < index { None } else { Some(account) } - ); + // Ensure that all the cache entries hold information consistent with + // what we accumulated in the local hash map. + // Note that the opposite assertion (checking that all entries from the + // local hash map exist in the cache) wouldn't work, because of sampled + // LRU eviction. + for entry in cache.cache.iter() { + let pubkey = entry.key(); + let ReadOnlyAccountCacheEntry { account, slot, .. } = entry.value(); + + let (local_account, local_slot, _) = hash_map + .get(pubkey) + .expect("account to be present in the map"); + assert_eq!(account, local_account); + assert_eq!(slot, local_slot); } } - #[test] - fn test_evict_in_background() { + #[test_matrix([1, 2, 4, 8, 10, 16, 32])] + fn test_evict_in_background(evict_sample_size: usize) { const ACCOUNT_DATA_SIZE: usize = 200; const MAX_ENTRIES: usize = 7; const MAX_CACHE_SIZE: usize = MAX_ENTRIES * (CACHE_ENTRY_SIZE + ACCOUNT_DATA_SIZE); let cache = ReadOnlyAccountsCache::new( MAX_CACHE_SIZE, MAX_CACHE_SIZE, + evict_sample_size, READ_ONLY_CACHE_MS_TO_SKIP_LRU_UPDATE_FOR_TESTS, ); @@ -629,8 +676,5 @@ mod tests { // ...now ensure the cache size is right assert_eq!(cache.cache_len(), MAX_ENTRIES); assert_eq!(cache.data_size(), MAX_CACHE_SIZE); - - // and the most recent account we stored should still be in the cache - assert_eq!(cache.load(pubkey, slot).unwrap(), account); } }