From 81cbc5ba0baef263c24b694ca41560d552742f1f Mon Sep 17 00:00:00 2001 From: Yueh-Hsuan Chiang Date: Fri, 20 Jan 2023 04:46:49 -0800 Subject: [PATCH] Introduce AccountsDataStorage Making new account file created in cold mode Introduce AccountsDataStorage Making new account file created in cold mode Introduce AccountsDataStorage Making new account file created in cold mode Test Test 2 --- Cargo.lock | 1 + ledger-tool/src/main.rs | 74 +++ runtime/Cargo.toml | 1 + runtime/src/account_storage.rs | 6 +- runtime/src/account_storage/meta.rs | 42 +- runtime/src/accounts_db.rs | 21 + runtime/src/accounts_file.rs | 29 +- runtime/src/ancient_append_vecs.rs | 3 + runtime/src/append_vec.rs | 130 ++-- runtime/src/append_vec/test_utils.rs | 14 + runtime/src/lib.rs | 1 + runtime/src/tiered_storage.rs | 703 +++++++++++++++++++++ runtime/src/tiered_storage/cold.rs | 530 ++++++++++++++++ runtime/src/tiered_storage/data_block.rs | 132 ++++ runtime/src/tiered_storage/file.rs | 71 +++ runtime/src/tiered_storage/footer.rs | 364 +++++++++++ runtime/src/tiered_storage/hot.rs | 511 +++++++++++++++ runtime/src/tiered_storage/meta_entries.rs | 257 ++++++++ runtime/src/tiered_storage/reader.rs | 184 ++++++ runtime/src/tiered_storage/writer.rs | 560 ++++++++++++++++ sdk/src/account.rs | 10 +- 21 files changed, 3579 insertions(+), 65 deletions(-) create mode 100644 runtime/src/tiered_storage.rs create mode 100644 runtime/src/tiered_storage/cold.rs create mode 100644 runtime/src/tiered_storage/data_block.rs create mode 100644 runtime/src/tiered_storage/file.rs create mode 100644 runtime/src/tiered_storage/footer.rs create mode 100644 runtime/src/tiered_storage/hot.rs create mode 100644 runtime/src/tiered_storage/meta_entries.rs create mode 100644 runtime/src/tiered_storage/reader.rs create mode 100644 runtime/src/tiered_storage/writer.rs diff --git a/Cargo.lock b/Cargo.lock index a2d5ac5190ec91..a95ac7ab4dd383 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6515,6 +6515,7 @@ dependencies = [ "num-derive", "num-traits", "num_cpus", + "num_enum", "once_cell", "ouroboros", "rand 0.7.3", diff --git a/ledger-tool/src/main.rs b/ledger-tool/src/main.rs index ed3c7b38f41ab3..c156101233ab55 100644 --- a/ledger-tool/src/main.rs +++ b/ledger-tool/src/main.rs @@ -53,6 +53,7 @@ use { transaction_status_service::TransactionStatusService, }, solana_runtime::{ + account_storage::meta::StoredAccountMeta, accounts::Accounts, accounts_background_service::{ AbsRequestHandlers, AbsRequestSender, AccountsBackgroundService, @@ -63,6 +64,7 @@ use { }, accounts_index::{AccountsIndexConfig, IndexLimitMb, ScanConfig}, accounts_update_notifier_interface::AccountsUpdateNotifier, + append_vec::AppendVec, bank::{Bank, RewardCalculationEvent, TotalAccountsStats}, bank_forks::BankForks, cost_model::CostModel, @@ -78,6 +80,7 @@ use { move_and_async_delete_path, ArchiveFormat, SnapshotVersion, DEFAULT_ARCHIVE_COMPRESSION, SUPPORTED_ARCHIVE_COMPRESSION, }, + tiered_storage::TieredStorage, }, solana_sdk::{ account::{AccountSharedData, ReadableAccount, WritableAccount}, @@ -2468,6 +2471,24 @@ fn main() { If no file name is specified, it will print the metadata of all ledger files.") ) ) + .subcommand( + SubCommand::with_name("new_ads_file") + .about("Create a new accounts-data-storage file from an existing append_vec file.") + .arg( + Arg::with_name("append_vec") + .long("append-vec") + .takes_value(true) + .value_name("APPEND_VEC_FILE_NAME") + .help("The name of the append vec file.") + ) + .arg( + Arg::with_name("ads_file_name") + .long("ads-file-name") + .takes_value(true) + .value_name("ADS_FILE_NAME") + .help("The name of the output ads file.") + ) + ) .get_matches(); info!("{} {}", crate_name!(), solana_version::version!()); @@ -4498,6 +4519,59 @@ fn main() { eprintln!("{err}"); } } + ("new_ads_file", Some(arg_matches)) => { + let append_vec_path = value_t_or_exit!(arg_matches, "append_vec", String); + let ads_file_path = + PathBuf::from(value_t_or_exit!(arg_matches, "ads_file_name", String)); + let append_vec_len = std::fs::metadata(&append_vec_path).unwrap().len() as usize; + let mut append_vec = + AppendVec::new_from_file_unchecked(append_vec_path, append_vec_len) + .expect("should succeed"); + append_vec.set_no_remove_on_drop(); + + let mut ads = TieredStorage::new(&ads_file_path, true /* create */); + ads.set_no_remove_on_drop(); + ads.write_from_append_vec(&append_vec).unwrap(); + + // read append-vec + let mut num_accounts = 0; + let mut offset = 0; + let mut account_map: HashMap = HashMap::new(); + while let Some((account, next_offset)) = append_vec.get_account(offset) { + offset = next_offset; + num_accounts += 1; + account_map.insert(*account.pubkey(), account); + } + + // iterate through all accounts in the tiered storage and + // verify their correctness. + offset = 0; + let mut tiered_num_accounts = 0; + while let Some((account, next_offset)) = ads.get_account(offset) { + tiered_num_accounts += 1; + offset = next_offset; + if *account.pubkey() == Pubkey::default() { + continue; + } + if *account.owner() == Pubkey::default() { + continue; + } + let av_account = &account_map[account.pubkey()]; + assert_eq!(*av_account.pubkey(), *account.pubkey()); + assert_eq!(*av_account.hash(), *account.hash()); + assert_eq!(av_account.data_len(), account.data_len()); + assert_eq!(av_account.write_version(), account.write_version()); + + assert_eq!(av_account.lamports(), account.lamports()); + assert_eq!(av_account.data(), account.data()); + assert_eq!(*av_account.owner(), *account.owner()); + assert_eq!(av_account.rent_epoch(), account.rent_epoch()); + assert_eq!(av_account.executable(), account.executable()); + } + assert_eq!(num_accounts, tiered_num_accounts); + + info!("# accounts from append_vec = {:?}", num_accounts); + } ("", _) => { eprintln!("{}", matches.usage()); exit(1); diff --git a/runtime/Cargo.toml b/runtime/Cargo.toml index 0b87b057521944..9a245399bfe51c 100644 --- a/runtime/Cargo.toml +++ b/runtime/Cargo.toml @@ -31,6 +31,7 @@ lru = { workspace = true } lz4 = { workspace = true } memmap2 = { workspace = true } modular-bitfield = { workspace = true } +num_enum = { workspace = true } num-derive = { workspace = true } num-traits = { workspace = true } num_cpus = { workspace = true } diff --git a/runtime/src/account_storage.rs b/runtime/src/account_storage.rs index 4479b8d338509e..a8b2f89670efcc 100644 --- a/runtime/src/account_storage.rs +++ b/runtime/src/account_storage.rs @@ -270,6 +270,8 @@ pub(crate) mod tests { use {super::*, std::path::Path}; #[test] + /// The new cold storage will fail this test as it does not have + /// the concept of capacity yet. fn test_shrink_in_progress() { // test that we check in order map then shrink_in_progress_map let storage = AccountStorage::default(); @@ -283,13 +285,13 @@ pub(crate) mod tests { let store_file_size = 4000; let store_file_size2 = store_file_size * 2; // 2 append vecs with same id, but different sizes - let entry = Arc::new(AccountStorageEntry::new( + let entry = Arc::new(AccountStorageEntry::new_av( common_store_path, slot, id, store_file_size, )); - let entry2 = Arc::new(AccountStorageEntry::new( + let entry2 = Arc::new(AccountStorageEntry::new_av( common_store_path, slot, id, diff --git a/runtime/src/account_storage/meta.rs b/runtime/src/account_storage/meta.rs index 8e244d18e665c3..1079c30a49cfe1 100644 --- a/runtime/src/account_storage/meta.rs +++ b/runtime/src/account_storage/meta.rs @@ -1,5 +1,11 @@ use { - crate::{append_vec::AppendVecStoredAccountMeta, storable_accounts::StorableAccounts}, + crate::{ + append_vec::AppendVecStoredAccountMeta, + storable_accounts::StorableAccounts, + tiered_storage::{ + cold::ColdAccountMeta, hot::HotAccountMeta, reader::TieredStoredAccountMeta, + }, + }, solana_sdk::{ account::{AccountSharedData, ReadableAccount}, hash::Hash, @@ -105,6 +111,8 @@ impl<'a: 'b, 'b, T: ReadableAccount + Sync + 'b, U: StorableAccounts<'a, T>, V: #[derive(PartialEq, Eq, Debug)] pub enum StoredAccountMeta<'a> { AppendVec(AppendVecStoredAccountMeta<'a>), + Cold(TieredStoredAccountMeta<'a, ColdAccountMeta>), + Hot(TieredStoredAccountMeta<'a, HotAccountMeta>), } impl<'a> StoredAccountMeta<'a> { @@ -112,66 +120,88 @@ impl<'a> StoredAccountMeta<'a> { pub fn clone_account(&self) -> AccountSharedData { match self { Self::AppendVec(av) => av.clone_account(), + Self::Cold(cs) => cs.clone_account(), + Self::Hot(hs) => hs.clone_account(), } } pub fn pubkey(&self) -> &'a Pubkey { match self { Self::AppendVec(av) => av.pubkey(), + Self::Cold(cs) => cs.pubkey(), + Self::Hot(hs) => hs.pubkey(), } } pub fn hash(&self) -> &'a Hash { match self { Self::AppendVec(av) => av.hash(), + Self::Cold(cs) => cs.hash(), + Self::Hot(hs) => hs.hash(), } } pub fn stored_size(&self) -> usize { match self { Self::AppendVec(av) => av.stored_size(), + Self::Cold(cs) => cs.stored_size(), + Self::Hot(hs) => hs.stored_size(), } } pub fn offset(&self) -> usize { match self { Self::AppendVec(av) => av.offset(), + Self::Cold(cs) => cs.offset(), + Self::Hot(hs) => hs.offset(), } } pub fn data(&self) -> &'a [u8] { match self { Self::AppendVec(av) => av.data(), + Self::Cold(cs) => cs.data(), + Self::Hot(hs) => hs.data(), } } pub fn data_len(&self) -> u64 { match self { Self::AppendVec(av) => av.data_len(), + Self::Cold(cs) => cs.data_len(), + Self::Hot(hs) => hs.data_len(), } } pub fn write_version(&self) -> StoredMetaWriteVersion { match self { Self::AppendVec(av) => av.write_version(), + Self::Cold(cs) => cs.write_version(), + Self::Hot(hs) => hs.write_version(), } } pub fn meta(&self) -> &StoredMeta { match self { Self::AppendVec(av) => av.meta(), + Self::Cold(_) => unreachable!(), + Self::Hot(_) => unreachable!(), } } pub fn set_meta(&mut self, meta: &'a StoredMeta) { match self { Self::AppendVec(av) => av.set_meta(meta), + Self::Cold(_) => unreachable!(), + Self::Hot(_) => unreachable!(), } } pub(crate) fn sanitize(&self) -> bool { match self { Self::AppendVec(av) => av.sanitize(), + Self::Cold(_) => unimplemented!(), + Self::Hot(_) => unreachable!(), } } } @@ -180,26 +210,36 @@ impl<'a> ReadableAccount for StoredAccountMeta<'a> { fn lamports(&self) -> u64 { match self { Self::AppendVec(av) => av.lamports(), + Self::Cold(cs) => cs.lamports(), + Self::Hot(hs) => hs.lamports(), } } fn data(&self) -> &[u8] { match self { Self::AppendVec(av) => av.data(), + Self::Cold(cs) => cs.data(), + Self::Hot(hs) => hs.data(), } } fn owner(&self) -> &Pubkey { match self { Self::AppendVec(av) => av.owner(), + Self::Cold(cs) => cs.owner(), + Self::Hot(hs) => hs.owner(), } } fn executable(&self) -> bool { match self { Self::AppendVec(av) => av.executable(), + Self::Cold(cs) => cs.executable(), + Self::Hot(hs) => hs.executable(), } } fn rent_epoch(&self) -> Epoch { match self { Self::AppendVec(av) => av.rent_epoch(), + Self::Cold(cs) => cs.rent_epoch(), + Self::Hot(hs) => hs.rent_epoch(), } } } diff --git a/runtime/src/accounts_db.rs b/runtime/src/accounts_db.rs index 7d46cddecb56cb..5a97071138bf11 100644 --- a/runtime/src/accounts_db.rs +++ b/runtime/src/accounts_db.rs @@ -66,6 +66,7 @@ use { snapshot_utils::create_accounts_run_and_snapshot_dirs, sorted_storages::SortedStorages, storable_accounts::StorableAccounts, + tiered_storage::{cold::COLD_FORMAT, TieredStorage}, verify_accounts_hash_in_background::VerifyAccountsHashInBackground, }, blake3::traits::digest::Digest, @@ -1046,6 +1047,10 @@ pub struct AccountStorageEntry { impl AccountStorageEntry { pub fn new(path: &Path, slot: Slot, id: AppendVecId, file_size: u64) -> Self { + Self::new_cold(path, slot, id, file_size) + } + + pub fn new_av(path: &Path, slot: Slot, id: AppendVecId, file_size: u64) -> Self { let tail = AccountsFile::file_name(slot, id); let path = Path::new(path).join(tail); let accounts = AccountsFile::AppendVec(AppendVec::new(&path, true, file_size as usize)); @@ -1060,6 +1065,22 @@ impl AccountStorageEntry { } } + pub fn new_cold(path: &Path, slot: Slot, id: AppendVecId, _file_size: u64) -> Self { + let tail = AccountsFile::file_name(slot, id); + let path = Path::new(path).join(tail); + let accounts = AccountsFile::Cold(TieredStorage::new(&path, Some(&COLD_FORMAT))); + info!("YH: Create new cold storage at path {:?}", path); + + Self { + id: AtomicAppendVecId::new(id), + slot: AtomicU64::new(slot), + accounts, + count_and_status: RwLock::new((0, AccountStorageStatus::Available)), + approx_store_count: AtomicUsize::new(0), + alive_bytes: AtomicUsize::new(0), + } + } + pub(crate) fn new_existing( slot: Slot, id: AppendVecId, diff --git a/runtime/src/accounts_file.rs b/runtime/src/accounts_file.rs index c6b52bec1a5221..ca93bf6425bf47 100644 --- a/runtime/src/accounts_file.rs +++ b/runtime/src/accounts_file.rs @@ -5,6 +5,7 @@ use { }, append_vec::{AppendVec, MatchAccountOwnerError}, storable_accounts::StorableAccounts, + tiered_storage::{cold::COLD_FORMAT, TieredStorage}, }, solana_sdk::{account::ReadableAccount, clock::Slot, hash::Hash, pubkey::Pubkey}, std::{ @@ -29,6 +30,7 @@ macro_rules! u64_align { /// under different formats. pub enum AccountsFile { AppendVec(AppendVec), + Cold(TieredStorage), } impl AccountsFile { @@ -37,52 +39,71 @@ impl AccountsFile { /// The second element of the returned tuple is the number of accounts in the /// accounts file. pub fn new_from_file(path: impl AsRef, current_len: usize) -> io::Result<(Self, usize)> { - let (av, num_accounts) = AppendVec::new_from_file(path, current_len)?; + if let Ok((ts, num_accounts)) = TieredStorage::new_from_file(path.as_ref()) { + log::info!("YH: Open {:?} as cold storage", path.as_ref()); + return Ok((Self::Cold(ts), num_accounts)); + } + + let (av, num_accounts) = AppendVec::new_from_file(path.as_ref(), current_len)?; Ok((Self::AppendVec(av), num_accounts)) } + pub fn new_cold_entry(file_path: &Path, create: bool) -> Self { + Self::Cold(TieredStorage::new( + file_path, + create.then_some(&COLD_FORMAT), + )) + } + /// By default, all AccountsFile will remove its underlying file on /// drop. Calling this function to disable such behavior for this /// instance. pub fn set_no_remove_on_drop(&mut self) { match self { Self::AppendVec(av) => av.set_no_remove_on_drop(), + Self::Cold(ts) => ts.set_no_remove_on_drop(), } } pub fn flush(&self) -> io::Result<()> { match self { Self::AppendVec(av) => av.flush(), + Self::Cold(..) => Ok(()), } } pub fn reset(&self) { match self { Self::AppendVec(av) => av.reset(), + Self::Cold(..) => {} } } pub fn remaining_bytes(&self) -> u64 { match self { Self::AppendVec(av) => av.remaining_bytes(), + Self::Cold(ts) => ts.remaining_bytes(), } } pub fn len(&self) -> usize { match self { Self::AppendVec(av) => av.len(), + Self::Cold(ts) => ts.len(), } } pub fn is_empty(&self) -> bool { match self { Self::AppendVec(av) => av.is_empty(), + Self::Cold(ts) => ts.is_empty(), } } pub fn capacity(&self) -> u64 { match self { Self::AppendVec(av) => av.capacity(), + Self::Cold(ts) => ts.capacity(), } } @@ -96,6 +117,7 @@ impl AccountsFile { pub fn get_account(&self, index: usize) -> Option<(StoredAccountMeta<'_>, usize)> { match self { Self::AppendVec(av) => av.get_account(index), + Self::Cold(ts) => ts.get_account(index), } } @@ -106,6 +128,7 @@ impl AccountsFile { ) -> Result { match self { Self::AppendVec(av) => av.account_matches_owners(offset, owners), + Self::Cold(ts) => ts.account_matches_owners(offset, owners), } } @@ -113,6 +136,7 @@ impl AccountsFile { pub fn get_path(&self) -> PathBuf { match self { Self::AppendVec(av) => av.get_path(), + Self::Cold(ts) => ts.get_path(), } } @@ -125,6 +149,7 @@ impl AccountsFile { pub fn accounts(&self, offset: usize) -> Vec { match self { Self::AppendVec(av) => av.accounts(offset), + Self::Cold(ts) => ts.accounts(offset), } } @@ -148,6 +173,7 @@ impl AccountsFile { ) -> Option> { match self { Self::AppendVec(av) => av.append_accounts(accounts, skip), + Self::Cold(ts) => ts.append_accounts(accounts, skip), } } } @@ -186,6 +212,7 @@ pub mod tests { pub(crate) fn set_current_len_for_tests(&self, len: usize) { match self { Self::AppendVec(av) => av.set_current_len_for_tests(len), + Self::Cold(..) => todo!(), } } } diff --git a/runtime/src/ancient_append_vecs.rs b/runtime/src/ancient_append_vecs.rs index 6480eb9ab8b6b5..ac6bdd2362ed28 100644 --- a/runtime/src/ancient_append_vecs.rs +++ b/runtime/src/ancient_append_vecs.rs @@ -752,6 +752,9 @@ pub fn get_ancient_append_vec_capacity() -> u64 { pub fn is_ancient(storage: &AccountsFile) -> bool { match storage { AccountsFile::AppendVec(storage) => storage.capacity() >= get_ancient_append_vec_capacity(), + AccountsFile::Cold(..) => { + return false; + } } } diff --git a/runtime/src/append_vec.rs b/runtime/src/append_vec.rs index 8c4653115a813e..ffcd204cb8bf51 100644 --- a/runtime/src/append_vec.rs +++ b/runtime/src/append_vec.rs @@ -176,10 +176,10 @@ impl<'a> ReadableAccount for AppendVecStoredAccountMeta<'a> { fn lamports(&self) -> u64 { self.account_meta.lamports } - fn data(&self) -> &[u8] { + fn data(&self) -> &'a [u8] { self.data() } - fn owner(&self) -> &Pubkey { + fn owner(&self) -> &'a Pubkey { &self.account_meta.owner } fn executable(&self) -> bool { @@ -599,6 +599,7 @@ impl AppendVec { let mut offsets = Vec::with_capacity(len); for i in skip..len { let (account, pubkey, hash, write_version_obsolete) = accounts.get(i); + println!("av hash = {}", hash); let account_meta = account .map(|account| AccountMeta { lamports: account.lamports(), @@ -675,7 +676,10 @@ pub mod tests { self.current_len.store(len, Ordering::Release); } - fn append_account_test(&self, data: &(StoredMeta, AccountSharedData)) -> Option { + pub(crate) fn append_account_test( + &self, + data: &(StoredMeta, AccountSharedData), + ) -> Option { let slot_ignored = Slot::MAX; let accounts = [(&data.0.pubkey, &data.1)]; let slice = &accounts[..]; @@ -697,6 +701,8 @@ pub mod tests { pub(crate) fn ref_executable_byte(&self) -> &u8 { match self { Self::AppendVec(av) => av.ref_executable_byte(), + Self::Cold(_) => unimplemented!(), + Self::Hot(_) => unimplemented!(), } } } @@ -1180,20 +1186,23 @@ pub mod tests { av.append_account_test(&create_test_account(10)).unwrap(); let accounts = av.accounts(0); - let StoredAccountMeta::AppendVec(account) = accounts.first().unwrap(); - account.set_data_len_unsafe(crafted_data_len); - assert_eq!(account.data_len(), crafted_data_len); + if let StoredAccountMeta::AppendVec(account) = accounts.first().unwrap() { + account.set_data_len_unsafe(crafted_data_len); + assert_eq!(account.data_len(), crafted_data_len); - // Reload accounts and observe crafted_data_len - let accounts = av.accounts(0); - let account = accounts.first().unwrap(); - assert_eq!(account.data_len(), crafted_data_len); + // Reload accounts and observe crafted_data_len + let accounts = av.accounts(0); + let account = accounts.first().unwrap(); + assert_eq!(account.data_len(), crafted_data_len); - av.flush().unwrap(); - let accounts_len = av.len(); - drop(av); - let result = AppendVec::new_from_file(path, accounts_len); - assert_matches!(result, Err(ref message) if message.to_string().starts_with("incorrect layout/length/data")); + av.flush().unwrap(); + let accounts_len = av.len(); + drop(av); + let result = AppendVec::new_from_file(path, accounts_len); + assert_matches!(result, Err(ref message) if message.to_string().starts_with("incorrect layout/length/data")); + } else { + unreachable!(); + } } #[test] @@ -1207,19 +1216,22 @@ pub mod tests { av.append_account_test(&create_test_account(10)).unwrap(); let accounts = av.accounts(0); - let StoredAccountMeta::AppendVec(account) = accounts.first().unwrap(); - account.set_data_len_unsafe(too_large_data_len); - assert_eq!(account.data_len(), too_large_data_len); + if let StoredAccountMeta::AppendVec(account) = accounts.first().unwrap() { + account.set_data_len_unsafe(too_large_data_len); + assert_eq!(account.data_len(), too_large_data_len); - // Reload accounts and observe no account with bad offset - let accounts = av.accounts(0); - assert_matches!(accounts.first(), None); + // Reload accounts and observe no account with bad offset + let accounts = av.accounts(0); + assert_matches!(accounts.first(), None); - av.flush().unwrap(); - let accounts_len = av.len(); - drop(av); - let result = AppendVec::new_from_file(path, accounts_len); - assert_matches!(result, Err(ref message) if message.to_string().starts_with("incorrect layout/length/data")); + av.flush().unwrap(); + let accounts_len = av.len(); + drop(av); + let result = AppendVec::new_from_file(path, accounts_len); + assert_matches!(result, Err(ref message) if message.to_string().starts_with("incorrect layout/length/data")); + } else { + unreachable!(); + } } #[test] @@ -1242,46 +1254,52 @@ pub mod tests { assert_eq!(*accounts[0].ref_executable_byte(), 0); assert_eq!(*accounts[1].ref_executable_byte(), 1); - let StoredAccountMeta::AppendVec(account) = &accounts[0]; - let crafted_executable = u8::max_value() - 1; + let crafted_executable: u8; + if let StoredAccountMeta::AppendVec(account) = &accounts[0] { + crafted_executable = u8::max_value() - 1; - account.set_executable_as_byte(crafted_executable); + account.set_executable_as_byte(crafted_executable); + } else { + unreachable!(); + } // reload crafted accounts let accounts = av.accounts(0); - let StoredAccountMeta::AppendVec(account) = accounts.first().unwrap(); + if let StoredAccountMeta::AppendVec(account) = accounts.first().unwrap() { + // upper 7-bits are not 0, so sanitization should fail + assert!(!account.sanitize_executable()); - // upper 7-bits are not 0, so sanitization should fail - assert!(!account.sanitize_executable()); - - // we can observe crafted value by ref - { - let executable_bool: &bool = &account.account_meta.executable; - // Depending on use, *executable_bool can be truthy or falsy due to direct memory manipulation - // assert_eq! thinks *executable_bool is equal to false but the if condition thinks it's not, contradictorily. - assert!(!*executable_bool); - #[cfg(not(target_arch = "aarch64"))] + // we can observe crafted value by ref { - const FALSE: bool = false; // keep clippy happy - if *executable_bool == FALSE { - panic!("This didn't occur if this test passed."); + let executable_bool: &bool = &account.account_meta.executable; + // Depending on use, *executable_bool can be truthy or falsy due to direct memory manipulation + // assert_eq! thinks *executable_bool is equal to false but the if condition thinks it's not, contradictorily. + assert!(!*executable_bool); + #[cfg(not(target_arch = "aarch64"))] + { + const FALSE: bool = false; // keep clippy happy + if *executable_bool == FALSE { + panic!("This didn't occur if this test passed."); + } } + assert_eq!(*account.ref_executable_byte(), crafted_executable); } - assert_eq!(*account.ref_executable_byte(), crafted_executable); - } - // we can NOT observe crafted value by value - { - let executable_bool: bool = account.executable(); - assert!(!executable_bool); - assert_eq!(account.get_executable_byte(), 0); // Wow, not crafted_executable! - } + // we can NOT observe crafted value by value + { + let executable_bool: bool = account.executable(); + assert!(!executable_bool); + assert_eq!(account.get_executable_byte(), 0); // Wow, not crafted_executable! + } - av.flush().unwrap(); - let accounts_len = av.len(); - drop(av); - let result = AppendVec::new_from_file(path, accounts_len); - assert_matches!(result, Err(ref message) if message.to_string().starts_with("incorrect layout/length/data")); + av.flush().unwrap(); + let accounts_len = av.len(); + drop(av); + let result = AppendVec::new_from_file(path, accounts_len); + assert_matches!(result, Err(ref message) if message.to_string().starts_with("incorrect layout/length/data")); + } else { + unreachable!(); + } } #[test] diff --git a/runtime/src/append_vec/test_utils.rs b/runtime/src/append_vec/test_utils.rs index 252044e755d06c..04e72703bc81e9 100644 --- a/runtime/src/append_vec/test_utils.rs +++ b/runtime/src/append_vec/test_utils.rs @@ -45,3 +45,17 @@ pub fn create_test_account(sample: usize) -> (StoredMeta, AccountSharedData) { }; (stored_meta, account) } + +pub fn create_test_account_from_len(data_len: usize) -> (StoredMeta, AccountSharedData) { + let mut account = AccountSharedData::new(data_len as u64, 0, &Pubkey::new_unique()); + let data_byte: u8 = (data_len % 256) as u8; + account.set_data((0..data_len).map(|_| data_byte).collect()); + account.executable = data_len % 2 > 0; + account.rent_epoch = if data_len % 3 > 0 { data_len as u64 } else { 0 }; + let stored_meta = StoredMeta { + write_version_obsolete: data_len as u64, + pubkey: Pubkey::new_unique(), + data_len: data_len as u64, + }; + (stored_meta, account) +} diff --git a/runtime/src/lib.rs b/runtime/src/lib.rs index 8a5bf2b0511af6..698018494c977c 100644 --- a/runtime/src/lib.rs +++ b/runtime/src/lib.rs @@ -76,6 +76,7 @@ pub mod static_ids; pub mod status_cache; mod storable_accounts; mod system_instruction_processor; +pub mod tiered_storage; pub mod transaction_batch; pub mod transaction_error_metrics; pub mod transaction_priority_details; diff --git a/runtime/src/tiered_storage.rs b/runtime/src/tiered_storage.rs new file mode 100644 index 00000000000000..0a0a322e88958c --- /dev/null +++ b/runtime/src/tiered_storage.rs @@ -0,0 +1,703 @@ +//! docs/src/proposals/append-vec-storage.md + +pub mod cold; +pub mod data_block; +pub mod file; +pub mod footer; +pub mod hot; +pub mod meta_entries; +pub mod mmap_utils; +pub mod reader; +pub mod writer; + +use { + crate::{ + account_storage::meta::{ + StorableAccountsWithHashesAndWriteVersions, StoredAccountInfo, StoredAccountMeta, + }, + append_vec::{AppendVec, MatchAccountOwnerError}, + storable_accounts::StorableAccounts, + }, + data_block::AccountDataBlockWriter, + footer::TieredFileFormat, + log::log_enabled, + once_cell::sync::OnceCell, + reader::TieredStorageReader, + solana_sdk::{account::ReadableAccount, hash::Hash, pubkey::Pubkey}, + std::{ + borrow::Borrow, + fs::{remove_file, OpenOptions}, + path::{Path, PathBuf}, + }, + writer::TieredStorageWriter, +}; + +pub const ACCOUNT_DATA_BLOCK_SIZE: usize = 4096; +pub const ACCOUNTS_DATA_STORAGE_FORMAT_VERSION: u64 = 1; + +lazy_static! { + pub static ref HASH_DEFAULT: Hash = Hash::default(); +} + +#[derive(Debug)] +pub struct TieredStorage { + reader: OnceCell, + // This format will be used when creating new data + format: Option<&'static TieredFileFormat>, + path: PathBuf, + remove_on_drop: bool, +} + +impl Drop for TieredStorage { + fn drop(&mut self) { + if self.remove_on_drop { + if let Err(_e) = remove_file(&self.path) { + // promote this to panic soon. + // disabled due to many false positive warnings while running tests. + // blocked by rpc's upgrade to jsonrpc v17 + //error!("AppendVec failed to remove {:?}: {:?}", &self.path, e); + inc_new_counter_info!("append_vec_drop_fail", 1); + } + } + } +} + +impl TieredStorage { + pub fn new(file_path: &Path, format: Option<&'static TieredFileFormat>) -> Self { + if format.is_some() { + let _ignored = remove_file(file_path); + Self { + reader: OnceCell::::new(), + format: format, + path: file_path.to_path_buf(), + remove_on_drop: true, + } + } else { + let (accounts_file, _) = Self::new_from_file(file_path).unwrap(); + return accounts_file; + } + } + + pub fn new_from_file>(path: P) -> std::io::Result<(Self, usize)> { + let reader = TieredStorageReader::new_from_path(path.as_ref())?; + let count = reader.num_accounts(); + let reader_cell = OnceCell::::new(); + reader_cell.set(reader).unwrap(); + Ok(( + Self { + reader: reader_cell, + format: None, + path: path.as_ref().to_path_buf(), + remove_on_drop: true, + }, + count, + )) + } + + pub fn account_matches_owners( + &self, + multiplied_index: usize, + owners: &[&Pubkey], + ) -> Result { + if let Some(reader) = self.reader.get() { + return reader.account_matches_owners(multiplied_index, owners); + } + + Err(MatchAccountOwnerError::UnableToLoad) + } + + pub fn get_account<'a>( + &'a self, + multiplied_index: usize, + ) -> Option<(StoredAccountMeta<'a>, usize)> { + if multiplied_index % 1024 == 0 { + log::info!( + "TieredStorage::get_account(): fetch {} account at file {:?}", + multiplied_index, + self.path + ); + } + if let Some(reader) = self.reader.get() { + return reader.get_account(multiplied_index); + } + None + } + + pub fn get_path(&self) -> PathBuf { + self.path.clone() + } + + pub fn accounts(&self, mut multiplied_index: usize) -> Vec { + log::info!( + "TieredStorage::accounts(): fetch all accounts after {} at file {:?}", + multiplied_index, + self.path + ); + let mut accounts = vec![]; + while let Some((account, next)) = self.get_account(multiplied_index) { + accounts.push(account); + multiplied_index = next; + } + accounts + } + + // Returns the Vec of offsets corresponding to the input accounts to later + // construct AccountInfo + pub fn append_accounts< + 'a, + 'b, + T: ReadableAccount + Sync, + U: StorableAccounts<'a, T>, + V: Borrow, + >( + &self, + accounts: &StorableAccountsWithHashesAndWriteVersions<'a, 'b, T, U, V>, + skip: usize, + ) -> Option> { + log::info!("TieredStorage::append_accounts(): file {:?}", self.path); + if self.is_read_only() { + log::error!("TieredStorage::append_accounts(): attempt to append accounts to read only file {:?}", self.path); + return None; + } + + let result: Option>; + { + let writer = TieredStorageWriter::new(&self.path, self.format.unwrap()); + result = writer.append_accounts(accounts, skip); + } + + if self + .reader + .set(TieredStorageReader::new_from_path(&self.path).unwrap()) + .is_err() + { + panic!( + "TieredStorage::append_accounts(): unable to create reader for file {:?}", + self.path + ); + } + log::info!( + "TieredStorage::append_accounts(): successfully appended {} accounts to file {:?}", + accounts.len() - skip, + self.path + ); + result + } + + pub fn file_size(&self) -> std::io::Result { + let file = OpenOptions::new() + .read(true) + .create(false) + .open(self.path.to_path_buf())?; + Ok(file.metadata()?.len()) + } + + pub fn is_read_only(&self) -> bool { + self.reader.get().is_some() + } + + pub fn write_from_append_vec(&self, append_vec: &AppendVec) -> std::io::Result<()> { + let writer = TieredStorageWriter::new(&self.path, self.format.unwrap()); + let result = writer.write_from_append_vec(&append_vec); + if result.is_ok() { + if self + .reader + .set(TieredStorageReader::new_from_path(&self.path).unwrap()) + .is_ok() + { + return result; + } else { + return Err(std::io::Error::new( + std::io::ErrorKind::Other, + "TieredStorageError::Reader failure", + )); + } + } + + result + } + + /////////////////////////////////////////////////////////////////////////////// + + pub fn set_no_remove_on_drop(&mut self) { + self.remove_on_drop = false; + } + + pub fn remaining_bytes(&self) -> u64 { + if self.is_read_only() { + return 0; + } + std::u64::MAX + } + + pub fn len(&self) -> usize { + self.file_size().unwrap_or(0).try_into().unwrap() + } + + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /////////////////////////////////////////////////////////////////////////////// + // unimplemented + + pub fn flush(&self) -> std::io::Result<()> { + Ok(()) + } + + pub fn reset(&self) {} + + pub fn capacity(&self) -> u64 { + if self.is_read_only() { + return self.len().try_into().unwrap(); + } + self.len().try_into().unwrap() + } + + pub fn is_ancient(&self) -> bool { + false + } +} + +#[cfg(test)] +pub mod tests { + use { + crate::{ + account_storage::meta::{StorableAccountsWithHashesAndWriteVersions, StoredMeta}, + append_vec::{ + test_utils::{create_test_account_from_len, get_append_vec_path, TempFile}, + AppendVec, + }, + tiered_storage::{ + cold::COLD_FORMAT, + footer::{TieredFileFormat, TieredStorageFooter, FOOTER_SIZE}, + hot::HOT_FORMAT, + reader::TieredStorageReader, + TieredStorage, ACCOUNTS_DATA_STORAGE_FORMAT_VERSION, ACCOUNT_DATA_BLOCK_SIZE, + }, + }, + once_cell::sync::OnceCell, + solana_sdk::{account::AccountSharedData, clock::Slot, hash::Hash, pubkey::Pubkey}, + std::{collections::HashMap, mem, path::Path}, + }; + + impl TieredStorage { + fn new_for_test(file_path: &Path, format: &'static TieredFileFormat) -> Self { + Self { + reader: OnceCell::::new(), + format: Some(format), + path: file_path.to_path_buf(), + remove_on_drop: false, + } + } + + fn footer(&self) -> Option<&TieredStorageFooter> { + if let Some(reader) = self.reader.get() { + return Some(reader.footer()); + } + None + } + } + + impl TieredStorageReader { + fn footer(&self) -> &TieredStorageFooter { + match self { + Self::Cold(cs) => &cs.footer, + Self::Hot(hs) => hs.footer(), + } + } + } + + fn create_test_append_vec( + path: &str, + data_sizes: &[usize], + ) -> (HashMap, AppendVec) { + let av_path = get_append_vec_path(path); + let av = AppendVec::new(&av_path.path, true, 100 * 1024 * 1024); + let mut test_accounts: HashMap = HashMap::new(); + + for size in data_sizes { + let account = create_test_account_from_len(*size); + let index = av.append_account_test(&account).unwrap(); + assert_eq!(av.get_account_test(index).unwrap(), account); + test_accounts.insert(account.0.pubkey, account); + } + + (test_accounts, av) + } + + fn ads_writer_test_help( + path_prefix: &str, + account_data_sizes: &[usize], + format: &'static TieredFileFormat, + ) { + /* + write_from_append_vec_test_helper( + &(path_prefix.to_owned() + "_from_append_vec"), + account_data_sizes, + );*/ + append_accounts_test_helper( + &(path_prefix.to_owned() + "_append_accounts"), + account_data_sizes, + format, + ); + } + + fn append_accounts_test_helper( + path_prefix: &str, + account_data_sizes: &[usize], + format: &'static TieredFileFormat, + ) { + let account_count = account_data_sizes.len(); + let (test_accounts, _av) = + create_test_append_vec(&(path_prefix.to_owned() + "_av"), account_data_sizes); + + let slot_ignored = Slot::MAX; + let accounts: Vec<(Pubkey, AccountSharedData)> = test_accounts + .clone() + .into_iter() + .map(|(pubkey, acc)| (pubkey, acc.1)) + .collect(); + let mut accounts_ref: Vec<(&Pubkey, &AccountSharedData)> = Vec::new(); + + for (x, y) in &accounts { + accounts_ref.push((&x, &y)); + } + + let slice = &accounts_ref[..]; + let account_data = (slot_ignored, slice); + let mut write_versions = Vec::new(); + + for (_pubkey, acc) in &test_accounts { + write_versions.push(acc.0.write_version_obsolete); + } + + let mut hashes = Vec::new(); + let mut hashes_ref = Vec::new(); + let mut hashes_map = HashMap::new(); + + for _ in 0..write_versions.len() { + hashes.push(Hash::new_unique()); + } + for i in 0..write_versions.len() { + hashes_ref.push(&hashes[i]); + } + for i in 0..write_versions.len() { + hashes_map.insert(accounts[i].0, &hashes[i]); + } + + let storable_accounts = + StorableAccountsWithHashesAndWriteVersions::new_with_hashes_and_write_versions( + &account_data, + hashes_ref, + write_versions, + ); + + let ads_path = get_append_vec_path(&(path_prefix.to_owned() + "_ads")); + { + let ads = TieredStorage::new_for_test(&ads_path.path, format); + ads.append_accounts(&storable_accounts, 0); + } + + verify_account_data_storage( + account_count, + &test_accounts, + &ads_path, + &hashes_map, + format, + ); + } + + fn write_from_append_vec_test_helper( + path_prefix: &str, + account_data_sizes: &[usize], + format: &'static TieredFileFormat, + ) { + let account_count = account_data_sizes.len(); + let (test_accounts, av) = + create_test_append_vec(&(path_prefix.to_owned() + "_av"), account_data_sizes); + + let ads_path = get_append_vec_path(&(path_prefix.to_owned() + "_ads")); + { + let ads = TieredStorage::new_for_test(&ads_path.path, format); + ads.write_from_append_vec(&av).unwrap(); + } + + verify_account_data_storage( + account_count, + &test_accounts, + &ads_path, + &HashMap::new(), + format, + ); + } + + fn verify_account_data_storage( + account_count: usize, + test_accounts: &HashMap, + ads_path: &TempFile, + hashes_map: &HashMap, + format: &'static TieredFileFormat, + ) { + let ads = TieredStorage::new(&ads_path.path, None); + let footer = ads.footer().unwrap(); + + let expected_footer = TieredStorageFooter { + account_meta_format: format.account_meta_format.clone(), + owners_block_format: format.owners_block_format.clone(), + account_index_format: format.account_index_format.clone(), + data_block_format: format.data_block_format.clone(), + account_meta_count: account_count as u32, + account_meta_entry_size: format.meta_entry_size as u32, + account_data_block_size: ACCOUNT_DATA_BLOCK_SIZE as u64, + owner_count: account_count as u32, + owner_entry_size: mem::size_of::() as u32, + // This number should be the total compressed account data size. + account_metas_offset: footer.account_metas_offset, + account_pubkeys_offset: footer.account_pubkeys_offset, + owners_offset: footer.account_pubkeys_offset + + (account_count * mem::size_of::()) as u64, + // TODO(yhchiang): reach out Brooks on how to obtain the new hash + hash: footer.hash, + // TODO(yhchiang): fix this + min_account_address: Hash::default(), + max_account_address: Hash::default(), + format_version: ACCOUNTS_DATA_STORAGE_FORMAT_VERSION, + footer_size: FOOTER_SIZE as u64, + }; + assert_eq!(*footer, expected_footer); + + let mut index = 0; + let mut count_from_ads = 0; + while let Some((account, next)) = ads.get_account(index) { + println!("account = {:?}", account); + index = next; + count_from_ads += 1; + let expected_account = &test_accounts[account.pubkey()]; + assert_eq!(account.clone_account(), expected_account.1); + + if hashes_map.len() > 0 { + let expected_hash = &hashes_map[account.pubkey()]; + assert_eq!(account.hash(), *expected_hash); + } + + let stored_meta_from_storage = StoredMeta { + write_version_obsolete: account.write_version(), + pubkey: *account.pubkey(), + data_len: account.data_len(), + }; + assert_eq!(stored_meta_from_storage, expected_account.0); + } + assert_eq!(&count_from_ads, &account_count); + } + + #[test] + fn test_write_from_append_vec_one_small() { + ads_writer_test_help( + "test_write_from_append_vec_one_small_hot", + &[255], + &HOT_FORMAT, + ); + ads_writer_test_help( + "test_write_from_append_vec_one_small_cold", + &[255], + &COLD_FORMAT, + ); + } + + #[test] + fn test_write_from_append_vec_one_big() { + ads_writer_test_help( + "test_write_from_append_vec_one_big_hot", + &[25500], + &HOT_FORMAT, + ); + ads_writer_test_help( + "test_write_from_append_vec_one_big_cold", + &[25500], + &COLD_FORMAT, + ); + } + + #[test] + fn test_write_from_append_vec_one_10_mb() { + ads_writer_test_help( + "test_write_from_append_vec_one_10_mb_hot", + &[10 * 1024 * 1024], + &HOT_FORMAT, + ); + ads_writer_test_help( + "test_write_from_append_vec_one_10_mb_cold", + &[10 * 1024 * 1024], + &COLD_FORMAT, + ); + } + + #[test] + fn test_write_from_append_vec_multiple_blobs() { + ads_writer_test_help( + "test_write_from_append_vec_multiple_blobs_hot", + &[5000, 6000, 7000, 8000, 5500, 10241023, 9999], + &HOT_FORMAT, + ); + ads_writer_test_help( + "test_write_from_append_vec_multiple_blobs_cold", + &[5000, 6000, 7000, 8000, 5500, 10241023, 9999], + &COLD_FORMAT, + ); + } + + #[test] + fn test_write_from_append_vec_one_data_block() { + ads_writer_test_help( + "test_write_from_append_vec_one_data_block_hot", + &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + &HOT_FORMAT, + ); + ads_writer_test_help( + "test_write_from_append_vec_one_data_block_cold", + &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + &COLD_FORMAT, + ); + } + + #[test] + fn test_write_from_append_vec_mixed_block() { + ads_writer_test_help( + "test_write_from_append_vec_mixed_block_hot", + &[ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1000, 2000, 3000, 4000, 9, 8, 7, 6, 5, 4, 3, 2, 1, + ], + &HOT_FORMAT, + ); + ads_writer_test_help( + "test_write_from_append_vec_mixed_block_cold", + &[ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1000, 2000, 3000, 4000, 9, 8, 7, 6, 5, 4, 3, 2, 1, + ], + &COLD_FORMAT, + ); + } +} + +/* +#[test] +fn test_account_metas_block() { + let path = get_append_vec_path("test_account_metas_block"); + + const ENTRY_COUNT: u64 = 128; + const TEST_LAMPORT_BASE: u64 = 48372; + const BLOCK_OFFSET_BASE: u64 = 3423; + const DATA_LENGTH: u16 = 976; + const TEST_RENT_EPOCH: Epoch = 327; + const TEST_WRITE_VERSION: StoredMetaWriteVersion = 543432; + let mut expected_metas: Vec = vec![]; + + { + let ads = TieredStorageWriter::new(&path.path); + let mut footer = TieredStorageFooter::new(); + let mut cursor = 0; + let meta_per_block = (ACCOUNT_DATA_BLOCK_SIZE as u16) / DATA_LENGTH; + for i in 0..ENTRY_COUNT { + expected_metas.push( + ColdAccountMeta::new() + .with_lamports(i * TEST_LAMPORT_BASE) + .with_block_offset(i * BLOCK_OFFSET_BASE) + .with_owner_local_id(i as u32) + .with_uncompressed_data_size(DATA_LENGTH) + .with_intra_block_offset(((i as u16) % meta_per_block) * DATA_LENGTH) + .with_flags( + AccountMetaFlags::new() + .with_bit(AccountMetaFlags::EXECUTABLE, i % 2 == 0) + .to_value(), + ) + .with_optional_fields(&AccountMetaOptionalFields { + rent_epoch: if i % 2 == 1 { + Some(TEST_RENT_EPOCH) + } else { + None + }, + account_hash: if i % 2 == 0 { + Some(Hash::new_unique()) + } else { + None + }, + write_version_obsolete: if i % 2 == 1 { + Some(TEST_WRITE_VERSION) + } else { + None + }, + }), + ); + } + ads.write_account_metas_block(&mut cursor, &mut footer, &expected_metas) + .unwrap(); + } + + let ads = TieredStorage::new_for_test(&path.path, false); + let metas: Vec = + ads.read_account_metas_block(0, ENTRY_COUNT as u32).unwrap(); + assert_eq!(expected_metas, metas); + for i in 0..ENTRY_COUNT as usize { + assert_eq!( + metas[i].flags_get(AccountMetaFlags::HAS_RENT_EPOCH), + i % 2 == 1 + ); + assert_eq!( + metas[i].flags_get(AccountMetaFlags::HAS_ACCOUNT_HASH), + i % 2 == 0 + ); + assert_eq!( + metas[i].flags_get(AccountMetaFlags::HAS_WRITE_VERSION), + i % 2 == 1 + ); + } +}*/ + +/* +fn verify_account_data_storage2( + account_count: usize, + test_accounts: &HashMap, + ads_path: &TempFile, + hashes_map: &HashMap, +) { + let ads = TieredStorage::new(&ads_path.path, false); + let footer = ads.footer().unwrap(); + + let expected_footer = TieredStorageFooter { + account_meta_count: account_count as u32, + account_meta_entry_size: , + account_data_block_size: ACCOUNT_DATA_BLOCK_SIZE as u64, + owner_count: account_count as u32, + owner_entry_size: mem::size_of::() as u32, + // This number should be the total compressed account data size. + account_metas_offset: footer.account_metas_offset, + account_pubkeys_offset: footer.account_pubkeys_offset, + owners_offset: footer.account_pubkeys_offset + + (account_count * mem::size_of::()) as u64, + // TODO(yhchiang): not yet implemented + data_block_format: AccountDataBlockFormat::Lz4, + // TODO(yhchiang): not yet implemented + hash: footer.hash, + // TODO(yhchiang): fix this + min_account_address: Hash::default(), + max_account_address: Hash::default(), + format_version: ACCOUNTS_DATA_STORAGE_FORMAT_VERSION, + footer_size: FOOTER_SIZE as u64, + }; + assert_eq!(*footer, expected_footer); + + let mut index = 0; + let mut count_from_ads = 0; + + while let Some((account, next)) = ads.get_account(index) { + index = next; + count_from_ads += 1; + let expected_account = &test_accounts[account.pubkey()]; + let expected_hash = &hashes_map[account.pubkey()]; + verify_account(&account, expected_account); + assert_eq!(account.hash(), *expected_hash); + } + assert_eq!(&count_from_ads, &account_count); +} +*/ diff --git a/runtime/src/tiered_storage/cold.rs b/runtime/src/tiered_storage/cold.rs new file mode 100644 index 00000000000000..1525fb4f1e9995 --- /dev/null +++ b/runtime/src/tiered_storage/cold.rs @@ -0,0 +1,530 @@ +use { + crate::{ + account_storage::meta::{StoredAccountMeta, StoredMetaWriteVersion}, + accounts_file::ALIGN_BOUNDARY_OFFSET, + append_vec::MatchAccountOwnerError, + tiered_storage::{ + data_block::AccountDataBlock, + file::TieredStorageFile, + footer::{ + AccountDataBlockFormat, AccountIndexFormat, AccountMetaFormat, OwnersBlockFormat, + TieredFileFormat, TieredStorageFooter, + }, + meta_entries::{ + get_compressed_block_size, AccountMetaFlags, AccountMetaOptionalFields, + TieredAccountMeta, ACCOUNT_DATA_ENTIRE_BLOCK, DEFAULT_ACCOUNT_HASH, + }, + reader::TieredStoredAccountMeta, + ACCOUNT_DATA_BLOCK_SIZE, + }, + }, + solana_sdk::{hash::Hash, pubkey::Pubkey, stake_history::Epoch}, + std::{collections::HashMap, path::Path}, +}; + +pub static COLD_FORMAT: TieredFileFormat = TieredFileFormat { + meta_entry_size: std::mem::size_of::(), + account_meta_format: AccountMetaFormat::Cold, + owners_block_format: OwnersBlockFormat::LocalIndex, + account_index_format: AccountIndexFormat::Linear, + data_block_format: AccountDataBlockFormat::Lz4, +}; + +#[derive(Debug)] +pub struct ColdStorageReader { + pub(crate) footer: TieredStorageFooter, + pub(crate) metas: Vec, + accounts: Vec, + owners: Vec, + data_blocks: HashMap>, +} + +impl ColdStorageReader { + pub fn new_from_file(file_path: impl AsRef) -> std::io::Result { + let storage = TieredStorageFile::new(file_path, false /* create */); + let footer = ColdReaderBuilder::read_footer_block(&storage)?; + + let metas = ColdReaderBuilder::read_account_metas_block(&storage, &footer)?; + let accounts = ColdReaderBuilder::read_account_addresses_block(&storage, &footer)?; + let owners = ColdReaderBuilder::read_owners_block(&storage, &footer)?; + let data_blocks = ColdReaderBuilder::read_data_blocks(&storage, &footer, &metas)?; + + Ok(ColdStorageReader { + footer, + metas, + accounts, + owners, + data_blocks, + }) + } + + pub fn num_accounts(&self) -> usize { + self.footer.account_meta_count.try_into().unwrap() + } + + fn multiplied_index_to_index(multiplied_index: usize) -> usize { + // This is a temporary workaround to work with existing AccountInfo + // implementation that ties to AppendVec with the assumption that the offset + // is a multiple of ALIGN_BOUNDARY_OFFSET, while tiered storage actually talks + // about index instead of offset. + multiplied_index / ALIGN_BOUNDARY_OFFSET + } + + pub fn account_matches_owners( + &self, + multiplied_index: usize, + owners: &[&Pubkey], + ) -> Result { + let index = Self::multiplied_index_to_index(multiplied_index); + if index >= self.metas.len() { + return Err(MatchAccountOwnerError::UnableToLoad); + } + + owners + .iter() + .position(|entry| &&self.owners[self.metas[index].owner_local_id() as usize] == entry) + .ok_or(MatchAccountOwnerError::NoMatch) + } + + pub fn get_account<'a>( + &'a self, + multiplied_index: usize, + ) -> Option<(StoredAccountMeta<'a>, usize)> { + let index = Self::multiplied_index_to_index(multiplied_index); + if index >= self.metas.len() { + return None; + } + if let Some(data_block) = self.data_blocks.get(&self.metas[index].block_offset()) { + return Some(( + StoredAccountMeta::Cold(TieredStoredAccountMeta { + meta: &self.metas[index], + pubkey: &self.accounts[index], + owner: &self.owners[self.metas[index].owner_local_id() as usize], + index: multiplied_index, + data_block: data_block, + }), + multiplied_index + ALIGN_BOUNDARY_OFFSET, + )); + } + None + } +} + +pub(crate) struct ColdReaderBuilder {} + +impl ColdReaderBuilder { + fn read_footer_block(storage: &TieredStorageFile) -> std::io::Result { + TieredStorageFooter::new_from_footer_block(&storage) + } + + fn read_account_metas_block( + storage: &TieredStorageFile, + footer: &TieredStorageFooter, + ) -> std::io::Result> { + let mut metas: Vec = + Vec::with_capacity(footer.account_meta_count as usize); + + (&storage).seek(footer.account_metas_offset)?; + + for _ in 0..footer.account_meta_count { + metas.push(ColdAccountMeta::new_from_file(&storage)?); + } + + Ok(metas) + } + + fn read_account_addresses_block( + storage: &TieredStorageFile, + footer: &TieredStorageFooter, + ) -> std::io::Result> { + Self::read_pubkeys_block( + storage, + footer.account_pubkeys_offset, + footer.account_meta_count, + ) + } + + fn read_owners_block( + storage: &TieredStorageFile, + footer: &TieredStorageFooter, + ) -> std::io::Result> { + Self::read_pubkeys_block(storage, footer.owners_offset, footer.owner_count) + } + + fn read_pubkeys_block( + storage: &TieredStorageFile, + offset: u64, + count: u32, + ) -> std::io::Result> { + let mut addresses: Vec = Vec::with_capacity(count as usize); + (&storage).seek(offset)?; + for _ in 0..count { + let mut pubkey = Pubkey::default(); + (&storage).read_type(&mut pubkey)?; + addresses.push(pubkey); + } + + Ok(addresses) + } + + pub fn read_data_blocks( + storage: &TieredStorageFile, + footer: &TieredStorageFooter, + metas: &Vec, + ) -> std::io::Result>> { + let count = footer.account_meta_count as usize; + let mut data_blocks = HashMap::>::new(); + for i in 0..count { + Self::update_data_block_map(&mut data_blocks, storage, footer, metas, i)?; + } + Ok(data_blocks) + } + + fn update_data_block_map( + data_blocks: &mut HashMap>, + storage: &TieredStorageFile, + footer: &TieredStorageFooter, + metas: &Vec, + index: usize, + ) -> std::io::Result<()> { + let block_offset = &metas[index].block_offset(); + if !data_blocks.contains_key(&block_offset) { + let data_block = Self::read_data_block(storage, footer, metas, index).unwrap(); + + data_blocks.insert(metas[index].block_offset(), data_block); + } + Ok(()) + } + + pub fn read_data_block( + storage: &TieredStorageFile, + footer: &TieredStorageFooter, + metas: &Vec, + index: usize, + ) -> std::io::Result> { + let compressed_block_size = get_compressed_block_size(footer, metas, index) as usize; + + (&storage).seek(metas[index].block_offset())?; + + let mut buffer: Vec = vec![0; compressed_block_size]; + (&storage).read_bytes(&mut buffer)?; + + // TODO(yhchiang): encoding from footer + Ok(AccountDataBlock::decode( + AccountDataBlockFormat::Lz4, + &buffer[..], + )?) + } +} + +#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)] +#[repr(C)] +pub struct ColdAccountMeta { + lamports: u64, + block_offset: u64, + uncompressed_data_size: u16, + intra_block_offset: u16, + owner_local_id: u32, + flags: u32, + // It will still be 32 bytes even without this field. + _unused: u32, +} + +impl TieredAccountMeta for ColdAccountMeta { + fn new() -> Self { + Self { + ..ColdAccountMeta::default() + } + } + + fn is_blob_account_data(data_len: u64) -> bool { + data_len > ACCOUNT_DATA_BLOCK_SIZE as u64 + } + + fn with_lamports(&mut self, lamports: u64) -> &mut Self { + self.lamports = lamports; + self + } + + fn with_block_offset(&mut self, offset: u64) -> &mut Self { + self.block_offset = offset; + self + } + + fn with_data_tailing_paddings(&mut self, _paddings: u8) -> &mut Self { + // cold storage never has paddings. + self + } + + fn with_owner_local_id(&mut self, local_id: u32) -> &mut Self { + self.owner_local_id = local_id; + self + } + + fn with_uncompressed_data_size(&mut self, data_size: u64) -> &mut Self { + assert!(ACCOUNT_DATA_ENTIRE_BLOCK <= std::u16::MAX); + if data_size >= ACCOUNT_DATA_ENTIRE_BLOCK as u64 { + self.uncompressed_data_size = ACCOUNT_DATA_ENTIRE_BLOCK; + } else { + self.uncompressed_data_size = data_size as u16; + } + self + } + + fn with_intra_block_offset(&mut self, offset: u16) -> &mut Self { + self.intra_block_offset = offset; + self + } + + fn with_flags(&mut self, flags: u32) -> &mut Self { + self.flags = flags; + self + } + + fn with_optional_fields(&mut self, fields: &AccountMetaOptionalFields) -> &mut Self { + fields.update_flags(&mut self.flags); + + self + } + + fn lamports(&self) -> u64 { + self.lamports + } + + fn block_offset(&self) -> u64 { + self.block_offset + } + + fn set_block_offset(&mut self, offset: u64) { + self.block_offset = offset; + } + + fn padding_bytes(&self) -> u8 { + 0u8 + } + + fn intra_block_offset(&self) -> u16 { + self.intra_block_offset + } + + fn owner_local_id(&self) -> u32 { + self.owner_local_id + } + + fn flags_get(&self, bit_field: u32) -> bool { + AccountMetaFlags::get(&self.flags, bit_field) + } + + fn uncompressed_data_size(&self) -> usize { + self.uncompressed_data_size as usize + } + + fn rent_epoch(&self, data_block: &[u8]) -> Option { + let offset = self.optional_fields_offset(data_block); + if self.flags_get(AccountMetaFlags::HAS_RENT_EPOCH) { + unsafe { + let unaligned = + std::ptr::addr_of!(data_block[offset..offset + std::mem::size_of::()]) + as *const Epoch; + return Some(std::ptr::read_unaligned(unaligned)); + } + } + None + } + + fn account_hash<'a>(&self, data_block: &'a [u8]) -> &'a Hash { + let mut offset = self.optional_fields_offset(data_block); + if self.flags_get(AccountMetaFlags::HAS_RENT_EPOCH) { + offset += std::mem::size_of::(); + } + if self.flags_get(AccountMetaFlags::HAS_ACCOUNT_HASH) { + unsafe { + let raw_ptr = std::slice::from_raw_parts( + data_block[offset..offset + std::mem::size_of::()].as_ptr() as *const u8, + std::mem::size_of::(), + ); + let ptr: *const Hash = raw_ptr.as_ptr() as *const Hash; + return &*ptr; + } + } + return &DEFAULT_ACCOUNT_HASH; + } + + fn write_version(&self, data_block: &[u8]) -> Option { + let mut offset = self.optional_fields_offset(data_block); + if self.flags_get(AccountMetaFlags::HAS_RENT_EPOCH) { + offset += std::mem::size_of::(); + } + if self.flags_get(AccountMetaFlags::HAS_ACCOUNT_HASH) { + offset += std::mem::size_of::(); + } + if self.flags_get(AccountMetaFlags::HAS_WRITE_VERSION) { + unsafe { + let unaligned = std::ptr::addr_of!( + data_block[offset..offset + std::mem::size_of::()] + ) as *const StoredMetaWriteVersion; + return Some(std::ptr::read_unaligned(unaligned)); + } + } + None + } + + fn data_len(&self, data_block: &[u8]) -> usize { + self.optional_fields_offset(data_block) + .saturating_sub(self.intra_block_offset as usize) + } + + fn optional_fields_offset<'a>(&self, data_block: &'a [u8]) -> usize { + if self.is_blob_account() { + return data_block.len().saturating_sub(self.optional_fields_size()); + } + (self.intra_block_offset + self.uncompressed_data_size) as usize + } + + fn account_data<'a>(&self, data_block: &'a [u8]) -> &'a [u8] { + &data_block[(self.intra_block_offset as usize)..self.optional_fields_offset(data_block)] + } + + fn is_blob_account(&self) -> bool { + self.uncompressed_data_size == ACCOUNT_DATA_ENTIRE_BLOCK && self.intra_block_offset == 0 + } + + fn write_account_meta_entry(&self, ads_file: &TieredStorageFile) -> std::io::Result { + ads_file.write_type(self)?; + + Ok(std::mem::size_of::()) + } + + fn stored_size( + footer: &TieredStorageFooter, + metas: &Vec, + i: usize, + ) -> usize { + let compressed_block_size = get_compressed_block_size(footer, metas, i); + + let data_size = if metas[i].is_blob_account() { + compressed_block_size + } else { + let compression_rate: f64 = + compressed_block_size as f64 / Self::get_raw_block_size(metas, i) as f64; + + ((metas[i].uncompressed_data_size() as usize + metas[i].optional_fields_size()) as f64 + / compression_rate) as usize + }; + + return std::mem::size_of::() + data_size; + } +} + +impl ColdAccountMeta { + fn new_from_file(ads_file: &TieredStorageFile) -> std::io::Result { + let mut entry = ColdAccountMeta::new(); + ads_file.read_type(&mut entry)?; + + Ok(entry) + } + + pub fn get_raw_block_size(metas: &Vec, index: usize) -> usize { + let mut block_size = 0; + + for i in index..metas.len() { + if metas[i].block_offset() == metas[index].block_offset() { + block_size += metas[i].uncompressed_data_size(); + } else { + break; + } + } + + block_size.try_into().unwrap() + } +} + +impl Default for ColdAccountMeta { + fn default() -> Self { + Self { + lamports: 0, + block_offset: 0, + owner_local_id: 0, + uncompressed_data_size: 0, + intra_block_offset: 0, + flags: AccountMetaFlags::new().to_value(), + _unused: 0, + } + } +} + +#[cfg(test)] +pub mod tests { + use { + crate::{ + account_storage::meta::StoredMetaWriteVersion, + append_vec::test_utils::get_append_vec_path, + tiered_storage::{ + cold::ColdAccountMeta, + file::TieredStorageFile, + meta_entries::{AccountMetaFlags, AccountMetaOptionalFields, TieredAccountMeta}, + }, + }, + ::solana_sdk::{hash::Hash, stake_history::Epoch}, + memoffset::offset_of, + }; + + #[test] + fn test_account_meta_entry() { + let path = get_append_vec_path("test_account_meta_entry"); + + const TEST_LAMPORT: u64 = 7; + const BLOCK_OFFSET: u64 = 56987; + const OWNER_LOCAL_ID: u32 = 54; + const UNCOMPRESSED_LENGTH: u64 = 255; + const LOCAL_OFFSET: u16 = 82; + const TEST_RENT_EPOCH: Epoch = 7; + const TEST_WRITE_VERSION: StoredMetaWriteVersion = 0; + + let optional_fields = AccountMetaOptionalFields { + rent_epoch: Some(TEST_RENT_EPOCH), + account_hash: Some(Hash::new_unique()), + write_version_obsolete: Some(TEST_WRITE_VERSION), + }; + + let mut expected_entry = ColdAccountMeta::new(); + expected_entry + .with_lamports(TEST_LAMPORT) + .with_block_offset(BLOCK_OFFSET) + .with_owner_local_id(OWNER_LOCAL_ID) + .with_uncompressed_data_size(UNCOMPRESSED_LENGTH) + .with_intra_block_offset(LOCAL_OFFSET) + .with_flags( + AccountMetaFlags::new() + .with_bit(AccountMetaFlags::EXECUTABLE, true) + .to_value(), + ) + .with_optional_fields(&optional_fields); + + { + let mut ads_file = TieredStorageFile::new(&path.path, true); + expected_entry + .write_account_meta_entry(&mut ads_file) + .unwrap(); + } + + let mut ads_file = TieredStorageFile::new(&path.path, true); + let entry = ColdAccountMeta::new_from_file(&mut ads_file).unwrap(); + + assert_eq!(expected_entry, entry); + assert_eq!(entry.flags_get(AccountMetaFlags::EXECUTABLE), true); + assert_eq!(entry.flags_get(AccountMetaFlags::HAS_RENT_EPOCH), true); + } + + #[test] + fn test_cold_account_meta_layout() { + const COLD_META_SIZE_BYTES: usize = 32; + assert_eq!(offset_of!(ColdAccountMeta, lamports), 0x00); + assert_eq!(offset_of!(ColdAccountMeta, block_offset), 0x08); + assert_eq!(offset_of!(ColdAccountMeta, uncompressed_data_size), 0x10); + assert_eq!(offset_of!(ColdAccountMeta, intra_block_offset), 0x12); + assert_eq!(offset_of!(ColdAccountMeta, owner_local_id), 0x14); + assert_eq!(offset_of!(ColdAccountMeta, flags), 0x18); + assert_eq!(std::mem::size_of::(), COLD_META_SIZE_BYTES); + } +} diff --git a/runtime/src/tiered_storage/data_block.rs b/runtime/src/tiered_storage/data_block.rs new file mode 100644 index 00000000000000..3fb5bb6d6d7ebd --- /dev/null +++ b/runtime/src/tiered_storage/data_block.rs @@ -0,0 +1,132 @@ +use { + crate::tiered_storage::footer::AccountDataBlockFormat, + std::{ + io::{Cursor, Error, Read, Write}, + mem, + }, +}; + +enum AccountDataBlockEncoder { + Raw(Cursor>), + Lz4(lz4::Encoder>), +} + +pub struct AccountDataBlockWriter { + len: usize, + encoder: AccountDataBlockEncoder, +} + +impl AccountDataBlockWriter { + pub fn new(encoding: AccountDataBlockFormat) -> Self { + Self { + len: 0, + encoder: match encoding { + AccountDataBlockFormat::AlignedRaw => { + AccountDataBlockEncoder::Raw(Cursor::new(Vec::new())) + } + AccountDataBlockFormat::Lz4 => AccountDataBlockEncoder::Lz4( + lz4::EncoderBuilder::new() + .level(1) + .build(Vec::new()) + .unwrap(), + ), + }, + } + } + + pub fn len(&self) -> usize { + self.len + } + + pub fn write_type(&mut self, value: &T) -> Result { + let len = mem::size_of::(); + unsafe { + let ptr = std::slice::from_raw_parts((value as *const T) as *const u8, len); + // self.len has been updated here + self.write(ptr, len)?; + } + Ok(len) + } + + pub fn write(&mut self, buf: &[u8], len: usize) -> Result<(), Error> { + let result = match &mut self.encoder { + AccountDataBlockEncoder::Raw(cursor) => cursor.write_all(&buf[0..len]), + AccountDataBlockEncoder::Lz4(lz4_encoder) => lz4_encoder.write_all(&buf[0..len]), + }; + if result.is_ok() { + self.len += len; + } + result + } + + pub fn finish(self) -> Result<(Vec, usize), Error> { + match self.encoder { + AccountDataBlockEncoder::Raw(cursor) => Ok((cursor.into_inner(), self.len)), + AccountDataBlockEncoder::Lz4(lz4_encoder) => { + let (compressed_block, result) = lz4_encoder.finish(); + result?; + Ok((compressed_block, self.len)) + } + } + } +} + +pub struct AccountDataBlock {} + +impl AccountDataBlock { + pub fn decode(encoding: AccountDataBlockFormat, input: &[u8]) -> Result, Error> { + match encoding { + AccountDataBlockFormat::Lz4 => { + let mut decoder = lz4::Decoder::new(input).unwrap(); + let mut output: Vec = vec![]; + decoder.read_to_end(&mut output)?; + Ok(output) + } + AccountDataBlockFormat::AlignedRaw => panic!("Not implemented"), + } + } +} + +/* + #[test] + fn test_compress_and_decompress() { + let path = get_append_vec_path("test_compress_and_decompress"); + + const DATA_SIZE: usize = 10 * 1024 * 1024; + let mut expected_data: Vec = Vec::with_capacity(DATA_SIZE); + let compressed_len_full; + let compressed_len_partial; + let mut byte = rand::random::(); + for i in 0..DATA_SIZE { + if i % 64 == 0 { + byte = rand::random::(); + } + expected_data.push(byte); + } + + { + let ads = TieredStorage::new_for_test(&path.path, true); + compressed_len_full = ads + .compress_and_write(&expected_data, expected_data.len()) + .unwrap(); + compressed_len_partial = ads + .compress_and_write(&expected_data, expected_data.len() / 2) + .unwrap(); + } + // Make sure we do compress data + assert!(compressed_len_full > 0); + assert!(DATA_SIZE / 2 > compressed_len_full); + + let ads = TieredStorage::new_for_test(&path.path, false); + + // full data + let mut data = ads.decompress_and_read(compressed_len_full).unwrap(); + assert_eq!(expected_data, data); + + // partial data + data = ads.decompress_and_read(compressed_len_partial).unwrap(); + assert_eq!(data.len(), expected_data.len() / 2); + assert_eq!(&expected_data[0..expected_data.len() / 2], data); + } + +*/ diff --git a/runtime/src/tiered_storage/file.rs b/runtime/src/tiered_storage/file.rs new file mode 100644 index 00000000000000..655e6670f64a68 --- /dev/null +++ b/runtime/src/tiered_storage/file.rs @@ -0,0 +1,71 @@ +use std::{ + fs::{File, OpenOptions}, + io::{Read, Seek, SeekFrom, Write}, + mem, + path::Path, +}; + +#[derive(Debug)] +pub struct TieredStorageFile { + pub file: File, +} + +impl TieredStorageFile { + pub fn new>(file_path: P, create: bool) -> Self { + let file = OpenOptions::new() + .read(true) + .write(true) + .create(create) + .open(file_path.as_ref()) + .map_err(|e| { + panic!( + "Unable to {} data file {} in current dir({:?}): {:?}", + if create { "create" } else { "open" }, + file_path.as_ref().display(), + std::env::current_dir(), + e + ); + }) + .unwrap(); + Self { file } + } + + pub fn write_type(&self, value: &T) -> Result { + unsafe { + let ptr = + std::slice::from_raw_parts((value as *const T) as *const u8, mem::size_of::()); + (&self.file).write_all(ptr)?; + } + Ok(std::mem::size_of::()) + } + + pub fn read_type(&self, value: &mut T) -> Result<(), std::io::Error> { + unsafe { + // TODO(yhchiang): this requires alignment + let ptr = + std::slice::from_raw_parts_mut((value as *mut T) as *mut u8, mem::size_of::()); + (&self.file).read_exact(ptr)?; + } + Ok(()) + } + + pub fn seek(&self, offset: u64) -> Result { + (&self.file).seek(SeekFrom::Start(offset)) + } + + pub fn seek_from_end(&self, offset: i64) -> Result { + (&self.file).seek(SeekFrom::End(offset)) + } + + pub fn write_bytes(&self, bytes: &[u8]) -> Result { + (&self.file).write_all(bytes)?; + + Ok(bytes.len()) + } + + pub fn read_bytes(&self, buffer: &mut [u8]) -> Result<(), std::io::Error> { + (&self.file).read_exact(buffer)?; + + Ok(()) + } +} diff --git a/runtime/src/tiered_storage/footer.rs b/runtime/src/tiered_storage/footer.rs new file mode 100644 index 00000000000000..9deb906d84772d --- /dev/null +++ b/runtime/src/tiered_storage/footer.rs @@ -0,0 +1,364 @@ +use { + crate::tiered_storage::{file::TieredStorageFile, mmap_utils::get_type}, + memmap2::Mmap, + serde::{Deserialize, Serialize}, + solana_sdk::hash::Hash, + std::{mem, path::Path}, +}; + +// The size of the footer struct + the u64 magic number at the end. +pub(crate) const FOOTER_SIZE: i64 = + (mem::size_of::() + mem::size_of::()) as i64; +pub(crate) const FOOTER_TAIL_SIZE: i64 = 24; + +pub(crate) const FOOTER_MAGIC_NUMBER: u64 = 0x501A2AB5; // SOLALABS -> SOLANA LABS + +#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)] +#[repr(C)] +pub struct TieredStorageMagicNumber { + pub magic: u64, +} + +impl TieredStorageMagicNumber { + pub fn new() -> Self { + Self { magic: 0 } + } + pub fn default() -> Self { + Self { + magic: FOOTER_MAGIC_NUMBER, + } + } +} + +#[repr(u64)] +#[derive( + Clone, + Copy, + Debug, + Eq, + Hash, + PartialEq, + Deserialize, + num_enum::IntoPrimitive, + Serialize, + num_enum::TryFromPrimitive, +)] +#[serde(into = "u64", try_from = "u64")] +pub enum AccountMetaFormat { + Cold = 0u64, + Hot = 1u64, +} + +impl Default for AccountMetaFormat { + fn default() -> Self { + AccountMetaFormat::Cold + } +} + +#[repr(u64)] +#[derive( + Clone, + Copy, + Debug, + Eq, + Hash, + PartialEq, + Deserialize, + num_enum::IntoPrimitive, + Serialize, + num_enum::TryFromPrimitive, +)] +#[serde(into = "u64", try_from = "u64")] +pub enum AccountDataBlockFormat { + AlignedRaw = 0u64, + Lz4 = 1u64, +} + +impl Default for AccountDataBlockFormat { + fn default() -> Self { + AccountDataBlockFormat::Lz4 + } +} + +#[repr(u64)] +#[derive( + Clone, + Copy, + Debug, + Eq, + Hash, + PartialEq, + Deserialize, + num_enum::IntoPrimitive, + Serialize, + num_enum::TryFromPrimitive, +)] +#[serde(into = "u64", try_from = "u64")] +pub enum OwnersBlockFormat { + LocalIndex = 0u64, +} + +impl Default for OwnersBlockFormat { + fn default() -> Self { + OwnersBlockFormat::LocalIndex + } +} + +#[repr(u64)] +#[derive( + Clone, + Copy, + Debug, + Eq, + Hash, + PartialEq, + Deserialize, + num_enum::IntoPrimitive, + Serialize, + num_enum::TryFromPrimitive, +)] +#[serde(into = "u64", try_from = "u64")] +pub enum AccountIndexFormat { + // This format does not support any fast lookup. + // Any query from account hash to account meta requires linear search. + Linear = 0u64, +} + +impl Default for AccountIndexFormat { + fn default() -> Self { + AccountIndexFormat::Linear + } +} + +#[derive(Debug)] +pub struct TieredFileFormat { + pub meta_entry_size: usize, + pub account_meta_format: AccountMetaFormat, + pub owners_block_format: OwnersBlockFormat, + pub account_index_format: AccountIndexFormat, + pub data_block_format: AccountDataBlockFormat, +} + +#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Clone)] +#[repr(C)] +pub struct TieredStorageFooter { + // formats + pub account_meta_format: AccountMetaFormat, + pub owners_block_format: OwnersBlockFormat, + pub account_index_format: AccountIndexFormat, + pub data_block_format: AccountDataBlockFormat, + + // regular accounts' stats + pub account_meta_count: u32, + pub account_meta_entry_size: u32, + pub account_data_block_size: u64, + + // owner's stats + pub owner_count: u32, + pub owner_entry_size: u32, + + // offsets + pub account_metas_offset: u64, + pub account_pubkeys_offset: u64, + pub owners_offset: u64, + + // misc + pub hash: Hash, + + // account range + pub min_account_address: Hash, + pub max_account_address: Hash, + + // tailing information + pub footer_size: u64, + pub format_version: u64, + // This field is persisted in the storage but not in this struct. + // pub magic_number: u64, // FOOTER_MAGIC_NUMBER +} + +impl TieredStorageFooter { + pub fn new() -> Self { + Self { ..Self::default() } + } +} + +impl Default for TieredStorageFooter { + fn default() -> Self { + Self { + account_meta_format: AccountMetaFormat::default(), + owners_block_format: OwnersBlockFormat::default(), + account_index_format: AccountIndexFormat::default(), + data_block_format: AccountDataBlockFormat::default(), + account_meta_count: 0, + account_meta_entry_size: 0, + account_data_block_size: 0, + owner_count: 0, + owner_entry_size: 0, + account_metas_offset: 0, + account_pubkeys_offset: 0, + owners_offset: 0, + hash: Hash::new_unique(), + min_account_address: Hash::default(), + max_account_address: Hash::default(), + footer_size: FOOTER_SIZE as u64, + format_version: 1, + } + } +} + +impl TieredStorageFooter { + pub fn new_from_path>(path: P) -> std::io::Result { + let storage = TieredStorageFile::new(path, false /* create */); + Self::new_from_footer_block(&storage) + } + + pub fn write_footer_block(&self, ads_file: &TieredStorageFile) -> std::io::Result<()> { + println!("write_footer_block({:?}", self); + ads_file.write_type(self)?; + ads_file.write_type(&TieredStorageMagicNumber::default())?; + + Ok(()) + } + + pub fn new_from_footer_block(ads_file: &TieredStorageFile) -> std::io::Result { + let mut footer_size: u64 = 0; + let mut footer_version: u64 = 0; + let mut magic_number = TieredStorageMagicNumber::new(); + + ads_file.seek_from_end(-FOOTER_TAIL_SIZE)?; + ads_file.read_type(&mut footer_size)?; + ads_file.read_type(&mut footer_version)?; + ads_file.read_type(&mut magic_number)?; + + if magic_number != TieredStorageMagicNumber::default() { + return Err(std::io::Error::new( + std::io::ErrorKind::Other, + "TieredStorageError: Magic mumber mismatch", + )); + } + + let mut footer = Self::new(); + ads_file.seek_from_end(-(footer_size as i64))?; + ads_file.read_type(&mut footer)?; + + Ok(footer) + } + + pub fn new_from_mmap(map: &Mmap) -> std::io::Result<&TieredStorageFooter> { + let offset = map.len().saturating_sub(FOOTER_TAIL_SIZE as usize); + println!("offset = {}", offset); + let (footer_size, offset): (&u64, _) = get_type(map, offset)?; + println!("footer_size = {}, {}", footer_size, offset); + let (_footer_version, offset): (&u64, _) = get_type(map, offset)?; + println!("footer_version = {}, {}", _footer_version, offset); + let (magic_number, _offset): (&TieredStorageMagicNumber, _) = get_type(&map, offset)?; + println!("magic_number = {:?}, {}", magic_number, offset); + + if *magic_number != TieredStorageMagicNumber::default() { + return Err(std::io::Error::new( + std::io::ErrorKind::Other, + "TieredStorageError: Magic mumber mismatch", + )); + } + + let (footer, _offset): (&TieredStorageFooter, _) = + get_type(map, map.len().saturating_sub(*footer_size as usize))?; + + Ok(footer) + } +} + +#[cfg(test)] +pub mod tests { + use { + crate::{ + append_vec::test_utils::get_append_vec_path, + tiered_storage::{ + file::TieredStorageFile, + footer::{ + AccountDataBlockFormat, AccountIndexFormat, AccountMetaFormat, + OwnersBlockFormat, TieredStorageFooter, FOOTER_SIZE, + }, + }, + }, + memoffset::offset_of, + solana_sdk::hash::Hash, + std::mem, + }; + + #[test] + /// Make sure the in-memory size is what we expected. + fn test_footer_size() { + assert_eq!( + mem::size_of::() + mem::size_of::(), + FOOTER_SIZE as usize + ); + } + + #[test] + fn test_footer() { + let path = get_append_vec_path("test_file_footer"); + let expected_footer = TieredStorageFooter { + account_meta_format: AccountMetaFormat::Hot, + owners_block_format: OwnersBlockFormat::LocalIndex, + account_index_format: AccountIndexFormat::Linear, + data_block_format: AccountDataBlockFormat::AlignedRaw, + account_meta_count: 300, + account_meta_entry_size: 24, + account_data_block_size: 4096, + owner_count: 250, + owner_entry_size: 32, + account_metas_offset: 1062400, + account_pubkeys_offset: 1069600, + owners_offset: 1081200, + hash: Hash::new_unique(), + min_account_address: Hash::default(), + max_account_address: Hash::default(), + footer_size: FOOTER_SIZE as u64, + format_version: 1, + }; + + { + let ads_file = TieredStorageFile::new(&path.path, true); + expected_footer.write_footer_block(&ads_file).unwrap(); + } + + // Reopen the same storage, and expect the persisted footer is + // the same as what we have written. + { + let ads_file = TieredStorageFile::new(&path.path, true); + let footer = TieredStorageFooter::new_from_footer_block(&ads_file).unwrap(); + assert_eq!(expected_footer, footer); + } + } + + #[test] + fn test_footer_layout() { + assert_eq!(offset_of!(TieredStorageFooter, account_meta_format), 0x00); + assert_eq!(offset_of!(TieredStorageFooter, owners_block_format), 0x08); + assert_eq!(offset_of!(TieredStorageFooter, account_index_format), 0x10); + assert_eq!(offset_of!(TieredStorageFooter, data_block_format), 0x18); + assert_eq!(offset_of!(TieredStorageFooter, account_meta_count), 0x20); + assert_eq!( + offset_of!(TieredStorageFooter, account_meta_entry_size), + 0x24 + ); + assert_eq!( + offset_of!(TieredStorageFooter, account_data_block_size), + 0x28 + ); + assert_eq!(offset_of!(TieredStorageFooter, owner_count), 0x30); + assert_eq!(offset_of!(TieredStorageFooter, owner_entry_size), 0x34); + assert_eq!(offset_of!(TieredStorageFooter, account_metas_offset), 0x38); + assert_eq!( + offset_of!(TieredStorageFooter, account_pubkeys_offset), + 0x40 + ); + assert_eq!(offset_of!(TieredStorageFooter, owners_offset), 0x48); + assert_eq!(offset_of!(TieredStorageFooter, hash), 0x50); + assert_eq!(offset_of!(TieredStorageFooter, min_account_address), 0x70); + assert_eq!(offset_of!(TieredStorageFooter, max_account_address), 0x90); + assert_eq!(offset_of!(TieredStorageFooter, footer_size), 0xB0); + assert_eq!(offset_of!(TieredStorageFooter, format_version), 0xB8); + } +} diff --git a/runtime/src/tiered_storage/hot.rs b/runtime/src/tiered_storage/hot.rs new file mode 100644 index 00000000000000..27b2c605c70fa2 --- /dev/null +++ b/runtime/src/tiered_storage/hot.rs @@ -0,0 +1,511 @@ +#![allow(unused_imports)] +use { + crate::{ + account_storage::meta::{StoredAccountMeta, StoredMetaWriteVersion}, + accounts_file::ALIGN_BOUNDARY_OFFSET, + append_vec::MatchAccountOwnerError, + tiered_storage::{ + data_block::AccountDataBlock, + file::TieredStorageFile, + footer::{ + AccountDataBlockFormat, AccountIndexFormat, AccountMetaFormat, OwnersBlockFormat, + TieredFileFormat, TieredStorageFooter, TieredStorageMagicNumber, + FOOTER_MAGIC_NUMBER, FOOTER_TAIL_SIZE, + }, + meta_entries::{ + get_compressed_block_size, AccountMetaFlags, AccountMetaOptionalFields, + TieredAccountMeta, ACCOUNT_DATA_ENTIRE_BLOCK, DEFAULT_ACCOUNT_HASH, + }, + mmap_utils::{get_slice, get_type}, + reader::{TieredStorageReader, TieredStoredAccountMeta}, + }, + u64_align, + }, + memmap2::{Mmap, MmapOptions}, + solana_sdk::{hash::Hash, pubkey::Pubkey, stake_history::Epoch}, + std::{collections::HashMap, fs::OpenOptions, mem::size_of, option::Option, path::Path}, +}; + +const BLOCK_OFFSET_MASK: u64 = 0x00ff_ffff_ffff_ffff; +const CLEAR_BLOCK_OFFSET_MASK: u64 = 0xff00_0000_0000_0000; +const PADDINGS_MASK: u64 = 0x0700_0000_0000_0000; +const CLEAR_PADDINGS_MASK: u64 = 0xf8ff_ffff_ffff_ffff; +const PADDINGS_SHIFT: u64 = 56; + +pub static HOT_FORMAT: TieredFileFormat = TieredFileFormat { + meta_entry_size: std::mem::size_of::(), + account_meta_format: AccountMetaFormat::Hot, + owners_block_format: OwnersBlockFormat::LocalIndex, + account_index_format: AccountIndexFormat::Linear, + data_block_format: AccountDataBlockFormat::AlignedRaw, +}; + +#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)] +#[repr(C)] +pub struct HotAccountMeta { + lamports: u64, + // the high 8-bits are used to store padding and data block + // format information. + // Use block_offset() to obtain the actual block offset. + block_offset_info: u64, + owner_index: u32, + flags: u32, +} + +impl HotAccountMeta { + #[allow(dead_code)] + fn new_from_file(ads_file: &TieredStorageFile) -> std::io::Result { + let mut entry = HotAccountMeta::new(); + ads_file.read_type(&mut entry)?; + + Ok(entry) + } + + fn set_padding_bytes(&mut self, paddings: u8) { + assert!(paddings <= 7); + self.block_offset_info &= CLEAR_PADDINGS_MASK; + self.block_offset_info |= (paddings as u64) << PADDINGS_SHIFT; + } + + fn get_type<'a, T>(data_block: &'a [u8], offset: usize) -> &'a T { + unsafe { + let raw_ptr = std::slice::from_raw_parts( + data_block[offset..offset + std::mem::size_of::()].as_ptr() as *const u8, + std::mem::size_of::(), + ); + let ptr: *const T = raw_ptr.as_ptr() as *const T; + return &*ptr; + } + } +} + +impl TieredAccountMeta for HotAccountMeta { + fn new() -> Self { + HotAccountMeta { + lamports: 0, + block_offset_info: 0, + owner_index: 0, + flags: 0, + } + } + + fn is_blob_account_data(_data_len: u64) -> bool { + true + } + + fn lamports(&self) -> u64 { + self.lamports + } + + fn with_lamports(&mut self, lamports: u64) -> &mut Self { + self.lamports = lamports; + self + } + + fn with_block_offset(&mut self, offset: u64) -> &mut Self { + self.set_block_offset(offset); + self + } + + fn with_data_tailing_paddings(&mut self, paddings: u8) -> &mut Self { + self.set_padding_bytes(paddings); + self + } + + fn with_owner_local_id(&mut self, owner_index: u32) -> &mut Self { + self.owner_index = owner_index; + self + } + + fn with_uncompressed_data_size(&mut self, data_size: u64) -> &mut Self { + // Hot meta derives its data length by comparing two consecutive offsets. + // TODO(yhchiang): invoke with_paddings() here. + println!("data_size = {}", data_size); + println!("paddings = {}", ((8 - (data_size % 8)) % 8) as u8); + self.set_padding_bytes(((8 - (data_size % 8)) % 8) as u8); + self + } + + fn with_intra_block_offset(&mut self, _offset: u16) -> &mut Self { + // hot meta always have intra block offset equals to 0 except + // its block_offset_info indocates it is inside a shared block. + self + } + + fn with_optional_fields(&mut self, fields: &AccountMetaOptionalFields) -> &mut Self { + fields.update_flags(&mut self.flags); + self + } + + fn with_flags(&mut self, flags: u32) -> &mut Self { + self.flags = flags; + self + } + + fn block_offset(&self) -> u64 { + (self.block_offset_info & BLOCK_OFFSET_MASK).saturating_mul(8) + } + + fn padding_bytes(&self) -> u8 { + ((self.block_offset_info & PADDINGS_MASK) >> PADDINGS_SHIFT) + .try_into() + .unwrap() + } + + fn set_block_offset(&mut self, offset: u64) { + assert!((offset >> 3) <= BLOCK_OFFSET_MASK); + self.block_offset_info &= CLEAR_BLOCK_OFFSET_MASK; + self.block_offset_info |= offset >> 3; + } + + fn intra_block_offset(&self) -> u16 { + // hot meta always have intra block offset equals to 0 except + // its block_offset_info indocates it is inside a shared block. + 0 + } + + fn owner_local_id(&self) -> u32 { + self.owner_index + } + + fn flags_get(&self, bit_field: u32) -> bool { + AccountMetaFlags::get(&self.flags, bit_field) + } + + fn rent_epoch(&self, data_block: &[u8]) -> Option { + let offset = self.optional_fields_offset(data_block); + println!("rent_epoch_offset = {}", offset); + if self.flags_get(AccountMetaFlags::HAS_RENT_EPOCH) { + let epoch: Epoch = *Self::get_type(data_block, offset); + println!("epoch = {}", epoch); + return Some(epoch); + } + None + } + + fn account_hash<'a>(&self, data_block: &'a [u8]) -> &'a Hash { + let mut offset = self.optional_fields_offset(data_block); + if self.flags_get(AccountMetaFlags::HAS_RENT_EPOCH) { + offset += std::mem::size_of::(); + } + if self.flags_get(AccountMetaFlags::HAS_ACCOUNT_HASH) { + return Self::get_type(data_block, offset); + } + return &DEFAULT_ACCOUNT_HASH; + } + + fn write_version(&self, data_block: &[u8]) -> Option { + let mut offset = self.optional_fields_offset(data_block); + if self.flags_get(AccountMetaFlags::HAS_RENT_EPOCH) { + offset += std::mem::size_of::(); + } + if self.flags_get(AccountMetaFlags::HAS_ACCOUNT_HASH) { + offset += std::mem::size_of::(); + } + if self.flags_get(AccountMetaFlags::HAS_WRITE_VERSION) { + let write_version: StoredMetaWriteVersion = *Self::get_type(data_block, offset); + return Some(write_version); + } + None + } + + fn data_len(&self, data_block: &[u8]) -> usize { + self.optional_fields_offset(data_block) + .saturating_sub(self.padding_bytes() as usize) + } + + fn optional_fields_offset<'a>(&self, data_block: &'a [u8]) -> usize { + data_block.len().saturating_sub(self.optional_fields_size()) + } + + fn account_data<'a>(&self, data_block: &'a [u8]) -> &'a [u8] { + &data_block[0..self.data_len(data_block)] + } + + fn is_blob_account(&self) -> bool { + todo!(); + } + + fn write_account_meta_entry(&self, ads_file: &TieredStorageFile) -> std::io::Result { + ads_file.write_type(self)?; + + Ok(std::mem::size_of::()) + } + + fn stored_size( + footer: &TieredStorageFooter, + metas: &Vec, + i: usize, + ) -> usize { + // hot storage does not compress so the returned size is the data size. + let data_size = get_compressed_block_size(footer, metas, i); + + return std::mem::size_of::() + data_size; + } +} + +#[derive(Debug)] +pub struct HotStorageReader { + map: Mmap, + footer: TieredStorageFooter, +} + +impl HotStorageReader { + pub fn new_from_path>(path: P) -> std::io::Result { + let file = OpenOptions::new().read(true).create(false).open(path)?; + let map = unsafe { MmapOptions::new().map(&file)? }; + println!("map len: {}", map.len()); + let footer = TieredStorageFooter::new_from_mmap(&map)?.clone(); + assert!(map.len() > 0); + + Ok(Self { map, footer }) + } + + pub fn footer(&self) -> &TieredStorageFooter { + &self.footer + } + + pub fn num_accounts(&self) -> usize { + self.footer.account_meta_count as usize + } + + pub fn account_matches_owners( + &self, + multiplied_index: usize, + owners: &[&Pubkey], + ) -> Result { + let index = Self::multiplied_index_to_index(multiplied_index); + if index >= self.num_accounts() { + return Err(MatchAccountOwnerError::UnableToLoad); + } + + let owner = self.get_owner_address(index).unwrap(); // ok_or(MatchAccountOwnerError::NoMatch); + owners + .iter() + .position(|entry| &owner == entry) + .ok_or(MatchAccountOwnerError::NoMatch) + } + + fn multiplied_index_to_index(multiplied_index: usize) -> usize { + // This is a temporary workaround to work with existing AccountInfo + // implementation that ties to AppendVec with the assumption that the offset + // is a multiple of ALIGN_BOUNDARY_OFFSET, while tiered storage actually talks + // about index instead of offset. + multiplied_index / ALIGN_BOUNDARY_OFFSET + } + + fn get_account_meta<'a>(&'a self, index: usize) -> std::io::Result<&'a HotAccountMeta> { + let offset = self.footer.account_metas_offset + + (self.footer.account_meta_entry_size as u64 * index as u64); + let (meta, _): (&'a HotAccountMeta, _) = get_type(&self.map, offset as usize)?; + Ok(meta) + } + + fn get_account_address<'a>(&'a self, index: usize) -> std::io::Result<&'a Pubkey> { + let offset = + self.footer.account_pubkeys_offset as usize + (std::mem::size_of::() * index); + let (pubkey, _): (&'a Pubkey, _) = get_type(&self.map, offset)?; + Ok(pubkey) + } + + fn get_owner_address<'a>(&'a self, index: usize) -> std::io::Result<&'a Pubkey> { + let offset = self.footer.owners_offset as usize + (std::mem::size_of::() * index); + let (pubkey, _): (&'a Pubkey, _) = get_type(&self.map, offset)?; + Ok(pubkey) + } + + fn get_data_block_size(&self, meta: &HotAccountMeta, index: usize) -> usize { + if (index + 1) as u32 == self.footer.account_meta_count { + return (self.footer.account_metas_offset - meta.block_offset()) as usize; + } + + let next_meta = self.get_account_meta(index + 1).unwrap(); + assert!(next_meta.block_offset() >= meta.block_offset()); + + next_meta.block_offset().saturating_sub(meta.block_offset()) as usize + } + + fn get_data_block<'a>( + &'a self, + meta: &HotAccountMeta, + index: usize, + ) -> std::io::Result<&'a [u8]> { + let (data, _): (&'a [u8], _) = get_slice( + &self.map, + meta.block_offset() as usize, + self.get_data_block_size(meta, index), + )?; + Ok(data) + } + + pub fn get_account<'a>( + &'a self, + multiplied_index: usize, + ) -> Option<(StoredAccountMeta<'a>, usize)> { + println!("get_account({})", multiplied_index); + let index = Self::multiplied_index_to_index(multiplied_index); + // TODO(yhchiang): remove this TODO + // TODO2 + println!("wtf?! index = {}", index); + if index >= self.footer.account_meta_count as usize { + return None; + } + + let meta: &'a HotAccountMeta = self.get_account_meta(index).ok()?; + println!("meta = {:?}", *meta); + // TODO(yhchiang): I think they can be lazy loaded. + let address: &'a Pubkey = self.get_account_address(index).ok()?; + println!("pubkey = {:?}", *address); + let owner: &'a Pubkey = self.get_owner_address(index).ok()?; + println!("owner = {:?}", *owner); + let data_block: &'a [u8] = self.get_data_block(meta, index).ok()?; + println!("data= {:?}", data_block); + + return Some(( + StoredAccountMeta::Hot(TieredStoredAccountMeta { + meta: meta, + pubkey: address, + owner: owner, + index: multiplied_index, + data_block: data_block, + }), + multiplied_index + ALIGN_BOUNDARY_OFFSET, + )); + } +} + +#[cfg(test)] +pub mod tests { + use { + crate::{ + account_storage::meta::StoredMetaWriteVersion, + append_vec::test_utils::get_append_vec_path, + tiered_storage::{ + file::TieredStorageFile, + footer::{ + AccountDataBlockFormat, AccountIndexFormat, AccountMetaFormat, + OwnersBlockFormat, TieredStorageFooter, FOOTER_SIZE, + }, + hot::{HotAccountMeta, HotStorageReader}, + meta_entries::{AccountMetaFlags, AccountMetaOptionalFields, TieredAccountMeta}, + }, + }, + ::solana_sdk::{hash::Hash, stake_history::Epoch}, + memoffset::offset_of, + std::mem::size_of, + }; + + #[test] + fn test_hot_account_meta_layout() { + assert_eq!(offset_of!(HotAccountMeta, lamports), 0x00); + assert_eq!(offset_of!(HotAccountMeta, block_offset_info), 0x08); + assert_eq!(offset_of!(HotAccountMeta, owner_index), 0x10); + assert_eq!(offset_of!(HotAccountMeta, flags), 0x14); + assert_eq!(std::mem::size_of::(), 24); + } + + #[test] + fn test_hot_offset_and_padding() { + let offset: u64 = 0x07ff_ef98_7654_3218; + let length: u64 = 153233; + let mut hot_meta = HotAccountMeta::new(); + hot_meta + .with_block_offset(offset) + .with_uncompressed_data_size(length); + assert_eq!(hot_meta.block_offset(), offset); + assert_eq!(hot_meta.padding_bytes(), ((8 - (length % 8)) % 8) as u8); + } + + #[test] + fn test_hot_account_meta() { + let path = get_append_vec_path("test_hot_account_meta"); + + const TEST_LAMPORT: u64 = 2314232137; + const BLOCK_OFFSET: u64 = 56987; + const PADDINGS: u8 = 5; + const OWNER_LOCAL_ID: u32 = 54; + const TEST_RENT_EPOCH: Epoch = 7; + const TEST_WRITE_VERSION: StoredMetaWriteVersion = 0; + + let optional_fields = AccountMetaOptionalFields { + rent_epoch: Some(TEST_RENT_EPOCH), + account_hash: Some(Hash::new_unique()), + write_version_obsolete: Some(TEST_WRITE_VERSION), + }; + + let mut expected_entry = HotAccountMeta::new(); + expected_entry + .with_lamports(TEST_LAMPORT) + .with_block_offset(BLOCK_OFFSET) + .with_data_tailing_paddings(PADDINGS) + .with_owner_local_id(OWNER_LOCAL_ID) + .with_flags( + AccountMetaFlags::new() + .with_bit(AccountMetaFlags::EXECUTABLE, true) + .to_value(), + ) + .with_optional_fields(&optional_fields); + + { + let mut ads_file = TieredStorageFile::new(&path.path, true); + expected_entry + .write_account_meta_entry(&mut ads_file) + .unwrap(); + } + + let mut ads_file = TieredStorageFile::new(&path.path, true); + let entry = HotAccountMeta::new_from_file(&mut ads_file).unwrap(); + + assert_eq!(expected_entry, entry); + assert_eq!(entry.flags_get(AccountMetaFlags::EXECUTABLE), true); + assert_eq!(entry.flags_get(AccountMetaFlags::HAS_RENT_EPOCH), true); + } + + #[test] + fn test_max_hot_offset_and_padding() { + let mut hot_meta = HotAccountMeta::new(); + // hot offset must be a multiple of 8. + let offset: u64 = 0x07ff_ffff_ffff_fff8; + let paddings: u8 = 7; + hot_meta.set_block_offset(offset); + hot_meta.set_padding_bytes(paddings); + assert_eq!(hot_meta.block_offset(), offset); + assert_eq!(hot_meta.padding_bytes(), paddings); + } + + #[test] + fn test_hot_storage_footer() { + let path = get_append_vec_path("test_hot_storage_footer"); + let expected_footer = TieredStorageFooter { + account_meta_format: AccountMetaFormat::Hot, + owners_block_format: OwnersBlockFormat::LocalIndex, + account_index_format: AccountIndexFormat::Linear, + data_block_format: AccountDataBlockFormat::AlignedRaw, + account_meta_count: 300, + account_meta_entry_size: 24, + account_data_block_size: 4096, + owner_count: 250, + owner_entry_size: 32, + account_metas_offset: 1062400, + account_pubkeys_offset: 1069600, + owners_offset: 1081200, + hash: Hash::new_unique(), + min_account_address: Hash::default(), + max_account_address: Hash::default(), + footer_size: FOOTER_SIZE as u64, + format_version: 1, + }; + + { + let ads_file = TieredStorageFile::new(&path.path, true); + expected_footer.write_footer_block(&ads_file).unwrap(); + } + + // Reopen the same storage, and expect the persisted footer is + // the same as what we have written. + { + let hot_storage = HotStorageReader::new_from_path(&path.path).unwrap(); + assert_eq!(expected_footer, *hot_storage.footer()); + } + } +} diff --git a/runtime/src/tiered_storage/meta_entries.rs b/runtime/src/tiered_storage/meta_entries.rs new file mode 100644 index 00000000000000..a9a9a5fbb22d34 --- /dev/null +++ b/runtime/src/tiered_storage/meta_entries.rs @@ -0,0 +1,257 @@ +use { + crate::{ + account_storage::meta::StoredMetaWriteVersion, + tiered_storage::{ + file::TieredStorageFile, footer::TieredStorageFooter, AccountDataBlockWriter, + }, + }, + ::solana_sdk::{hash::Hash, stake_history::Epoch}, + serde::{Deserialize, Serialize}, + std::mem::size_of, +}; + +lazy_static! { + pub static ref DEFAULT_ACCOUNT_HASH: Hash = Hash::default(); +} + +pub const ACCOUNT_DATA_ENTIRE_BLOCK: u16 = std::u16::MAX; + +pub(crate) fn get_compressed_block_size( + footer: &TieredStorageFooter, + metas: &Vec, + index: usize, +) -> usize { + let mut block_size = footer.account_metas_offset - metas[index].block_offset(); + + for i in index..metas.len() { + if metas[i].block_offset() == metas[index].block_offset() { + continue; + } + block_size = metas[i].block_offset() - metas[index].block_offset(); + break; + } + + block_size.try_into().unwrap() +} + +#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)] +#[repr(C)] +pub struct AccountMetaFlags { + flags: u32, +} + +impl AccountMetaFlags { + pub const EXECUTABLE: u32 = 1u32; + pub const HAS_RENT_EPOCH: u32 = 1u32 << 1; + pub const HAS_ACCOUNT_HASH: u32 = 1u32 << 2; + pub const HAS_WRITE_VERSION: u32 = 1u32 << 3; + // TODO(yhchiang): might not be needed. + pub const HAS_DATA_LENGTH: u32 = 1u32 << 4; + + pub fn new() -> Self { + Self { flags: 0 } + } + + pub fn new_from(value: u32) -> Self { + Self { flags: value } + } + + pub fn with_bit(mut self, bit_field: u32, value: bool) -> Self { + self.set(bit_field, value); + + self + } + + pub fn to_value(self) -> u32 { + self.flags + } + + pub fn set(&mut self, bit_field: u32, value: bool) { + if value == true { + self.flags |= bit_field; + } else { + self.flags &= !bit_field; + } + } + + pub fn get(flags: &u32, bit_field: u32) -> bool { + (flags & bit_field) > 0 + } + + pub fn get_value(&self) -> u32 { + self.flags + } + + pub fn get_value_mut(&mut self) -> &mut u32 { + &mut self.flags + } +} + +#[derive(Debug, PartialEq, Eq, Clone)] +pub struct AccountMetaOptionalFields { + pub rent_epoch: Option, + pub account_hash: Option, + pub write_version_obsolete: Option, +} + +impl AccountMetaOptionalFields { + /// Returns the 16-bit value where each bit represesnts whether one + /// optional field has a Some value. + pub fn update_flags(&self, flags_value: &mut u32) { + let mut flags = AccountMetaFlags::new_from(*flags_value); + flags.set(AccountMetaFlags::HAS_RENT_EPOCH, self.rent_epoch.is_some()); + flags.set( + AccountMetaFlags::HAS_ACCOUNT_HASH, + self.account_hash.is_some(), + ); + flags.set( + AccountMetaFlags::HAS_WRITE_VERSION, + self.write_version_obsolete.is_some(), + ); + *flags_value = flags.to_value(); + } + + pub fn size(&self) -> usize { + let mut size_in_bytes = 0; + if self.rent_epoch.is_some() { + size_in_bytes += size_of::(); + } + if self.account_hash.is_some() { + size_in_bytes += size_of::(); + } + if self.write_version_obsolete.is_some() { + size_in_bytes += size_of::(); + } + + size_in_bytes + } + + pub fn write(&self, writer: &mut AccountDataBlockWriter) -> std::io::Result { + let mut length = 0; + if let Some(rent_epoch) = self.rent_epoch { + length += writer.write_type(&rent_epoch)?; + } + if let Some(hash) = self.account_hash { + length += writer.write_type(&hash)?; + } + if let Some(write_version) = self.write_version_obsolete { + length += writer.write_type(&write_version)?; + } + + Ok(length) + } +} + +pub trait TieredAccountMeta { + fn new() -> Self; + + fn is_blob_account_data(data_len: u64) -> bool; + + fn with_lamports(&mut self, _lamports: u64) -> &mut Self { + unimplemented!(); + } + + fn with_block_offset(&mut self, _offset: u64) -> &mut Self { + unimplemented!(); + } + + fn with_data_tailing_paddings(&mut self, _paddings: u8) -> &mut Self { + unimplemented!(); + } + + fn with_owner_local_id(&mut self, _local_id: u32) -> &mut Self { + unimplemented!(); + } + + fn with_uncompressed_data_size(&mut self, _data_size: u64) -> &mut Self { + unimplemented!(); + } + + fn with_intra_block_offset(&mut self, _offset: u16) -> &mut Self { + unimplemented!(); + } + + fn with_flags(&mut self, _flags: u32) -> &mut Self { + unimplemented!(); + } + + fn with_optional_fields(&mut self, _fields: &AccountMetaOptionalFields) -> &mut Self { + unimplemented!(); + } + + fn lamports(&self) -> u64; + fn block_offset(&self) -> u64; + fn set_block_offset(&mut self, offset: u64); + fn padding_bytes(&self) -> u8; + fn uncompressed_data_size(&self) -> usize { + unimplemented!(); + } + fn intra_block_offset(&self) -> u16; + fn owner_local_id(&self) -> u32; + fn flags_get(&self, bit_field: u32) -> bool; + fn rent_epoch(&self, data_block: &[u8]) -> Option; + fn account_hash<'a>(&self, data_block: &'a [u8]) -> &'a Hash; + fn write_version(&self, data_block: &[u8]) -> Option; + fn optional_fields_size(&self) -> usize { + let mut size_in_bytes = 0; + if self.flags_get(AccountMetaFlags::HAS_RENT_EPOCH) { + size_in_bytes += size_of::(); + } + if self.flags_get(AccountMetaFlags::HAS_ACCOUNT_HASH) { + size_in_bytes += size_of::(); + } + if self.flags_get(AccountMetaFlags::HAS_WRITE_VERSION) { + size_in_bytes += size_of::(); + } + if self.flags_get(AccountMetaFlags::HAS_DATA_LENGTH) { + size_in_bytes += size_of::(); + } + + size_in_bytes + } + + fn optional_fields_offset<'a>(&self, data_block: &'a [u8]) -> usize; + fn data_len(&self, data_block: &[u8]) -> usize; + fn account_data<'a>(&self, data_block: &'a [u8]) -> &'a [u8]; + fn is_blob_account(&self) -> bool; + fn write_account_meta_entry(&self, ads_file: &TieredStorageFile) -> std::io::Result; + fn stored_size( + footer: &TieredStorageFooter, + metas: &Vec, + i: usize, + ) -> usize; +} + +#[cfg(test)] +pub mod tests { + use crate::tiered_storage::meta_entries::AccountMetaFlags; + + impl AccountMetaFlags { + pub(crate) fn get_test(&self, bit_field: u32) -> bool { + (self.flags & bit_field) > 0 + } + } + + #[test] + fn test_flags() { + let mut flags = AccountMetaFlags::new(); + assert_eq!(flags.get_test(AccountMetaFlags::EXECUTABLE), false); + assert_eq!(flags.get_test(AccountMetaFlags::HAS_RENT_EPOCH), false); + + flags.set(AccountMetaFlags::EXECUTABLE, true); + assert_eq!(flags.get_test(AccountMetaFlags::EXECUTABLE), true); + assert_eq!(flags.get_test(AccountMetaFlags::HAS_RENT_EPOCH), false); + + flags.set(AccountMetaFlags::HAS_RENT_EPOCH, true); + assert_eq!(flags.get_test(AccountMetaFlags::EXECUTABLE), true); + assert_eq!(flags.get_test(AccountMetaFlags::HAS_RENT_EPOCH), true); + + flags.set(AccountMetaFlags::EXECUTABLE, false); + assert_eq!(flags.get_test(AccountMetaFlags::EXECUTABLE), false); + assert_eq!(flags.get_test(AccountMetaFlags::HAS_RENT_EPOCH), true); + + flags.set(AccountMetaFlags::HAS_RENT_EPOCH, false); + assert_eq!(flags.get_test(AccountMetaFlags::EXECUTABLE), false); + assert_eq!(flags.get_test(AccountMetaFlags::HAS_RENT_EPOCH), false); + } +} diff --git a/runtime/src/tiered_storage/reader.rs b/runtime/src/tiered_storage/reader.rs new file mode 100644 index 00000000000000..069eed692f0a85 --- /dev/null +++ b/runtime/src/tiered_storage/reader.rs @@ -0,0 +1,184 @@ +use { + crate::{ + account_storage::meta::{StoredAccountMeta, StoredMeta, StoredMetaWriteVersion}, + append_vec::MatchAccountOwnerError, + tiered_storage::{ + cold::ColdStorageReader, + footer::{AccountMetaFormat, TieredStorageFooter}, + hot::HotStorageReader, + meta_entries::{AccountMetaFlags, TieredAccountMeta}, + }, + }, + solana_sdk::{ + account::{Account, AccountSharedData, ReadableAccount}, + hash::Hash, + pubkey::Pubkey, + stake_history::Epoch, + }, + std::path::Path, +}; + +#[derive(Debug)] +pub enum TieredStorageReader { + Cold(ColdStorageReader), + Hot(HotStorageReader), +} + +impl TieredStorageReader { + pub fn new_from_path>(path: P) -> std::io::Result { + let footer = TieredStorageFooter::new_from_path(&path)?; + println!("footer = {:?}", footer); + + match footer.account_meta_format { + AccountMetaFormat::Cold => Ok(Self::Cold(ColdStorageReader::new_from_file(path)?)), + AccountMetaFormat::Hot => Ok(Self::Hot(HotStorageReader::new_from_path(path)?)), + } + } + + pub fn num_accounts(&self) -> usize { + match self { + Self::Cold(cs) => cs.num_accounts(), + Self::Hot(hs) => hs.num_accounts(), + } + } + + pub fn account_matches_owners( + &self, + multiplied_index: usize, + owners: &[&Pubkey], + ) -> Result { + match self { + Self::Cold(cs) => cs.account_matches_owners(multiplied_index, owners), + Self::Hot(hs) => hs.account_matches_owners(multiplied_index, owners), + } + } + + pub fn get_account<'a>( + &'a self, + multiplied_index: usize, + ) -> Option<(StoredAccountMeta<'a>, usize)> { + match self { + Self::Cold(cs) => cs.get_account(multiplied_index), + Self::Hot(hs) => hs.get_account(multiplied_index), + } + } +} + +#[derive(PartialEq, Eq, Debug)] +#[allow(dead_code)] +pub struct TieredStoredAccountMeta<'a, T: TieredAccountMeta> { + pub(crate) meta: &'a T, + pub(crate) pubkey: &'a Pubkey, + pub(crate) owner: &'a Pubkey, + pub(crate) index: usize, + // this data block may be shared with other accounts + pub(crate) data_block: &'a [u8], +} + +#[allow(dead_code)] +impl<'a, T: TieredAccountMeta> TieredStoredAccountMeta<'a, T> { + pub fn pubkey(&self) -> &'a Pubkey { + &self.pubkey + } + + pub fn hash(&self) -> &'a Hash { + self.meta.account_hash(self.data_block) + } + + pub fn offset(&self) -> usize { + self.index + } + + pub fn data(&self) -> &'a [u8] { + self.meta.account_data(self.data_block) + } + + pub fn data_len(&self) -> u64 { + self.meta.account_data(self.data_block).len() as u64 + } + + pub fn stored_size(&self) -> usize { + // TODO(yhchiang): make it accurate + self.data_len() as usize / 2 + + std::mem::size_of::() + + std::mem::size_of::() // account's pubkey + + std::mem::size_of::() // owner's pubkey + } + + pub fn clone_account(&self) -> AccountSharedData { + AccountSharedData::from(Account { + lamports: self.lamports(), + owner: *self.owner(), + executable: self.executable(), + rent_epoch: self.rent_epoch(), + data: self.data().to_vec(), + }) + } + + pub fn write_version(&self) -> StoredMetaWriteVersion { + if let Some(write_version) = self.meta.write_version(self.data_block) { + return write_version; + } + 0 + } + + /////////////////////////////////////////////////////////////////////////// + // Unimlpemented + + pub fn meta(&self) -> &StoredMeta { + unimplemented!(); + } + + pub fn set_meta(&mut self, _meta: &'a StoredMeta) { + unimplemented!(); + } + + pub(crate) fn sanitize(&self) -> bool { + unimplemented!(); + } +} + +impl<'a, T: TieredAccountMeta> ReadableAccount for TieredStoredAccountMeta<'a, T> { + fn lamports(&self) -> u64 { + self.meta.lamports() + } + fn owner(&self) -> &'a Pubkey { + self.owner + } + fn executable(&self) -> bool { + self.meta.flags_get(AccountMetaFlags::EXECUTABLE) + } + fn rent_epoch(&self) -> Epoch { + if let Some(rent_epoch) = self.meta.rent_epoch(self.data_block) { + return rent_epoch; + } + std::u64::MAX + } + fn data(&self) -> &'a [u8] { + self.meta.account_data(self.data_block) + } +} + +/* + #[test] + fn test_account_pubkeys_block() { + let path = get_append_vec_path("test_account_pubkeys_block"); + let mut expected_pubkeys: Vec = vec![]; + const ENTRY_COUNT: u32 = 1024; + + { + let ads = TieredStorageWriter::new(&path.path); + let mut footer = TieredStorageFooter::new(); + let mut cursor = 0; + for _ in 0..ENTRY_COUNT { + expected_pubkeys.push(Pubkey::new_unique()); + } + ads.write_account_pubkeys_block(&mut cursor, &mut footer, &expected_pubkeys) + .unwrap(); + } + + let ads = TieredStorage::new_for_test(&path.path, false); + let pubkeys: Vec = ads.read_pubkeys_block(0, ENTRY_COUNT).unwrap(); + assert_eq!(expected_pubkeys, pubkeys); + } +*/ diff --git a/runtime/src/tiered_storage/writer.rs b/runtime/src/tiered_storage/writer.rs new file mode 100644 index 00000000000000..083b295c5b1630 --- /dev/null +++ b/runtime/src/tiered_storage/writer.rs @@ -0,0 +1,560 @@ +//! docs/src/proposals/append-vec-storage.md + +use { + crate::{ + account_storage::meta::{ + AccountMeta, StorableAccountsWithHashesAndWriteVersions, StoredAccountInfo, + StoredAccountMeta, StoredMeta, + }, + accounts_file::ALIGN_BOUNDARY_OFFSET, + append_vec::{AppendVec, AppendVecStoredAccountMeta}, + storable_accounts::StorableAccounts, + tiered_storage::{ + cold::ColdAccountMeta, + data_block::AccountDataBlockWriter, + file::TieredStorageFile, + footer::{AccountMetaFormat, TieredFileFormat, TieredStorageFooter}, + hot::HotAccountMeta, + meta_entries::{AccountMetaFlags, AccountMetaOptionalFields, TieredAccountMeta}, + }, + }, + solana_sdk::{account::ReadableAccount, hash::Hash, pubkey::Pubkey}, + std::{borrow::Borrow, collections::HashMap, fs::remove_file, mem, path::Path}, +}; + +pub const ACCOUNT_DATA_BLOCK_SIZE: usize = 4096; +pub const ACCOUNTS_DATA_STORAGE_FORMAT_VERSION: u64 = 1; + +lazy_static! { + pub static ref HASH_DEFAULT: Hash = Hash::default(); +} + +pub(crate) struct AccountOwnerTable { + pub owners_vec: Vec, + pub owners_map: HashMap, +} + +impl AccountOwnerTable { + pub fn new() -> Self { + Self { + owners_vec: vec![], + owners_map: HashMap::new(), + } + } + pub fn check_and_add(&mut self, pubkey: &Pubkey) -> u32 { + if let Some(index) = self.owners_map.get(pubkey) { + return index.clone(); + } + let index: u32 = self.owners_vec.len().try_into().unwrap(); + self.owners_vec.push(*pubkey); + self.owners_map.insert(*pubkey, index); + + index + } +} + +#[derive(Debug)] +pub struct TieredStorageWriter { + storage: TieredStorageFile, + format: &'static TieredFileFormat, +} + +impl TieredStorageWriter { + /// Create a new accounts-state-file + #[allow(dead_code)] + pub fn new(file_path: &Path, format: &'static TieredFileFormat) -> Self { + let _ignored = remove_file(file_path); + Self { + storage: TieredStorageFile::new(file_path, true), + format: format, + } + } + + fn append_accounts_impl< + 'a, + 'b, + T: ReadableAccount + Sync, + U: StorableAccounts<'a, T>, + V: Borrow, + W: TieredAccountMeta, + >( + &self, + accounts: &StorableAccountsWithHashesAndWriteVersions<'a, 'b, T, U, V>, + mut footer: TieredStorageFooter, + mut account_metas: Vec, + skip: usize, + ) -> Option> { + let mut cursor = 0; + let mut account_pubkeys: Vec = vec![]; + let mut owners_table = AccountOwnerTable::new(); + let mut dummy_hash: Hash = Hash::new_unique(); + + let mut data_block_writer = self.new_data_block_writer(&footer); + footer.account_data_block_size = ACCOUNT_DATA_BLOCK_SIZE as u64; + + let mut buffered_account_metas = Vec::::new(); + let mut buffered_account_pubkeys: Vec = vec![]; + + let len = accounts.accounts.len(); + let mut input_pubkey_map: HashMap = HashMap::with_capacity(len); + + for i in skip..len { + // TODO(yhchiang): here we don't need to convert it to + // StoredAccountMeta::AppendVec + let (account, pubkey, hash, write_version_obsolete) = accounts.get(i); + input_pubkey_map.insert(*pubkey, i); + let account_meta = account + .map(|account| AccountMeta { + lamports: account.lamports(), + owner: *account.owner(), + rent_epoch: account.rent_epoch(), + executable: account.executable(), + }) + .unwrap_or_default(); + println!("i = {}, meta = {:?}", i, account_meta); + + let stored_meta = StoredMeta { + pubkey: *pubkey, + data_len: account + .map(|account| account.data().len()) + .unwrap_or_default() as u64, + write_version_obsolete, + }; + + let stored_account_meta = StoredAccountMeta::AppendVec(AppendVecStoredAccountMeta { + meta: &stored_meta, + account_meta: &account_meta, + data: account.map(|account| account.data()).unwrap_or_default(), + offset: 0, + stored_size: 0, + hash: hash, + }); + + data_block_writer = self + .write_stored_account_meta( + &stored_account_meta, + &mut cursor, + &mut footer, + &mut account_metas, + &mut account_pubkeys, + &mut owners_table, + data_block_writer, + &mut buffered_account_metas, + &mut buffered_account_pubkeys, + &mut dummy_hash, + ) + .unwrap(); + } + + // Persist the last block if any + if buffered_account_metas.len() > 0 { + self.flush_account_data_block( + &mut cursor, + &mut footer, + &mut account_metas, + &mut account_pubkeys, + &mut buffered_account_metas, + &mut buffered_account_pubkeys, + data_block_writer, + ) + .ok()?; + } + + assert_eq!(buffered_account_metas.len(), 0); + assert_eq!(buffered_account_pubkeys.len(), 0); + assert_eq!(footer.account_meta_count, account_metas.len() as u32); + + self.write_account_metas_block(&mut cursor, &mut footer, &account_metas) + .ok()?; + self.write_account_pubkeys_block(&mut cursor, &mut footer, &account_pubkeys) + .ok()?; + + self.write_owners_block(&mut cursor, &mut footer, &owners_table.owners_vec) + .ok()?; + + println!("before writing footer: {:?}", footer); + footer.write_footer_block(&self.storage).ok()?; + + assert_eq!(account_metas.len(), account_pubkeys.len()); + assert_eq!(account_metas.len(), len - skip); + + let mut stored_accounts_info: Vec = Vec::with_capacity(len); + for _ in skip..len { + stored_accounts_info.push(StoredAccountInfo { offset: 0, size: 0 }); + } + for i in 0..account_metas.len() { + let index = input_pubkey_map.get(&account_pubkeys[i]).unwrap(); + + // of ALIGN_BOUNDARY_OFFSET, while cold storage actually talks about index + // instead of offset. + stored_accounts_info[*index].offset = i * ALIGN_BOUNDARY_OFFSET; + stored_accounts_info[*index].size = W::stored_size(&footer, &account_metas, i); + } + + Some(stored_accounts_info) + } + + pub fn append_accounts< + 'a, + 'b, + T: ReadableAccount + Sync, + U: StorableAccounts<'a, T>, + V: Borrow, + >( + &self, + accounts: &StorableAccountsWithHashesAndWriteVersions<'a, 'b, T, U, V>, + skip: usize, + ) -> Option> { + let mut footer = TieredStorageFooter::new(); + // TODO(yhchiang): make it configerable + footer.account_meta_format = self.format.account_meta_format.clone(); + footer.data_block_format = self.format.data_block_format.clone(); + footer.format_version = ACCOUNTS_DATA_STORAGE_FORMAT_VERSION; + match footer.account_meta_format { + AccountMetaFormat::Hot => { + self.append_accounts_impl(accounts, footer, Vec::::new(), skip) + } + AccountMetaFormat::Cold => { + self.append_accounts_impl(accounts, footer, Vec::::new(), skip) + } + } + } + + fn new_data_block_writer(&self, footer: &TieredStorageFooter) -> AccountDataBlockWriter { + return AccountDataBlockWriter::new(footer.data_block_format); + } + + pub(crate) fn write_account_metas_block( + &self, + cursor: &mut u64, + footer: &mut TieredStorageFooter, + account_metas: &Vec, + ) -> std::io::Result<()> { + let entry_size: u32 = std::mem::size_of::() as u32; + footer.account_metas_offset = *cursor; + footer.account_meta_entry_size = entry_size; + for account_meta in account_metas { + *cursor += account_meta.write_account_meta_entry(&self.storage)? as u64; + } + println!( + "entry_size = {:?}, account_metas.len() = {:?}", + entry_size, + account_metas.len() + ); + // make sure cursor advanced as what we expected + assert_eq!( + footer.account_metas_offset + (entry_size * account_metas.len() as u32) as u64, + *cursor + ); + + Ok(()) + } + + pub(crate) fn write_account_pubkeys_block( + &self, + cursor: &mut u64, + footer: &mut TieredStorageFooter, + pubkeys: &Vec, + ) -> std::io::Result<()> { + footer.account_pubkeys_offset = *cursor; + + self.write_pubkeys_block(cursor, pubkeys) + } + + fn write_owners_block( + &self, + cursor: &mut u64, + footer: &mut TieredStorageFooter, + pubkeys: &Vec, + ) -> std::io::Result<()> { + footer.owners_offset = *cursor; + footer.owner_count = pubkeys.len() as u32; + footer.owner_entry_size = mem::size_of::() as u32; + + self.write_pubkeys_block(cursor, pubkeys) + } + + fn write_pubkeys_block(&self, cursor: &mut u64, pubkeys: &Vec) -> std::io::Result<()> { + for pubkey in pubkeys { + *cursor += self.storage.write_type(pubkey)? as u64; + } + + Ok(()) + } + + fn flush_account_data_block( + &self, + cursor: &mut u64, + footer: &mut TieredStorageFooter, + account_metas: &mut Vec, + account_pubkeys: &mut Vec, + input_metas: &mut Vec, + input_pubkeys: &mut Vec, + data_block_writer: AccountDataBlockWriter, + ) -> std::io::Result<()> { + // Persist the current block + let (encoded_data, _raw_data_size) = data_block_writer.finish()?; + self.storage.write_bytes(&encoded_data)?; + + assert_eq!(input_metas.len(), input_pubkeys.len()); + + for input_meta in &mut input_metas.into_iter() { + input_meta.set_block_offset(*cursor); + } + for input_meta in &mut input_metas.into_iter() { + assert_eq!(input_meta.block_offset(), *cursor); + } + footer.account_meta_count += input_metas.len() as u32; + account_metas.append(input_metas); + account_pubkeys.append(input_pubkeys); + + *cursor += encoded_data.len() as u64; + assert_eq!(input_metas.len(), 0); + assert_eq!(input_pubkeys.len(), 0); + + Ok(()) + } + + fn write_stored_account_meta( + &self, + account: &StoredAccountMeta, + cursor: &mut u64, + footer: &mut TieredStorageFooter, + account_metas: &mut Vec, + account_pubkeys: &mut Vec, + owners_table: &mut AccountOwnerTable, + mut data_block: AccountDataBlockWriter, + buffered_account_metas: &mut Vec, + buffered_account_pubkeys: &mut Vec, + _hash: &mut Hash, + ) -> std::io::Result { + if !account.sanitize() { + // Not Ok + } + + let optional_fields = AccountMetaOptionalFields { + rent_epoch: Some(account.rent_epoch()), + account_hash: Some(*account.hash()), + write_version_obsolete: Some(account.write_version()), + }; + + if T::is_blob_account_data(account.data_len()) { + self.write_blob_account_data_block( + cursor, + footer, + account_metas, + account_pubkeys, + owners_table, + account, + )?; + return Ok(data_block); + } + println!("YH: Hey!!"); + + // If the current data cannot fit in the current block, then + // persist the current block. + if data_block.len() + account.data_len() as usize + optional_fields.size() + > ACCOUNT_DATA_BLOCK_SIZE + { + self.flush_account_data_block( + cursor, + footer, + account_metas, + account_pubkeys, + buffered_account_metas, + buffered_account_pubkeys, + data_block, + )?; + data_block = self.new_data_block_writer(footer); + } + + let owner_local_id = owners_table.check_and_add(account.owner()); + let local_offset = data_block.len(); + + data_block.write(account.data(), account.data_len() as usize)?; + optional_fields.write(&mut data_block)?; + + let mut meta = T::new(); + meta.with_lamports(account.lamports()) + .with_block_offset(*cursor) + .with_owner_local_id(owner_local_id) + .with_uncompressed_data_size(account.data_len()) + .with_intra_block_offset(local_offset as u16) + .with_flags( + AccountMetaFlags::new() + .with_bit(AccountMetaFlags::EXECUTABLE, account.executable()) + .to_value(), + ) + .with_optional_fields(&optional_fields); + + buffered_account_metas.push(meta); + buffered_account_pubkeys.push(*account.pubkey()); + + Ok(data_block) + } + + fn write_blob_account_data_block( + &self, + cursor: &mut u64, + footer: &mut TieredStorageFooter, + account_metas: &mut Vec, + account_pubkeys: &mut Vec, + owners_table: &mut AccountOwnerTable, + account: &StoredAccountMeta, + ) -> std::io::Result<()> { + let owner_local_id = owners_table.check_and_add(account.owner()); + let optional_fields = AccountMetaOptionalFields { + rent_epoch: Some(account.rent_epoch()), + account_hash: Some(*account.hash()), + write_version_obsolete: Some(account.write_version()), + }; + + let mut meta = T::new(); + meta.with_lamports(account.lamports()) + .with_block_offset(*cursor) + .with_owner_local_id(owner_local_id) + .with_uncompressed_data_size(account.data_len()) + .with_intra_block_offset(0) + .with_flags( + AccountMetaFlags::new() + .with_bit(AccountMetaFlags::EXECUTABLE, account.executable()) + .to_value(), + ) + .with_optional_fields(&optional_fields); + println!("cursor = {}", *cursor); + println!( + "data_len = {}, padding = {}", + account.data_len(), + meta.padding_bytes() + ); + + let mut writer = AccountDataBlockWriter::new(footer.data_block_format); + writer.write(account.data(), account.data_len() as usize)?; + if meta.padding_bytes() > 0 { + let padding = [0u8; 8]; + writer.write(&padding, meta.padding_bytes() as usize)?; + } + optional_fields.write(&mut writer)?; + + let (data, _uncompressed_len) = writer.finish().unwrap(); + println!("_uncompressed_len = {}", _uncompressed_len); + let compressed_length = data.len(); + self.storage.write_bytes(&data)?; + + account_metas.push(meta); + account_pubkeys.push(*account.pubkey()); + + *cursor += compressed_length as u64; + println!("cursor = {}", *cursor); + footer.account_meta_count += 1; + + Ok(()) + } + + #[allow(dead_code)] + pub fn write_from_append_vec(&self, append_vec: &AppendVec) -> std::io::Result<()> { + let mut footer = TieredStorageFooter::new(); + // TODO(yhchiang): make it configerable + footer.account_meta_format = self.format.account_meta_format.clone(); + footer.data_block_format = self.format.data_block_format.clone(); + footer.format_version = ACCOUNTS_DATA_STORAGE_FORMAT_VERSION; + let mut cursor = 0; + let mut account_pubkeys: Vec = vec![]; + let mut owners_table = AccountOwnerTable::new(); + let mut hash: Hash = Hash::new_unique(); + + match footer.account_meta_format { + AccountMetaFormat::Hot => { + let mut account_metas = Vec::::new(); + self.write_account_data_blocks( + &mut cursor, + &mut footer, + &mut account_metas, + &mut account_pubkeys, + &mut owners_table, + &mut hash, + &append_vec, + )?; + self.write_account_metas_block(&mut cursor, &mut footer, &account_metas)?; + } + AccountMetaFormat::Cold => { + let mut account_metas = Vec::::new(); + self.write_account_data_blocks( + &mut cursor, + &mut footer, + &mut account_metas, + &mut account_pubkeys, + &mut owners_table, + &mut hash, + &append_vec, + )?; + self.write_account_metas_block(&mut cursor, &mut footer, &account_metas)?; + } + } + self.write_account_pubkeys_block(&mut cursor, &mut footer, &account_pubkeys)?; + self.write_owners_block(&mut cursor, &mut footer, &owners_table.owners_vec)?; + + assert_eq!( + self.format.meta_entry_size as u32, + footer.account_meta_entry_size + ); + footer.write_footer_block(&self.storage)?; + + Ok(()) + } + + #[allow(dead_code)] + fn write_account_data_blocks( + &self, + cursor: &mut u64, + footer: &mut TieredStorageFooter, + account_metas: &mut Vec, + account_pubkeys: &mut Vec, + owners_table: &mut AccountOwnerTable, + // TODO(yhchiang): update hash + _hash: &mut Hash, + append_vec: &AppendVec, + ) -> std::io::Result<()> { + let mut offset = 0; + footer.account_data_block_size = ACCOUNT_DATA_BLOCK_SIZE as u64; + + let mut buffered_account_metas = Vec::::new(); + let mut buffered_account_pubkeys: Vec = vec![]; + let mut data_block_writer = self.new_data_block_writer(footer); + + while let Some((account, next_offset)) = append_vec.get_account(offset) { + offset = next_offset; + data_block_writer = self.write_stored_account_meta( + &account, + cursor, + footer, + account_metas, + account_pubkeys, + owners_table, + data_block_writer, + &mut buffered_account_metas, + &mut buffered_account_pubkeys, + _hash, + )?; + } + + // Persist the last block if any + if buffered_account_metas.len() > 0 { + self.flush_account_data_block( + cursor, + footer, + account_metas, + account_pubkeys, + &mut buffered_account_metas, + &mut buffered_account_pubkeys, + data_block_writer, + )?; + } + + assert_eq!(buffered_account_metas.len(), 0); + assert_eq!(buffered_account_pubkeys.len(), 0); + assert_eq!(footer.account_meta_count, account_metas.len() as u32); + + Ok(()) + } +} diff --git a/sdk/src/account.rs b/sdk/src/account.rs index fca8f8b71f5165..3e61018cca235f 100644 --- a/sdk/src/account.rs +++ b/sdk/src/account.rs @@ -103,15 +103,15 @@ impl Serialize for AccountSharedData { #[serde(from = "Account")] pub struct AccountSharedData { /// lamports in the account - lamports: u64, + pub lamports: u64, /// data held in this account - data: Arc>, + pub data: Arc>, /// the program that owns this account. If executable, the program that loads this account. - owner: Pubkey, + pub owner: Pubkey, /// this account's data contains a loaded program (and is now read-only) - executable: bool, + pub executable: bool, /// the epoch at which this account will next owe rent - rent_epoch: Epoch, + pub rent_epoch: Epoch, } /// Compares two ReadableAccounts