From 5b5271a33aa01391dfdef48c0453b990c866e088 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Thu, 24 Aug 2023 17:34:41 +0800 Subject: [PATCH 01/89] minor changes to benchmarking scripts --- scripts/local_bench.tmp.py | 2 +- scripts/local_client.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py index 714eac67..6f1ceb78 100644 --- a/scripts/local_bench.tmp.py +++ b/scripts/local_bench.tmp.py @@ -110,7 +110,7 @@ def bench_round(protocol, num_replicas, value_size, put_ratio, length_s): if __name__ == "__main__": do_cargo_build() - for num_replicas in (3, 7): + for num_replicas in (3, 5, 7): for value_size in (1024, 65536, 4194304): for protocol in ("MultiPaxos", "RSPaxos"): bench_round(protocol, num_replicas, value_size, 100, 60) diff --git a/scripts/local_client.py b/scripts/local_client.py index c0f46adf..04347398 100644 --- a/scripts/local_client.py +++ b/scripts/local_client.py @@ -71,7 +71,7 @@ def compose_client_cmd(protocol, manager, config, utility, params, release): # if in benchmarking mode, lower the client's CPU scheduling priority if utility == "bench": - cmd = ["nice", "-n", "15"] + cmd + cmd = ["nice", "-n", "19"] + cmd return cmd From 91d8778aae6902509ea1b3b434c07831308d44da Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Thu, 24 Aug 2023 20:06:10 +0800 Subject: [PATCH 02/89] minor updates to README --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 3e2fe00a..3ccadae9 100644 --- a/README.md +++ b/README.md @@ -50,6 +50,7 @@ Summerset is a distributed key-value store supporting a wide range of state mach | `RepNothing` | Simplest protocol w/o any replication | | `SimplePush` | Pushing to peers w/o any consistency guarantees | | `MultiPaxos` | Classic [MultiPaxos](https://www.microsoft.com/en-us/research/uploads/prod/2016/12/paxos-simple-Copy.pdf) protocol | +| `RS-Paxos` | MultiPaxos w/ Reed-Solomon erasure code sharding | Formal TLA+ specification of some protocols are provided in `tla+/`. From 7979ddac1663a9bccc03a598f5d67bd4fb6580cd Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Mon, 28 Aug 2023 16:54:27 +0800 Subject: [PATCH 03/89] add very basic crossword impl --- scripts/local_client.py | 1 + scripts/local_cluster.py | 1 + src/lib.rs | 2 + src/protocols/crossword.rs | 1410 ++++++++++++++++++++++++++++++++++++ src/protocols/mod.rs | 18 + src/protocols/rs_paxos.rs | 2 +- 6 files changed, 1433 insertions(+), 1 deletion(-) create mode 100644 src/protocols/crossword.rs diff --git a/scripts/local_client.py b/scripts/local_client.py index 04347398..2f9c2c6d 100644 --- a/scripts/local_client.py +++ b/scripts/local_client.py @@ -26,6 +26,7 @@ def run_process(cmd): "SimplePush": "", "MultiPaxos": "", "RSPaxos": "", + "Crossword": "", } diff --git a/scripts/local_cluster.py b/scripts/local_cluster.py index a1d33351..b7dcdb25 100644 --- a/scripts/local_cluster.py +++ b/scripts/local_cluster.py @@ -40,6 +40,7 @@ def kill_all_matching(name): "SimplePush": lambda r, n: f"backer_path='/tmp/summerset.simple_push.{r}.wal'+rep_degree={n-1}", "MultiPaxos": lambda r, n: f"backer_path='/tmp/summerset.multipaxos.{r}.wal'", "RSPaxos": lambda r, n: f"backer_path='/tmp/summerset.rs_paxos.{r}.wal'+fault_tolerance={n-(n//2+1)}", + "Crossword": lambda r, n: f"backer_path='/tmp/summerset.rs_paxos.{r}.wal'+fault_tolerance=0+shards_per_replica=3", } diff --git a/src/lib.rs b/src/lib.rs index f5cf4126..40bcbf31 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -35,3 +35,5 @@ pub use crate::protocols::SmrProtocol; pub use crate::protocols::{ReplicaConfigRepNothing, ClientConfigRepNothing}; pub use crate::protocols::{ReplicaConfigSimplePush, ClientConfigSimplePush}; pub use crate::protocols::{ReplicaConfigMultiPaxos, ClientConfigMultiPaxos}; +pub use crate::protocols::{ReplicaConfigRSPaxos, ClientConfigRSPaxos}; +pub use crate::protocols::{ReplicaConfigCrossword, ClientConfigCrossword}; diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs new file mode 100644 index 00000000..dfe647ea --- /dev/null +++ b/src/protocols/crossword.rs @@ -0,0 +1,1410 @@ +//! Replication protocol: Crossword. +//! +//! MultiPaxos with flexible Reed-Solomon erasure coding that supports tunable +//! shard groups and asymmetric shard assignment. + +use std::collections::{HashMap, HashSet}; +use std::path::Path; +use std::net::SocketAddr; + +use crate::utils::{SummersetError, ReplicaMap, RSCodeword}; +use crate::manager::{CtrlMsg, CtrlRequest, CtrlReply}; +use crate::server::{ + ReplicaId, ControlHub, StateMachine, CommandResult, CommandId, ExternalApi, + ApiRequest, ApiReply, StorageHub, LogAction, LogResult, LogActionId, + TransportHub, GenericReplica, +}; +use crate::client::{ClientId, ClientApiStub, ClientCtrlStub, GenericEndpoint}; +use crate::protocols::SmrProtocol; + +use async_trait::async_trait; + +use serde::{Serialize, Deserialize}; + +use tokio::time::Duration; + +use reed_solomon_erasure::galois_8::ReedSolomon; + +/// Configuration parameters struct. +#[derive(Debug, Deserialize)] +pub struct ReplicaConfigCrossword { + /// Client request batching interval in microsecs. + pub batch_interval_us: u64, + + /// Client request batching maximum batch size. + pub max_batch_size: usize, + + /// Path to backing file. + pub backer_path: String, + + /// Whether to call `fsync()`/`fdatasync()` on logger. + pub logger_sync: bool, + + /// Fault-tolerance level. + pub fault_tolerance: u8, + + /// Number of shards to assign to each replica. + // TODO: proper config options. + pub shards_per_replica: u8, +} + +#[allow(clippy::derivable_impls)] +impl Default for ReplicaConfigCrossword { + fn default() -> Self { + ReplicaConfigCrossword { + batch_interval_us: 1000, + max_batch_size: 5000, + backer_path: "/tmp/summerset.rs_paxos.wal".into(), + logger_sync: false, + fault_tolerance: 0, + shards_per_replica: 1, + } + } +} + +/// Ballot number type. Use 0 as a null ballot number. +type Ballot = u64; + +/// Instance status enum. +#[derive( + Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Serialize, Deserialize, +)] +enum Status { + Null = 0, + Preparing = 1, + Accepting = 2, + Committed = 3, + Executed = 4, +} + +/// Request batch type (i.e., the "value" in Paxos). +type ReqBatch = Vec<(ClientId, ApiRequest)>; + +/// Leader-side bookkeeping info for each instance initiated. +#[derive(Debug, Clone)] +struct LeaderBookkeeping { + /// Replicas from which I have received Prepare confirmations. + prepare_acks: ReplicaMap, + + /// Max ballot among received Prepare replies. + prepare_max_bal: Ballot, + + /// Replicas from which I have received Accept confirmations. + accept_acks: ReplicaMap, +} + +/// Follower-side bookkeeping info for each instance received. +#[derive(Debug, Clone)] +struct ReplicaBookkeeping { + /// Source leader replica ID for replyiing to Prepares and Accepts. + source: ReplicaId, +} + +/// In-memory instance containing a complete commands batch. +#[derive(Debug, Clone)] +struct Instance { + /// Ballot number. + bal: Ballot, + + /// Instance status. + status: Status, + + /// Shards of a batch of client requests. + reqs_cw: RSCodeword, + + /// Leader-side bookkeeping info. + leader_bk: Option, + + /// Follower-side bookkeeping info. + replica_bk: Option, +} + +/// Stable storage log entry type. +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] +enum LogEntry { + /// Records an update to the largest prepare ballot seen. + PrepareBal { slot: usize, ballot: Ballot }, + + /// Records a newly accepted request batch data shards at slot index. + AcceptData { + slot: usize, + ballot: Ballot, + reqs_cw: RSCodeword, + }, + + /// Records an event of committing the instance at index. + CommitSlot { slot: usize }, +} + +/// Peer-peer message type. +#[derive(Debug, Clone, Serialize, Deserialize)] +enum PeerMsg { + /// Prepare message from leader to replicas. + Prepare { slot: usize, ballot: Ballot }, + + /// Prepare reply from replica to leader. + PrepareReply { + slot: usize, + ballot: Ballot, + /// The accepted ballot number for that instance and the corresponding + /// request batch value shards known by replica. + voted: Option<(Ballot, RSCodeword)>, + }, + + /// Accept message from leader to replicas. + Accept { + slot: usize, + ballot: Ballot, + reqs_cw: RSCodeword, + }, + + /// Accept reply from replica to leader. + AcceptReply { slot: usize, ballot: Ballot }, + + /// Commit notification from leader to replicas. + Commit { slot: usize }, +} + +/// Crossword server replica module. +pub struct CrosswordReplica { + /// Replica ID in cluster. + id: ReplicaId, + + /// Total number of replicas in cluster. + population: u8, + + /// Majority quorum size. + quorum_cnt: u8, + + /// Configuration parameters struct. + config: ReplicaConfigCrossword, + + /// Address string for client requests API. + _api_addr: SocketAddr, + + /// Address string for internal peer-peer communication. + _p2p_addr: SocketAddr, + + /// ControlHub module. + control_hub: ControlHub, + + /// ExternalApi module. + external_api: ExternalApi, + + /// StateMachine module. + state_machine: StateMachine, + + /// StorageHub module. + storage_hub: StorageHub, + + /// TransportHub module. + transport_hub: TransportHub, + + /// Do I think I am the leader? + is_leader: bool, + + /// In-memory log of instances. + insts: Vec, + + /// Largest ballot number that a leader has sent Prepare messages in. + bal_prep_sent: Ballot, + + /// Largest ballot number that a leader knows has been safely prepared. + bal_prepared: Ballot, + + /// Largest ballot number seen as acceptor. + bal_max_seen: Ballot, + + /// Index of the first non-committed instance. + commit_bar: usize, + + /// Index of the first non-executed instance. + /// It is always true that exec_bar <= commit_bar <= insts.len() + exec_bar: usize, + + /// Current durable log file offset. + log_offset: usize, + + /// Fixed Reed-Solomon coder. + rs_coder: ReedSolomon, +} + +impl CrosswordReplica { + /// Compose a unique ballot number from base. + fn make_unique_ballot(&self, base: u64) -> Ballot { + ((base << 8) | ((self.id + 1) as u64)) as Ballot + } + + /// Compose a unique ballot number greater than the given one. + fn make_greater_ballot(&self, bal: Ballot) -> Ballot { + self.make_unique_ballot((bal >> 8) + 1) + } + + /// Compose LogActionId from slot index & entry type. + /// Uses the `Status` enum type to represent differnet entry types. + fn make_log_action_id(slot: usize, entry_type: Status) -> LogActionId { + let type_num = match entry_type { + Status::Preparing => 1, + Status::Accepting => 2, + Status::Committed => 3, + _ => panic!("unknown log entry type {:?}", entry_type), + }; + ((slot << 2) | type_num) as LogActionId + } + + /// Decompose LogActionId into slot index & entry type. + fn split_log_action_id(log_action_id: LogActionId) -> (usize, Status) { + let slot = (log_action_id >> 2) as usize; + let type_num = log_action_id & ((1 << 2) - 1); + let entry_type = match type_num { + 1 => Status::Preparing, + 2 => Status::Accepting, + 3 => Status::Committed, + _ => panic!("unknown log entry type num {}", type_num), + }; + (slot, entry_type) + } + + /// Compose CommandId from slot index & command index within. + fn make_command_id(slot: usize, cmd_idx: usize) -> CommandId { + assert!(slot <= (u32::MAX as usize)); + assert!(cmd_idx <= (u32::MAX as usize)); + ((slot << 32) | cmd_idx) as CommandId + } + + /// Decompose CommandId into slot index & command index within. + fn split_command_id(command_id: CommandId) -> (usize, usize) { + let slot = (command_id >> 32) as usize; + let cmd_idx = (command_id & ((1 << 32) - 1)) as usize; + (slot, cmd_idx) + } + + /// TODO: maybe remove this. + fn shards_for_replica( + id: ReplicaId, + population: u8, + num_shards: u8, + ) -> HashSet { + (id..(id + num_shards)) + .map(|i| (i % population) as usize) + .collect() + } + + /// Handler of client request batch chan recv. + fn handle_req_batch( + &mut self, + req_batch: ReqBatch, + ) -> Result<(), SummersetError> { + let batch_size = req_batch.len(); + assert!(batch_size > 0); + pf_debug!(self.id; "got request batch of size {}", batch_size); + + // if I'm not a leader, ignore client requests + if !self.is_leader { + for (client, req) in req_batch { + if let ApiRequest::Req { id: req_id, .. } = req { + // tell the client to try on the next replica + let next_replica = (self.id + 1) % self.population; + self.external_api.send_reply( + ApiReply::Reply { + id: req_id, + result: None, + redirect: Some(next_replica), + }, + client, + )?; + pf_trace!(self.id; "redirected client {} to replica {}", + client, next_replica); + } + } + return Ok(()); + } + + // compute the complete Reed-Solomon codeword for the batch data + let mut reqs_cw = RSCodeword::from_data( + req_batch, + self.quorum_cnt as usize, + (self.population - self.quorum_cnt) as usize, + )?; + reqs_cw.compute_parity(Some(&self.rs_coder))?; + + // create a new instance in the first null slot (or append a new one + // at the end if no holes exist) + // TODO: maybe use a null_idx variable to better keep track of this + let mut slot = self.insts.len(); + for s in self.commit_bar..self.insts.len() { + if self.insts[s].status == Status::Null { + slot = s; + break; + } + } + if slot < self.insts.len() { + let old_inst = &mut self.insts[slot]; + assert_eq!(old_inst.status, Status::Null); + old_inst.reqs_cw = reqs_cw; + old_inst.leader_bk = Some(LeaderBookkeeping { + prepare_acks: ReplicaMap::new(self.population, false), + prepare_max_bal: 0, + accept_acks: ReplicaMap::new(self.population, false), + }); + } else { + let new_inst = Instance { + bal: 0, + status: Status::Null, + reqs_cw, + leader_bk: Some(LeaderBookkeeping { + prepare_acks: ReplicaMap::new(self.population, false), + prepare_max_bal: 0, + accept_acks: ReplicaMap::new(self.population, false), + }), + replica_bk: None, + }; + self.insts.push(new_inst); + } + + // decide whether we can enter fast path for this instance + // TODO: remember to reset bal_prepared to 0, update bal_max_seen, + // and re-handle all Preparing & Accepting instances in autonomous + // Prepare initiation + if self.bal_prepared == 0 { + // slow case: Prepare phase not done yet. Initiate a Prepare round + // if none is on the fly, or just wait for some Prepare reply to + // trigger my Accept phase + if self.bal_prep_sent == 0 { + self.bal_prep_sent = + self.make_greater_ballot(self.bal_max_seen); + self.bal_max_seen = self.bal_prep_sent; + } + + let inst = &mut self.insts[slot]; + inst.bal = self.bal_prep_sent; + inst.status = Status::Preparing; + pf_debug!(self.id; "enter Prepare phase for slot {} bal {}", + slot, inst.bal); + + // record update to largest prepare ballot + self.storage_hub.submit_action( + Self::make_log_action_id(slot, Status::Preparing), + LogAction::Append { + entry: LogEntry::PrepareBal { + slot, + ballot: self.bal_prep_sent, + }, + sync: self.config.logger_sync, + }, + )?; + pf_trace!(self.id; "submitted PrepareBal log action for slot {} bal {}", + slot, inst.bal); + + // send Prepare messages to all peers + self.transport_hub.bcast_msg( + PeerMsg::Prepare { + slot, + ballot: self.bal_prep_sent, + }, + None, + )?; + pf_trace!(self.id; "broadcast Prepare messages for slot {} bal {}", + slot, inst.bal); + } else { + // normal case: Prepare phase covered, only do the Accept phase + let inst = &mut self.insts[slot]; + inst.bal = self.bal_prepared; + inst.status = Status::Accepting; + pf_debug!(self.id; "enter Accept phase for slot {} bal {}", + slot, inst.bal); + + // record update to largest accepted ballot and corresponding data + self.storage_hub.submit_action( + Self::make_log_action_id(slot, Status::Accepting), + LogAction::Append { + entry: LogEntry::AcceptData { + slot, + ballot: inst.bal, + // persist only some shards on myself + reqs_cw: inst.reqs_cw.subset_copy( + Self::shards_for_replica( + self.id, + self.population, + self.config.shards_per_replica, + ), + false, + )?, + }, + sync: self.config.logger_sync, + }, + )?; + pf_trace!(self.id; "submitted AcceptData log action for slot {} bal {}", + slot, inst.bal); + + // send Accept messages to all peers, each getting its subset of + // shards of data + for peer in 0..self.population { + if peer == self.id { + continue; + } + self.transport_hub.send_msg( + PeerMsg::Accept { + slot, + ballot: inst.bal, + reqs_cw: inst.reqs_cw.subset_copy( + Self::shards_for_replica( + peer, + self.population, + self.config.shards_per_replica, + ), + false, + )?, + }, + peer, + )?; + } + pf_trace!(self.id; "broadcast Accept messages for slot {} bal {}", + slot, inst.bal); + } + + Ok(()) + } + + /// Handler of PrepareBal logging result chan recv. + fn handle_logged_prepare_bal( + &mut self, + slot: usize, + ) -> Result<(), SummersetError> { + pf_trace!(self.id; "finished PrepareBal logging for slot {} bal {}", + slot, self.insts[slot].bal); + let inst = &self.insts[slot]; + let voted = if inst.status >= Status::Accepting { + Some((inst.bal, inst.reqs_cw.clone())) + } else { + None + }; + + if self.is_leader { + // on leader, finishing the logging of a PrepareBal entry + // is equivalent to receiving a Prepare reply from myself + // (as an acceptor role) + self.handle_msg_prepare_reply(self.id, slot, inst.bal, voted)?; + } else { + // on follower replica, finishing the logging of a + // PrepareBal entry leads to sending back a Prepare reply + assert!(inst.replica_bk.is_some()); + let source = inst.replica_bk.as_ref().unwrap().source; + self.transport_hub.send_msg( + PeerMsg::PrepareReply { + slot, + ballot: inst.bal, + voted, + }, + source, + )?; + pf_trace!(self.id; "sent PrepareReply -> {} for slot {} bal {}", + source, slot, inst.bal); + } + + Ok(()) + } + + /// Handler of AcceptData logging result chan recv. + fn handle_logged_accept_data( + &mut self, + slot: usize, + ) -> Result<(), SummersetError> { + pf_trace!(self.id; "finished AcceptData logging for slot {} bal {}", + slot, self.insts[slot].bal); + let inst = &self.insts[slot]; + + if self.is_leader { + // on leader, finishing the logging of an AcceptData entry + // is equivalent to receiving an Accept reply from myself + // (as an acceptor role) + self.handle_msg_accept_reply(self.id, slot, inst.bal)?; + } else { + // on follower replica, finishing the logging of an + // AcceptData entry leads to sending back an Accept reply + assert!(inst.replica_bk.is_some()); + let source = inst.replica_bk.as_ref().unwrap().source; + self.transport_hub.send_msg( + PeerMsg::AcceptReply { + slot, + ballot: inst.bal, + }, + source, + )?; + pf_trace!(self.id; "sent AcceptReply -> {} for slot {} bal {}", + source, slot, inst.bal); + } + + Ok(()) + } + + /// Handler of CommitSlot logging result chan recv. + fn handle_logged_commit_slot( + &mut self, + slot: usize, + ) -> Result<(), SummersetError> { + pf_trace!(self.id; "finished CommitSlot logging for slot {} bal {}", + slot, self.insts[slot].bal); + assert!(self.insts[slot].status >= Status::Committed); + + // update index of the first non-committed instance + if slot == self.commit_bar { + while self.commit_bar < self.insts.len() { + let inst = &mut self.insts[self.commit_bar]; + if inst.status < Status::Committed { + break; + } + + if inst.reqs_cw.avail_shards() < self.quorum_cnt as usize { + // can't execute if I don't have the complete request batch + pf_debug!(self.id; "postponing execution for slot {} (shards {}/{})", + slot, inst.reqs_cw.avail_shards(), self.quorum_cnt); + break; + } else if inst.reqs_cw.avail_data_shards() + < self.quorum_cnt as usize + { + // have enough shards but need reconstruction + inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?; + } + let reqs = inst.reqs_cw.get_data()?; + + // submit commands in committed instance to the state machine + // for execution + if reqs.is_empty() { + inst.status = Status::Executed; + } else if inst.status == Status::Committed { + for (cmd_idx, (_, req)) in reqs.iter().enumerate() { + if let ApiRequest::Req { cmd, .. } = req { + self.state_machine.submit_cmd( + Self::make_command_id(self.commit_bar, cmd_idx), + cmd.clone(), + )?; + } else { + continue; // ignore other types of requests + } + } + pf_trace!(self.id; "submitted {} exec commands for slot {}", + reqs.len(), self.commit_bar); + } + + self.commit_bar += 1; + } + } + + Ok(()) + } + + /// Synthesized handler of durable logging result chan recv. + fn handle_log_result( + &mut self, + action_id: LogActionId, + log_result: LogResult, + ) -> Result<(), SummersetError> { + let (slot, entry_type) = Self::split_log_action_id(action_id); + assert!(slot < self.insts.len()); + + if let LogResult::Append { now_size } = log_result { + assert!(now_size >= self.log_offset); + self.log_offset = now_size; + } else { + return logged_err!(self.id; "unexpected log result type: {:?}", log_result); + } + + match entry_type { + Status::Preparing => self.handle_logged_prepare_bal(slot), + Status::Accepting => self.handle_logged_accept_data(slot), + Status::Committed => self.handle_logged_commit_slot(slot), + _ => { + logged_err!(self.id; "unexpected log entry type: {:?}", entry_type) + } + } + } + + /// Handler of Prepare message from leader. + fn handle_msg_prepare( + &mut self, + peer: ReplicaId, + slot: usize, + ballot: Ballot, + ) -> Result<(), SummersetError> { + pf_trace!(self.id; "received Prepare <- {} for slot {} bal {}", + peer, slot, ballot); + + // if ballot is not smaller than what I have seen: + if ballot >= self.bal_max_seen { + // locate instance in memory, filling in null instances if needed + while self.insts.len() <= slot { + self.insts.push(Instance { + bal: 0, + status: Status::Null, + reqs_cw: RSCodeword::::from_null( + self.quorum_cnt as usize, + (self.population - self.quorum_cnt) as usize, + )?, + leader_bk: None, + replica_bk: None, + }); + } + let inst = &mut self.insts[slot]; + assert!(inst.bal <= ballot); + + inst.bal = ballot; + inst.status = Status::Preparing; + inst.replica_bk = Some(ReplicaBookkeeping { source: peer }); + + // update largest ballot seen + self.bal_max_seen = ballot; + + // record update to largest prepare ballot + self.storage_hub.submit_action( + Self::make_log_action_id(slot, Status::Preparing), + LogAction::Append { + entry: LogEntry::PrepareBal { slot, ballot }, + sync: self.config.logger_sync, + }, + )?; + pf_trace!(self.id; "submitted PrepareBal log action for slot {} bal {}", + slot, ballot); + } + + Ok(()) + } + + /// Handler of Prepare reply from replica. + fn handle_msg_prepare_reply( + &mut self, + peer: ReplicaId, + slot: usize, + ballot: Ballot, + voted: Option<(Ballot, RSCodeword)>, + ) -> Result<(), SummersetError> { + pf_trace!(self.id; "received PrepareReply <- {} for slot {} bal {} shards {:?}", + peer, slot, ballot, + voted.as_ref().map(|(_, cw)| cw.avail_shards_set())); + + // if ballot is what I'm currently waiting on for Prepare replies: + if ballot == self.bal_prep_sent { + assert!(slot < self.insts.len()); + let inst = &mut self.insts[slot]; + + // ignore spurious duplications and outdated replies + if (inst.status != Status::Preparing) || (ballot < inst.bal) { + return Ok(()); + } + assert_eq!(inst.bal, ballot); + assert!(self.bal_max_seen >= ballot); + assert!(inst.leader_bk.is_some()); + let leader_bk = inst.leader_bk.as_mut().unwrap(); + if leader_bk.prepare_acks.get(peer)? { + return Ok(()); + } + + // bookkeep this Prepare reply + leader_bk.prepare_acks.set(peer, true)?; + if let Some((bal, val)) = voted { + #[allow(clippy::comparison_chain)] + if bal > leader_bk.prepare_max_bal { + // is of ballot > current maximum, so discard the current + // codeword and take the replied codeword + leader_bk.prepare_max_bal = bal; + inst.reqs_cw = val; + } else if bal == leader_bk.prepare_max_bal { + // is of ballot == the one currently taken, so merge the + // replied codeword into the current one + inst.reqs_cw.absorb_other(val)?; + } + } + + // if quorum size reached AND enough shards are known to + // reconstruct the original data, enter Accept phase for this + // instance using the request batch value constructed using shards + // with the highest ballot number in quorum + if leader_bk.prepare_acks.count() >= self.quorum_cnt + && inst.reqs_cw.avail_data_shards() >= self.quorum_cnt as usize + { + inst.status = Status::Accepting; + pf_debug!(self.id; "enter Accept phase for slot {} bal {}", + slot, inst.bal); + + // update bal_prepared + assert!(self.bal_prepared <= ballot); + self.bal_prepared = ballot; + + // if parity shards not computed yet, compute them now + if inst.reqs_cw.avail_shards() < self.population as usize { + inst.reqs_cw.compute_parity(Some(&self.rs_coder))?; + } + + // record update to largest accepted ballot and corresponding data + self.storage_hub.submit_action( + Self::make_log_action_id(slot, Status::Accepting), + LogAction::Append { + entry: LogEntry::AcceptData { + slot, + ballot, + reqs_cw: inst.reqs_cw.subset_copy( + Self::shards_for_replica( + self.id, + self.population, + self.config.shards_per_replica, + ), + false, + )?, + }, + sync: self.config.logger_sync, + }, + )?; + pf_trace!(self.id; "submitted AcceptData log action for slot {} bal {}", + slot, ballot); + + // send Accept messages to all peers + for peer in 0..self.population { + if peer == self.id { + continue; + } + self.transport_hub.send_msg( + PeerMsg::Accept { + slot, + ballot, + reqs_cw: inst.reqs_cw.subset_copy( + Self::shards_for_replica( + peer, + self.population, + self.config.shards_per_replica, + ), + false, + )?, + }, + peer, + )?; + } + pf_trace!(self.id; "broadcast Accept messages for slot {} bal {}", + slot, ballot); + } + } + + Ok(()) + } + + /// Handler of Accept message from leader. + fn handle_msg_accept( + &mut self, + peer: ReplicaId, + slot: usize, + ballot: Ballot, + reqs_cw: RSCodeword, + ) -> Result<(), SummersetError> { + pf_trace!(self.id; "received Accept <- {} for slot {} bal {} shards {:?}", + peer, slot, ballot, reqs_cw.avail_shards_set()); + + // if ballot is not smaller than what I have made promises for: + if ballot >= self.bal_max_seen { + // locate instance in memory, filling in null instances if needed + while self.insts.len() <= slot { + self.insts.push(Instance { + bal: 0, + status: Status::Null, + reqs_cw: RSCodeword::::from_null( + self.quorum_cnt as usize, + (self.population - self.quorum_cnt) as usize, + )?, + leader_bk: None, + replica_bk: None, + }); + } + let inst = &mut self.insts[slot]; + assert!(inst.bal <= ballot); + + inst.bal = ballot; + inst.status = Status::Accepting; + inst.reqs_cw = reqs_cw; + inst.replica_bk = Some(ReplicaBookkeeping { source: peer }); + + // update largest ballot seen + self.bal_max_seen = ballot; + + // record update to largest prepare ballot + self.storage_hub.submit_action( + Self::make_log_action_id(slot, Status::Accepting), + LogAction::Append { + entry: LogEntry::AcceptData { + slot, + ballot, + reqs_cw: inst.reqs_cw.clone(), + }, + sync: self.config.logger_sync, + }, + )?; + pf_trace!(self.id; "submitted AcceptData log action for slot {} bal {}", + slot, ballot); + } + + Ok(()) + } + + /// Handler of Accept reply from replica. + fn handle_msg_accept_reply( + &mut self, + peer: ReplicaId, + slot: usize, + ballot: Ballot, + ) -> Result<(), SummersetError> { + pf_trace!(self.id; "received AcceptReply <- {} for slot {} bal {}", + peer, slot, ballot); + + // if ballot is what I'm currently waiting on for Accept replies: + if ballot == self.bal_prepared { + assert!(slot < self.insts.len()); + let inst = &mut self.insts[slot]; + + // ignore spurious duplications and outdated replies + if (inst.status != Status::Accepting) || (ballot < inst.bal) { + return Ok(()); + } + assert_eq!(inst.bal, ballot); + assert!(self.bal_max_seen >= ballot); + assert!(inst.leader_bk.is_some()); + let leader_bk = inst.leader_bk.as_mut().unwrap(); + if leader_bk.accept_acks.get(peer)? { + return Ok(()); + } + + // bookkeep this Accept reply + leader_bk.accept_acks.set(peer, true)?; + + // if quorum size reached AND enough number of shards are + // remembered, mark this instance as committed; in RS-Paxos, this + // means accept_acks.count() >= self.quorum_cnt + fault_tolerance + if leader_bk.accept_acks.count() + >= self.quorum_cnt + self.config.fault_tolerance + { + inst.status = Status::Committed; + pf_debug!(self.id; "committed instance at slot {} bal {}", + slot, inst.bal); + + // record commit event + self.storage_hub.submit_action( + Self::make_log_action_id(slot, Status::Committed), + LogAction::Append { + entry: LogEntry::CommitSlot { slot }, + sync: self.config.logger_sync, + }, + )?; + pf_trace!(self.id; "submitted CommitSlot log action for slot {} bal {}", + slot, inst.bal); + + // send Commit messages to all peers + self.transport_hub + .bcast_msg(PeerMsg::Commit { slot }, None)?; + pf_trace!(self.id; "broadcast Commit messages for slot {} bal {}", + slot, ballot); + } + } + + Ok(()) + } + + /// Handler of Commit message from leader. + /// TODO: take care of missing/lost Commit messages + fn handle_msg_commit( + &mut self, + peer: ReplicaId, + slot: usize, + ) -> Result<(), SummersetError> { + pf_trace!(self.id; "received Commit <- {} for slot {}", peer, slot); + + // locate instance in memory, filling in null instances if needed + while self.insts.len() <= slot { + self.insts.push(Instance { + bal: 0, + status: Status::Null, + reqs_cw: RSCodeword::::from_null( + self.quorum_cnt as usize, + (self.population - self.quorum_cnt) as usize, + )?, + leader_bk: None, + replica_bk: None, + }); + } + let inst = &mut self.insts[slot]; + + // ignore spurious duplications + if inst.status != Status::Accepting { + return Ok(()); + } + + // mark this instance as committed + inst.status = Status::Committed; + pf_debug!(self.id; "committed instance at slot {} bal {}", + slot, inst.bal); + + // record commit event + self.storage_hub.submit_action( + Self::make_log_action_id(slot, Status::Committed), + LogAction::Append { + entry: LogEntry::CommitSlot { slot }, + sync: self.config.logger_sync, + }, + )?; + pf_trace!(self.id; "submitted CommitSlot log action for slot {} bal {}", + slot, inst.bal); + + Ok(()) + } + + /// Synthesized handler of receiving message from peer. + fn handle_msg_recv( + &mut self, + peer: ReplicaId, + msg: PeerMsg, + ) -> Result<(), SummersetError> { + match msg { + PeerMsg::Prepare { slot, ballot } => { + self.handle_msg_prepare(peer, slot, ballot) + } + PeerMsg::PrepareReply { + slot, + ballot, + voted, + } => self.handle_msg_prepare_reply(peer, slot, ballot, voted), + PeerMsg::Accept { + slot, + ballot, + reqs_cw, + } => self.handle_msg_accept(peer, slot, ballot, reqs_cw), + PeerMsg::AcceptReply { slot, ballot } => { + self.handle_msg_accept_reply(peer, slot, ballot) + } + PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot), + } + } + + /// Handler of state machine exec result chan recv. + fn handle_cmd_result( + &mut self, + cmd_id: CommandId, + cmd_result: CommandResult, + ) -> Result<(), SummersetError> { + let (slot, cmd_idx) = Self::split_command_id(cmd_id); + assert!(slot < self.insts.len()); + pf_trace!(self.id; "executed cmd in instance at slot {} idx {}", + slot, cmd_idx); + + let inst = &mut self.insts[slot]; + let reqs = inst.reqs_cw.get_data()?; + assert!(cmd_idx < reqs.len()); + let (client, ref req) = reqs[cmd_idx]; + + // reply command result back to client + if let ApiRequest::Req { id: req_id, .. } = req { + if self.external_api.has_client(client) { + self.external_api.send_reply( + ApiReply::Reply { + id: *req_id, + result: Some(cmd_result), + redirect: None, + }, + client, + )?; + pf_trace!(self.id; "replied -> client {} for slot {} idx {}", + client, slot, cmd_idx); + } + } else { + return logged_err!(self.id; "unexpected API request type"); + } + + // if all commands in this instance have been executed, set status to + // Executed and update `exec_bar` + if cmd_idx == reqs.len() - 1 { + inst.status = Status::Executed; + pf_debug!(self.id; "executed all cmds in instance at slot {}", + slot); + + // update index of the first non-executed instance + if slot == self.exec_bar { + while self.exec_bar < self.insts.len() { + let inst = &mut self.insts[self.exec_bar]; + if inst.status < Status::Executed { + break; + } + self.exec_bar += 1; + } + } + } + + Ok(()) + } + + /// Synthesized handler of manager control messages. + fn handle_ctrl_msg(&mut self, _msg: CtrlMsg) -> Result<(), SummersetError> { + // TODO: fill this when more control message types added + Ok(()) + } +} + +#[async_trait] +impl GenericReplica for CrosswordReplica { + async fn new_and_setup( + api_addr: SocketAddr, + p2p_addr: SocketAddr, + manager: SocketAddr, + config_str: Option<&str>, + ) -> Result { + let config = parsed_config!(config_str => ReplicaConfigCrossword; + batch_interval_us, max_batch_size, + backer_path, logger_sync, fault_tolerance, + shards_per_replica)?; + // connect to the cluster manager and get assigned a server ID + let mut control_hub = ControlHub::new_and_setup(manager).await?; + let id = control_hub.me; + + if config.batch_interval_us == 0 { + return logged_err!( + id; + "invalid config.batch_interval_us '{}'", + config.batch_interval_us + ); + } + + // ask for population number and the list of peers to proactively + // connect to + control_hub.send_ctrl(CtrlMsg::NewServerJoin { + id, + protocol: SmrProtocol::Crossword, + api_addr, + p2p_addr, + })?; + let (population, to_peers) = if let CtrlMsg::ConnectToPeers { + population, + to_peers, + } = control_hub.recv_ctrl().await? + { + (population, to_peers) + } else { + return logged_err!(id; "unexpected ctrl msg type received"); + }; + + // create a Reed-Solomon coder with num_data_shards == quorum size and + // num_parity shards == population - quorum + let quorum_cnt = (population / 2) + 1; + if config.fault_tolerance > (population - quorum_cnt) { + return logged_err!(id; "invalid config.fault_tolerance '{}'", + config.fault_tolerance); + } + if config.shards_per_replica == 0 + || config.shards_per_replica > quorum_cnt + { + return logged_err!(id; "invalid config.shards_per_replica '{}'", + config.shards_per_replica); + } + let rs_coder = ReedSolomon::new( + quorum_cnt as usize, + (population - quorum_cnt) as usize, + )?; + + let state_machine = StateMachine::new_and_setup(id).await?; + + let storage_hub = + StorageHub::new_and_setup(id, Path::new(&config.backer_path)) + .await?; + + let mut transport_hub = + TransportHub::new_and_setup(id, population, p2p_addr).await?; + + // proactively connect to some peers, then wait for all population + // have been connected with me + for (peer, addr) in to_peers { + transport_hub.connect_to_peer(peer, addr).await?; + } + transport_hub.wait_for_group(population).await?; + + let external_api = ExternalApi::new_and_setup( + id, + api_addr, + Duration::from_micros(config.batch_interval_us), + config.max_batch_size, + ) + .await?; + + Ok(CrosswordReplica { + id, + population, + quorum_cnt, + config, + _api_addr: api_addr, + _p2p_addr: p2p_addr, + control_hub, + external_api, + state_machine, + storage_hub, + transport_hub, + is_leader: false, + insts: vec![], + bal_prep_sent: 0, + bal_prepared: 0, + bal_max_seen: 0, + commit_bar: 0, + exec_bar: 0, + log_offset: 0, + rs_coder, + }) + } + + async fn run(&mut self) { + // TODO: proper leader election + if self.id == 0 { + self.is_leader = true; + } + + loop { + tokio::select! { + // client request batch + req_batch = self.external_api.get_req_batch() => { + if let Err(e) = req_batch { + pf_error!(self.id; "error getting req batch: {}", e); + continue; + } + let req_batch = req_batch.unwrap(); + if let Err(e) = self.handle_req_batch(req_batch) { + pf_error!(self.id; "error handling req batch: {}", e); + } + }, + + // durable logging result + log_result = self.storage_hub.get_result() => { + if let Err(e) = log_result { + pf_error!(self.id; "error getting log result: {}", e); + continue; + } + let (action_id, log_result) = log_result.unwrap(); + if let Err(e) = self.handle_log_result(action_id, log_result) { + pf_error!(self.id; "error handling log result {}: {}", + action_id, e); + } + }, + + // message from peer + msg = self.transport_hub.recv_msg() => { + if let Err(e) = msg { + pf_error!(self.id; "error receiving peer msg: {}", e); + continue; + } + let (peer, msg) = msg.unwrap(); + if let Err(e) = self.handle_msg_recv(peer, msg) { + pf_error!(self.id; "error handling msg recv <- {}: {}", peer, e); + } + } + + // state machine execution result + cmd_result = self.state_machine.get_result() => { + if let Err(e) = cmd_result { + pf_error!(self.id; "error getting cmd result: {}", e); + continue; + } + let (cmd_id, cmd_result) = cmd_result.unwrap(); + if let Err(e) = self.handle_cmd_result(cmd_id, cmd_result) { + pf_error!(self.id; "error handling cmd result {}: {}", cmd_id, e); + } + }, + + // manager control message + ctrl_msg = self.control_hub.recv_ctrl() => { + if let Err(e) = ctrl_msg { + pf_error!(self.id; "error getting ctrl msg: {}", e); + continue; + } + let ctrl_msg = ctrl_msg.unwrap(); + if let Err(e) = self.handle_ctrl_msg(ctrl_msg) { + pf_error!(self.id; "error handling ctrl msg: {}", e); + } + } + } + } + } +} + +/// Configuration parameters struct. +#[derive(Debug, Deserialize)] +pub struct ClientConfigCrossword { + /// Which server to pick initially. + pub init_server_id: ReplicaId, +} + +#[allow(clippy::derivable_impls)] +impl Default for ClientConfigCrossword { + fn default() -> Self { + ClientConfigCrossword { init_server_id: 0 } + } +} + +/// Crossword client-side module. +pub struct CrosswordClient { + /// Client ID. + id: ClientId, + + /// Address of the cluster manager oracle. + manager: SocketAddr, + + /// Configuration parameters struct. + _config: ClientConfigCrossword, + + /// Cached list of active servers information. + servers: HashMap, + + /// Current server ID to connect to. + server_id: ReplicaId, + + /// Control API stub to the cluster manager. + ctrl_stub: Option, + + /// API stubs for communicating with servers. + api_stub: Option, +} + +#[async_trait] +impl GenericEndpoint for CrosswordClient { + fn new( + manager: SocketAddr, + config_str: Option<&str>, + ) -> Result { + let config = parsed_config!(config_str => ClientConfigCrossword; + init_server_id)?; + let init_server_id = config.init_server_id; + + Ok(CrosswordClient { + id: 255, // nil at this time + manager, + _config: config, + servers: HashMap::new(), + server_id: init_server_id, + ctrl_stub: None, + api_stub: None, + }) + } + + async fn connect(&mut self) -> Result { + // disallow reconnection without leaving + if self.api_stub.is_some() { + return logged_err!(self.id; "reconnecting without leaving"); + } + + // if ctrl_stubs not established yet, connect to the manager + if self.ctrl_stub.is_none() { + let ctrl_stub = + ClientCtrlStub::new_by_connect(self.manager).await?; + self.id = ctrl_stub.id; + self.ctrl_stub = Some(ctrl_stub); + } + let ctrl_stub = self.ctrl_stub.as_mut().unwrap(); + + // ask the manager about the list of active servers + let mut sent = ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?; + while !sent { + sent = ctrl_stub.send_req(None)?; + } + + let reply = ctrl_stub.recv_reply().await?; + match reply { + CtrlReply::QueryInfo { servers } => { + // connect to the one with server ID in config + let api_stub = ClientApiStub::new_by_connect( + self.id, + servers[&self.server_id], + ) + .await?; + self.api_stub = Some(api_stub); + self.servers = servers; + Ok(self.id) + } + _ => logged_err!(self.id; "unexpected reply type received"), + } + } + + async fn leave(&mut self, permanent: bool) -> Result<(), SummersetError> { + // send leave notification to current connected server + if let Some(mut api_stub) = self.api_stub.take() { + let mut sent = api_stub.send_req(Some(&ApiRequest::Leave))?; + while !sent { + sent = api_stub.send_req(None)?; + } + + let reply = api_stub.recv_reply().await?; + match reply { + ApiReply::Leave => { + pf_info!(self.id; "left current server connection"); + api_stub.forget(); + } + _ => { + return logged_err!(self.id; "unexpected reply type received"); + } + } + } + + // if permanently leaving, send leave notification to the manager + if permanent { + // disallow multiple permanent leaving + if self.ctrl_stub.is_none() { + return logged_err!(self.id; "repeated permanent leaving"); + } + + if let Some(mut ctrl_stub) = self.ctrl_stub.take() { + let mut sent = ctrl_stub.send_req(Some(&CtrlRequest::Leave))?; + while !sent { + sent = ctrl_stub.send_req(None)?; + } + + let reply = ctrl_stub.recv_reply().await?; + match reply { + CtrlReply::Leave => { + pf_info!(self.id; "left current manager connection"); + ctrl_stub.forget(); + } + _ => { + return logged_err!(self.id; "unexpected reply type received"); + } + } + } + } + + Ok(()) + } + + fn send_req( + &mut self, + req: Option<&ApiRequest>, + ) -> Result { + match self.api_stub { + Some(ref mut api_stub) => api_stub.send_req(req), + None => logged_err!(self.id; "client is not set up"), + } + } + + async fn recv_reply(&mut self) -> Result { + match self.api_stub { + Some(ref mut api_stub) => { + let reply = api_stub.recv_reply().await?; + + if let ApiReply::Reply { + ref result, + ref redirect, + .. + } = reply + { + // if the current server redirects me to a different server + if result.is_none() && redirect.is_some() { + let redirect_id = redirect.unwrap(); + assert!(self.servers.contains_key(&redirect_id)); + self.leave(false).await?; + self.server_id = redirect_id; + self.connect().await?; + pf_debug!(self.id; "redirected to replica {} '{}'", + redirect_id, self.servers[&redirect_id]); + } + } + + Ok(reply) + } + None => logged_err!(self.id; "client is not set up"), + } + } +} diff --git a/src/protocols/mod.rs b/src/protocols/mod.rs index 36d0ea1a..98ecf371 100644 --- a/src/protocols/mod.rs +++ b/src/protocols/mod.rs @@ -26,6 +26,10 @@ mod rs_paxos; use rs_paxos::{RSPaxosReplica, RSPaxosClient}; pub use rs_paxos::{ReplicaConfigRSPaxos, ClientConfigRSPaxos}; +mod crossword; +use crossword::{CrosswordReplica, CrosswordClient}; +pub use crossword::{ReplicaConfigCrossword, ClientConfigCrossword}; + /// Enum of supported replication protocol types. #[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)] pub enum SmrProtocol { @@ -33,6 +37,7 @@ pub enum SmrProtocol { SimplePush, MultiPaxos, RSPaxos, + Crossword, } /// Helper macro for saving boilder-plate `Box` mapping in @@ -52,6 +57,7 @@ impl SmrProtocol { "SimplePush" => Some(Self::SimplePush), "MultiPaxos" => Some(Self::MultiPaxos), "RSPaxos" => Some(Self::RSPaxos), + "Crossword" => Some(Self::Crossword), _ => None, } } @@ -108,6 +114,14 @@ impl SmrProtocol { .await ) } + Self::Crossword => { + box_if_ok!( + CrosswordReplica::new_and_setup( + api_addr, p2p_addr, manager, config_str + ) + .await + ) + } } } @@ -130,6 +144,9 @@ impl SmrProtocol { Self::RSPaxos => { box_if_ok!(RSPaxosClient::new(manager, config_str)) } + Self::Crossword => { + box_if_ok!(CrosswordClient::new(manager, config_str)) + } } } } @@ -159,6 +176,7 @@ mod protocols_name_tests { valid_name_test!(SimplePush); valid_name_test!(MultiPaxos); valid_name_test!(RSPaxos); + valid_name_test!(Crossword); } #[test] diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs index 1dade1c0..e47993de 100644 --- a/src/protocols/rs_paxos.rs +++ b/src/protocols/rs_paxos.rs @@ -417,7 +417,7 @@ impl RSPaxosReplica { pf_trace!(self.id; "submitted AcceptData log action for slot {} bal {}", slot, inst.bal); - // send Accept messages to all peers, each getting on shard of data + // send Accept messages to all peers, each getting one shard of data for peer in 0..self.population { if peer == self.id { continue; From 2eb416c7b0452cb091f6ff43b1a2cfe6da6fa06c Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Tue, 29 Aug 2023 14:24:47 +0800 Subject: [PATCH 04/89] updates to benchmark scripts --- scripts/local_bench.tmp.py | 76 +++++++++++++++++++++++++++++------ scripts/local_client.py | 22 ++++------- scripts/local_cluster.py | 77 ++++++++++++++++++++++++++---------- scripts/set_tcp_buf_sizes.sh | 10 +++++ 4 files changed, 138 insertions(+), 47 deletions(-) diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py index 6f1ceb78..c20ad6d5 100644 --- a/scripts/local_bench.tmp.py +++ b/scripts/local_bench.tmp.py @@ -1,5 +1,5 @@ import subprocess -import time +import itertools import statistics @@ -24,7 +24,7 @@ def kill_all_matching(name): proc.wait() -def launch_cluster(protocol, num_replicas): +def launch_cluster(protocol, num_replicas, config): cmd = [ "python3", "./scripts/local_cluster.py", @@ -34,9 +34,25 @@ def launch_cluster(protocol, num_replicas): str(num_replicas), "-r", ] + if config is not None and len(config) > 0: + cmd += ["--config", config] return run_process(cmd) +def wait_cluster_setup(proc, num_replicas): + accepting_clients = [False for _ in range(num_replicas)] + + for line in iter(proc.stderr.readline, b""): + l = line.decode() + if "manager" not in l and "accepting clients" in l: + replica = int(l[l.find("(") + 1 : l.find(")")]) + assert not accepting_clients[replica] + accepting_clients[replica] = True + + if accepting_clients.count(True) == num_replicas: + break + + def run_bench_client(protocol, value_size, put_ratio, length_s): cmd = [ "python3", @@ -84,7 +100,15 @@ def parse_output(output): print(f" std tpt {std_tpt:9.2f} lat {std_lat:9.2f}") -def bench_round(protocol, num_replicas, value_size, put_ratio, length_s): +def bench_round( + protocol, + num_replicas, + value_size, + put_ratio, + length_s, + fault_tolerance=None, + shards_per_replica=None, +): print( f"{protocol:<10s} n={num_replicas:1d} v={value_size:<9d} w%={put_ratio:<3d} {length_s:3d}s" ) @@ -92,8 +116,13 @@ def bench_round(protocol, num_replicas, value_size, put_ratio, length_s): kill_all_matching("summerset_server") kill_all_matching("summerset_manager") - proc_cluster = launch_cluster(protocol, num_replicas) - time.sleep(15) + configs = [] + if fault_tolerance is not None: + configs.append(f"fault_tolerance={fault_tolerance}") + if shards_per_replica is not None: + configs.append(f"shards_per_replica={shards_per_replica}") + proc_cluster = launch_cluster(protocol, num_replicas, "+".join(configs)) + wait_cluster_setup(proc_cluster, num_replicas) proc_client = run_bench_client(protocol, value_size, put_ratio, length_s) out, err = proc_client.communicate() @@ -110,10 +139,33 @@ def bench_round(protocol, num_replicas, value_size, put_ratio, length_s): if __name__ == "__main__": do_cargo_build() - for num_replicas in (3, 5, 7): - for value_size in (1024, 65536, 4194304): - for protocol in ("MultiPaxos", "RSPaxos"): - bench_round(protocol, num_replicas, value_size, 100, 60) - - bench_round("MultiPaxos", 7, 4194304, 10, 60) - bench_round("RSPaxos", 7, 4194304, 10, 60) + def all_protocol_configs(num_replicas): + quorum_cnt = num_replicas // 2 + 1 + max_fault_tolerance = num_replicas - quorum_cnt + + config_choices = [("MultiPaxos", None, None)] + for shards_per_replica in range(quorum_cnt, 0): + config_choices.append( + ("Crossword", max_fault_tolerance, shards_per_replica) + ) + config_choices.append(("Crossword", 0, 1)) + + return config_choices + + # for num_replicas in (3, 5, 7): + # for value_size in (1024, 65536, 4194304): + # for protocol, fault_tolerance, shards_per_replica in all_protocol_configs( + # num_replicas + # ): + # bench_round( + # protocol, + # num_replicas, + # value_size, + # 100, + # 60, + # fault_tolerance=fault_tolerance, + # shards_per_replica=shards_per_replica, + # ) + + bench_round("MultiPaxos", 5, 65536, 0, 60) + # bench_round("Crossword", 5, 65536, 0, 60, fault_tolerance=0, shards_per_replica=1) diff --git a/scripts/local_client.py b/scripts/local_client.py index 2f9c2c6d..f8ac5981 100644 --- a/scripts/local_client.py +++ b/scripts/local_client.py @@ -21,15 +21,6 @@ def run_process(cmd): MANAGER_CLI_PORT = 52601 -PROTOCOL_CONFIGS = { - "RepNothing": "", - "SimplePush": "", - "MultiPaxos": "", - "RSPaxos": "", - "Crossword": "", -} - - UTILITY_PARAM_NAMES = { "repl": [], "bench": ["freq_target", "value_size", "put_ratio", "length_s"], @@ -63,7 +54,7 @@ def compose_client_cmd(protocol, manager, config, utility, params, release): "-m", manager, ] - if len(config) > 0: + if config is not None and len(config) > 0: cmd += ["--config", config] cmd += ["-u", utility] @@ -77,11 +68,11 @@ def compose_client_cmd(protocol, manager, config, utility, params, release): return cmd -def run_client(protocol, utility, params, release): +def run_client(protocol, utility, params, release, config): cmd = compose_client_cmd( protocol, f"127.0.0.1:{MANAGER_CLI_PORT}", - PROTOCOL_CONFIGS[protocol], + config, utility, params, release, @@ -97,6 +88,9 @@ def run_client(protocol, utility, params, release): "-p", "--protocol", type=str, required=True, help="protocol name" ) parser.add_argument("-r", "--release", action="store_true", help="run release mode") + parser.add_argument( + "-c", "--config", type=str, help="protocol-specific TOML config string" + ) subparsers = parser.add_subparsers( required=True, @@ -129,9 +123,6 @@ def run_client(protocol, utility, params, release): args = parser.parse_args() - if args.protocol not in PROTOCOL_CONFIGS: - raise ValueError(f"unknown protocol name '{args.protocol}'") - # build everything do_cargo_build(args.release) @@ -141,6 +132,7 @@ def run_client(protocol, utility, params, release): args.utility, glue_params_str(args, UTILITY_PARAM_NAMES[args.utility]), args.release, + args.config, ) rc = client_proc.wait() diff --git a/scripts/local_cluster.py b/scripts/local_cluster.py index b7dcdb25..ffbdff15 100644 --- a/scripts/local_cluster.py +++ b/scripts/local_cluster.py @@ -1,7 +1,6 @@ import sys import argparse import subprocess -import time from pathlib import Path @@ -14,9 +13,13 @@ def do_cargo_build(release): proc.wait() -def run_process(cmd): +def run_process(cmd, capture_stderr=False): print("Run:", " ".join(cmd)) - proc = subprocess.Popen(cmd) + proc = None + if capture_stderr: + proc = subprocess.Popen(cmd, stderr=subprocess.PIPE) + else: + proc = subprocess.Popen(cmd) return proc @@ -35,15 +38,28 @@ def kill_all_matching(name): SERVER_P2P_PORT = lambda r: 52800 + r -PROTOCOL_CONFIGS = { - "RepNothing": lambda r, n: f"backer_path='/tmp/summerset.rep_nothing.{r}.wal'", - "SimplePush": lambda r, n: f"backer_path='/tmp/summerset.simple_push.{r}.wal'+rep_degree={n-1}", - "MultiPaxos": lambda r, n: f"backer_path='/tmp/summerset.multipaxos.{r}.wal'", - "RSPaxos": lambda r, n: f"backer_path='/tmp/summerset.rs_paxos.{r}.wal'+fault_tolerance={n-(n//2+1)}", - "Crossword": lambda r, n: f"backer_path='/tmp/summerset.rs_paxos.{r}.wal'+fault_tolerance=0+shards_per_replica=3", +PROTOCOL_BACKER_PATH = { + "RepNothing": lambda r: f"backer_path='/tmp/summerset.rep_nothing.{r}.wal'", + "SimplePush": lambda r: f"backer_path='/tmp/summerset.simple_push.{r}.wal'", + "MultiPaxos": lambda r: f"backer_path='/tmp/summerset.multipaxos.{r}.wal'", + "RSPaxos": lambda r: f"backer_path='/tmp/summerset.rs_paxos.{r}.wal'", + "Crossword": lambda r: f"backer_path='/tmp/summerset.crossword.{r}.wal'", } +def config_with_backer_path(protocol, config, replica): + result_config = PROTOCOL_BACKER_PATH[protocol](replica) + + if config is not None and len(config) > 0: + if "backer_path" in config: + result_config = config # use user-supplied path + else: + result_config += "+" + result_config += config + + return result_config + + def compose_manager_cmd(protocol, srv_port, cli_port, num_replicas, release): cmd = [f"./target/{'release' if release else 'debug'}/summerset_manager"] cmd += [ @@ -67,7 +83,26 @@ def launch_manager(protocol, num_replicas, release): num_replicas, release, ) - return run_process(cmd) + return run_process(cmd, capture_stderr=True) + + +def wait_manager_setup(proc): + accepting_servers, accepting_clients = False, False + + for line in iter(proc.stderr.readline, b""): + sys.stderr.buffer.write(line) + sys.stderr.flush() + + l = line.decode() + if "(m) accepting servers" in l: + assert not accepting_servers + accepting_servers = True + if "(m) accepting clients" in l: + assert not accepting_clients + accepting_clients = True + + if accepting_servers and accepting_clients: + break def compose_server_cmd(protocol, api_port, p2p_port, manager, config, release): @@ -82,12 +117,12 @@ def compose_server_cmd(protocol, api_port, p2p_port, manager, config, release): "-m", manager, ] - if len(config) > 0: + if config is not None and len(config) > 0: cmd += ["--config", config] return cmd -def launch_servers(protocol, num_replicas, release): +def launch_servers(protocol, num_replicas, release, config): server_procs = [] for replica in range(num_replicas): cmd = compose_server_cmd( @@ -95,7 +130,7 @@ def launch_servers(protocol, num_replicas, release): SERVER_API_PORT(replica), SERVER_P2P_PORT(replica), f"127.0.0.1:{MANAGER_SRV_PORT}", - PROTOCOL_CONFIGS[protocol](replica, num_replicas), + config_with_backer_path(protocol, config, replica), release, ) proc = run_process(cmd) @@ -115,13 +150,11 @@ def launch_servers(protocol, num_replicas, release): parser.add_argument( "-r", "--release", action="store_true", help="if set, run release mode" ) + parser.add_argument( + "-c", "--config", type=str, help="protocol-specific TOML config string" + ) args = parser.parse_args() - if args.protocol not in PROTOCOL_CONFIGS: - raise ValueError(f"unknown protocol name '{args.protocol}'") - if args.num_replicas <= 0 or args.num_replicas > 9: - raise ValueError(f"invalid number of replicas {args.num_replicas}") - # kill all existing server and manager processes kill_all_matching("summerset_server") kill_all_matching("summerset_manager") @@ -135,10 +168,14 @@ def launch_servers(protocol, num_replicas, release): # launch cluster manager oracle first manager_proc = launch_manager(args.protocol, args.num_replicas, args.release) - time.sleep(5) + wait_manager_setup(manager_proc) # then launch server replicas - launch_servers(args.protocol, args.num_replicas, args.release) + launch_servers(args.protocol, args.num_replicas, args.release, args.config) + + for line in iter(manager_proc.stderr.readline, b""): + sys.stderr.buffer.write(line) + sys.stderr.flush() rc = manager_proc.wait() sys.exit(rc) diff --git a/scripts/set_tcp_buf_sizes.sh b/scripts/set_tcp_buf_sizes.sh index 2d3e3f21..55d8d0a4 100755 --- a/scripts/set_tcp_buf_sizes.sh +++ b/scripts/set_tcp_buf_sizes.sh @@ -1,12 +1,22 @@ #! /usr/bin/bash +echo "Per-socket TCP send/receive buffer:" +echo "min default max" echo "4096 131072 33554432" | sudo tee /proc/sys/net/ipv4/tcp_rmem echo "4096 131072 33554432" | sudo tee /proc/sys/net/ipv4/tcp_wmem +echo +echo "System-wide total buffer size:" +echo "min default max" echo "1538757 16413408 24620112" | sudo tee /proc/sys/net/ipv4/tcp_mem +echo +echo "Max value of setsockopt:" echo "33554432" | sudo tee /proc/sys/net/core/rmem_max echo "33554432" | sudo tee /proc/sys/net/core/wmem_max +echo +echo "Default value of network socket:" echo "131072" | sudo tee /proc/sys/net/core/rmem_default echo "131072" | sudo tee /proc/sys/net/core/wmem_default +echo From f9960edec58fe55dddbd0e2e2346636c84411c90 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Tue, 29 Aug 2023 16:44:12 +0800 Subject: [PATCH 05/89] make Bitmap a general u8-indexed map --- src/lib.rs | 2 +- src/protocols/crossword.rs | 53 +++++++------- src/protocols/multipaxos.rs | 14 ++-- src/protocols/rs_paxos.rs | 54 +++++++------- src/protocols/simple_push.rs | 8 +-- src/server/transport.rs | 12 ++-- src/utils/bitmap.rs | 86 +++++++++++++++------- src/utils/mod.rs | 2 +- src/utils/rscoding.rs | 134 ++++++++++++++++++++--------------- 9 files changed, 210 insertions(+), 155 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 40bcbf31..24a24bb6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -13,7 +13,7 @@ mod protocols; // Things (other than exported macros) exposed to users of this crate: #[doc(inline)] -pub use crate::utils::{SummersetError, ReplicaMap, Timer}; +pub use crate::utils::{SummersetError, Bitmap, Timer}; #[doc(inline)] pub use crate::manager::{CtrlMsg, CtrlRequest, CtrlReply, ClusterManager}; diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index dfe647ea..7f6bc743 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -3,11 +3,11 @@ //! MultiPaxos with flexible Reed-Solomon erasure coding that supports tunable //! shard groups and asymmetric shard assignment. -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use std::path::Path; use std::net::SocketAddr; -use crate::utils::{SummersetError, ReplicaMap, RSCodeword}; +use crate::utils::{SummersetError, Bitmap, RSCodeword}; use crate::manager::{CtrlMsg, CtrlRequest, CtrlReply}; use crate::server::{ ReplicaId, ControlHub, StateMachine, CommandResult, CommandId, ExternalApi, @@ -84,13 +84,13 @@ type ReqBatch = Vec<(ClientId, ApiRequest)>; #[derive(Debug, Clone)] struct LeaderBookkeeping { /// Replicas from which I have received Prepare confirmations. - prepare_acks: ReplicaMap, + prepare_acks: Bitmap, /// Max ballot among received Prepare replies. prepare_max_bal: Ballot, /// Replicas from which I have received Accept confirmations. - accept_acks: ReplicaMap, + accept_acks: Bitmap, } /// Follower-side bookkeeping info for each instance received. @@ -284,10 +284,9 @@ impl CrosswordReplica { id: ReplicaId, population: u8, num_shards: u8, - ) -> HashSet { - (id..(id + num_shards)) - .map(|i| (i % population) as usize) - .collect() + ) -> Bitmap { + let ones = (id..(id + num_shards)).map(|i| (i % population)).collect(); + Bitmap::from(population, ones) } /// Handler of client request batch chan recv. @@ -323,8 +322,8 @@ impl CrosswordReplica { // compute the complete Reed-Solomon codeword for the batch data let mut reqs_cw = RSCodeword::from_data( req_batch, - self.quorum_cnt as usize, - (self.population - self.quorum_cnt) as usize, + self.quorum_cnt, + self.population - self.quorum_cnt, )?; reqs_cw.compute_parity(Some(&self.rs_coder))?; @@ -343,9 +342,9 @@ impl CrosswordReplica { assert_eq!(old_inst.status, Status::Null); old_inst.reqs_cw = reqs_cw; old_inst.leader_bk = Some(LeaderBookkeeping { - prepare_acks: ReplicaMap::new(self.population, false), + prepare_acks: Bitmap::new(self.population, false), prepare_max_bal: 0, - accept_acks: ReplicaMap::new(self.population, false), + accept_acks: Bitmap::new(self.population, false), }); } else { let new_inst = Instance { @@ -353,9 +352,9 @@ impl CrosswordReplica { status: Status::Null, reqs_cw, leader_bk: Some(LeaderBookkeeping { - prepare_acks: ReplicaMap::new(self.population, false), + prepare_acks: Bitmap::new(self.population, false), prepare_max_bal: 0, - accept_acks: ReplicaMap::new(self.population, false), + accept_acks: Bitmap::new(self.population, false), }), replica_bk: None, }; @@ -555,14 +554,12 @@ impl CrosswordReplica { break; } - if inst.reqs_cw.avail_shards() < self.quorum_cnt as usize { + if inst.reqs_cw.avail_shards() < self.quorum_cnt { // can't execute if I don't have the complete request batch pf_debug!(self.id; "postponing execution for slot {} (shards {}/{})", slot, inst.reqs_cw.avail_shards(), self.quorum_cnt); break; - } else if inst.reqs_cw.avail_data_shards() - < self.quorum_cnt as usize - { + } else if inst.reqs_cw.avail_data_shards() < self.quorum_cnt { // have enough shards but need reconstruction inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?; } @@ -638,8 +635,8 @@ impl CrosswordReplica { bal: 0, status: Status::Null, reqs_cw: RSCodeword::::from_null( - self.quorum_cnt as usize, - (self.population - self.quorum_cnt) as usize, + self.quorum_cnt, + self.population - self.quorum_cnt, )?, leader_bk: None, replica_bk: None, @@ -680,7 +677,7 @@ impl CrosswordReplica { ) -> Result<(), SummersetError> { pf_trace!(self.id; "received PrepareReply <- {} for slot {} bal {} shards {:?}", peer, slot, ballot, - voted.as_ref().map(|(_, cw)| cw.avail_shards_set())); + voted.as_ref().map(|(_, cw)| cw.avail_shards_map())); // if ballot is what I'm currently waiting on for Prepare replies: if ballot == self.bal_prep_sent { @@ -720,7 +717,7 @@ impl CrosswordReplica { // instance using the request batch value constructed using shards // with the highest ballot number in quorum if leader_bk.prepare_acks.count() >= self.quorum_cnt - && inst.reqs_cw.avail_data_shards() >= self.quorum_cnt as usize + && inst.reqs_cw.avail_data_shards() >= self.quorum_cnt { inst.status = Status::Accepting; pf_debug!(self.id; "enter Accept phase for slot {} bal {}", @@ -731,7 +728,7 @@ impl CrosswordReplica { self.bal_prepared = ballot; // if parity shards not computed yet, compute them now - if inst.reqs_cw.avail_shards() < self.population as usize { + if inst.reqs_cw.avail_shards() < self.population { inst.reqs_cw.compute_parity(Some(&self.rs_coder))?; } @@ -795,7 +792,7 @@ impl CrosswordReplica { reqs_cw: RSCodeword, ) -> Result<(), SummersetError> { pf_trace!(self.id; "received Accept <- {} for slot {} bal {} shards {:?}", - peer, slot, ballot, reqs_cw.avail_shards_set()); + peer, slot, ballot, reqs_cw.avail_shards_map()); // if ballot is not smaller than what I have made promises for: if ballot >= self.bal_max_seen { @@ -805,8 +802,8 @@ impl CrosswordReplica { bal: 0, status: Status::Null, reqs_cw: RSCodeword::::from_null( - self.quorum_cnt as usize, - (self.population - self.quorum_cnt) as usize, + self.quorum_cnt, + self.population - self.quorum_cnt, )?, leader_bk: None, replica_bk: None, @@ -919,8 +916,8 @@ impl CrosswordReplica { bal: 0, status: Status::Null, reqs_cw: RSCodeword::::from_null( - self.quorum_cnt as usize, - (self.population - self.quorum_cnt) as usize, + self.quorum_cnt, + self.population - self.quorum_cnt, )?, leader_bk: None, replica_bk: None, diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs index c268372c..d44056f1 100644 --- a/src/protocols/multipaxos.rs +++ b/src/protocols/multipaxos.rs @@ -11,7 +11,7 @@ use std::collections::HashMap; use std::path::Path; use std::net::SocketAddr; -use crate::utils::{SummersetError, ReplicaMap}; +use crate::utils::{SummersetError, Bitmap}; use crate::manager::{CtrlMsg, CtrlRequest, CtrlReply}; use crate::server::{ ReplicaId, ControlHub, StateMachine, CommandResult, CommandId, ExternalApi, @@ -77,13 +77,13 @@ type ReqBatch = Vec<(ClientId, ApiRequest)>; #[derive(Debug, Clone)] struct LeaderBookkeeping { /// Replicas from which I have received Prepare confirmations. - prepare_acks: ReplicaMap, + prepare_acks: Bitmap, /// Max ballot among received Prepare replies. prepare_max_bal: Ballot, /// Replicas from which I have received Accept confirmations. - accept_acks: ReplicaMap, + accept_acks: Bitmap, } /// Follower-side bookkeeping info for each instance received. @@ -308,9 +308,9 @@ impl MultiPaxosReplica { if old_inst.status == Status::Null { old_inst.reqs = req_batch.clone(); old_inst.leader_bk = Some(LeaderBookkeeping { - prepare_acks: ReplicaMap::new(self.population, false), + prepare_acks: Bitmap::new(self.population, false), prepare_max_bal: 0, - accept_acks: ReplicaMap::new(self.population, false), + accept_acks: Bitmap::new(self.population, false), }); slot = s; break; @@ -322,9 +322,9 @@ impl MultiPaxosReplica { status: Status::Null, reqs: req_batch.clone(), leader_bk: Some(LeaderBookkeeping { - prepare_acks: ReplicaMap::new(self.population, false), + prepare_acks: Bitmap::new(self.population, false), prepare_max_bal: 0, - accept_acks: ReplicaMap::new(self.population, false), + accept_acks: Bitmap::new(self.population, false), }), replica_bk: None, }; diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs index e47993de..b2da668d 100644 --- a/src/protocols/rs_paxos.rs +++ b/src/protocols/rs_paxos.rs @@ -3,11 +3,11 @@ //! MultiPaxos with Reed-Solomon erasure coding. References: //! - -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use std::path::Path; use std::net::SocketAddr; -use crate::utils::{SummersetError, ReplicaMap, RSCodeword}; +use crate::utils::{SummersetError, Bitmap, RSCodeword}; use crate::manager::{CtrlMsg, CtrlRequest, CtrlReply}; use crate::server::{ ReplicaId, ControlHub, StateMachine, CommandResult, CommandId, ExternalApi, @@ -79,13 +79,13 @@ type ReqBatch = Vec<(ClientId, ApiRequest)>; #[derive(Debug, Clone)] struct LeaderBookkeeping { /// Replicas from which I have received Prepare confirmations. - prepare_acks: ReplicaMap, + prepare_acks: Bitmap, /// Max ballot among received Prepare replies. prepare_max_bal: Ballot, /// Replicas from which I have received Accept confirmations. - accept_acks: ReplicaMap, + accept_acks: Bitmap, } /// Follower-side bookkeeping info for each instance received. @@ -307,8 +307,8 @@ impl RSPaxosReplica { // compute the complete Reed-Solomon codeword for the batch data let mut reqs_cw = RSCodeword::from_data( req_batch, - self.quorum_cnt as usize, - (self.population - self.quorum_cnt) as usize, + self.quorum_cnt, + self.population - self.quorum_cnt, )?; reqs_cw.compute_parity(Some(&self.rs_coder))?; @@ -327,9 +327,9 @@ impl RSPaxosReplica { assert_eq!(old_inst.status, Status::Null); old_inst.reqs_cw = reqs_cw; old_inst.leader_bk = Some(LeaderBookkeeping { - prepare_acks: ReplicaMap::new(self.population, false), + prepare_acks: Bitmap::new(self.population, false), prepare_max_bal: 0, - accept_acks: ReplicaMap::new(self.population, false), + accept_acks: Bitmap::new(self.population, false), }); } else { let new_inst = Instance { @@ -337,9 +337,9 @@ impl RSPaxosReplica { status: Status::Null, reqs_cw, leader_bk: Some(LeaderBookkeeping { - prepare_acks: ReplicaMap::new(self.population, false), + prepare_acks: Bitmap::new(self.population, false), prepare_max_bal: 0, - accept_acks: ReplicaMap::new(self.population, false), + accept_acks: Bitmap::new(self.population, false), }), replica_bk: None, }; @@ -407,7 +407,7 @@ impl RSPaxosReplica { ballot: inst.bal, // persist only one shard on myself reqs_cw: inst.reqs_cw.subset_copy( - HashSet::from([self.id as usize]), + Bitmap::from(self.population, vec![self.id]), false, )?, }, @@ -427,7 +427,7 @@ impl RSPaxosReplica { slot, ballot: inst.bal, reqs_cw: inst.reqs_cw.subset_copy( - HashSet::from([peer as usize]), + Bitmap::from(self.population, vec![peer]), false, )?, }, @@ -530,14 +530,12 @@ impl RSPaxosReplica { break; } - if inst.reqs_cw.avail_shards() < self.quorum_cnt as usize { + if inst.reqs_cw.avail_shards() < self.quorum_cnt { // can't execute if I don't have the complete request batch pf_debug!(self.id; "postponing execution for slot {} (shards {}/{})", slot, inst.reqs_cw.avail_shards(), self.quorum_cnt); break; - } else if inst.reqs_cw.avail_data_shards() - < self.quorum_cnt as usize - { + } else if inst.reqs_cw.avail_data_shards() < self.quorum_cnt { // have enough shards but need reconstruction inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?; } @@ -613,8 +611,8 @@ impl RSPaxosReplica { bal: 0, status: Status::Null, reqs_cw: RSCodeword::::from_null( - self.quorum_cnt as usize, - (self.population - self.quorum_cnt) as usize, + self.quorum_cnt, + self.population - self.quorum_cnt, )?, leader_bk: None, replica_bk: None, @@ -655,7 +653,7 @@ impl RSPaxosReplica { ) -> Result<(), SummersetError> { pf_trace!(self.id; "received PrepareReply <- {} for slot {} bal {} shards {:?}", peer, slot, ballot, - voted.as_ref().map(|(_, cw)| cw.avail_shards_set())); + voted.as_ref().map(|(_, cw)| cw.avail_shards_map())); // if ballot is what I'm currently waiting on for Prepare replies: if ballot == self.bal_prep_sent { @@ -695,7 +693,7 @@ impl RSPaxosReplica { // instance using the request batch value constructed using shards // with the highest ballot number in quorum if leader_bk.prepare_acks.count() >= self.quorum_cnt - && inst.reqs_cw.avail_data_shards() >= self.quorum_cnt as usize + && inst.reqs_cw.avail_data_shards() >= self.quorum_cnt { inst.status = Status::Accepting; pf_debug!(self.id; "enter Accept phase for slot {} bal {}", @@ -706,7 +704,7 @@ impl RSPaxosReplica { self.bal_prepared = ballot; // if parity shards not computed yet, compute them now - if inst.reqs_cw.avail_shards() < self.population as usize { + if inst.reqs_cw.avail_shards() < self.population { inst.reqs_cw.compute_parity(Some(&self.rs_coder))?; } @@ -718,7 +716,7 @@ impl RSPaxosReplica { slot, ballot, reqs_cw: inst.reqs_cw.subset_copy( - HashSet::from([self.id as usize]), + Bitmap::from(self.population, vec![self.id]), false, )?, }, @@ -738,7 +736,7 @@ impl RSPaxosReplica { slot, ballot, reqs_cw: inst.reqs_cw.subset_copy( - HashSet::from([peer as usize]), + Bitmap::from(self.population, vec![peer]), false, )?, }, @@ -762,7 +760,7 @@ impl RSPaxosReplica { reqs_cw: RSCodeword, ) -> Result<(), SummersetError> { pf_trace!(self.id; "received Accept <- {} for slot {} bal {} shards {:?}", - peer, slot, ballot, reqs_cw.avail_shards_set()); + peer, slot, ballot, reqs_cw.avail_shards_map()); // if ballot is not smaller than what I have made promises for: if ballot >= self.bal_max_seen { @@ -772,8 +770,8 @@ impl RSPaxosReplica { bal: 0, status: Status::Null, reqs_cw: RSCodeword::::from_null( - self.quorum_cnt as usize, - (self.population - self.quorum_cnt) as usize, + self.quorum_cnt, + self.population - self.quorum_cnt, )?, leader_bk: None, replica_bk: None, @@ -886,8 +884,8 @@ impl RSPaxosReplica { bal: 0, status: Status::Null, reqs_cw: RSCodeword::::from_null( - self.quorum_cnt as usize, - (self.population - self.quorum_cnt) as usize, + self.quorum_cnt, + self.population - self.quorum_cnt, )?, leader_bk: None, replica_bk: None, diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs index d212f98b..eb082de3 100644 --- a/src/protocols/simple_push.rs +++ b/src/protocols/simple_push.rs @@ -7,7 +7,7 @@ use std::path::Path; use std::net::SocketAddr; -use crate::utils::{SummersetError, ReplicaMap}; +use crate::utils::{SummersetError, Bitmap}; use crate::manager::{CtrlMsg, CtrlRequest, CtrlReply}; use crate::server::{ ReplicaId, ControlHub, StateMachine, CommandResult, CommandId, ExternalApi, @@ -80,7 +80,7 @@ enum PushMsg { struct Instance { reqs: Vec<(ClientId, ApiRequest)>, durable: bool, - pending_peers: ReplicaMap, + pending_peers: Bitmap, execed: Vec, from_peer: Option<(ReplicaId, usize)>, // peer ID, peer inst_idx } @@ -148,7 +148,7 @@ impl SimplePushReplica { assert!(batch_size > 0); // target peers to push to - let mut target = ReplicaMap::new(self.population, false); + let mut target = Bitmap::new(self.population, false); let mut peer_cnt = 0; for peer in 0..self.population { if peer_cnt == self.config.rep_degree { @@ -262,7 +262,7 @@ impl SimplePushReplica { let inst = Instance { reqs: req_batch.clone(), durable: false, - pending_peers: ReplicaMap::new(self.population, false), + pending_peers: Bitmap::new(self.population, false), execed: vec![false; req_batch.len()], from_peer: Some((peer, src_inst_idx)), }; diff --git a/src/server/transport.rs b/src/server/transport.rs index fb4bae0e..10deff8b 100644 --- a/src/server/transport.rs +++ b/src/server/transport.rs @@ -3,7 +3,7 @@ use std::fmt; use std::net::SocketAddr; -use crate::utils::{SummersetError, ReplicaMap, safe_tcp_read, safe_tcp_write}; +use crate::utils::{SummersetError, Bitmap, safe_tcp_read, safe_tcp_write}; use crate::server::ReplicaId; use bytes::BytesMut; @@ -144,10 +144,10 @@ where } } - /// Gets a ReplicaMap where currently connected peers are set true. - pub fn current_peers(&self) -> Result { + /// Gets a bitmap where currently connected peers are set true. + pub fn current_peers(&self) -> Result { let tx_sends_guard = self.tx_sends.guard(); - let mut peers = ReplicaMap::new(self.population, false); + let mut peers = Bitmap::new(self.population, false); for &id in tx_sends_guard.keys() { if let Err(e) = peers.set(id, true) { return logged_err!(self.me; "error setting peer {}: {}", @@ -187,7 +187,7 @@ where pub fn bcast_msg( &mut self, msg: Msg, - target: Option, + target: Option, ) -> Result<(), SummersetError> { let tx_sends_guard = self.tx_sends.guard(); for &peer in tx_sends_guard.keys() { @@ -624,7 +624,7 @@ mod transport_tests { assert!(id == 1 || id == 2); assert_eq!(msg, TestMsg("world".into())); // send another message to 1 only - let mut map = ReplicaMap::new(3, false); + let mut map = Bitmap::new(3, false); map.set(1, true)?; hub.bcast_msg(TestMsg("nice".into()), Some(map))?; // recv another message from 1 diff --git a/src/utils/bitmap.rs b/src/utils/bitmap.rs index a7f27d98..dfbb8467 100644 --- a/src/utils/bitmap.rs +++ b/src/utils/bitmap.rs @@ -1,15 +1,16 @@ //! Bitmap data structure helper. +use std::fmt; + use crate::utils::SummersetError; -use crate::server::ReplicaId; use fixedbitset::FixedBitSet; -/// Compact bitmap for replica ID -> bool mapping. -#[derive(Debug, Clone)] -pub struct ReplicaMap(FixedBitSet); +/// Compact bitmap for u8 ID -> bool mapping. +#[derive(Clone, PartialEq, Eq)] +pub struct Bitmap(FixedBitSet); -impl ReplicaMap { +impl Bitmap { /// Creates a new bitmap of given size. If `ones` is true, all slots are /// marked true initially; otherwise, all slots are initially false. pub fn new(size: u8, ones: bool) -> Self { @@ -17,18 +18,31 @@ impl ReplicaMap { panic!("invalid bitmap size {}", size); } let mut bitset = FixedBitSet::with_capacity(size as usize); + if ones { bitset.set_range(.., true); } - ReplicaMap(bitset) + + Bitmap(bitset) + } + + /// Creates a new bitmap of given size from vec literal. Indices in the + /// vec are bits to be set as true. + pub fn from(size: u8, ones: Vec) -> Self { + let mut bitmap = Self::new(size, false); + + for idx in ones { + if let Err(e) = bitmap.set(idx, true) { + panic!("{}", e); + } + } + + bitmap } /// Sets bit at index to given flag. - pub fn set( - &mut self, - idx: ReplicaId, - flag: bool, - ) -> Result<(), SummersetError> { + #[inline] + pub fn set(&mut self, idx: u8, flag: bool) -> Result<(), SummersetError> { if idx as usize >= self.0.len() { return Err(SummersetError(format!("index {} out of bound", idx))); } @@ -37,7 +51,8 @@ impl ReplicaMap { } /// Gets the bit flag at index. - pub fn get(&self, idx: ReplicaId) -> Result { + #[inline] + pub fn get(&self, idx: u8) -> Result { if idx as usize >= self.0.len() { return Err(SummersetError(format!("index {} out of bound", idx))); } @@ -45,33 +60,36 @@ impl ReplicaMap { } /// Returns the size of the bitmap. + #[inline] pub fn size(&self) -> u8 { self.0.len() as u8 } /// Returns the number of trues in the bitmap. + #[inline] pub fn count(&self) -> u8 { self.0.count_ones(..) as u8 } /// Allows `for (id, bit) in map.iter()`. - pub fn iter(&self) -> ReplicaMapIter { - ReplicaMapIter { map: self, idx: 0 } + #[inline] + pub fn iter(&self) -> BitmapIter { + BitmapIter { map: self, idx: 0 } } } -/// Iterator over `ReplicaMap`, yielding `(id, bit)` pairs. +/// Iterator over `Bitmap`, yielding `(id, bit)` pairs. #[derive(Debug, Clone)] -pub struct ReplicaMapIter<'m> { - map: &'m ReplicaMap, +pub struct BitmapIter<'m> { + map: &'m Bitmap, idx: usize, } -impl Iterator for ReplicaMapIter<'_> { - type Item = (ReplicaId, bool); +impl Iterator for BitmapIter<'_> { + type Item = (u8, bool); fn next(&mut self) -> Option { - let id: ReplicaId = self.idx as ReplicaId; + let id: u8 = self.idx as u8; if id < self.map.size() { self.idx += 1; Some((id, self.map.get(id).unwrap())) @@ -81,6 +99,26 @@ impl Iterator for ReplicaMapIter<'_> { } } +// Implement `Debug` trait manually for better trace printing. +impl fmt::Debug for Bitmap { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{{{}; [", self.size())?; + let mut first_idx = true; + for i in self + .iter() + .filter_map(|(i, flag)| if flag { Some(i) } else { None }) + { + if !first_idx { + write!(f, ", {}", i)?; + } else { + write!(f, "{}", i)?; + first_idx = false; + } + } + write!(f, "]}}") + } +} + #[cfg(test)] mod bitmap_tests { use super::*; @@ -88,12 +126,12 @@ mod bitmap_tests { #[test] #[should_panic] fn bitmap_new_panic() { - ReplicaMap::new(0, true); + Bitmap::new(0, true); } #[test] fn bitmap_set_get() { - let mut map = ReplicaMap::new(7, false); + let mut map = Bitmap::new(7, false); assert!(map.set(0, true).is_ok()); assert!(map.set(1, false).is_ok()); assert!(map.set(2, true).is_ok()); @@ -107,7 +145,7 @@ mod bitmap_tests { #[test] fn bitmap_count() { - let mut map = ReplicaMap::new(7, false); + let mut map = Bitmap::new(7, false); assert_eq!(map.count(), 0); assert!(map.set(0, true).is_ok()); assert!(map.set(2, true).is_ok()); @@ -118,7 +156,7 @@ mod bitmap_tests { #[test] fn bitmap_iter() { let ref_map = vec![true, true, false, true, true]; - let mut map = ReplicaMap::new(5, true); + let mut map = Bitmap::new(5, true); assert!(map.set(2, false).is_ok()); for (id, flag) in map.iter() { assert_eq!(ref_map[id as usize], flag); diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 6feb3e1e..7510b772 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -13,7 +13,7 @@ mod safetcp; mod rscoding; pub use error::SummersetError; -pub use bitmap::ReplicaMap; +pub use bitmap::Bitmap; pub use timer::Timer; pub use safetcp::{safe_tcp_read, safe_tcp_write}; pub use rscoding::RSCodeword; diff --git a/src/utils/rscoding.rs b/src/utils/rscoding.rs index 659077ac..49c008a3 100644 --- a/src/utils/rscoding.rs +++ b/src/utils/rscoding.rs @@ -2,10 +2,9 @@ use std::fmt; use std::io; -use std::collections::HashSet; use std::marker::PhantomData; -use crate::utils::SummersetError; +use crate::utils::{SummersetError, Bitmap}; use bytes::{BytesMut, BufMut}; @@ -20,10 +19,10 @@ use reed_solomon_erasure::galois_8::ReedSolomon; #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] pub struct RSCodeword { /// Number of data shards. - num_data_shards: usize, + num_data_shards: u8, /// Number of parity shards. - num_parity_shards: usize, + num_parity_shards: u8, /// Exact length of original data in bytes. data_len: usize, @@ -53,13 +52,13 @@ where data_copy: Option, data_bytes: Option, data_len: usize, - num_data_shards: usize, - num_parity_shards: usize, + num_data_shards: u8, + num_parity_shards: u8, ) -> Result { if num_data_shards == 0 { return Err(SummersetError("num_data_shards is zero".into())); } - if data_len != 0 && data_len < num_data_shards { + if data_len != 0 && data_len < num_data_shards as usize { return Err(SummersetError(format!( "data length too small: {}", data_len @@ -67,10 +66,10 @@ where } let num_total_shards = num_data_shards + num_parity_shards; - let shard_len = if data_len % num_data_shards == 0 { - data_len / num_data_shards + let shard_len = if data_len % num_data_shards as usize == 0 { + data_len / num_data_shards as usize } else { - (data_len / num_data_shards) + 1 + (data_len / num_data_shards as usize) + 1 }; let shards = if let Some(mut data_bytes) = data_bytes { @@ -78,11 +77,11 @@ where assert_eq!(data_bytes.len(), data_len); // pad length to multiple of num_data_shards and compute shard size - let padded_len = shard_len * num_data_shards; + let padded_len = shard_len * num_data_shards as usize; data_bytes.resize(padded_len, 0); // split the bytes representation into contiguously stored shards - let mut shards = Vec::with_capacity(num_data_shards); + let mut shards = Vec::with_capacity(num_data_shards as usize); for _ in 0..(num_data_shards - 1) { let shard = data_bytes.split_to(shard_len); assert_eq!(shard.len(), shard_len); @@ -90,15 +89,15 @@ where } assert_eq!(data_bytes.len(), shard_len); shards.push(Some(data_bytes)); // the last shard - assert_eq!(shards.len(), num_data_shards); + assert_eq!(shards.len(), num_data_shards as usize); for _ in num_data_shards..num_total_shards { shards.push(None); } - assert_eq!(shards.len(), num_total_shards); + assert_eq!(shards.len(), num_total_shards as usize); shards } else { // if newing from empty - vec![None; num_total_shards] + vec![None; num_total_shards as usize] }; Ok(RSCodeword { @@ -115,8 +114,8 @@ where /// Creates a new RSCodeword from original data. pub fn from_data( data: T, - num_data_shards: usize, - num_parity_shards: usize, + num_data_shards: u8, + num_parity_shards: u8, ) -> Result { // serialize original data into bytes let mut data_writer = BytesMut::new().writer(); @@ -133,8 +132,8 @@ where /// Creates a new RSCodeword from empty bytes. pub fn from_null( - num_data_shards: usize, - num_parity_shards: usize, + num_data_shards: u8, + num_parity_shards: u8, ) -> Result { Self::new(None, None, 0, num_data_shards, num_parity_shards) } @@ -143,15 +142,25 @@ where /// shards, and a complete copy of the original data if required. pub fn subset_copy( &self, - subset: HashSet, + subset: Bitmap, copy_data: bool, ) -> Result { if self.data_len == 0 { return Err(SummersetError("codeword is null".into())); } - let mut shards = vec![None; self.num_shards()]; - for i in subset { + let mut shards = vec![None; self.num_shards() as usize]; + for i in + subset.iter().filter_map( + |(i, flag)| { + if flag { + Some(i as usize) + } else { + None + } + }, + ) + { if i >= shards.len() { return Err(SummersetError(format!( "shard index {} out-of-bound", @@ -231,60 +240,71 @@ where } /// Gets number of data shards. - pub fn num_data_shards(&self) -> usize { + #[inline] + pub fn num_data_shards(&self) -> u8 { self.num_data_shards } /// Gets number of parity shards. #[allow(dead_code)] - pub fn num_parity_shards(&self) -> usize { + #[inline] + pub fn num_parity_shards(&self) -> u8 { self.num_parity_shards } /// Gets total number of shards. - pub fn num_shards(&self) -> usize { - self.shards.len() + #[inline] + pub fn num_shards(&self) -> u8 { + self.shards.len() as u8 } /// Gets number of currently available data shards. - pub fn avail_data_shards(&self) -> usize { + #[inline] + pub fn avail_data_shards(&self) -> u8 { self.shards .iter() - .take(self.num_data_shards) + .take(self.num_data_shards as usize) .filter(|s| s.is_some()) - .count() + .count() as u8 } /// Gets number of currently available parity shards. #[allow(dead_code)] - pub fn avail_parity_shards(&self) -> usize { + #[inline] + pub fn avail_parity_shards(&self) -> u8 { self.shards .iter() - .skip(self.num_data_shards) + .skip(self.num_data_shards as usize) .filter(|s| s.is_some()) - .count() + .count() as u8 } /// Gets total number of currently available shards. - pub fn avail_shards(&self) -> usize { - self.shards.iter().filter(|s| s.is_some()).count() + #[inline] + pub fn avail_shards(&self) -> u8 { + self.shards.iter().filter(|s| s.is_some()).count() as u8 } - /// Gets the set of available shard indexes. - pub fn avail_shards_set(&self) -> HashSet { - self.shards + /// Gets a bitmap of available shard indexes set true. + #[inline] + pub fn avail_shards_map(&self) -> Bitmap { + let ones: Vec = self + .shards .iter() .enumerate() - .filter_map(|(i, s)| if s.is_some() { Some(i) } else { None }) - .collect() + .filter_map(|(i, s)| if s.is_some() { Some(i as u8) } else { None }) + .collect(); + Bitmap::from(self.num_shards(), ones) } /// Gets length of original data in bytes. + #[inline] pub fn data_len(&self) -> usize { self.data_len } /// Gets length of a shard in bytes. + #[inline] pub fn shard_len(&self) -> usize { self.shard_len } @@ -295,13 +315,13 @@ where &self, rs: &ReedSolomon, ) -> Result<(), SummersetError> { - if rs.data_shard_count() != self.num_data_shards { + if rs.data_shard_count() != self.num_data_shards as usize { Err(SummersetError(format!( "num_data_shards mismatch: expected {}, rs {}", self.num_data_shards, rs.data_shard_count() ))) - } else if rs.parity_shard_count() != self.num_parity_shards { + } else if rs.parity_shard_count() != self.num_parity_shards as usize { Err(SummersetError(format!( "num_parity_shards mismatch: expected {}, rs {}", self.num_parity_shards, @@ -339,7 +359,8 @@ where } // allocate space for parity shards if haven't - for shard in self.shards.iter_mut().skip(self.num_data_shards) { + for shard in self.shards.iter_mut().skip(self.num_data_shards as usize) + { if shard.is_none() { *shard = Some(BytesMut::zeroed(self.shard_len)); } @@ -473,23 +494,23 @@ struct ShardsReader<'a> { shards: &'a Vec>, /// Number of data shards in vec. - num_data_shards: usize, + num_data_shards: u8, /// Length in bytes of a shard. shard_len: usize, /// Composite cursor: (shard_idx, byte_idx). - cursor: (usize, usize), + cursor: (u8, usize), } impl<'a> ShardsReader<'a> { /// Creates a new temporary reader. fn new( shards: &'a Vec>, - num_data_shards: usize, + num_data_shards: u8, shard_len: usize, ) -> Result { - for shard in shards.iter().take(num_data_shards) { + for shard in shards.iter().take(num_data_shards as usize) { if shard.is_none() { return Err(SummersetError("some data shard is None".into())); } @@ -510,8 +531,9 @@ impl<'a> io::Read for ShardsReader<'a> { let mut total_nread = 0; while self.cursor.0 < self.num_data_shards { - let mut slice = &(self.shards[self.cursor.0].as_ref().unwrap()) - [self.cursor.1..]; + let mut slice = &(self.shards[self.cursor.0 as usize] + .as_ref() + .unwrap())[self.cursor.1..]; let (_, buf_tail) = buf.split_at_mut(total_nread); let shard_nread = slice.read(buf_tail).unwrap(); @@ -569,7 +591,7 @@ mod rscoding_tests { assert_eq!(cw.avail_data_shards(), 3); assert_eq!(cw.avail_parity_shards(), 0); assert_eq!(cw.avail_shards(), 3); - assert_eq!(cw.avail_shards_set(), HashSet::from([0, 1, 2])); + assert_eq!(cw.avail_shards_map(), Bitmap::from(3, vec![0, 1, 2])); assert_eq!(cw.data_len(), data_len); assert_eq!(cw.shard_len(), shard_len); // valid with num_parity_shards > 0 @@ -580,7 +602,7 @@ mod rscoding_tests { assert_eq!(cw.avail_data_shards(), 3); assert_eq!(cw.avail_parity_shards(), 0); assert_eq!(cw.avail_shards(), 3); - assert_eq!(cw.avail_shards_set(), HashSet::from([0, 1, 2])); + assert_eq!(cw.avail_shards_map(), Bitmap::from(5, vec![0, 1, 2])); assert_eq!(cw.data_len(), data_len); assert_eq!(cw.shard_len(), shard_len); Ok(()) @@ -598,7 +620,7 @@ mod rscoding_tests { assert_eq!(cw.avail_data_shards(), 0); assert_eq!(cw.avail_parity_shards(), 0); assert_eq!(cw.avail_shards(), 0); - assert_eq!(cw.avail_shards_set(), HashSet::new()); + assert_eq!(cw.avail_shards_map(), Bitmap::new(5, false)); assert_eq!(cw.data_len(), 0); assert_eq!(cw.shard_len(), 0); Ok(()) @@ -609,21 +631,21 @@ mod rscoding_tests { let data = TestData("interesting_value".into()); let cwa = RSCodeword::from_data(data.clone(), 3, 2)?; // invalid subset - assert!(cwa.subset_copy(HashSet::from([0, 5]), false).is_err()); + assert!(cwa.subset_copy(Bitmap::from(6, vec![0, 5]), false).is_err()); // valid subsets - let cw01 = cwa.subset_copy(HashSet::from([0, 1]), false)?; + let cw01 = cwa.subset_copy(Bitmap::from(5, vec![0, 1]), false)?; assert_eq!(cw01.avail_data_shards(), 2); - let cw02 = cwa.subset_copy(HashSet::from([0, 2]), true)?; + let cw02 = cwa.subset_copy(Bitmap::from(5, vec![0, 2]), true)?; assert_eq!(cw02.avail_data_shards(), 2); assert!(cw02.data_copy.is_some()); // valid absorbing let mut cwb = RSCodeword::::from_null(3, 2)?; cwb.absorb_other(cw02)?; assert_eq!(cwb.avail_shards(), 2); - assert_eq!(cwb.avail_shards_set(), HashSet::from([0, 2])); + assert_eq!(cwb.avail_shards_map(), Bitmap::from(5, vec![0, 2])); cwb.absorb_other(cw01)?; assert_eq!(cwb.avail_shards(), 3); - assert_eq!(cwb.avail_shards_set(), HashSet::from([0, 1, 2])); + assert_eq!(cwb.avail_shards_map(), Bitmap::from(5, vec![0, 1, 2])); assert_eq!(*cwb.get_data()?, data); // invalid absorbing assert!(cwb From 7312afbb471e865e31a40e3b306a1bbfc2a224ac Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Tue, 29 Aug 2023 20:51:29 +0800 Subject: [PATCH 06/89] fix crossword ack pattern bug --- src/protocols/crossword.rs | 118 +++++++++++++++++++++++++++++-------- 1 file changed, 93 insertions(+), 25 deletions(-) diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index 7f6bc743..da59f976 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -89,8 +89,9 @@ struct LeaderBookkeeping { /// Max ballot among received Prepare replies. prepare_max_bal: Ballot, - /// Replicas from which I have received Accept confirmations. - accept_acks: Bitmap, + /// Replicas and their assigned shards which the received Accept + /// confirmations cover. + accept_acks: HashMap, } /// Follower-side bookkeeping info for each instance received. @@ -284,9 +285,51 @@ impl CrosswordReplica { id: ReplicaId, population: u8, num_shards: u8, - ) -> Bitmap { - let ones = (id..(id + num_shards)).map(|i| (i % population)).collect(); - Bitmap::from(population, ones) + ) -> Vec { + (id..(id + num_shards)).map(|i| (i % population)).collect() + } + + /// TODO: make better impl of this. + fn coverage_under_faults( + population: u8, + acks: &HashMap, + fault_tolerance: u8, + ) -> u8 { + if acks.len() <= fault_tolerance as usize { + return 0; + } + + // enumerate all subsets of acks excluding fault number of replicas + let cnt = (acks.len() - fault_tolerance as usize) as u32; + let servers: Vec = acks.keys().cloned().collect(); + let mut min_coverage = population; + + for n in (0..2usize.pow(servers.len() as u32)) + .filter(|n| n.count_ones() == cnt) + { + let mut coverage = Bitmap::new(population, false); + for (_, server) in servers + .iter() + .enumerate() + .filter(|&(i, _)| (n >> i) % 2 == 1) + { + for shard in acks[server].iter().filter_map(|(s, flag)| { + if flag { + Some(s) + } else { + None + } + }) { + coverage.set(shard, true).expect("impossible shard index"); + } + } + + if coverage.count() < min_coverage { + min_coverage = coverage.count(); + } + } + + min_coverage } /// Handler of client request batch chan recv. @@ -344,7 +387,7 @@ impl CrosswordReplica { old_inst.leader_bk = Some(LeaderBookkeeping { prepare_acks: Bitmap::new(self.population, false), prepare_max_bal: 0, - accept_acks: Bitmap::new(self.population, false), + accept_acks: HashMap::new(), }); } else { let new_inst = Instance { @@ -354,7 +397,7 @@ impl CrosswordReplica { leader_bk: Some(LeaderBookkeeping { prepare_acks: Bitmap::new(self.population, false), prepare_max_bal: 0, - accept_acks: Bitmap::new(self.population, false), + accept_acks: HashMap::new(), }), replica_bk: None, }; @@ -422,10 +465,13 @@ impl CrosswordReplica { ballot: inst.bal, // persist only some shards on myself reqs_cw: inst.reqs_cw.subset_copy( - Self::shards_for_replica( - self.id, + Bitmap::from( self.population, - self.config.shards_per_replica, + Self::shards_for_replica( + self.id, + self.population, + self.config.shards_per_replica, + ), ), false, )?, @@ -447,10 +493,13 @@ impl CrosswordReplica { slot, ballot: inst.bal, reqs_cw: inst.reqs_cw.subset_copy( - Self::shards_for_replica( - peer, + Bitmap::from( self.population, - self.config.shards_per_replica, + Self::shards_for_replica( + peer, + self.population, + self.config.shards_per_replica, + ), ), false, )?, @@ -740,10 +789,13 @@ impl CrosswordReplica { slot, ballot, reqs_cw: inst.reqs_cw.subset_copy( - Self::shards_for_replica( - self.id, + Bitmap::from( self.population, - self.config.shards_per_replica, + Self::shards_for_replica( + self.id, + self.population, + self.config.shards_per_replica, + ), ), false, )?, @@ -764,10 +816,13 @@ impl CrosswordReplica { slot, ballot, reqs_cw: inst.reqs_cw.subset_copy( - Self::shards_for_replica( - peer, + Bitmap::from( self.population, - self.config.shards_per_replica, + Self::shards_for_replica( + peer, + self.population, + self.config.shards_per_replica, + ), ), false, )?, @@ -862,18 +917,31 @@ impl CrosswordReplica { assert!(self.bal_max_seen >= ballot); assert!(inst.leader_bk.is_some()); let leader_bk = inst.leader_bk.as_mut().unwrap(); - if leader_bk.accept_acks.get(peer)? { + if leader_bk.accept_acks.contains_key(&peer) { return Ok(()); } // bookkeep this Accept reply - leader_bk.accept_acks.set(peer, true)?; + leader_bk.accept_acks.insert( + peer, + Bitmap::from( + self.population, + Self::shards_for_replica( + peer, + self.population, + self.config.shards_per_replica, + ), + ), + ); // if quorum size reached AND enough number of shards are - // remembered, mark this instance as committed; in RS-Paxos, this - // means accept_acks.count() >= self.quorum_cnt + fault_tolerance - if leader_bk.accept_acks.count() - >= self.quorum_cnt + self.config.fault_tolerance + // remembered, mark this instance as committed + if leader_bk.accept_acks.len() as u8 >= self.quorum_cnt + && Self::coverage_under_faults( + self.population, + &leader_bk.accept_acks, + self.config.fault_tolerance, + ) >= self.quorum_cnt { inst.status = Status::Committed; pf_debug!(self.id; "committed instance at slot {} bal {}", From 6f2e0c7e5dce3cf31e1df1aaf94b65a4416f532e Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Wed, 30 Aug 2023 13:25:28 +0800 Subject: [PATCH 07/89] minor updates to bench script --- scripts/local_bench.tmp.py | 39 ++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py index c20ad6d5..8aaa6374 100644 --- a/scripts/local_bench.tmp.py +++ b/scripts/local_bench.tmp.py @@ -144,7 +144,7 @@ def all_protocol_configs(num_replicas): max_fault_tolerance = num_replicas - quorum_cnt config_choices = [("MultiPaxos", None, None)] - for shards_per_replica in range(quorum_cnt, 0): + for shards_per_replica in range(quorum_cnt, 0, -1): config_choices.append( ("Crossword", max_fault_tolerance, shards_per_replica) ) @@ -152,20 +152,27 @@ def all_protocol_configs(num_replicas): return config_choices - # for num_replicas in (3, 5, 7): - # for value_size in (1024, 65536, 4194304): - # for protocol, fault_tolerance, shards_per_replica in all_protocol_configs( - # num_replicas - # ): - # bench_round( - # protocol, - # num_replicas, - # value_size, - # 100, - # 60, - # fault_tolerance=fault_tolerance, - # shards_per_replica=shards_per_replica, - # ) + for num_replicas in (3, 5, 7): + for value_size in (1024, 65536, 4194304): + for protocol, fault_tolerance, shards_per_replica in all_protocol_configs( + num_replicas + ): + # print( + # num_replicas, + # value_size, + # protocol, + # fault_tolerance, + # shards_per_replica, + # ) + bench_round( + protocol, + num_replicas, + value_size, + 100, + 60, + fault_tolerance=fault_tolerance, + shards_per_replica=shards_per_replica, + ) bench_round("MultiPaxos", 5, 65536, 0, 60) - # bench_round("Crossword", 5, 65536, 0, 60, fault_tolerance=0, shards_per_replica=1) + bench_round("Crossword", 5, 65536, 0, 60, fault_tolerance=0, shards_per_replica=1) From 7d11298a57d319a81f863ad4d4df002ec2795bd4 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Wed, 30 Aug 2023 14:03:55 +0800 Subject: [PATCH 08/89] minor updates to bench script --- scripts/local_bench.tmp.py | 2 +- scripts/local_cluster.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py index 8aaa6374..ba3e2346 100644 --- a/scripts/local_bench.tmp.py +++ b/scripts/local_bench.tmp.py @@ -19,7 +19,7 @@ def run_process(cmd): def kill_all_matching(name): # print("Kill all:", name) assert name.count(" ") == 0 - cmd = ["pkill", "-9", "-f", name] + cmd = ["sudo", "pkill", "-9", "-f", name] proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) proc.wait() diff --git a/scripts/local_cluster.py b/scripts/local_cluster.py index ffbdff15..c4fa4f68 100644 --- a/scripts/local_cluster.py +++ b/scripts/local_cluster.py @@ -26,7 +26,7 @@ def run_process(cmd, capture_stderr=False): def kill_all_matching(name): print("Kill all:", name) assert name.count(" ") == 0 - cmd = ["pkill", "-9", "-f", name] + cmd = ["sudo", "pkill", "-9", "-f", name] proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) proc.wait() From 8353c08c5310293e55c4dad0d5bbf2e1b081124e Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Wed, 30 Aug 2023 14:11:55 +0800 Subject: [PATCH 09/89] minor updates to bench script --- scripts/local_bench.tmp.py | 6 ++---- scripts/local_cluster.py | 5 ++--- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py index ba3e2346..45085437 100644 --- a/scripts/local_bench.tmp.py +++ b/scripts/local_bench.tmp.py @@ -1,5 +1,5 @@ +import os import subprocess -import itertools import statistics @@ -19,9 +19,7 @@ def run_process(cmd): def kill_all_matching(name): # print("Kill all:", name) assert name.count(" ") == 0 - cmd = ["sudo", "pkill", "-9", "-f", name] - proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - proc.wait() + os.system(f"sudo pkill -9 -f {name}") def launch_cluster(protocol, num_replicas, config): diff --git a/scripts/local_cluster.py b/scripts/local_cluster.py index c4fa4f68..f0cc1099 100644 --- a/scripts/local_cluster.py +++ b/scripts/local_cluster.py @@ -1,4 +1,5 @@ import sys +import os import argparse import subprocess from pathlib import Path @@ -26,9 +27,7 @@ def run_process(cmd, capture_stderr=False): def kill_all_matching(name): print("Kill all:", name) assert name.count(" ") == 0 - cmd = ["sudo", "pkill", "-9", "-f", name] - proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - proc.wait() + os.system(f"sudo pkill -9 -f {name}") MANAGER_SRV_PORT = 52600 From b2621cbe65f7fc0395c261a99ce39306d3ea3bad Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Wed, 30 Aug 2023 14:28:58 +0800 Subject: [PATCH 10/89] fixing scripts address already in use --- scripts/local_bench.tmp.py | 2 +- scripts/local_cluster.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py index 45085437..3776079f 100644 --- a/scripts/local_bench.tmp.py +++ b/scripts/local_bench.tmp.py @@ -19,7 +19,7 @@ def run_process(cmd): def kill_all_matching(name): # print("Kill all:", name) assert name.count(" ") == 0 - os.system(f"sudo pkill -9 -f {name}") + os.system(f"killall -9 {name} > /dev/null 2>&1") def launch_cluster(protocol, num_replicas, config): diff --git a/scripts/local_cluster.py b/scripts/local_cluster.py index f0cc1099..87fd94e3 100644 --- a/scripts/local_cluster.py +++ b/scripts/local_cluster.py @@ -27,7 +27,7 @@ def run_process(cmd, capture_stderr=False): def kill_all_matching(name): print("Kill all:", name) assert name.count(" ") == 0 - os.system(f"sudo pkill -9 -f {name}") + os.system(f"killall -9 {name} > /dev/null 2>&1") MANAGER_SRV_PORT = 52600 From 0a79799d8b2b80f0c160b084602cd03aaf581230 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Wed, 30 Aug 2023 14:54:13 +0800 Subject: [PATCH 11/89] fixing scripts address already in use --- scripts/local_bench.tmp.py | 5 ++++- src/manager/reactor.rs | 6 ++++-- src/manager/reigner.rs | 6 ++++-- src/server/external.rs | 6 ++++-- src/server/transport.rs | 6 ++++-- src/utils/mod.rs | 2 +- src/utils/safetcp.rs | 28 +++++++++++++++++++++++++--- 7 files changed, 46 insertions(+), 13 deletions(-) diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py index 3776079f..ad51e517 100644 --- a/scripts/local_bench.tmp.py +++ b/scripts/local_bench.tmp.py @@ -108,7 +108,10 @@ def bench_round( shards_per_replica=None, ): print( - f"{protocol:<10s} n={num_replicas:1d} v={value_size:<9d} w%={put_ratio:<3d} {length_s:3d}s" + f"{protocol:<10s} n={num_replicas:1d} v={value_size:<9d} " + + f"f={fault_tolerance if fault_tolerance is not None else 'x':1d} " + + f"s={shards_per_replica if shards_per_replica is not None else 'x':1d} " + + f"w%={put_ratio:<3d} {length_s:3d}s" ) kill_all_matching("summerset_client") kill_all_matching("summerset_server") diff --git a/src/manager/reactor.rs b/src/manager/reactor.rs index 52b14f63..e3a1b198 100644 --- a/src/manager/reactor.rs +++ b/src/manager/reactor.rs @@ -3,7 +3,9 @@ use std::collections::HashMap; use std::net::SocketAddr; -use crate::utils::{SummersetError, safe_tcp_read, safe_tcp_write}; +use crate::utils::{ + SummersetError, safe_tcp_read, safe_tcp_write, tcp_bind_with_retry, +}; use crate::server::ReplicaId; use crate::client::ClientId; @@ -74,7 +76,7 @@ impl ClientReactor { let (client_responder_handles_write, client_responder_handles_read) = flashmap::new::>(); - let client_listener = TcpListener::bind(cli_addr).await?; + let client_listener = tcp_bind_with_retry(cli_addr, 10).await?; let client_acceptor_handle = tokio::spawn(Self::client_acceptor_thread( tx_req, diff --git a/src/manager/reigner.rs b/src/manager/reigner.rs index 21658f92..cff8f18f 100644 --- a/src/manager/reigner.rs +++ b/src/manager/reigner.rs @@ -3,7 +3,9 @@ use std::collections::HashMap; use std::net::SocketAddr; -use crate::utils::{SummersetError, safe_tcp_read, safe_tcp_write}; +use crate::utils::{ + SummersetError, safe_tcp_read, safe_tcp_write, tcp_bind_with_retry, +}; use crate::server::ReplicaId; use crate::protocols::SmrProtocol; @@ -71,7 +73,7 @@ impl ServerReigner { let (server_controller_handles_write, server_controller_handles_read) = flashmap::new::>(); - let server_listener = TcpListener::bind(srv_addr).await?; + let server_listener = tcp_bind_with_retry(srv_addr, 10).await?; let server_acceptor_handle = tokio::spawn(Self::server_acceptor_thread( tx_recv, diff --git a/src/server/external.rs b/src/server/external.rs index 9c50b546..3083a662 100644 --- a/src/server/external.rs +++ b/src/server/external.rs @@ -3,7 +3,9 @@ use std::net::SocketAddr; use std::sync::Arc; -use crate::utils::{SummersetError, safe_tcp_read, safe_tcp_write}; +use crate::utils::{ + SummersetError, safe_tcp_read, safe_tcp_write, tcp_bind_with_retry, +}; use crate::server::{ReplicaId, Command, CommandResult}; use crate::client::ClientId; @@ -115,7 +117,7 @@ impl ExternalApi { let (client_servant_handles_write, client_servant_handles_read) = flashmap::new::>(); - let client_listener = TcpListener::bind(api_addr).await?; + let client_listener = tcp_bind_with_retry(api_addr, 10).await?; let client_acceptor_handle = tokio::spawn(Self::client_acceptor_thread( me, diff --git a/src/server/transport.rs b/src/server/transport.rs index 10deff8b..ca121d70 100644 --- a/src/server/transport.rs +++ b/src/server/transport.rs @@ -3,7 +3,9 @@ use std::fmt; use std::net::SocketAddr; -use crate::utils::{SummersetError, Bitmap, safe_tcp_read, safe_tcp_write}; +use crate::utils::{ + SummersetError, Bitmap, safe_tcp_read, safe_tcp_write, tcp_bind_with_retry, +}; use crate::server::ReplicaId; use bytes::BytesMut; @@ -84,7 +86,7 @@ where let (tx_connect, rx_connect) = mpsc::unbounded_channel(); let (tx_connack, rx_connack) = mpsc::unbounded_channel(); - let peer_listener = TcpListener::bind(p2p_addr).await?; + let peer_listener = tcp_bind_with_retry(p2p_addr, 10).await?; let peer_acceptor_handle = tokio::spawn(Self::peer_acceptor_thread( me, tx_recv.clone(), diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 7510b772..23a43006 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -15,5 +15,5 @@ mod rscoding; pub use error::SummersetError; pub use bitmap::Bitmap; pub use timer::Timer; -pub use safetcp::{safe_tcp_read, safe_tcp_write}; +pub use safetcp::{safe_tcp_read, safe_tcp_write, tcp_bind_with_retry}; pub use rscoding::RSCodeword; diff --git a/src/utils/safetcp.rs b/src/utils/safetcp.rs index 2e3cad88..6a0df26a 100644 --- a/src/utils/safetcp.rs +++ b/src/utils/safetcp.rs @@ -1,7 +1,9 @@ //! Safe TCP read/write helpers that provides cancellation safety on the read -//! side and deadlock avoidance on the write side. +//! side and deadlock avoidance on the write side. Safe `TcpListener` binding +//! wrapper that provides a retrying logic. use std::io::ErrorKind; +use std::net::SocketAddr; use crate::utils::SummersetError; @@ -13,7 +15,8 @@ use rmp_serde::encode::to_vec as encode_to_vec; use rmp_serde::decode::from_read as decode_from_read; use tokio::io::AsyncReadExt; -use tokio::net::TcpStream; +use tokio::net::{TcpStream, TcpListener}; +use tokio::time::{self, Duration}; /// Receives an object of type `T` from TCP readable connection `conn_read`, /// using `read_buf` as buffer storage for partial reads. Returns: @@ -140,4 +143,23 @@ where Ok(true) } -// No unit tests for these two helpers... +/// Wrapper over tokio `TcpListener::bind()` that provides a retrying logic. +pub async fn tcp_bind_with_retry( + addr: SocketAddr, + mut retries: u8, +) -> Result { + loop { + match TcpListener::bind(addr).await { + Ok(listener) => return Ok(listener), + Err(e) => { + if retries == 0 { + return Err(e.into()); + } + retries -= 1; + time::sleep(Duration::from_secs(1)).await; + } + } + } +} + +// No unit tests for these helpers... From aec9b00d5f03c9dea0ae6dce87a9ee18539db9e6 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Wed, 30 Aug 2023 15:01:37 +0800 Subject: [PATCH 12/89] fixing scripts address already in use --- scripts/local_bench.tmp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py index ad51e517..96a03bf4 100644 --- a/scripts/local_bench.tmp.py +++ b/scripts/local_bench.tmp.py @@ -109,8 +109,8 @@ def bench_round( ): print( f"{protocol:<10s} n={num_replicas:1d} v={value_size:<9d} " - + f"f={fault_tolerance if fault_tolerance is not None else 'x':1d} " - + f"s={shards_per_replica if shards_per_replica is not None else 'x':1d} " + + f"f={fault_tolerance if fault_tolerance is not None else 'x':1} " + + f"s={shards_per_replica if shards_per_replica is not None else 'x':1} " + f"w%={put_ratio:<3d} {length_s:3d}s" ) kill_all_matching("summerset_client") From 9359dbe73e579f711c8f49eb9498c5abfec663f0 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Wed, 30 Aug 2023 15:02:09 +0800 Subject: [PATCH 13/89] fixing scripts address already in use --- scripts/local_bench.tmp.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py index 96a03bf4..1537f7c2 100644 --- a/scripts/local_bench.tmp.py +++ b/scripts/local_bench.tmp.py @@ -42,6 +42,7 @@ def wait_cluster_setup(proc, num_replicas): for line in iter(proc.stderr.readline, b""): l = line.decode() + print(l, end="") if "manager" not in l and "accepting clients" in l: replica = int(l[l.find("(") + 1 : l.find(")")]) assert not accepting_clients[replica] From 667f705cdf4fd7e6e60388cbd90fd83773bf5e89 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Wed, 30 Aug 2023 17:29:30 +0800 Subject: [PATCH 14/89] add proper termination signals handling --- Cargo.lock | 22 ++++++++++++ Cargo.toml | 1 + scripts/local_bench.tmp.py | 12 ++++--- scripts/local_cluster.py | 29 ++++++++++++--- src/manager/clusman.rs | 23 ++++++++++-- src/protocols/crossword.rs | 17 ++++++++- src/protocols/multipaxos.rs | 17 ++++++++- src/protocols/rep_nothing.rs | 17 ++++++++- src/protocols/rs_paxos.rs | 17 ++++++++- src/protocols/simple_push.rs | 17 ++++++++- src/server/replica.rs | 7 ++-- src/server/transport.rs | 2 +- src/utils/error.rs | 1 + summerset_client/src/main.rs | 3 +- summerset_manager/src/main.rs | 5 +-- summerset_server/src/main.rs | 66 ++++++++++++++++++++++------------- 16 files changed, 209 insertions(+), 47 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 38874908..e9fc04f8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -230,6 +230,16 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" +[[package]] +name = "ctrlc" +version = "3.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a011bbe2c35ce9c1f143b7af6f94f29a167beb4cd1d29e6740ce836f723120e" +dependencies = [ + "nix", + "windows-sys", +] + [[package]] name = "dirs" version = "4.0.0" @@ -607,6 +617,17 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "nix" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "598beaf3cc6fdd9a5dfb1630c2800c7acd31df7aaf0f565796fba2b53ca1af1b" +dependencies = [ + "bitflags 1.3.2", + "cfg-if", + "libc", +] + [[package]] name = "nom" version = "5.1.3" @@ -1095,6 +1116,7 @@ version = "0.1.0" dependencies = [ "async-trait", "bytes", + "ctrlc", "fixedbitset", "flashmap", "futures", diff --git a/Cargo.toml b/Cargo.toml index 18f2bfe9..707f1150 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,3 +22,4 @@ serde = { version = "1.0", features = ["derive"] } toml = { version = "0.7", features = ["parse"] } log = "0.4" reed-solomon-erasure = { version = "6.0", features = ["simd-accel"] } +ctrlc = { version = "3.4", features = ["termination"] } diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py index 1537f7c2..03d9e9ba 100644 --- a/scripts/local_bench.tmp.py +++ b/scripts/local_bench.tmp.py @@ -16,10 +16,12 @@ def run_process(cmd): return proc -def kill_all_matching(name): +def kill_all_matching(name, force=False): # print("Kill all:", name) assert name.count(" ") == 0 - os.system(f"killall -9 {name} > /dev/null 2>&1") + cmd = "killall -9" if force else "killall" + cmd += f" {name} > /dev/null 2>&1" + os.system(cmd) def launch_cluster(protocol, num_replicas, config): @@ -114,9 +116,9 @@ def bench_round( + f"s={shards_per_replica if shards_per_replica is not None else 'x':1} " + f"w%={put_ratio:<3d} {length_s:3d}s" ) - kill_all_matching("summerset_client") - kill_all_matching("summerset_server") - kill_all_matching("summerset_manager") + kill_all_matching("summerset_client", force=True) + kill_all_matching("summerset_server", force=True) + kill_all_matching("summerset_manager", force=True) configs = [] if fault_tolerance is not None: diff --git a/scripts/local_cluster.py b/scripts/local_cluster.py index 87fd94e3..c4e0877c 100644 --- a/scripts/local_cluster.py +++ b/scripts/local_cluster.py @@ -1,5 +1,6 @@ import sys import os +import signal import argparse import subprocess from pathlib import Path @@ -24,10 +25,12 @@ def run_process(cmd, capture_stderr=False): return proc -def kill_all_matching(name): +def kill_all_matching(name, force=False): print("Kill all:", name) assert name.count(" ") == 0 - os.system(f"killall -9 {name} > /dev/null 2>&1") + cmd = "killall -9" if force else "killall" + cmd += f" {name} > /dev/null 2>&1" + os.system(cmd) MANAGER_SRV_PORT = 52600 @@ -155,8 +158,8 @@ def launch_servers(protocol, num_replicas, release, config): args = parser.parse_args() # kill all existing server and manager processes - kill_all_matching("summerset_server") - kill_all_matching("summerset_manager") + kill_all_matching("summerset_server", force=True) + kill_all_matching("summerset_manager", force=True) # remove all existing wal files for path in Path("/tmp").glob("summerset.*.wal"): @@ -170,11 +173,27 @@ def launch_servers(protocol, num_replicas, release, config): wait_manager_setup(manager_proc) # then launch server replicas - launch_servers(args.protocol, args.num_replicas, args.release, args.config) + server_procs = launch_servers( + args.protocol, args.num_replicas, args.release, args.config + ) + + # register termination signals handler + def kill_spawned_procs(*args): + for proc in server_procs: + proc.terminate() + for proc in server_procs: + proc.wait() + manager_proc.terminate() + + signal.signal(signal.SIGINT, kill_spawned_procs) + signal.signal(signal.SIGTERM, kill_spawned_procs) + signal.signal(signal.SIGHUP, kill_spawned_procs) + # since we piped manager proc's output, re-print it out for line in iter(manager_proc.stderr.readline, b""): sys.stderr.buffer.write(line) sys.stderr.flush() + # reaches here after manager proc has terminated rc = manager_proc.wait() sys.exit(rc) diff --git a/src/manager/clusman.rs b/src/manager/clusman.rs index 15c7372e..e0c6f842 100644 --- a/src/manager/clusman.rs +++ b/src/manager/clusman.rs @@ -11,6 +11,8 @@ use crate::server::ReplicaId; use crate::client::ClientId; use crate::protocols::SmrProtocol; +use tokio::sync::mpsc; + /// Information about an active server. // TODO: maybe add things like leader info, etc. #[derive(Debug, Clone)] @@ -73,8 +75,17 @@ impl ClusterManager { }) } - /// Main event loop logic of the cluster manager. - pub async fn run(&mut self) { + /// Main event loop logic of the cluster manager. Breaks out of the loop + /// only upon catching termination signals to the process. + pub async fn run(&mut self) -> Result<(), SummersetError> { + // set up termination signals handler + let (tx_term, mut rx_term) = mpsc::unbounded_channel(); + ctrlc::set_handler(move || { + if let Err(e) = tx_term.send(true) { + pf_error!("m"; "error sending to term channel: {}", e); + } + })?; + loop { tokio::select! { // receiving server control message @@ -102,8 +113,16 @@ impl ClusterManager { client, e); } }, + + // receiving termination signal + _ = rx_term.recv() => { + pf_warn!("m"; "manager caught termination signal"); + break; + } } } + + Ok(()) } } diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index da59f976..db359123 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -22,6 +22,7 @@ use async_trait::async_trait; use serde::{Serialize, Deserialize}; use tokio::time::Duration; +use tokio::sync::mpsc; use reed_solomon_erasure::galois_8::ReedSolomon; @@ -1215,7 +1216,15 @@ impl GenericReplica for CrosswordReplica { }) } - async fn run(&mut self) { + async fn run(&mut self) -> Result { + // set up termination signals handler + let (tx_term, mut rx_term) = mpsc::unbounded_channel(); + ctrlc::set_handler(move || { + if let Err(e) = tx_term.send(true) { + pf_error!("s"; "error sending to term channel: {}", e); + } + })?; + // TODO: proper leader election if self.id == 0 { self.is_leader = true; @@ -1282,6 +1291,12 @@ impl GenericReplica for CrosswordReplica { if let Err(e) = self.handle_ctrl_msg(ctrl_msg) { pf_error!(self.id; "error handling ctrl msg: {}", e); } + }, + + // receiving termination signal + _ = rx_term.recv() => { + pf_warn!(self.id; "server caught termination signal"); + return Ok(false); } } } diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs index d44056f1..5ed71b3d 100644 --- a/src/protocols/multipaxos.rs +++ b/src/protocols/multipaxos.rs @@ -26,6 +26,7 @@ use async_trait::async_trait; use serde::{Serialize, Deserialize}; use tokio::time::Duration; +use tokio::sync::mpsc; /// Configuration parameters struct. #[derive(Debug, Deserialize)] @@ -1010,7 +1011,15 @@ impl GenericReplica for MultiPaxosReplica { }) } - async fn run(&mut self) { + async fn run(&mut self) -> Result { + // set up termination signals handler + let (tx_term, mut rx_term) = mpsc::unbounded_channel(); + ctrlc::set_handler(move || { + if let Err(e) = tx_term.send(true) { + pf_error!("s"; "error sending to term channel: {}", e); + } + })?; + // TODO: proper leader election if self.id == 0 { self.is_leader = true; @@ -1077,6 +1086,12 @@ impl GenericReplica for MultiPaxosReplica { if let Err(e) = self.handle_ctrl_msg(ctrl_msg) { pf_error!(self.id; "error handling ctrl msg: {}", e); } + }, + + // receiving termination signal + _ = rx_term.recv() => { + pf_warn!(self.id; "server caught termination signal"); + return Ok(false); } } } diff --git a/src/protocols/rep_nothing.rs b/src/protocols/rep_nothing.rs index b071253b..f4ea852c 100644 --- a/src/protocols/rep_nothing.rs +++ b/src/protocols/rep_nothing.rs @@ -21,6 +21,7 @@ use async_trait::async_trait; use serde::{Serialize, Deserialize}; use tokio::time::Duration; +use tokio::sync::mpsc; /// Configuration parameters struct. #[derive(Debug, Deserialize)] @@ -295,7 +296,15 @@ impl GenericReplica for RepNothingReplica { }) } - async fn run(&mut self) { + async fn run(&mut self) -> Result { + // set up termination signals handler + let (tx_term, mut rx_term) = mpsc::unbounded_channel(); + ctrlc::set_handler(move || { + if let Err(e) = tx_term.send(true) { + pf_error!("s"; "error sending to term channel: {}", e); + } + })?; + loop { tokio::select! { // client request batch @@ -344,6 +353,12 @@ impl GenericReplica for RepNothingReplica { if let Err(e) = self.handle_ctrl_msg(ctrl_msg) { pf_error!(self.id; "error handling ctrl msg: {}", e); } + }, + + // receiving termination signal + _ = rx_term.recv() => { + pf_warn!(self.id; "server caught termination signal"); + return Ok(false); } } } diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs index b2da668d..ce3ccc85 100644 --- a/src/protocols/rs_paxos.rs +++ b/src/protocols/rs_paxos.rs @@ -22,6 +22,7 @@ use async_trait::async_trait; use serde::{Serialize, Deserialize}; use tokio::time::Duration; +use tokio::sync::mpsc; use reed_solomon_erasure::galois_8::ReedSolomon; @@ -1108,7 +1109,15 @@ impl GenericReplica for RSPaxosReplica { }) } - async fn run(&mut self) { + async fn run(&mut self) -> Result { + // set up termination signals handler + let (tx_term, mut rx_term) = mpsc::unbounded_channel(); + ctrlc::set_handler(move || { + if let Err(e) = tx_term.send(true) { + pf_error!("s"; "error sending to term channel: {}", e); + } + })?; + // TODO: proper leader election if self.id == 0 { self.is_leader = true; @@ -1175,6 +1184,12 @@ impl GenericReplica for RSPaxosReplica { if let Err(e) = self.handle_ctrl_msg(ctrl_msg) { pf_error!(self.id; "error handling ctrl msg: {}", e); } + }, + + // receiving termination signal + _ = rx_term.recv() => { + pf_warn!(self.id; "server caught termination signal"); + return Ok(false); } } } diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs index eb082de3..841d8c20 100644 --- a/src/protocols/simple_push.rs +++ b/src/protocols/simple_push.rs @@ -22,6 +22,7 @@ use async_trait::async_trait; use serde::{Serialize, Deserialize}; use tokio::time::Duration; +use tokio::sync::mpsc; /// Configuration parameters struct. #[derive(Debug, Deserialize)] @@ -470,7 +471,15 @@ impl GenericReplica for SimplePushReplica { }) } - async fn run(&mut self) { + async fn run(&mut self) -> Result { + // set up termination signals handler + let (tx_term, mut rx_term) = mpsc::unbounded_channel(); + ctrlc::set_handler(move || { + if let Err(e) = tx_term.send(true) { + pf_error!("s"; "error sending to term channel: {}", e); + } + })?; + loop { tokio::select! { // client request batch @@ -541,6 +550,12 @@ impl GenericReplica for SimplePushReplica { if let Err(e) = self.handle_ctrl_msg(ctrl_msg) { pf_error!(self.id; "error handling ctrl msg: {}", e); } + }, + + // receiving termination signal + _ = rx_term.recv() => { + pf_warn!(self.id; "server caught termination signal"); + return Ok(false); } } } diff --git a/src/server/replica.rs b/src/server/replica.rs index 174e60a2..cae9d042 100644 --- a/src/server/replica.rs +++ b/src/server/replica.rs @@ -24,6 +24,9 @@ pub trait GenericReplica { where Self: Sized; - /// Main event loop logic of running this replica. - async fn run(&mut self); + /// Main event loop logic of running this replica. Returns `Ok(true)` if + /// terminated normally and wants to restart (e.g., receiving a reset + /// control message) or `Ok(false)` if terminated normally and does not + /// want to restart (e.g., receiving a termination signal). + async fn run(&mut self) -> Result; } diff --git a/src/server/transport.rs b/src/server/transport.rs index ca121d70..504e32a2 100644 --- a/src/server/transport.rs +++ b/src/server/transport.rs @@ -370,7 +370,7 @@ where to_connect = rx_connect.recv() => { if to_connect.is_none() { pf_error!(me; "connect channel closed"); - continue; + break; // channel gets closed and no messages remain } let (peer, addr) = to_connect.unwrap(); if let Err(e) = Self::connect_new_peer( diff --git a/src/utils/error.rs b/src/utils/error.rs index cdef3b56..90e576c5 100644 --- a/src/utils/error.rs +++ b/src/utils/error.rs @@ -45,6 +45,7 @@ impl_from_error!( tokio::sync::mpsc::error::SendError<(ReplicaId, net::SocketAddr)> ); impl_from_error!(reed_solomon_erasure::Error); +impl_from_error!(ctrlc::Error); #[cfg(test)] mod error_tests { diff --git a/summerset_client/src/main.rs b/summerset_client/src/main.rs index bba5eae3..26346720 100644 --- a/summerset_client/src/main.rs +++ b/summerset_client/src/main.rs @@ -10,7 +10,7 @@ use env_logger::Env; use tokio::runtime::Builder; use tokio::time::Duration; -use summerset::{SmrProtocol, SummersetError, pf_error}; +use summerset::{SmrProtocol, SummersetError, pf_warn, pf_error}; mod drivers; mod clients; @@ -160,6 +160,7 @@ fn main() -> ExitCode { pf_error!("c"; "client_main exitted: {}", e); ExitCode::FAILURE } else { + pf_warn!("c"; "client_main exitted successfully"); ExitCode::SUCCESS } } diff --git a/summerset_manager/src/main.rs b/summerset_manager/src/main.rs index 87fc1d61..6b886372 100644 --- a/summerset_manager/src/main.rs +++ b/summerset_manager/src/main.rs @@ -9,7 +9,7 @@ use env_logger::Env; use tokio::runtime::Builder; -use summerset::{SmrProtocol, SummersetError, pf_error}; +use summerset::{SmrProtocol, SummersetError, pf_warn, pf_error}; /// Command line arguments definition. #[derive(Parser, Debug)] @@ -113,7 +113,7 @@ fn manager_main() -> Result<(), SummersetError> { .new_cluster_manager_setup(srv_addr, cli_addr, args.population) .await?; - manager.run().await; + manager.run().await?; Ok::<(), SummersetError>(()) // give type hint for this async closure }) @@ -130,6 +130,7 @@ fn main() -> ExitCode { pf_error!("m"; "manager_main exitted: {}", e); ExitCode::FAILURE } else { + pf_warn!("m"; "manager_main exitted successfully"); ExitCode::SUCCESS } } diff --git a/summerset_server/src/main.rs b/summerset_server/src/main.rs index df3a736b..abbbc20d 100644 --- a/summerset_server/src/main.rs +++ b/summerset_server/src/main.rs @@ -2,6 +2,8 @@ use std::net::SocketAddr; use std::process::ExitCode; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; use clap::Parser; @@ -9,7 +11,7 @@ use env_logger::Env; use tokio::runtime::Builder; -use summerset::{SmrProtocol, SummersetError, pf_error}; +use summerset::{SmrProtocol, SummersetError, pf_warn, pf_error}; /// Command line arguments definition. #[derive(Parser, Debug)] @@ -107,29 +109,44 @@ fn server_main() -> Result<(), SummersetError> { Some(&args.config[..]) }; - // create tokio multi-threaded runtime - let runtime = Builder::new_multi_thread() - .enable_all() - .worker_threads(args.threads) - .thread_name("tokio-worker-replica") - .build()?; - - // enter tokio runtime, setup the server replica, and start the main event - // loop logic - runtime.block_on(async move { - let mut replica = protocol - .new_server_replica_setup( - api_addr, - p2p_addr, - args.manager, - config_str, - ) - .await?; - - replica.run().await; - - Ok::<(), SummersetError>(()) // give type hint for this async closure - }) + let shutdown = Arc::new(AtomicBool::new(false)); + while !shutdown.load(Ordering::SeqCst) { + let sd = shutdown.clone(); + + // create tokio multi-threaded runtime + let runtime = Builder::new_multi_thread() + .enable_all() + .worker_threads(args.threads) + .thread_name("tokio-worker-replica") + .build()?; + + // enter tokio runtime, setup the server replica, and start the main + // event loop logic + runtime.block_on(async move { + let mut replica = protocol + .new_server_replica_setup( + api_addr, + p2p_addr, + args.manager, + config_str, + ) + .await?; + + if replica.run().await? { + // event loop terminated but wants to restart (e.g., when + // receiving a reset control message); just drop this runtime + // and move to the next iteration of loop + } else { + // event loop terminated and does not want to restart (e.g., + // when receiving a termination signal) + sd.store(true, Ordering::SeqCst); + } + + Ok::<(), SummersetError>(()) // give type hint for this async closure + })?; + } + + Ok(()) } fn main() -> ExitCode { @@ -143,6 +160,7 @@ fn main() -> ExitCode { pf_error!("s"; "server_main exitted: {}", e); ExitCode::FAILURE } else { + pf_warn!("s"; "server_main exitted successfully"); ExitCode::SUCCESS } } From f7d71d45aafc69bee22a11ad6d4b5ae10c5cfe51 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Wed, 30 Aug 2023 17:33:21 +0800 Subject: [PATCH 15/89] fixing scripts address already in use --- scripts/local_bench.tmp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py index 03d9e9ba..a76b77ea 100644 --- a/scripts/local_bench.tmp.py +++ b/scripts/local_bench.tmp.py @@ -44,7 +44,7 @@ def wait_cluster_setup(proc, num_replicas): for line in iter(proc.stderr.readline, b""): l = line.decode() - print(l, end="") + # print(l, end="") if "manager" not in l and "accepting clients" in l: replica = int(l[l.find("(") + 1 : l.find(")")]) assert not accepting_clients[replica] From edbd4f5386a30d5eb0fa63c51632e840be71cd72 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Wed, 30 Aug 2023 18:46:26 +0800 Subject: [PATCH 16/89] fix wrong NewServerJoin message send timing --- src/manager/clusman.rs | 3 ++- src/manager/reigner.rs | 15 ++++++++++-- src/protocols/crossword.rs | 45 ++++++++++++++++++++---------------- src/protocols/multipaxos.rs | 43 +++++++++++++++++++--------------- src/protocols/rep_nothing.rs | 26 ++++++++++++--------- src/protocols/rs_paxos.rs | 43 +++++++++++++++++++--------------- src/protocols/simple_push.rs | 43 +++++++++++++++++++--------------- src/server/control.rs | 9 ++++++-- 8 files changed, 134 insertions(+), 93 deletions(-) diff --git a/src/manager/clusman.rs b/src/manager/clusman.rs index e0c6f842..8890faa9 100644 --- a/src/manager/clusman.rs +++ b/src/manager/clusman.rs @@ -61,7 +61,8 @@ impl ClusterManager { return logged_err!("m"; "invalid population {}", population); } - let server_reigner = ServerReigner::new_and_setup(srv_addr).await?; + let server_reigner = + ServerReigner::new_and_setup(srv_addr, population).await?; let client_reactor = ClientReactor::new_and_setup(cli_addr).await?; Ok(ClusterManager { diff --git a/src/manager/reigner.rs b/src/manager/reigner.rs index cff8f18f..ef7d7579 100644 --- a/src/manager/reigner.rs +++ b/src/manager/reigner.rs @@ -64,6 +64,7 @@ impl ServerReigner { /// messages. pub async fn new_and_setup( srv_addr: SocketAddr, + population: u8, ) -> Result { let (tx_recv, rx_recv) = mpsc::unbounded_channel(); @@ -76,6 +77,7 @@ impl ServerReigner { let server_listener = tcp_bind_with_retry(srv_addr, 10).await?; let server_acceptor_handle = tokio::spawn(Self::server_acceptor_thread( + population, tx_recv, server_listener, tx_sends_write, @@ -128,10 +130,12 @@ impl ServerReigner { // ServerReigner server_acceptor thread implementation impl ServerReigner { /// Accepts a new server connection. + #[allow(clippy::too_many_arguments)] async fn accept_new_server( mut stream: TcpStream, addr: SocketAddr, id: ReplicaId, + population: u8, tx_recv: mpsc::UnboundedSender<(ReplicaId, CtrlMsg)>, tx_sends: &mut flashmap::WriteHandle< ReplicaId, @@ -143,11 +147,16 @@ impl ServerReigner { >, tx_exit: mpsc::UnboundedSender, ) -> Result<(), SummersetError> { - // send ID assignment + // first send server ID assignment if let Err(e) = stream.write_u8(id).await { return logged_err!("m"; "error assigning new server ID: {}", e); } + // then send population + if let Err(e) = stream.write_u8(population).await { + return logged_err!("m"; "error sending population: {}", e); + } + let mut tx_sends_guard = tx_sends.guard(); if let Some(sender) = tx_sends_guard.get(&id) { if sender.is_closed() { @@ -205,6 +214,7 @@ impl ServerReigner { /// Server acceptor thread function. async fn server_acceptor_thread( + population: u8, tx_recv: mpsc::UnboundedSender<(ReplicaId, CtrlMsg)>, server_listener: TcpListener, mut tx_sends: flashmap::WriteHandle< @@ -241,6 +251,7 @@ impl ServerReigner { stream, addr, next_server_id, + population, tx_recv.clone(), &mut tx_sends, &mut server_controller_handles, @@ -471,7 +482,7 @@ mod reigner_tests { }); // manager let mut reigner = - ServerReigner::new_and_setup("127.0.0.1:53600".parse()?).await?; + ServerReigner::new_and_setup("127.0.0.1:53600".parse()?, 2).await?; setup_bar.wait().await; // recv message from server 0 let (id, msg) = reigner.recv_ctrl().await?; diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index db359123..dc74afe8 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -1116,14 +1116,16 @@ impl GenericReplica for CrosswordReplica { manager: SocketAddr, config_str: Option<&str>, ) -> Result { - let config = parsed_config!(config_str => ReplicaConfigCrossword; - batch_interval_us, max_batch_size, - backer_path, logger_sync, fault_tolerance, - shards_per_replica)?; // connect to the cluster manager and get assigned a server ID let mut control_hub = ControlHub::new_and_setup(manager).await?; let id = control_hub.me; + let population = control_hub.population; + // parse protocol-specific configs + let config = parsed_config!(config_str => ReplicaConfigCrossword; + batch_interval_us, max_batch_size, + backer_path, logger_sync, fault_tolerance, + shards_per_replica)?; if config.batch_interval_us == 0 { return logged_err!( id; @@ -1132,20 +1134,31 @@ impl GenericReplica for CrosswordReplica { ); } - // ask for population number and the list of peers to proactively - // connect to + // setup state machine module + let state_machine = StateMachine::new_and_setup(id).await?; + + // setup storage hub module + let storage_hub = + StorageHub::new_and_setup(id, Path::new(&config.backer_path)) + .await?; + + // setup transport hub module + let mut transport_hub = + TransportHub::new_and_setup(id, population, p2p_addr).await?; + + // ask for the list of peers to proactively connect to. Do this after + // transport hub has been set up, so that I will be able to accept + // later peer connections control_hub.send_ctrl(CtrlMsg::NewServerJoin { id, protocol: SmrProtocol::Crossword, api_addr, p2p_addr, })?; - let (population, to_peers) = if let CtrlMsg::ConnectToPeers { - population, - to_peers, - } = control_hub.recv_ctrl().await? + let to_peers = if let CtrlMsg::ConnectToPeers { to_peers, .. } = + control_hub.recv_ctrl().await? { - (population, to_peers) + to_peers } else { return logged_err!(id; "unexpected ctrl msg type received"); }; @@ -1168,15 +1181,6 @@ impl GenericReplica for CrosswordReplica { (population - quorum_cnt) as usize, )?; - let state_machine = StateMachine::new_and_setup(id).await?; - - let storage_hub = - StorageHub::new_and_setup(id, Path::new(&config.backer_path)) - .await?; - - let mut transport_hub = - TransportHub::new_and_setup(id, population, p2p_addr).await?; - // proactively connect to some peers, then wait for all population // have been connected with me for (peer, addr) in to_peers { @@ -1184,6 +1188,7 @@ impl GenericReplica for CrosswordReplica { } transport_hub.wait_for_group(population).await?; + // setup external API module, ready to take in client requests let external_api = ExternalApi::new_and_setup( id, api_addr, diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs index 5ed71b3d..4e50c9a3 100644 --- a/src/protocols/multipaxos.rs +++ b/src/protocols/multipaxos.rs @@ -931,13 +931,15 @@ impl GenericReplica for MultiPaxosReplica { manager: SocketAddr, config_str: Option<&str>, ) -> Result { - let config = parsed_config!(config_str => ReplicaConfigMultiPaxos; - batch_interval_us, max_batch_size, - backer_path, logger_sync)?; // connect to the cluster manager and get assigned a server ID let mut control_hub = ControlHub::new_and_setup(manager).await?; let id = control_hub.me; + let population = control_hub.population; + // parse protocol-specific configs + let config = parsed_config!(config_str => ReplicaConfigMultiPaxos; + batch_interval_us, max_batch_size, + backer_path, logger_sync)?; if config.batch_interval_us == 0 { return logged_err!( id; @@ -946,33 +948,35 @@ impl GenericReplica for MultiPaxosReplica { ); } - // ask for population number and the list of peers to proactively - // connect to + // setup state machine module + let state_machine = StateMachine::new_and_setup(id).await?; + + // setup storage hub module + let storage_hub = + StorageHub::new_and_setup(id, Path::new(&config.backer_path)) + .await?; + + // setup transport hub module + let mut transport_hub = + TransportHub::new_and_setup(id, population, p2p_addr).await?; + + // ask for the list of peers to proactively connect to. Do this after + // transport hub has been set up, so that I will be able to accept + // later peer connections control_hub.send_ctrl(CtrlMsg::NewServerJoin { id, protocol: SmrProtocol::MultiPaxos, api_addr, p2p_addr, })?; - let (population, to_peers) = if let CtrlMsg::ConnectToPeers { - population, - to_peers, - } = control_hub.recv_ctrl().await? + let to_peers = if let CtrlMsg::ConnectToPeers { to_peers, .. } = + control_hub.recv_ctrl().await? { - (population, to_peers) + to_peers } else { return logged_err!(id; "unexpected ctrl msg type received"); }; - let state_machine = StateMachine::new_and_setup(id).await?; - - let storage_hub = - StorageHub::new_and_setup(id, Path::new(&config.backer_path)) - .await?; - - let mut transport_hub = - TransportHub::new_and_setup(id, population, p2p_addr).await?; - // proactively connect to some peers, then wait for all population // have been connected with me for (peer, addr) in to_peers { @@ -980,6 +984,7 @@ impl GenericReplica for MultiPaxosReplica { } transport_hub.wait_for_group(population).await?; + // setup external API module, ready to take in client requests let external_api = ExternalApi::new_and_setup( id, api_addr, diff --git a/src/protocols/rep_nothing.rs b/src/protocols/rep_nothing.rs index f4ea852c..bbfb79c6 100644 --- a/src/protocols/rep_nothing.rs +++ b/src/protocols/rep_nothing.rs @@ -242,13 +242,14 @@ impl GenericReplica for RepNothingReplica { manager: SocketAddr, config_str: Option<&str>, ) -> Result { - let config = parsed_config!(config_str => ReplicaConfigRepNothing; - batch_interval_us, max_batch_size, - backer_path, logger_sync)?; // connect to the cluster manager and get assigned a server ID let mut control_hub = ControlHub::new_and_setup(manager).await?; let id = control_hub.me; + // parse protocol-specific configs + let config = parsed_config!(config_str => ReplicaConfigRepNothing; + batch_interval_us, max_batch_size, + backer_path, logger_sync)?; if config.batch_interval_us == 0 { return logged_err!( id; @@ -257,6 +258,16 @@ impl GenericReplica for RepNothingReplica { ); } + // setup state machine module + let state_machine = StateMachine::new_and_setup(id).await?; + + // setup storage hub module + let storage_hub = + StorageHub::new_and_setup(id, Path::new(&config.backer_path)) + .await?; + + // TransportHub is not needed in RepNothing + // tell the manager tha I have joined control_hub.send_ctrl(CtrlMsg::NewServerJoin { id, @@ -266,14 +277,7 @@ impl GenericReplica for RepNothingReplica { })?; control_hub.recv_ctrl().await?; - let state_machine = StateMachine::new_and_setup(id).await?; - - let storage_hub = - StorageHub::new_and_setup(id, Path::new(&config.backer_path)) - .await?; - - // TransportHub is not needed in RepNothing - + // setup external API module, ready to take in client requests let external_api = ExternalApi::new_and_setup( id, api_addr, diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs index ce3ccc85..1c72b41a 100644 --- a/src/protocols/rs_paxos.rs +++ b/src/protocols/rs_paxos.rs @@ -1016,13 +1016,15 @@ impl GenericReplica for RSPaxosReplica { manager: SocketAddr, config_str: Option<&str>, ) -> Result { - let config = parsed_config!(config_str => ReplicaConfigRSPaxos; - batch_interval_us, max_batch_size, - backer_path, logger_sync, fault_tolerance)?; // connect to the cluster manager and get assigned a server ID let mut control_hub = ControlHub::new_and_setup(manager).await?; let id = control_hub.me; + let population = control_hub.population; + // parse protocol-specific configs + let config = parsed_config!(config_str => ReplicaConfigRSPaxos; + batch_interval_us, max_batch_size, + backer_path, logger_sync, fault_tolerance)?; if config.batch_interval_us == 0 { return logged_err!( id; @@ -1031,20 +1033,31 @@ impl GenericReplica for RSPaxosReplica { ); } - // ask for population number and the list of peers to proactively - // connect to + // setup state machine module + let state_machine = StateMachine::new_and_setup(id).await?; + + // setup storage hub module + let storage_hub = + StorageHub::new_and_setup(id, Path::new(&config.backer_path)) + .await?; + + // setup transport hub module + let mut transport_hub = + TransportHub::new_and_setup(id, population, p2p_addr).await?; + + // ask for the list of peers to proactively connect to. Do this after + // transport hub has been set up, so that I will be able to accept + // later peer connections control_hub.send_ctrl(CtrlMsg::NewServerJoin { id, protocol: SmrProtocol::RSPaxos, api_addr, p2p_addr, })?; - let (population, to_peers) = if let CtrlMsg::ConnectToPeers { - population, - to_peers, - } = control_hub.recv_ctrl().await? + let to_peers = if let CtrlMsg::ConnectToPeers { to_peers, .. } = + control_hub.recv_ctrl().await? { - (population, to_peers) + to_peers } else { return logged_err!(id; "unexpected ctrl msg type received"); }; @@ -1061,15 +1074,6 @@ impl GenericReplica for RSPaxosReplica { (population - quorum_cnt) as usize, )?; - let state_machine = StateMachine::new_and_setup(id).await?; - - let storage_hub = - StorageHub::new_and_setup(id, Path::new(&config.backer_path)) - .await?; - - let mut transport_hub = - TransportHub::new_and_setup(id, population, p2p_addr).await?; - // proactively connect to some peers, then wait for all population // have been connected with me for (peer, addr) in to_peers { @@ -1077,6 +1081,7 @@ impl GenericReplica for RSPaxosReplica { } transport_hub.wait_for_group(population).await?; + // setup external API module, ready to take in client requests let external_api = ExternalApi::new_and_setup( id, api_addr, diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs index 841d8c20..b0156ad3 100644 --- a/src/protocols/simple_push.rs +++ b/src/protocols/simple_push.rs @@ -398,13 +398,15 @@ impl GenericReplica for SimplePushReplica { manager: SocketAddr, config_str: Option<&str>, ) -> Result { - let config = parsed_config!(config_str => ReplicaConfigSimplePush; - batch_interval_us, max_batch_size, - backer_path, rep_degree)?; // connect to the cluster manager and get assigned a server ID let mut control_hub = ControlHub::new_and_setup(manager).await?; let id = control_hub.me; + let population = control_hub.population; + // parse protocol-specific configs + let config = parsed_config!(config_str => ReplicaConfigSimplePush; + batch_interval_us, max_batch_size, + backer_path, rep_degree)?; if config.batch_interval_us == 0 { return logged_err!( id; @@ -413,33 +415,35 @@ impl GenericReplica for SimplePushReplica { ); } - // ask for population number and the list of peers to proactively - // connect to + // setup state machine module + let state_machine = StateMachine::new_and_setup(id).await?; + + // setup storage hub module + let storage_hub = + StorageHub::new_and_setup(id, Path::new(&config.backer_path)) + .await?; + + // setup transport hub module + let mut transport_hub = + TransportHub::new_and_setup(id, population, p2p_addr).await?; + + // ask for the list of peers to proactively connect to. Do this after + // transport hub has been set up, so that I will be able to accept + // later peer connections control_hub.send_ctrl(CtrlMsg::NewServerJoin { id, protocol: SmrProtocol::SimplePush, api_addr, p2p_addr, })?; - let (population, to_peers) = if let CtrlMsg::ConnectToPeers { - population, - to_peers, - } = control_hub.recv_ctrl().await? + let to_peers = if let CtrlMsg::ConnectToPeers { to_peers, .. } = + control_hub.recv_ctrl().await? { - (population, to_peers) + to_peers } else { return logged_err!(id; "unexpected ctrl msg type received"); }; - let state_machine = StateMachine::new_and_setup(id).await?; - - let storage_hub = - StorageHub::new_and_setup(id, Path::new(&config.backer_path)) - .await?; - - let mut transport_hub = - TransportHub::new_and_setup(id, population, p2p_addr).await?; - // proactively connect to some peers, then wait for all population // have been connected with me for (peer, addr) in to_peers { @@ -447,6 +451,7 @@ impl GenericReplica for SimplePushReplica { } transport_hub.wait_for_group(population).await?; + // setup external API module, ready to take in client requests let external_api = ExternalApi::new_and_setup( id, api_addr, diff --git a/src/server/control.rs b/src/server/control.rs index 68b812fd..05627db3 100644 --- a/src/server/control.rs +++ b/src/server/control.rs @@ -19,6 +19,9 @@ pub struct ControlHub { /// My replica ID. pub me: ReplicaId, + /// Number of replicas in cluster. + pub population: u8, + /// Receiver side of the recv channel. rx_recv: mpsc::UnboundedReceiver, @@ -42,8 +45,9 @@ impl ControlHub { // connect to the cluster manager and receive my assigned server ID pf_info!("s"; "connecting to manager '{}'...", manager); let mut stream = TcpStream::connect(manager).await?; - let id = stream.read_u8().await?; // receive my server ID - pf_debug!(id; "assigned server ID: {}", id); + let id = stream.read_u8().await?; // first receive assigned server ID + let population = stream.read_u8().await?; // then receive population + pf_debug!(id; "assigned server ID: {} of {}", id, population); let (tx_recv, rx_recv) = mpsc::unbounded_channel(); let (tx_send, rx_send) = mpsc::unbounded_channel(); @@ -54,6 +58,7 @@ impl ControlHub { Ok(ControlHub { me: id, + population, rx_recv, tx_send, _control_messenger_handle: control_messenger_handle, From f1295e8dfb3dc1154cbdbb9c0ad27f0f19551cab Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Wed, 30 Aug 2023 18:59:34 +0800 Subject: [PATCH 17/89] minor updates to README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3ccadae9..22eb30f3 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ git checkout -b git branch --set-upstream-to=private/main git pull private git push origin -# then, on GitHub, make a PR from branch to main +# then, on GitHub, make a squashing PR from branch to main ``` # Summerset From 056f385bb73a3dabad4e0cc643fe8ed3ca562d52 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Wed, 30 Aug 2023 20:18:56 +0800 Subject: [PATCH 18/89] staging progress on reset control message --- src/manager/reactor.rs | 13 ++++++++++++- src/manager/reigner.rs | 6 +++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/src/manager/reactor.rs b/src/manager/reactor.rs index e3a1b198..3aba3ea2 100644 --- a/src/manager/reactor.rs +++ b/src/manager/reactor.rs @@ -1,6 +1,6 @@ //! Cluster manager client-facing reactor module implementation. -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::net::SocketAddr; use crate::utils::{ @@ -26,6 +26,14 @@ pub enum CtrlRequest { /// Query the set of active servers and their info. QueryInfo, + /// Reset the specified server(s) to initial state. + ResetServer { + /// ID of server to reset. If `None`, resets all active servers. + server: Option, + /// If false, cleans durable storage state as well. + durable: bool, + }, + /// Client leave notification. Leave, } @@ -38,6 +46,9 @@ pub enum CtrlReply { servers: HashMap, }, + /// Reply to server reset request. + ResetServer { servers: HashSet }, + /// Reply to client leave notification. Leave, } diff --git a/src/manager/reigner.rs b/src/manager/reigner.rs index ef7d7579..459918b2 100644 --- a/src/manager/reigner.rs +++ b/src/manager/reigner.rs @@ -21,7 +21,7 @@ use tokio::task::JoinHandle; /// Control message from/to servers. Control traffic could be bidirectional: /// some initiated by the manager and some by servers. -// TODO: add reset, pause, resume, server leave, leader change, etc. +// TODO: add pause, resume, server leave, leader change, etc. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] pub enum CtrlMsg { /// Server -> Manager: new server up, requesting a list of peers' addresses @@ -38,6 +38,10 @@ pub enum CtrlMsg { population: u8, to_peers: HashMap, }, + + /// Manager -> Server: reset to initial state. If durable is false, cleans + /// durable storage state as well. + ResetState { durable: bool }, } /// The server-facing controller API module. From bc2e22f175dc66cd3a6fbd8da212f3887474cb31 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Wed, 30 Aug 2023 21:17:32 +0800 Subject: [PATCH 19/89] add tcp_connect wrapper; better server ID assign logic --- src/client/apistub.rs | 7 ++++--- src/manager/clusman.rs | 40 +++++++++++++++++++++++++++++++++-- src/manager/reigner.rs | 46 +++++++++++++++++++++++++++-------------- src/server/control.rs | 6 ++++-- src/server/transport.rs | 3 ++- src/utils/error.rs | 4 +++- src/utils/mod.rs | 4 +++- src/utils/safetcp.rs | 23 ++++++++++++++++++--- 8 files changed, 104 insertions(+), 29 deletions(-) diff --git a/src/client/apistub.rs b/src/client/apistub.rs index 8106f7f7..ea0bb14f 100644 --- a/src/client/apistub.rs +++ b/src/client/apistub.rs @@ -2,13 +2,14 @@ use std::net::SocketAddr; -use crate::utils::{SummersetError, safe_tcp_read, safe_tcp_write}; +use crate::utils::{ + SummersetError, safe_tcp_read, safe_tcp_write, tcp_connect_with_retry, +}; use crate::server::{ApiRequest, ApiReply}; use crate::client::ClientId; use bytes::BytesMut; -use tokio::net::TcpStream; use tokio::net::tcp::{OwnedReadHalf, OwnedWriteHalf}; use tokio::io::AsyncWriteExt; @@ -40,7 +41,7 @@ impl ClientApiStub { addr: SocketAddr, ) -> Result { pf_info!(id; "connecting to server '{}'...", addr); - let mut stream = TcpStream::connect(addr).await?; + let mut stream = tcp_connect_with_retry(addr, 10).await?; stream.write_u64(id).await?; // send my client ID let (read_half, write_half) = stream.into_split(); diff --git a/src/manager/clusman.rs b/src/manager/clusman.rs index 8890faa9..9a153aa3 100644 --- a/src/manager/clusman.rs +++ b/src/manager/clusman.rs @@ -1,6 +1,6 @@ //! Summerset cluster manager oracle implementation. -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::net::SocketAddr; use crate::utils::SummersetError; @@ -41,11 +41,20 @@ pub struct ClusterManager { /// ServerReigner module. server_reigner: ServerReigner, + /// Receiver side of the server ID assignment channel. + rx_id_assign: mpsc::UnboundedReceiver<()>, + + /// Sender side of the server ID assignment result channel. + tx_id_result: mpsc::UnboundedSender<(ReplicaId, u8)>, + /// ClientReactor module. client_reactor: ClientReactor, /// Information of current active servers. server_info: HashMap, + + /// Currently assigned server IDs. + assigned_ids: HashSet, } impl ClusterManager { @@ -61,8 +70,12 @@ impl ClusterManager { return logged_err!("m"; "invalid population {}", population); } + let (tx_id_assign, rx_id_assign) = mpsc::unbounded_channel(); + let (tx_id_result, rx_id_result) = mpsc::unbounded_channel(); let server_reigner = - ServerReigner::new_and_setup(srv_addr, population).await?; + ServerReigner::new_and_setup(srv_addr, tx_id_assign, rx_id_result) + .await?; + let client_reactor = ClientReactor::new_and_setup(cli_addr).await?; Ok(ClusterManager { @@ -71,11 +84,27 @@ impl ClusterManager { _cli_addr: cli_addr, population, server_reigner, + rx_id_assign, + tx_id_result, client_reactor, server_info: HashMap::new(), + assigned_ids: HashSet::new(), }) } + /// Assign the first vacant server ID to a new server. + fn assign_server_id(&mut self) -> Result<(), SummersetError> { + for id in 0..self.population { + if !self.assigned_ids.contains(&id) { + self.tx_id_result.send((id, self.population))?; + self.assigned_ids.insert(id); + return Ok(()); + } + } + + logged_err!("m"; "no server ID < population left available") + } + /// Main event loop logic of the cluster manager. Breaks out of the loop /// only upon catching termination signals to the process. pub async fn run(&mut self) -> Result<(), SummersetError> { @@ -89,6 +118,13 @@ impl ClusterManager { loop { tokio::select! { + // receiving server ID assignment request + _ = self.rx_id_assign.recv() => { + if let Err(e) = self.assign_server_id() { + pf_error!("m"; "error assigning new server ID: {}", e); + } + }, + // receiving server control message ctrl_msg = self.server_reigner.recv_ctrl() => { if let Err(e) = ctrl_msg { diff --git a/src/manager/reigner.rs b/src/manager/reigner.rs index 459918b2..05436ac0 100644 --- a/src/manager/reigner.rs +++ b/src/manager/reigner.rs @@ -64,11 +64,12 @@ pub struct ServerReigner { // ServerReigner public API implementation impl ServerReigner { /// Creates a new server-facing controller module. Spawns the server - /// acceptor thread. Creates a recv channel for buffering incoming control - /// messages. + /// acceptor thread. Creates a pair of ID assignment channels. Creates + /// a recv channel for buffering incoming control messages. pub async fn new_and_setup( srv_addr: SocketAddr, - population: u8, + tx_id_assign: mpsc::UnboundedSender<()>, + rx_id_result: mpsc::UnboundedReceiver<(ReplicaId, u8)>, ) -> Result { let (tx_recv, rx_recv) = mpsc::unbounded_channel(); @@ -81,7 +82,8 @@ impl ServerReigner { let server_listener = tcp_bind_with_retry(srv_addr, 10).await?; let server_acceptor_handle = tokio::spawn(Self::server_acceptor_thread( - population, + tx_id_assign, + rx_id_result, tx_recv, server_listener, tx_sends_write, @@ -138,8 +140,8 @@ impl ServerReigner { async fn accept_new_server( mut stream: TcpStream, addr: SocketAddr, - id: ReplicaId, - population: u8, + tx_id_assign: &mpsc::UnboundedSender<()>, + rx_id_result: &mut mpsc::UnboundedReceiver<(ReplicaId, u8)>, tx_recv: mpsc::UnboundedSender<(ReplicaId, CtrlMsg)>, tx_sends: &mut flashmap::WriteHandle< ReplicaId, @@ -151,6 +153,12 @@ impl ServerReigner { >, tx_exit: mpsc::UnboundedSender, ) -> Result<(), SummersetError> { + // communicate with the manager's main thread to get assigned server ID + tx_id_assign.send(())?; + let (id, population) = rx_id_result.recv().await.ok_or( + SummersetError("failed to get server ID assignment".into()), + )?; + // first send server ID assignment if let Err(e) = stream.write_u8(id).await { return logged_err!("m"; "error assigning new server ID: {}", e); @@ -218,7 +226,8 @@ impl ServerReigner { /// Server acceptor thread function. async fn server_acceptor_thread( - population: u8, + tx_id_assign: mpsc::UnboundedSender<()>, + mut rx_id_result: mpsc::UnboundedReceiver<(ReplicaId, u8)>, tx_recv: mpsc::UnboundedSender<(ReplicaId, CtrlMsg)>, server_listener: TcpListener, mut tx_sends: flashmap::WriteHandle< @@ -235,9 +244,6 @@ impl ServerReigner { let local_addr = server_listener.local_addr().unwrap(); pf_info!("m"; "accepting servers on '{}'", local_addr); - // maintain a monotonically increasing server ID for new servers - let mut next_server_id: ReplicaId = 0; - // create an exit mpsc channel for getting notified about termination // of server controller threads let (tx_exit, mut rx_exit) = mpsc::unbounded_channel(); @@ -254,16 +260,14 @@ impl ServerReigner { if let Err(e) = Self::accept_new_server( stream, addr, - next_server_id, - population, + &tx_id_assign, + &mut rx_id_result, tx_recv.clone(), &mut tx_sends, &mut server_controller_handles, tx_exit.clone(), ).await { pf_error!("m"; "error accepting new server: {}", e); - } else { - next_server_id += 1; } }, @@ -485,10 +489,18 @@ mod reigner_tests { Ok::<(), SummersetError>(()) }); // manager - let mut reigner = - ServerReigner::new_and_setup("127.0.0.1:53600".parse()?, 2).await?; + let (tx_id_assign, mut rx_id_assign) = mpsc::unbounded_channel(); + let (tx_id_result, rx_id_result) = mpsc::unbounded_channel(); + let mut reigner = ServerReigner::new_and_setup( + "127.0.0.1:53600".parse()?, + tx_id_assign, + rx_id_result, + ) + .await?; setup_bar.wait().await; // recv message from server 0 + rx_id_assign.recv().await; + tx_id_result.send((0, 2))?; let (id, msg) = reigner.recv_ctrl().await?; assert_eq!(id, 0); assert_eq!( @@ -509,6 +521,8 @@ mod reigner_tests { id, )?; // recv message from server 1 + rx_id_assign.recv().await; + tx_id_result.send((1, 2))?; let (id, msg) = reigner.recv_ctrl().await?; assert_eq!(id, 1); assert_eq!( diff --git a/src/server/control.rs b/src/server/control.rs index 05627db3..ef5ff794 100644 --- a/src/server/control.rs +++ b/src/server/control.rs @@ -2,7 +2,9 @@ use std::net::SocketAddr; -use crate::utils::{SummersetError, safe_tcp_read, safe_tcp_write}; +use crate::utils::{ + SummersetError, safe_tcp_read, safe_tcp_write, tcp_connect_with_retry, +}; use crate::manager::CtrlMsg; use crate::server::ReplicaId; @@ -44,7 +46,7 @@ impl ControlHub { ) -> Result { // connect to the cluster manager and receive my assigned server ID pf_info!("s"; "connecting to manager '{}'...", manager); - let mut stream = TcpStream::connect(manager).await?; + let mut stream = tcp_connect_with_retry(manager, 10).await?; let id = stream.read_u8().await?; // first receive assigned server ID let population = stream.read_u8().await?; // then receive population pf_debug!(id; "assigned server ID: {} of {}", id, population); diff --git a/src/server/transport.rs b/src/server/transport.rs index 504e32a2..8f5f69cf 100644 --- a/src/server/transport.rs +++ b/src/server/transport.rs @@ -5,6 +5,7 @@ use std::net::SocketAddr; use crate::utils::{ SummersetError, Bitmap, safe_tcp_read, safe_tcp_write, tcp_bind_with_retry, + tcp_connect_with_retry, }; use crate::server::ReplicaId; @@ -253,7 +254,7 @@ where tx_exit: mpsc::UnboundedSender, ) -> Result<(), SummersetError> { pf_debug!(me; "connecting to peer {} '{}'...", id, addr); - let mut stream = TcpStream::connect(addr).await?; + let mut stream = tcp_connect_with_retry(addr, 10).await?; stream.write_u8(me).await?; // send my ID let mut peer_messenger_handles_guard = peer_messenger_handles.guard(); diff --git a/src/utils/error.rs b/src/utils/error.rs index 90e576c5..0e73dccb 100644 --- a/src/utils/error.rs +++ b/src/utils/error.rs @@ -37,13 +37,15 @@ impl_from_error!(toml::ser::Error); impl_from_error!(toml::de::Error); impl_from_error!(tokio::sync::SetError); impl_from_error!(tokio::sync::SetError); -impl_from_error!(tokio::sync::mpsc::error::TryRecvError); impl_from_error!( tokio::sync::watch::error::SendError> ); +impl_from_error!(tokio::sync::mpsc::error::TryRecvError); +impl_from_error!(tokio::sync::mpsc::error::SendError<()>); impl_from_error!( tokio::sync::mpsc::error::SendError<(ReplicaId, net::SocketAddr)> ); +impl_from_error!(tokio::sync::mpsc::error::SendError<(ReplicaId, u8)>); impl_from_error!(reed_solomon_erasure::Error); impl_from_error!(ctrlc::Error); diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 23a43006..31533217 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -15,5 +15,7 @@ mod rscoding; pub use error::SummersetError; pub use bitmap::Bitmap; pub use timer::Timer; -pub use safetcp::{safe_tcp_read, safe_tcp_write, tcp_bind_with_retry}; +pub use safetcp::{ + safe_tcp_read, safe_tcp_write, tcp_bind_with_retry, tcp_connect_with_retry, +}; pub use rscoding::RSCodeword; diff --git a/src/utils/safetcp.rs b/src/utils/safetcp.rs index 6a0df26a..2c337317 100644 --- a/src/utils/safetcp.rs +++ b/src/utils/safetcp.rs @@ -1,6 +1,4 @@ -//! Safe TCP read/write helpers that provides cancellation safety on the read -//! side and deadlock avoidance on the write side. Safe `TcpListener` binding -//! wrapper that provides a retrying logic. +//! Safe TCP bind/connect/read/write helper functions. use std::io::ErrorKind; use std::net::SocketAddr; @@ -162,4 +160,23 @@ pub async fn tcp_bind_with_retry( } } +/// Wrapper over tokio `TcpStream::connect()` that provides a retrying logic. +pub async fn tcp_connect_with_retry( + addr: SocketAddr, + mut retries: u8, +) -> Result { + loop { + match TcpStream::connect(addr).await { + Ok(stream) => return Ok(stream), + Err(e) => { + if retries == 0 { + return Err(e.into()); + } + retries -= 1; + time::sleep(Duration::from_secs(1)).await; + } + } + } +} + // No unit tests for these helpers... From 36d0c5bc7b57625fad4f621ef2445be54752f257 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Thu, 31 Aug 2023 14:02:13 +0800 Subject: [PATCH 20/89] huge updates adding server leave and reset support --- Cargo.lock | 2 + src/client/endpoint.rs | 18 +- src/lib.rs | 2 +- src/manager/clusman.rs | 84 ++++++-- src/manager/reactor.rs | 15 +- src/manager/reigner.rs | 140 ++++++++++++- src/protocols/crossword.rs | 164 ++++++++++----- src/protocols/mod.rs | 22 +- src/protocols/multipaxos.rs | 164 ++++++++++----- src/protocols/rep_nothing.rs | 161 ++++++++++----- src/protocols/rs_paxos.rs | 164 ++++++++++----- src/protocols/simple_push.rs | 164 ++++++++++----- src/server/external.rs | 15 +- src/server/replica.rs | 10 +- src/server/transport.rs | 217 ++++++++++++++++---- summerset_client/Cargo.toml | 2 +- summerset_client/src/clients/tester.rs | 28 ++- summerset_client/src/drivers/closed_loop.rs | 17 +- summerset_client/src/drivers/open_loop.rs | 16 +- summerset_client/src/main.rs | 5 +- summerset_manager/Cargo.toml | 3 +- summerset_manager/src/main.rs | 56 +++-- summerset_server/Cargo.toml | 3 +- summerset_server/src/main.rs | 30 ++- 24 files changed, 1118 insertions(+), 384 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e9fc04f8..883efd71 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1151,6 +1151,7 @@ name = "summerset_manager" version = "0.1.0" dependencies = [ "clap", + "ctrlc", "env_logger", "log", "rand", @@ -1163,6 +1164,7 @@ name = "summerset_server" version = "0.1.0" dependencies = [ "clap", + "ctrlc", "env_logger", "log", "rand", diff --git a/src/client/endpoint.rs b/src/client/endpoint.rs index bf3cd10f..37387e2b 100644 --- a/src/client/endpoint.rs +++ b/src/client/endpoint.rs @@ -5,6 +5,7 @@ use std::net::SocketAddr; use crate::utils::SummersetError; use crate::server::{ApiRequest, ApiReply}; +use crate::client::ClientCtrlStub; use async_trait::async_trait; @@ -14,8 +15,9 @@ pub type ClientId = u64; /// Client trait to be implement by all protocol-specific client structs. #[async_trait] pub trait GenericEndpoint { - /// Creates a new client stub. - fn new( + /// Creates a new client stub and sets up required functionality modules + /// according to protocol-specific logic. + async fn new_and_setup( manager: SocketAddr, // remote address of manager oracle config_str: Option<&str>, ) -> Result @@ -23,9 +25,8 @@ pub trait GenericEndpoint { Self: Sized; /// Establishes connection to the service (or re-joins the service) - /// according to protocol-specific logic. Returns the assigned client ID - /// on success. - async fn connect(&mut self) -> Result; + /// according to protocol-specific logic. + async fn connect(&mut self) -> Result<(), SummersetError>; /// Leaves the service: forgets about the current TCP connections and send /// leave notifications according to protocol-specific logic. If `permanent` @@ -40,4 +41,11 @@ pub trait GenericEndpoint { /// Receives a reply from the service according to protocol-specific logic. async fn recv_reply(&mut self) -> Result; + + /// Gets my client ID. + fn id(&self) -> ClientId; + + /// Gets a mutable reference to the control stub for sending control + /// requests and receiving control replies for testing purposes. + fn ctrl_stub(&mut self) -> &mut ClientCtrlStub; } diff --git a/src/lib.rs b/src/lib.rs index 24a24bb6..2de53e51 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -25,7 +25,7 @@ pub use crate::server::{ }; #[doc(inline)] -pub use crate::client::{ClientId, GenericEndpoint}; +pub use crate::client::{ClientId, GenericEndpoint, ClientCtrlStub}; #[doc(inline)] pub use crate::protocols::SmrProtocol; diff --git a/src/manager/clusman.rs b/src/manager/clusman.rs index 9a153aa3..de18afb5 100644 --- a/src/manager/clusman.rs +++ b/src/manager/clusman.rs @@ -11,7 +11,7 @@ use crate::server::ReplicaId; use crate::client::ClientId; use crate::protocols::SmrProtocol; -use tokio::sync::mpsc; +use tokio::sync::{mpsc, watch}; /// Information about an active server. // TODO: maybe add things like leader info, etc. @@ -107,15 +107,10 @@ impl ClusterManager { /// Main event loop logic of the cluster manager. Breaks out of the loop /// only upon catching termination signals to the process. - pub async fn run(&mut self) -> Result<(), SummersetError> { - // set up termination signals handler - let (tx_term, mut rx_term) = mpsc::unbounded_channel(); - ctrlc::set_handler(move || { - if let Err(e) = tx_term.send(true) { - pf_error!("m"; "error sending to term channel: {}", e); - } - })?; - + pub async fn run( + &mut self, + mut rx_term: watch::Receiver, + ) -> Result<(), SummersetError> { loop { tokio::select! { // receiving server ID assignment request @@ -132,7 +127,7 @@ impl ClusterManager { continue; } let (server, msg) = ctrl_msg.unwrap(); - if let Err(e) = self.handle_ctrl_msg(server, msg) { + if let Err(e) = self.handle_ctrl_msg(server, msg).await { pf_error!("m"; "error handling ctrl msg <- {}: {}", server, e); } @@ -145,14 +140,14 @@ impl ClusterManager { continue; } let (client, req) = ctrl_req.unwrap(); - if let Err(e) = self.handle_ctrl_req(client, req) { + if let Err(e) = self.handle_ctrl_req(client, req).await { pf_error!("m"; "error handling ctrl req <- {}: {}", client, e); } }, // receiving termination signal - _ = rx_term.recv() => { + _ = rx_term.changed() => { pf_warn!("m"; "manager caught termination signal"); break; } @@ -203,7 +198,7 @@ impl ClusterManager { } /// Synthesized handler of server-initiated control messages. - fn handle_ctrl_msg( + async fn handle_ctrl_msg( &mut self, server: ReplicaId, msg: CtrlMsg, @@ -249,8 +244,62 @@ impl ClusterManager { .send_reply(CtrlReply::QueryInfo { servers }, client) } + /// Handler of client ResetServer request. + async fn handle_client_reset_server( + &mut self, + client: ClientId, + server: Option, + durable: bool, + ) -> Result<(), SummersetError> { + let num_replicas = self.server_info.len(); + let mut servers: Vec = if server.is_none() { + // all active servers + self.server_info.keys().copied().collect() + } else { + vec![server.unwrap()] + }; + + // reset specified server(s) + let mut reset_done = HashSet::new(); + while let Some(s) = servers.pop() { + // send reset server control message to server + self.server_reigner + .send_ctrl(CtrlMsg::ResetState { durable }, s)?; + + // remove information about this server + assert!(self.assigned_ids.contains(&s)); + assert!(self.server_info.contains_key(&s)); + self.assigned_ids.remove(&s); + self.server_info.remove(&s); + + // wait for the new server ID assignment request from it + self.rx_id_assign.recv().await; + if let Err(e) = self.assign_server_id() { + return logged_err!("m"; "error assigning new server ID: {}", e); + } + + reset_done.insert(s); + } + + // now the reset servers should be sending NewServerJoin messages to + // me. Process them until all servers joined + while self.server_info.len() < num_replicas { + let (s, msg) = self.server_reigner.recv_ctrl().await?; + if let Err(e) = self.handle_ctrl_msg(s, msg).await { + pf_error!("m"; "error handling ctrl msg <- {}: {}", s, e); + } + } + + self.client_reactor.send_reply( + CtrlReply::ResetServer { + servers: reset_done, + }, + client, + ) + } + /// Synthesized handler of client-initiated control requests. - fn handle_ctrl_req( + async fn handle_ctrl_req( &mut self, client: ClientId, req: CtrlRequest, @@ -261,6 +310,11 @@ impl ClusterManager { self.handle_client_query_info(client)?; } + CtrlRequest::ResetServer { server, durable } => { + self.handle_client_reset_server(client, server, durable) + .await?; + } + _ => {} // ignore all other types } diff --git a/src/manager/reactor.rs b/src/manager/reactor.rs index 3aba3ea2..41a0582d 100644 --- a/src/manager/reactor.rs +++ b/src/manager/reactor.rs @@ -441,9 +441,11 @@ mod reactor_tests { ClientReactor::new_and_setup("127.0.0.1:53601".parse()?) .await?; barrier2.wait().await; + // recv request from client let (client, req) = reactor.recv_req().await?; assert!(reactor.has_client(client)); assert_eq!(req, CtrlRequest::QueryInfo); + // send reply to client reactor.send_reply( CtrlReply::QueryInfo { servers: HashMap::::from([ @@ -459,7 +461,9 @@ mod reactor_tests { barrier.wait().await; let mut ctrl_stub = ClientCtrlStub::new_by_connect("127.0.0.1:53601".parse()?).await?; + // send request to manager ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?; + // recv reply from manager assert_eq!( ctrl_stub.recv_reply().await?, CtrlReply::QueryInfo { @@ -482,7 +486,9 @@ mod reactor_tests { let mut ctrl_stub = ClientCtrlStub::new_by_connect("127.0.0.1:54601".parse()?) .await?; + // send request to manager ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?; + // recv reply from manager assert_eq!( ctrl_stub.recv_reply().await?, CtrlReply::QueryInfo { @@ -492,14 +498,17 @@ mod reactor_tests { ]), } ); + // leave and come back as new client ctrl_stub.send_req(Some(&CtrlRequest::Leave))?; assert_eq!(ctrl_stub.recv_reply().await?, CtrlReply::Leave); ctrl_stub.forget(); - time::sleep(Duration::from_millis(10)).await; + time::sleep(Duration::from_millis(100)).await; let mut ctrl_stub = ClientCtrlStub::new_by_connect("127.0.0.1:54601".parse()?) .await?; + // send request to manager ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?; + // recv reply from manager assert_eq!( ctrl_stub.recv_reply().await?, CtrlReply::QueryInfo { @@ -515,9 +524,11 @@ mod reactor_tests { let mut reactor = ClientReactor::new_and_setup("127.0.0.1:54601".parse()?).await?; barrier.wait().await; + // recv request from client let (client, req) = reactor.recv_req().await?; assert!(reactor.has_client(client)); assert_eq!(req, CtrlRequest::QueryInfo); + // send reply to client reactor.send_reply( CtrlReply::QueryInfo { servers: HashMap::::from([ @@ -527,10 +538,12 @@ mod reactor_tests { }, client, )?; + // recv request from new client let (client2, req2) = reactor.recv_req().await?; assert!(reactor.has_client(client2)); assert!(!reactor.has_client(client)); assert_eq!(req2, CtrlRequest::QueryInfo); + // send reply to new client reactor.send_reply( CtrlReply::QueryInfo { servers: HashMap::::from([ diff --git a/src/manager/reigner.rs b/src/manager/reigner.rs index 05436ac0..02e4e4c3 100644 --- a/src/manager/reigner.rs +++ b/src/manager/reigner.rs @@ -21,7 +21,7 @@ use tokio::task::JoinHandle; /// Control message from/to servers. Control traffic could be bidirectional: /// some initiated by the manager and some by servers. -// TODO: add pause, resume, server leave, leader change, etc. +// TODO: add pause, resume, leader change, membership change, etc. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] pub enum CtrlMsg { /// Server -> Manager: new server up, requesting a list of peers' addresses @@ -42,6 +42,12 @@ pub enum CtrlMsg { /// Manager -> Server: reset to initial state. If durable is false, cleans /// durable storage state as well. ResetState { durable: bool }, + + /// Server -> Manager: leave notification. + Leave, + + /// Manager -> Server: dummy leave reply. + LeaveReply, } /// The server-facing controller API module. @@ -98,6 +104,13 @@ impl ServerReigner { }) } + /// Returns whether a server ID is connected to me. + #[allow(dead_code)] + pub fn has_server(&self, server: ReplicaId) -> bool { + let tx_sends_guard = self.tx_sends.guard(); + tx_sends_guard.contains_key(&server) + } + /// Waits for the next control event message from some server. pub async fn recv_ctrl( &mut self, @@ -359,6 +372,22 @@ impl ServerReigner { // receives control message from server msg = Self::read_ctrl(&mut read_buf, &mut conn_read) => { match msg { + Ok(CtrlMsg::Leave) => { + // server leaving, send dummy reply and break + let msg = CtrlMsg::LeaveReply; + if let Err(e) = Self::write_ctrl( + &mut write_buf, + &mut write_buf_cursor, + &conn_write, + Some(&msg) + ) { + pf_error!("m"; "error replying -> {}: {}", id, e); + } else { // skips `WouldBlock` failure check here + pf_info!("m"; "server {} has left", id); + } + break; + }, + Ok(CtrlMsg::NewServerJoin { id, protocol, @@ -380,7 +409,7 @@ impl ServerReigner { if let Err(e) = tx_recv.send((id, msg)) { pf_error!("m"; "error sending to tx_recv for {}: {}", id, e); } - } + }, Ok(msg) => { // pf_trace!("m"; "recv <- {} ctrl {:?}", id, msg); @@ -432,6 +461,7 @@ mod reigner_tests { use std::sync::Arc; use crate::server::ControlHub; use tokio::sync::Barrier; + use tokio::time::{self, Duration}; #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn api_send_recv() -> Result<(), SummersetError> { @@ -544,4 +574,110 @@ mod reigner_tests { )?; Ok(()) } + + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn api_server_leave() -> Result<(), SummersetError> { + let barrier = Arc::new(Barrier::new(2)); + let barrier2 = barrier.clone(); + tokio::spawn(async move { + // replica 0 + barrier2.wait().await; + let mut hub = + ControlHub::new_and_setup("127.0.0.1:54600".parse()?).await?; + assert_eq!(hub.me, 0); + // send a message to manager + hub.send_ctrl(CtrlMsg::NewServerJoin { + id: hub.me, + protocol: SmrProtocol::SimplePush, + api_addr: "127.0.0.1:54700".parse()?, + p2p_addr: "127.0.0.1:54800".parse()?, + })?; + // recv a message from manager + assert_eq!( + hub.recv_ctrl().await?, + CtrlMsg::ConnectToPeers { + population: 1, + to_peers: HashMap::new(), + } + ); + // leave and re-join as 0 + hub.send_ctrl(CtrlMsg::Leave)?; + assert_eq!(hub.recv_ctrl().await?, CtrlMsg::LeaveReply); + time::sleep(Duration::from_millis(100)).await; + let mut hub = + ControlHub::new_and_setup("127.0.0.1:54600".parse()?).await?; + assert_eq!(hub.me, 0); + // send a message to manager + hub.send_ctrl(CtrlMsg::NewServerJoin { + id: hub.me, + protocol: SmrProtocol::SimplePush, + api_addr: "127.0.0.1:54700".parse()?, + p2p_addr: "127.0.0.1:54800".parse()?, + })?; + // recv a message from manager + assert_eq!( + hub.recv_ctrl().await?, + CtrlMsg::ConnectToPeers { + population: 1, + to_peers: HashMap::new(), + } + ); + Ok::<(), SummersetError>(()) + }); + // manager + let (tx_id_assign, mut rx_id_assign) = mpsc::unbounded_channel(); + let (tx_id_result, rx_id_result) = mpsc::unbounded_channel(); + let mut reigner = ServerReigner::new_and_setup( + "127.0.0.1:54600".parse()?, + tx_id_assign, + rx_id_result, + ) + .await?; + barrier.wait().await; + // recv message from server 0 + rx_id_assign.recv().await; + tx_id_result.send((0, 1))?; + let (id, msg) = reigner.recv_ctrl().await?; + assert_eq!(id, 0); + assert_eq!( + msg, + CtrlMsg::NewServerJoin { + id: 0, + protocol: SmrProtocol::SimplePush, + api_addr: "127.0.0.1:54700".parse()?, + p2p_addr: "127.0.0.1:54800".parse()? + } + ); + // send reply to server 0 + reigner.send_ctrl( + CtrlMsg::ConnectToPeers { + population: 1, + to_peers: HashMap::new(), + }, + id, + )?; + rx_id_assign.recv().await; + tx_id_result.send((0, 1))?; + // recv message from server 0 + let (id, msg) = reigner.recv_ctrl().await?; + assert_eq!(id, 0); + assert_eq!( + msg, + CtrlMsg::NewServerJoin { + id: 0, + protocol: SmrProtocol::SimplePush, + api_addr: "127.0.0.1:54700".parse()?, + p2p_addr: "127.0.0.1:54800".parse()? + } + ); + // send reply to server 0 + reigner.send_ctrl( + CtrlMsg::ConnectToPeers { + population: 1, + to_peers: HashMap::new(), + }, + id, + )?; + Ok(()) + } } diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index dc74afe8..493213d4 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -22,7 +22,7 @@ use async_trait::async_trait; use serde::{Serialize, Deserialize}; use tokio::time::Duration; -use tokio::sync::mpsc; +use tokio::sync::watch; use reed_solomon_erasure::galois_8::ReedSolomon; @@ -1101,11 +1101,61 @@ impl CrosswordReplica { Ok(()) } - /// Synthesized handler of manager control messages. - fn handle_ctrl_msg(&mut self, _msg: CtrlMsg) -> Result<(), SummersetError> { - // TODO: fill this when more control message types added + /// Handler of ResetState control message. + async fn handle_ctrl_reset_state( + &mut self, + durable: bool, + ) -> Result<(), SummersetError> { + // send leave notification to peers and wait for their replies + self.transport_hub.leave().await?; + + // send leave notification to manager and wait for its reply + self.control_hub.send_ctrl(CtrlMsg::Leave)?; + while self.control_hub.recv_ctrl().await? != CtrlMsg::LeaveReply {} + + // if `durable` is false, truncate backer file + if !durable { + // use 0 as a special log action ID here + self.storage_hub + .submit_action(0, LogAction::Truncate { offset: 0 })?; + loop { + let (action_id, log_result) = + self.storage_hub.get_result().await?; + if action_id == 0 { + if log_result + != (LogResult::Truncate { + offset_ok: true, + now_size: 0, + }) + { + return logged_err!(self.id; "failed to truncate log to 0"); + } else { + return Ok(()); + } + } + } + } + Ok(()) } + + /// Synthesized handler of manager control messages. If ok, returns + /// `Some(true)` if decides to terminate and reboot, `Some(false)` if + /// decides to shutdown completely, and `None` if not terminating. + async fn handle_ctrl_msg( + &mut self, + msg: CtrlMsg, + ) -> Result, SummersetError> { + // TODO: fill this when more control message types added + match msg { + CtrlMsg::ResetState { durable } => { + self.handle_ctrl_reset_state(durable).await?; + Ok(Some(true)) + } + + _ => Ok(None), // ignore all other types + } + } } #[async_trait] @@ -1221,15 +1271,10 @@ impl GenericReplica for CrosswordReplica { }) } - async fn run(&mut self) -> Result { - // set up termination signals handler - let (tx_term, mut rx_term) = mpsc::unbounded_channel(); - ctrlc::set_handler(move || { - if let Err(e) = tx_term.send(true) { - pf_error!("s"; "error sending to term channel: {}", e); - } - })?; - + async fn run( + &mut self, + mut rx_term: watch::Receiver, + ) -> Result { // TODO: proper leader election if self.id == 0 { self.is_leader = true; @@ -1293,19 +1338,34 @@ impl GenericReplica for CrosswordReplica { continue; } let ctrl_msg = ctrl_msg.unwrap(); - if let Err(e) = self.handle_ctrl_msg(ctrl_msg) { - pf_error!(self.id; "error handling ctrl msg: {}", e); + match self.handle_ctrl_msg(ctrl_msg).await { + Ok(terminate) => { + if let Some(restart) = terminate { + pf_warn!( + self.id; + "server got {} req", + if restart { "restart" } else { "shutdown" }); + return Ok(restart); + } + }, + Err(e) => { + pf_error!(self.id; "error handling ctrl msg: {}", e); + } } }, // receiving termination signal - _ = rx_term.recv() => { + _ = rx_term.changed() => { pf_warn!(self.id; "server caught termination signal"); return Ok(false); } } } } + + fn id(&self) -> ReplicaId { + self.id + } } /// Configuration parameters struct. @@ -1327,9 +1387,6 @@ pub struct CrosswordClient { /// Client ID. id: ClientId, - /// Address of the cluster manager oracle. - manager: SocketAddr, - /// Configuration parameters struct. _config: ClientConfigCrossword, @@ -1340,7 +1397,7 @@ pub struct CrosswordClient { server_id: ReplicaId, /// Control API stub to the cluster manager. - ctrl_stub: Option, + ctrl_stub: ClientCtrlStub, /// API stubs for communicating with servers. api_stub: Option, @@ -1348,47 +1405,43 @@ pub struct CrosswordClient { #[async_trait] impl GenericEndpoint for CrosswordClient { - fn new( + async fn new_and_setup( manager: SocketAddr, config_str: Option<&str>, ) -> Result { + // connect to the cluster manager and get assigned a client ID + let ctrl_stub = ClientCtrlStub::new_by_connect(manager).await?; + let id = ctrl_stub.id; + + // parse protocol-specific configs let config = parsed_config!(config_str => ClientConfigCrossword; init_server_id)?; let init_server_id = config.init_server_id; Ok(CrosswordClient { - id: 255, // nil at this time - manager, + id, _config: config, servers: HashMap::new(), server_id: init_server_id, - ctrl_stub: None, + ctrl_stub, api_stub: None, }) } - async fn connect(&mut self) -> Result { + async fn connect(&mut self) -> Result<(), SummersetError> { // disallow reconnection without leaving if self.api_stub.is_some() { return logged_err!(self.id; "reconnecting without leaving"); } - // if ctrl_stubs not established yet, connect to the manager - if self.ctrl_stub.is_none() { - let ctrl_stub = - ClientCtrlStub::new_by_connect(self.manager).await?; - self.id = ctrl_stub.id; - self.ctrl_stub = Some(ctrl_stub); - } - let ctrl_stub = self.ctrl_stub.as_mut().unwrap(); - // ask the manager about the list of active servers - let mut sent = ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?; + let mut sent = + self.ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?; while !sent { - sent = ctrl_stub.send_req(None)?; + sent = self.ctrl_stub.send_req(None)?; } - let reply = ctrl_stub.recv_reply().await?; + let reply = self.ctrl_stub.recv_reply().await?; match reply { CtrlReply::QueryInfo { servers } => { // connect to the one with server ID in config @@ -1399,7 +1452,7 @@ impl GenericEndpoint for CrosswordClient { .await?; self.api_stub = Some(api_stub); self.servers = servers; - Ok(self.id) + Ok(()) } _ => logged_err!(self.id; "unexpected reply type received"), } @@ -1427,26 +1480,19 @@ impl GenericEndpoint for CrosswordClient { // if permanently leaving, send leave notification to the manager if permanent { - // disallow multiple permanent leaving - if self.ctrl_stub.is_none() { - return logged_err!(self.id; "repeated permanent leaving"); + let mut sent = + self.ctrl_stub.send_req(Some(&CtrlRequest::Leave))?; + while !sent { + sent = self.ctrl_stub.send_req(None)?; } - if let Some(mut ctrl_stub) = self.ctrl_stub.take() { - let mut sent = ctrl_stub.send_req(Some(&CtrlRequest::Leave))?; - while !sent { - sent = ctrl_stub.send_req(None)?; + let reply = self.ctrl_stub.recv_reply().await?; + match reply { + CtrlReply::Leave => { + pf_info!(self.id; "left current manager connection"); } - - let reply = ctrl_stub.recv_reply().await?; - match reply { - CtrlReply::Leave => { - pf_info!(self.id; "left current manager connection"); - ctrl_stub.forget(); - } - _ => { - return logged_err!(self.id; "unexpected reply type received"); - } + _ => { + return logged_err!(self.id; "unexpected reply type received"); } } } @@ -1492,4 +1538,12 @@ impl GenericEndpoint for CrosswordClient { None => logged_err!(self.id; "client is not set up"), } } + + fn id(&self) -> ClientId { + self.id + } + + fn ctrl_stub(&mut self) -> &mut ClientCtrlStub { + &mut self.ctrl_stub + } } diff --git a/src/protocols/mod.rs b/src/protocols/mod.rs index 98ecf371..3aae79bf 100644 --- a/src/protocols/mod.rs +++ b/src/protocols/mod.rs @@ -126,26 +126,36 @@ impl SmrProtocol { } /// Create a client endpoint instance of this protocol on heap. - pub fn new_client_endpoint( + pub async fn new_client_endpoint( &self, manager: SocketAddr, config_str: Option<&str>, ) -> Result, SummersetError> { match self { Self::RepNothing => { - box_if_ok!(RepNothingClient::new(manager, config_str)) + box_if_ok!( + RepNothingClient::new_and_setup(manager, config_str).await + ) } Self::SimplePush => { - box_if_ok!(SimplePushClient::new(manager, config_str)) + box_if_ok!( + SimplePushClient::new_and_setup(manager, config_str).await + ) } Self::MultiPaxos => { - box_if_ok!(MultiPaxosClient::new(manager, config_str)) + box_if_ok!( + MultiPaxosClient::new_and_setup(manager, config_str).await + ) } Self::RSPaxos => { - box_if_ok!(RSPaxosClient::new(manager, config_str)) + box_if_ok!( + RSPaxosClient::new_and_setup(manager, config_str).await + ) } Self::Crossword => { - box_if_ok!(CrosswordClient::new(manager, config_str)) + box_if_ok!( + CrosswordClient::new_and_setup(manager, config_str).await + ) } } } diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs index 4e50c9a3..2431ff86 100644 --- a/src/protocols/multipaxos.rs +++ b/src/protocols/multipaxos.rs @@ -26,7 +26,7 @@ use async_trait::async_trait; use serde::{Serialize, Deserialize}; use tokio::time::Duration; -use tokio::sync::mpsc; +use tokio::sync::watch; /// Configuration parameters struct. #[derive(Debug, Deserialize)] @@ -916,11 +916,61 @@ impl MultiPaxosReplica { Ok(()) } - /// Synthesized handler of manager control messages. - fn handle_ctrl_msg(&mut self, _msg: CtrlMsg) -> Result<(), SummersetError> { - // TODO: fill this when more control message types added + /// Handler of ResetState control message. + async fn handle_ctrl_reset_state( + &mut self, + durable: bool, + ) -> Result<(), SummersetError> { + // send leave notification to peers and wait for their replies + self.transport_hub.leave().await?; + + // send leave notification to manager and wait for its reply + self.control_hub.send_ctrl(CtrlMsg::Leave)?; + while self.control_hub.recv_ctrl().await? != CtrlMsg::LeaveReply {} + + // if `durable` is false, truncate backer file + if !durable { + // use 0 as a special log action ID here + self.storage_hub + .submit_action(0, LogAction::Truncate { offset: 0 })?; + loop { + let (action_id, log_result) = + self.storage_hub.get_result().await?; + if action_id == 0 { + if log_result + != (LogResult::Truncate { + offset_ok: true, + now_size: 0, + }) + { + return logged_err!(self.id; "failed to truncate log to 0"); + } else { + return Ok(()); + } + } + } + } + Ok(()) } + + /// Synthesized handler of manager control messages. If ok, returns + /// `Some(true)` if decides to terminate and reboot, `Some(false)` if + /// decides to shutdown completely, and `None` if not terminating. + async fn handle_ctrl_msg( + &mut self, + msg: CtrlMsg, + ) -> Result, SummersetError> { + // TODO: fill this when more control message types added + match msg { + CtrlMsg::ResetState { durable } => { + self.handle_ctrl_reset_state(durable).await?; + Ok(Some(true)) + } + + _ => Ok(None), // ignore all other types + } + } } #[async_trait] @@ -1016,15 +1066,10 @@ impl GenericReplica for MultiPaxosReplica { }) } - async fn run(&mut self) -> Result { - // set up termination signals handler - let (tx_term, mut rx_term) = mpsc::unbounded_channel(); - ctrlc::set_handler(move || { - if let Err(e) = tx_term.send(true) { - pf_error!("s"; "error sending to term channel: {}", e); - } - })?; - + async fn run( + &mut self, + mut rx_term: watch::Receiver, + ) -> Result { // TODO: proper leader election if self.id == 0 { self.is_leader = true; @@ -1088,19 +1133,34 @@ impl GenericReplica for MultiPaxosReplica { continue; } let ctrl_msg = ctrl_msg.unwrap(); - if let Err(e) = self.handle_ctrl_msg(ctrl_msg) { - pf_error!(self.id; "error handling ctrl msg: {}", e); + match self.handle_ctrl_msg(ctrl_msg).await { + Ok(terminate) => { + if let Some(restart) = terminate { + pf_warn!( + self.id; + "server got {} req", + if restart { "restart" } else { "shutdown" }); + return Ok(restart); + } + }, + Err(e) => { + pf_error!(self.id; "error handling ctrl msg: {}", e); + } } }, // receiving termination signal - _ = rx_term.recv() => { + _ = rx_term.changed() => { pf_warn!(self.id; "server caught termination signal"); return Ok(false); } } } } + + fn id(&self) -> ReplicaId { + self.id + } } /// Configuration parameters struct. @@ -1122,9 +1182,6 @@ pub struct MultiPaxosClient { /// Client ID. id: ClientId, - /// Address of the cluster manager oracle. - manager: SocketAddr, - /// Configuration parameters struct. _config: ClientConfigMultiPaxos, @@ -1135,7 +1192,7 @@ pub struct MultiPaxosClient { server_id: ReplicaId, /// Control API stub to the cluster manager. - ctrl_stub: Option, + ctrl_stub: ClientCtrlStub, /// API stubs for communicating with servers. api_stub: Option, @@ -1143,47 +1200,43 @@ pub struct MultiPaxosClient { #[async_trait] impl GenericEndpoint for MultiPaxosClient { - fn new( + async fn new_and_setup( manager: SocketAddr, config_str: Option<&str>, ) -> Result { + // connect to the cluster manager and get assigned a client ID + let ctrl_stub = ClientCtrlStub::new_by_connect(manager).await?; + let id = ctrl_stub.id; + + // parse protocol-specific configs let config = parsed_config!(config_str => ClientConfigMultiPaxos; init_server_id)?; let init_server_id = config.init_server_id; Ok(MultiPaxosClient { - id: 255, // nil at this time - manager, + id, _config: config, servers: HashMap::new(), server_id: init_server_id, - ctrl_stub: None, + ctrl_stub, api_stub: None, }) } - async fn connect(&mut self) -> Result { + async fn connect(&mut self) -> Result<(), SummersetError> { // disallow reconnection without leaving if self.api_stub.is_some() { return logged_err!(self.id; "reconnecting without leaving"); } - // if ctrl_stubs not established yet, connect to the manager - if self.ctrl_stub.is_none() { - let ctrl_stub = - ClientCtrlStub::new_by_connect(self.manager).await?; - self.id = ctrl_stub.id; - self.ctrl_stub = Some(ctrl_stub); - } - let ctrl_stub = self.ctrl_stub.as_mut().unwrap(); - // ask the manager about the list of active servers - let mut sent = ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?; + let mut sent = + self.ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?; while !sent { - sent = ctrl_stub.send_req(None)?; + sent = self.ctrl_stub.send_req(None)?; } - let reply = ctrl_stub.recv_reply().await?; + let reply = self.ctrl_stub.recv_reply().await?; match reply { CtrlReply::QueryInfo { servers } => { // connect to the one with server ID in config @@ -1194,7 +1247,7 @@ impl GenericEndpoint for MultiPaxosClient { .await?; self.api_stub = Some(api_stub); self.servers = servers; - Ok(self.id) + Ok(()) } _ => logged_err!(self.id; "unexpected reply type received"), } @@ -1222,26 +1275,19 @@ impl GenericEndpoint for MultiPaxosClient { // if permanently leaving, send leave notification to the manager if permanent { - // disallow multiple permanent leaving - if self.ctrl_stub.is_none() { - return logged_err!(self.id; "repeated permanent leaving"); + let mut sent = + self.ctrl_stub.send_req(Some(&CtrlRequest::Leave))?; + while !sent { + sent = self.ctrl_stub.send_req(None)?; } - if let Some(mut ctrl_stub) = self.ctrl_stub.take() { - let mut sent = ctrl_stub.send_req(Some(&CtrlRequest::Leave))?; - while !sent { - sent = ctrl_stub.send_req(None)?; + let reply = self.ctrl_stub.recv_reply().await?; + match reply { + CtrlReply::Leave => { + pf_info!(self.id; "left current manager connection"); } - - let reply = ctrl_stub.recv_reply().await?; - match reply { - CtrlReply::Leave => { - pf_info!(self.id; "left current manager connection"); - ctrl_stub.forget(); - } - _ => { - return logged_err!(self.id; "unexpected reply type received"); - } + _ => { + return logged_err!(self.id; "unexpected reply type received"); } } } @@ -1287,4 +1333,12 @@ impl GenericEndpoint for MultiPaxosClient { None => logged_err!(self.id; "client is not set up"), } } + + fn id(&self) -> ClientId { + self.id + } + + fn ctrl_stub(&mut self) -> &mut ClientCtrlStub { + &mut self.ctrl_stub + } } diff --git a/src/protocols/rep_nothing.rs b/src/protocols/rep_nothing.rs index bbfb79c6..ffbc57e1 100644 --- a/src/protocols/rep_nothing.rs +++ b/src/protocols/rep_nothing.rs @@ -21,7 +21,7 @@ use async_trait::async_trait; use serde::{Serialize, Deserialize}; use tokio::time::Duration; -use tokio::sync::mpsc; +use tokio::sync::watch; /// Configuration parameters struct. #[derive(Debug, Deserialize)] @@ -227,11 +227,58 @@ impl RepNothingReplica { Ok(()) } - /// Synthesized handler of manager control messages. - fn handle_ctrl_msg(&mut self, _msg: CtrlMsg) -> Result<(), SummersetError> { - // TODO: fill this when more control message types added + /// Handler of ResetState control message. + async fn handle_ctrl_reset_state( + &mut self, + durable: bool, + ) -> Result<(), SummersetError> { + // send leave notification to manager and wait for its reply + self.control_hub.send_ctrl(CtrlMsg::Leave)?; + while self.control_hub.recv_ctrl().await? != CtrlMsg::LeaveReply {} + + // if `durable` is false, truncate backer file + if !durable { + // use 0 as a special log action ID here + self.storage_hub + .submit_action(0, LogAction::Truncate { offset: 0 })?; + loop { + let (action_id, log_result) = + self.storage_hub.get_result().await?; + if action_id == 0 { + if log_result + != (LogResult::Truncate { + offset_ok: true, + now_size: 0, + }) + { + return logged_err!(self.id; "failed to truncate log to 0"); + } else { + return Ok(()); + } + } + } + } + Ok(()) } + + /// Synthesized handler of manager control messages. If ok, returns + /// `Some(true)` if decides to terminate and reboot, `Some(false)` if + /// decides to shutdown completely, and `None` if not terminating. + async fn handle_ctrl_msg( + &mut self, + msg: CtrlMsg, + ) -> Result, SummersetError> { + // TODO: fill this when more control message types added + match msg { + CtrlMsg::ResetState { durable } => { + self.handle_ctrl_reset_state(durable).await?; + Ok(Some(true)) + } + + _ => Ok(None), // ignore all other types + } + } } #[async_trait] @@ -300,15 +347,10 @@ impl GenericReplica for RepNothingReplica { }) } - async fn run(&mut self) -> Result { - // set up termination signals handler - let (tx_term, mut rx_term) = mpsc::unbounded_channel(); - ctrlc::set_handler(move || { - if let Err(e) = tx_term.send(true) { - pf_error!("s"; "error sending to term channel: {}", e); - } - })?; - + async fn run( + &mut self, + mut rx_term: watch::Receiver, + ) -> Result { loop { tokio::select! { // client request batch @@ -354,19 +396,34 @@ impl GenericReplica for RepNothingReplica { continue; } let ctrl_msg = ctrl_msg.unwrap(); - if let Err(e) = self.handle_ctrl_msg(ctrl_msg) { - pf_error!(self.id; "error handling ctrl msg: {}", e); + match self.handle_ctrl_msg(ctrl_msg).await { + Ok(terminate) => { + if let Some(restart) = terminate { + pf_warn!( + self.id; + "server got {} req", + if restart { "restart" } else { "shutdown" }); + return Ok(restart); + } + }, + Err(e) => { + pf_error!(self.id; "error handling ctrl msg: {}", e); + } } }, // receiving termination signal - _ = rx_term.recv() => { + _ = rx_term.changed() => { pf_warn!(self.id; "server caught termination signal"); return Ok(false); } } } } + + fn id(&self) -> ReplicaId { + self.id + } } /// Configuration parameters struct. @@ -388,14 +445,11 @@ pub struct RepNothingClient { /// Client ID. id: ClientId, - /// Address of the cluster manager oracle. - manager: SocketAddr, - /// Configuration parameters struct. config: ClientConfigRepNothing, /// Control API stub to the cluster manager. - ctrl_stub: Option, + ctrl_stub: ClientCtrlStub, /// API stubs for communicating with servers. api_stub: Option, @@ -403,44 +457,40 @@ pub struct RepNothingClient { #[async_trait] impl GenericEndpoint for RepNothingClient { - fn new( + async fn new_and_setup( manager: SocketAddr, config_str: Option<&str>, ) -> Result { + // connect to the cluster manager and get assigned a client ID + let ctrl_stub = ClientCtrlStub::new_by_connect(manager).await?; + let id = ctrl_stub.id; + + // parse protocol-specific configs let config = parsed_config!(config_str => ClientConfigRepNothing; server_id)?; Ok(RepNothingClient { - id: 255, // nil at this time - manager, + id, config, - ctrl_stub: None, + ctrl_stub, api_stub: None, }) } - async fn connect(&mut self) -> Result { + async fn connect(&mut self) -> Result<(), SummersetError> { // disallow reconnection without leaving if self.api_stub.is_some() { return logged_err!(self.id; "reconnecting without leaving"); } - // if ctrl_stubs not established yet, connect to the manager - if self.ctrl_stub.is_none() { - let ctrl_stub = - ClientCtrlStub::new_by_connect(self.manager).await?; - self.id = ctrl_stub.id; - self.ctrl_stub = Some(ctrl_stub); - } - let ctrl_stub = self.ctrl_stub.as_mut().unwrap(); - // ask the manager about the list of active servers - let mut sent = ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?; + let mut sent = + self.ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?; while !sent { - sent = ctrl_stub.send_req(None)?; + sent = self.ctrl_stub.send_req(None)?; } - let reply = ctrl_stub.recv_reply().await?; + let reply = self.ctrl_stub.recv_reply().await?; match reply { CtrlReply::QueryInfo { servers } => { // connect to the one with server ID in config @@ -450,7 +500,7 @@ impl GenericEndpoint for RepNothingClient { ) .await?; self.api_stub = Some(api_stub); - Ok(self.id) + Ok(()) } _ => logged_err!(self.id; "unexpected reply type received"), } @@ -478,26 +528,19 @@ impl GenericEndpoint for RepNothingClient { // if permanently leaving, send leave notification to the manager if permanent { - // disallow multiple permanent leaving - if self.ctrl_stub.is_none() { - return logged_err!(self.id; "repeated permanent leaving"); + let mut sent = + self.ctrl_stub.send_req(Some(&CtrlRequest::Leave))?; + while !sent { + sent = self.ctrl_stub.send_req(None)?; } - if let Some(mut ctrl_stub) = self.ctrl_stub.take() { - let mut sent = ctrl_stub.send_req(Some(&CtrlRequest::Leave))?; - while !sent { - sent = ctrl_stub.send_req(None)?; + let reply = self.ctrl_stub.recv_reply().await?; + match reply { + CtrlReply::Leave => { + pf_info!(self.id; "left current manager connection"); } - - let reply = ctrl_stub.recv_reply().await?; - match reply { - CtrlReply::Leave => { - pf_info!(self.id; "left current manager connection"); - ctrl_stub.forget(); - } - _ => { - return logged_err!(self.id; "unexpected reply type received"); - } + _ => { + return logged_err!(self.id; "unexpected reply type received"); } } } @@ -521,4 +564,12 @@ impl GenericEndpoint for RepNothingClient { None => logged_err!(self.id; "client is not set up"), } } + + fn id(&self) -> ClientId { + self.id + } + + fn ctrl_stub(&mut self) -> &mut ClientCtrlStub { + &mut self.ctrl_stub + } } diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs index 1c72b41a..ada30d45 100644 --- a/src/protocols/rs_paxos.rs +++ b/src/protocols/rs_paxos.rs @@ -22,7 +22,7 @@ use async_trait::async_trait; use serde::{Serialize, Deserialize}; use tokio::time::Duration; -use tokio::sync::mpsc; +use tokio::sync::watch; use reed_solomon_erasure::galois_8::ReedSolomon; @@ -1001,11 +1001,61 @@ impl RSPaxosReplica { Ok(()) } - /// Synthesized handler of manager control messages. - fn handle_ctrl_msg(&mut self, _msg: CtrlMsg) -> Result<(), SummersetError> { - // TODO: fill this when more control message types added + /// Handler of ResetState control message. + async fn handle_ctrl_reset_state( + &mut self, + durable: bool, + ) -> Result<(), SummersetError> { + // send leave notification to peers and wait for their replies + self.transport_hub.leave().await?; + + // send leave notification to manager and wait for its reply + self.control_hub.send_ctrl(CtrlMsg::Leave)?; + while self.control_hub.recv_ctrl().await? != CtrlMsg::LeaveReply {} + + // if `durable` is false, truncate backer file + if !durable { + // use 0 as a special log action ID here + self.storage_hub + .submit_action(0, LogAction::Truncate { offset: 0 })?; + loop { + let (action_id, log_result) = + self.storage_hub.get_result().await?; + if action_id == 0 { + if log_result + != (LogResult::Truncate { + offset_ok: true, + now_size: 0, + }) + { + return logged_err!(self.id; "failed to truncate log to 0"); + } else { + return Ok(()); + } + } + } + } + Ok(()) } + + /// Synthesized handler of manager control messages. If ok, returns + /// `Some(true)` if decides to terminate and reboot, `Some(false)` if + /// decides to shutdown completely, and `None` if not terminating. + async fn handle_ctrl_msg( + &mut self, + msg: CtrlMsg, + ) -> Result, SummersetError> { + // TODO: fill this when more control message types added + match msg { + CtrlMsg::ResetState { durable } => { + self.handle_ctrl_reset_state(durable).await?; + Ok(Some(true)) + } + + _ => Ok(None), // ignore all other types + } + } } #[async_trait] @@ -1114,15 +1164,10 @@ impl GenericReplica for RSPaxosReplica { }) } - async fn run(&mut self) -> Result { - // set up termination signals handler - let (tx_term, mut rx_term) = mpsc::unbounded_channel(); - ctrlc::set_handler(move || { - if let Err(e) = tx_term.send(true) { - pf_error!("s"; "error sending to term channel: {}", e); - } - })?; - + async fn run( + &mut self, + mut rx_term: watch::Receiver, + ) -> Result { // TODO: proper leader election if self.id == 0 { self.is_leader = true; @@ -1186,19 +1231,34 @@ impl GenericReplica for RSPaxosReplica { continue; } let ctrl_msg = ctrl_msg.unwrap(); - if let Err(e) = self.handle_ctrl_msg(ctrl_msg) { - pf_error!(self.id; "error handling ctrl msg: {}", e); + match self.handle_ctrl_msg(ctrl_msg).await { + Ok(terminate) => { + if let Some(restart) = terminate { + pf_warn!( + self.id; + "server got {} req", + if restart { "restart" } else { "shutdown" }); + return Ok(restart); + } + }, + Err(e) => { + pf_error!(self.id; "error handling ctrl msg: {}", e); + } } }, // receiving termination signal - _ = rx_term.recv() => { + _ = rx_term.changed() => { pf_warn!(self.id; "server caught termination signal"); return Ok(false); } } } } + + fn id(&self) -> ReplicaId { + self.id + } } /// Configuration parameters struct. @@ -1220,9 +1280,6 @@ pub struct RSPaxosClient { /// Client ID. id: ClientId, - /// Address of the cluster manager oracle. - manager: SocketAddr, - /// Configuration parameters struct. _config: ClientConfigRSPaxos, @@ -1233,7 +1290,7 @@ pub struct RSPaxosClient { server_id: ReplicaId, /// Control API stub to the cluster manager. - ctrl_stub: Option, + ctrl_stub: ClientCtrlStub, /// API stubs for communicating with servers. api_stub: Option, @@ -1241,47 +1298,43 @@ pub struct RSPaxosClient { #[async_trait] impl GenericEndpoint for RSPaxosClient { - fn new( + async fn new_and_setup( manager: SocketAddr, config_str: Option<&str>, ) -> Result { + // connect to the cluster manager and get assigned a client ID + let ctrl_stub = ClientCtrlStub::new_by_connect(manager).await?; + let id = ctrl_stub.id; + + // parse protocol-specific configs let config = parsed_config!(config_str => ClientConfigRSPaxos; init_server_id)?; let init_server_id = config.init_server_id; Ok(RSPaxosClient { - id: 255, // nil at this time - manager, + id, _config: config, servers: HashMap::new(), server_id: init_server_id, - ctrl_stub: None, + ctrl_stub, api_stub: None, }) } - async fn connect(&mut self) -> Result { + async fn connect(&mut self) -> Result<(), SummersetError> { // disallow reconnection without leaving if self.api_stub.is_some() { return logged_err!(self.id; "reconnecting without leaving"); } - // if ctrl_stubs not established yet, connect to the manager - if self.ctrl_stub.is_none() { - let ctrl_stub = - ClientCtrlStub::new_by_connect(self.manager).await?; - self.id = ctrl_stub.id; - self.ctrl_stub = Some(ctrl_stub); - } - let ctrl_stub = self.ctrl_stub.as_mut().unwrap(); - // ask the manager about the list of active servers - let mut sent = ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?; + let mut sent = + self.ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?; while !sent { - sent = ctrl_stub.send_req(None)?; + sent = self.ctrl_stub.send_req(None)?; } - let reply = ctrl_stub.recv_reply().await?; + let reply = self.ctrl_stub.recv_reply().await?; match reply { CtrlReply::QueryInfo { servers } => { // connect to the one with server ID in config @@ -1292,7 +1345,7 @@ impl GenericEndpoint for RSPaxosClient { .await?; self.api_stub = Some(api_stub); self.servers = servers; - Ok(self.id) + Ok(()) } _ => logged_err!(self.id; "unexpected reply type received"), } @@ -1320,26 +1373,19 @@ impl GenericEndpoint for RSPaxosClient { // if permanently leaving, send leave notification to the manager if permanent { - // disallow multiple permanent leaving - if self.ctrl_stub.is_none() { - return logged_err!(self.id; "repeated permanent leaving"); + let mut sent = + self.ctrl_stub.send_req(Some(&CtrlRequest::Leave))?; + while !sent { + sent = self.ctrl_stub.send_req(None)?; } - if let Some(mut ctrl_stub) = self.ctrl_stub.take() { - let mut sent = ctrl_stub.send_req(Some(&CtrlRequest::Leave))?; - while !sent { - sent = ctrl_stub.send_req(None)?; + let reply = self.ctrl_stub.recv_reply().await?; + match reply { + CtrlReply::Leave => { + pf_info!(self.id; "left current manager connection"); } - - let reply = ctrl_stub.recv_reply().await?; - match reply { - CtrlReply::Leave => { - pf_info!(self.id; "left current manager connection"); - ctrl_stub.forget(); - } - _ => { - return logged_err!(self.id; "unexpected reply type received"); - } + _ => { + return logged_err!(self.id; "unexpected reply type received"); } } } @@ -1385,4 +1431,12 @@ impl GenericEndpoint for RSPaxosClient { None => logged_err!(self.id; "client is not set up"), } } + + fn id(&self) -> ClientId { + self.id + } + + fn ctrl_stub(&mut self) -> &mut ClientCtrlStub { + &mut self.ctrl_stub + } } diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs index b0156ad3..7d9aa763 100644 --- a/src/protocols/simple_push.rs +++ b/src/protocols/simple_push.rs @@ -22,7 +22,7 @@ use async_trait::async_trait; use serde::{Serialize, Deserialize}; use tokio::time::Duration; -use tokio::sync::mpsc; +use tokio::sync::watch; /// Configuration parameters struct. #[derive(Debug, Deserialize)] @@ -383,11 +383,61 @@ impl SimplePushReplica { Ok(()) } - /// Synthesized handler of manager control messages. - fn handle_ctrl_msg(&mut self, _msg: CtrlMsg) -> Result<(), SummersetError> { - // TODO: fill this when more control message types added + /// Handler of ResetState control message. + async fn handle_ctrl_reset_state( + &mut self, + durable: bool, + ) -> Result<(), SummersetError> { + // send leave notification to peers and wait for their replies + self.transport_hub.leave().await?; + + // send leave notification to manager and wait for its reply + self.control_hub.send_ctrl(CtrlMsg::Leave)?; + while self.control_hub.recv_ctrl().await? != CtrlMsg::LeaveReply {} + + // if `durable` is false, truncate backer file + if !durable { + // use 0 as a special log action ID here + self.storage_hub + .submit_action(0, LogAction::Truncate { offset: 0 })?; + loop { + let (action_id, log_result) = + self.storage_hub.get_result().await?; + if action_id == 0 { + if log_result + != (LogResult::Truncate { + offset_ok: true, + now_size: 0, + }) + { + return logged_err!(self.id; "failed to truncate log to 0"); + } else { + return Ok(()); + } + } + } + } + Ok(()) } + + /// Synthesized handler of manager control messages. If ok, returns + /// `Some(true)` if decides to terminate and reboot, `Some(false)` if + /// decides to shutdown completely, and `None` if not terminating. + async fn handle_ctrl_msg( + &mut self, + msg: CtrlMsg, + ) -> Result, SummersetError> { + // TODO: fill this when more control message types added + match msg { + CtrlMsg::ResetState { durable } => { + self.handle_ctrl_reset_state(durable).await?; + Ok(Some(true)) + } + + _ => Ok(None), // ignore all other types + } + } } #[async_trait] @@ -476,15 +526,10 @@ impl GenericReplica for SimplePushReplica { }) } - async fn run(&mut self) -> Result { - // set up termination signals handler - let (tx_term, mut rx_term) = mpsc::unbounded_channel(); - ctrlc::set_handler(move || { - if let Err(e) = tx_term.send(true) { - pf_error!("s"; "error sending to term channel: {}", e); - } - })?; - + async fn run( + &mut self, + mut rx_term: watch::Receiver, + ) -> Result { loop { tokio::select! { // client request batch @@ -552,19 +597,34 @@ impl GenericReplica for SimplePushReplica { continue; } let ctrl_msg = ctrl_msg.unwrap(); - if let Err(e) = self.handle_ctrl_msg(ctrl_msg) { - pf_error!(self.id; "error handling ctrl msg: {}", e); + match self.handle_ctrl_msg(ctrl_msg).await { + Ok(terminate) => { + if let Some(restart) = terminate { + pf_warn!( + self.id; + "server got {} req", + if restart { "restart" } else { "shutdown" }); + return Ok(restart); + } + }, + Err(e) => { + pf_error!(self.id; "error handling ctrl msg: {}", e); + } } }, // receiving termination signal - _ = rx_term.recv() => { + _ = rx_term.changed() => { pf_warn!(self.id; "server caught termination signal"); return Ok(false); } } } } + + fn id(&self) -> ReplicaId { + self.id + } } /// Configuration parameters struct. @@ -586,14 +646,11 @@ pub struct SimplePushClient { /// Client ID. id: ClientId, - /// Address of the cluster manager oracle. - manager: SocketAddr, - /// Configuration parameters struct. config: ClientConfigSimplePush, /// Control API stub to the cluster manager. - ctrl_stub: Option, + ctrl_stub: ClientCtrlStub, /// API stubs for communicating with servers. api_stub: Option, @@ -601,44 +658,40 @@ pub struct SimplePushClient { #[async_trait] impl GenericEndpoint for SimplePushClient { - fn new( + async fn new_and_setup( manager: SocketAddr, config_str: Option<&str>, ) -> Result { + // connect to the cluster manager and get assigned a client ID + let ctrl_stub = ClientCtrlStub::new_by_connect(manager).await?; + let id = ctrl_stub.id; + + // parse protocol-specific configs let config = parsed_config!(config_str => ClientConfigSimplePush; server_id)?; Ok(SimplePushClient { - id: 255, // nil at this time - manager, + id, config, - ctrl_stub: None, + ctrl_stub, api_stub: None, }) } - async fn connect(&mut self) -> Result { + async fn connect(&mut self) -> Result<(), SummersetError> { // disallow reconnection without leaving if self.api_stub.is_some() { return logged_err!(self.id; "reconnecting without leaving"); } - // if ctrl_stubs not established yet, connect to the manager - if self.ctrl_stub.is_none() { - let ctrl_stub = - ClientCtrlStub::new_by_connect(self.manager).await?; - self.id = ctrl_stub.id; - self.ctrl_stub = Some(ctrl_stub); - } - let ctrl_stub = self.ctrl_stub.as_mut().unwrap(); - // ask the manager about the list of active servers - let mut sent = ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?; + let mut sent = + self.ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?; while !sent { - sent = ctrl_stub.send_req(None)?; + sent = self.ctrl_stub.send_req(None)?; } - let reply = ctrl_stub.recv_reply().await?; + let reply = self.ctrl_stub.recv_reply().await?; match reply { CtrlReply::QueryInfo { servers } => { // connect to the one with server ID in config @@ -648,7 +701,7 @@ impl GenericEndpoint for SimplePushClient { ) .await?; self.api_stub = Some(api_stub); - Ok(self.id) + Ok(()) } _ => logged_err!(self.id; "unexpected reply type received"), } @@ -676,26 +729,19 @@ impl GenericEndpoint for SimplePushClient { // if permanently leaving, send leave notification to the manager if permanent { - // disallow multiple permanent leaving - if self.ctrl_stub.is_none() { - return logged_err!(self.id; "repeated permanent leaving"); + let mut sent = + self.ctrl_stub.send_req(Some(&CtrlRequest::Leave))?; + while !sent { + sent = self.ctrl_stub.send_req(None)?; } - if let Some(mut ctrl_stub) = self.ctrl_stub.take() { - let mut sent = ctrl_stub.send_req(Some(&CtrlRequest::Leave))?; - while !sent { - sent = ctrl_stub.send_req(None)?; + let reply = self.ctrl_stub.recv_reply().await?; + match reply { + CtrlReply::Leave => { + pf_info!(self.id; "left current manager connection"); } - - let reply = ctrl_stub.recv_reply().await?; - match reply { - CtrlReply::Leave => { - pf_info!(self.id; "left current manager connection"); - ctrl_stub.forget(); - } - _ => { - return logged_err!(self.id; "unexpected reply type received"); - } + _ => { + return logged_err!(self.id; "unexpected reply type received"); } } } @@ -719,4 +765,12 @@ impl GenericEndpoint for SimplePushClient { None => logged_err!(self.id; "client is not set up"), } } + + fn id(&self) -> ClientId { + self.id + } + + fn ctrl_stub(&mut self) -> &mut ClientCtrlStub { + &mut self.ctrl_stub + } } diff --git a/src/server/external.rs b/src/server/external.rs index 3083a662..c52a946c 100644 --- a/src/server/external.rs +++ b/src/server/external.rs @@ -519,6 +519,7 @@ mod external_tests { ) .await?; barrier2.wait().await; + // recv requests from client let mut reqs: Vec<(ClientId, ApiRequest)> = vec![]; while reqs.len() < 3 { let mut req_batch = api.get_req_batch().await?; @@ -551,6 +552,7 @@ mod external_tests { cmd: Command::Get { key: "Jose".into() }, } ); + // send replies to client api.send_reply( ApiReply::Reply { id: 0, @@ -584,6 +586,7 @@ mod external_tests { let mut api_stub = ClientApiStub::new_by_connect(2857, "127.0.0.1:53700".parse()?) .await?; + // send requests to server api_stub.send_req(Some(&ApiRequest::Req { id: 0, cmd: Command::Put { @@ -599,6 +602,7 @@ mod external_tests { id: 1, cmd: Command::Get { key: "Jose".into() }, }))?; + // recv replies from server assert_eq!( api_stub.recv_reply().await?, ApiReply::Reply { @@ -642,6 +646,7 @@ mod external_tests { ) .await?; barrier2.wait().await; + // recv request from client let mut reqs: Vec<(ClientId, ApiRequest)> = vec![]; while reqs.is_empty() { let mut req_batch = api.get_req_batch().await?; @@ -660,6 +665,7 @@ mod external_tests { }, } ); + // send reply to client api.send_reply( ApiReply::Reply { id: 0, @@ -668,6 +674,7 @@ mod external_tests { }, client, )?; + // recv request from new client reqs.clear(); while reqs.is_empty() { let mut req_batch = api.get_req_batch().await?; @@ -687,6 +694,7 @@ mod external_tests { }, } ); + // send reply to new client api.send_reply( ApiReply::Reply { id: 0, @@ -704,6 +712,7 @@ mod external_tests { let mut api_stub = ClientApiStub::new_by_connect(2857, "127.0.0.1:54700".parse()?) .await?; + // send request to server api_stub.send_req(Some(&ApiRequest::Req { id: 0, cmd: Command::Put { @@ -711,6 +720,7 @@ mod external_tests { value: "123".into(), }, }))?; + // recv reply from server assert_eq!( api_stub.recv_reply().await?, ApiReply::Reply { @@ -719,13 +729,15 @@ mod external_tests { redirect: None, } ); + // leave and come back as new client api_stub.send_req(Some(&ApiRequest::Leave))?; assert_eq!(api_stub.recv_reply().await?, ApiReply::Leave); api_stub.forget(); - time::sleep(Duration::from_millis(1)).await; + time::sleep(Duration::from_millis(100)).await; let mut api_stub = ClientApiStub::new_by_connect(2858, "127.0.0.1:54700".parse()?) .await?; + // send request to server api_stub.send_req(Some(&ApiRequest::Req { id: 0, cmd: Command::Put { @@ -733,6 +745,7 @@ mod external_tests { value: "456".into(), }, }))?; + // recv reply from server assert_eq!( api_stub.recv_reply().await?, ApiReply::Reply { diff --git a/src/server/replica.rs b/src/server/replica.rs index cae9d042..6c305ad3 100644 --- a/src/server/replica.rs +++ b/src/server/replica.rs @@ -7,6 +7,8 @@ use crate::utils::SummersetError; use async_trait::async_trait; +use tokio::sync::watch; + /// Server replica ID type. pub type ReplicaId = u8; @@ -28,5 +30,11 @@ pub trait GenericReplica { /// terminated normally and wants to restart (e.g., receiving a reset /// control message) or `Ok(false)` if terminated normally and does not /// want to restart (e.g., receiving a termination signal). - async fn run(&mut self) -> Result; + async fn run( + &mut self, + rx_term: watch::Receiver, // termination signals channel + ) -> Result; + + /// Gets my replica ID. + fn id(&self) -> ReplicaId; } diff --git a/src/server/transport.rs b/src/server/transport.rs index 8f5f69cf..e7ca2998 100644 --- a/src/server/transport.rs +++ b/src/server/transport.rs @@ -11,7 +11,7 @@ use crate::server::ReplicaId; use bytes::BytesMut; -use serde::{Serialize, de::DeserializeOwned}; +use serde::{Serialize, Deserialize, de::DeserializeOwned}; use tokio::net::{TcpListener, TcpStream}; use tokio::net::tcp::{OwnedReadHalf, OwnedWriteHalf}; @@ -20,6 +20,19 @@ use tokio::sync::mpsc; use tokio::task::JoinHandle; use tokio::time::{self, Duration}; +/// Peer-peer message wrapper type that includes leave notification variants. +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] +enum PeerMessage { + /// Normal protocol-specific request. + Msg { msg: Msg }, + + /// Server leave notification. + Leave, + + /// Reply to leave notification. + LeaveReply, +} + /// Server internal TCP transport module. pub struct TransportHub { /// My replica ID. @@ -29,11 +42,14 @@ pub struct TransportHub { population: u8, /// Receiver side of the recv channel. - rx_recv: mpsc::UnboundedReceiver<(ReplicaId, Msg)>, + rx_recv: mpsc::UnboundedReceiver<(ReplicaId, PeerMessage)>, /// Map from peer ID -> sender side of the send channel, shared with the /// peer acceptor thread. - tx_sends: flashmap::ReadHandle>, + tx_sends: flashmap::ReadHandle< + ReplicaId, + mpsc::UnboundedSender>, + >, /// Join handle of the peer acceptor thread. _peer_acceptor_handle: JoinHandle<()>, @@ -76,8 +92,10 @@ where let (tx_recv, rx_recv) = mpsc::unbounded_channel(); - let (tx_sends_write, tx_sends_read) = - flashmap::new::>(); + let (tx_sends_write, tx_sends_read) = flashmap::new::< + ReplicaId, + mpsc::UnboundedSender>, + >(); let (peer_messenger_handles_write, peer_messenger_handles_read) = flashmap::new::>(); @@ -141,7 +159,7 @@ where logged_err!(self.me; "invalid group size {}", group) } else { while self.current_peers()?.count() + 1 < group { - time::sleep(Duration::from_millis(10)).await; + time::sleep(Duration::from_millis(100)).await; } Ok(()) } @@ -170,7 +188,7 @@ where match tx_sends_guard.get(&peer) { Some(tx_send) => { tx_send - .send(msg) + .send(PeerMessage::Msg { msg }) .map_err(|e| SummersetError(e.to_string()))?; } None => { @@ -207,7 +225,7 @@ where tx_sends_guard .get(&peer) .unwrap() - .send(msg.clone()) + .send(PeerMessage::Msg { msg: msg.clone() }) .map_err(|e| SummersetError(e.to_string()))?; } @@ -220,10 +238,47 @@ where &mut self, ) -> Result<(ReplicaId, Msg), SummersetError> { match self.rx_recv.recv().await { - Some((id, msg)) => Ok((id, msg)), + Some((id, peer_msg)) => match peer_msg { + PeerMessage::Msg { msg } => Ok((id, msg)), + _ => logged_err!(self.me; "unexpected peer message type"), + }, None => logged_err!(self.me; "recv channel has been closed"), } } + + /// Broadcasts leave notifications to all peers and waits for replies. + pub async fn leave(&mut self) -> Result<(), SummersetError> { + let tx_sends_guard = self.tx_sends.guard(); + let mut num_peers = 0; + for &peer in tx_sends_guard.keys() { + if peer == self.me { + continue; + } + + // not skipped + tx_sends_guard + .get(&peer) + .unwrap() + .send(PeerMessage::Leave) + .map_err(|e| SummersetError(e.to_string()))?; + num_peers += 1; + } + + let mut replies = Bitmap::new(self.population, false); + while replies.count() < num_peers { + match self.rx_recv.recv().await { + Some((id, peer_msg)) => match peer_msg { + PeerMessage::LeaveReply => replies.set(id, true)?, + _ => continue, // ignore all other types of messages + }, + None => { + return logged_err!(self.me; "recv channel has been closed"); + } + } + } + + Ok(()) + } } // TransportHub peer_acceptor thread implementation @@ -242,10 +297,10 @@ where me: ReplicaId, id: ReplicaId, addr: SocketAddr, - tx_recv: mpsc::UnboundedSender<(ReplicaId, Msg)>, + tx_recv: mpsc::UnboundedSender<(ReplicaId, PeerMessage)>, tx_sends: &mut flashmap::WriteHandle< ReplicaId, - mpsc::UnboundedSender, + mpsc::UnboundedSender>, >, peer_messenger_handles: &mut flashmap::WriteHandle< ReplicaId, @@ -280,10 +335,10 @@ where me: ReplicaId, mut stream: TcpStream, addr: SocketAddr, - tx_recv: mpsc::UnboundedSender<(ReplicaId, Msg)>, + tx_recv: mpsc::UnboundedSender<(ReplicaId, PeerMessage)>, tx_sends: &mut flashmap::WriteHandle< ReplicaId, - mpsc::UnboundedSender, + mpsc::UnboundedSender>, >, peer_messenger_handles: &mut flashmap::WriteHandle< ReplicaId, @@ -321,7 +376,7 @@ where id: ReplicaId, tx_sends: &mut flashmap::WriteHandle< ReplicaId, - mpsc::UnboundedSender, + mpsc::UnboundedSender>, >, peer_messenger_handles: &mut flashmap::WriteHandle< ReplicaId, @@ -343,11 +398,11 @@ where /// Peer acceptor thread function. async fn peer_acceptor_thread( me: ReplicaId, - tx_recv: mpsc::UnboundedSender<(ReplicaId, Msg)>, + tx_recv: mpsc::UnboundedSender<(ReplicaId, PeerMessage)>, peer_listener: TcpListener, mut tx_sends: flashmap::WriteHandle< ReplicaId, - mpsc::UnboundedSender, + mpsc::UnboundedSender>, >, mut peer_messenger_handles: flashmap::WriteHandle< ReplicaId, @@ -444,7 +499,7 @@ where write_buf: &mut BytesMut, write_buf_cursor: &mut usize, conn_write: &OwnedWriteHalf, - msg: Option<&Msg>, + msg: Option<&PeerMessage>, ) -> Result { safe_tcp_write(write_buf, write_buf_cursor, conn_write, msg) } @@ -455,7 +510,7 @@ where // message itself read_buf: &mut BytesMut, conn_read: &mut OwnedReadHalf, - ) -> Result { + ) -> Result, SummersetError> { safe_tcp_read(read_buf, conn_read).await } @@ -465,8 +520,8 @@ where id: ReplicaId, // corresonding peer's ID addr: SocketAddr, // corresponding peer's address conn: TcpStream, - mut rx_send: mpsc::UnboundedReceiver, - tx_recv: mpsc::UnboundedSender<(ReplicaId, Msg)>, + mut rx_send: mpsc::UnboundedReceiver>, + tx_recv: mpsc::UnboundedSender<(ReplicaId, PeerMessage)>, tx_exit: mpsc::UnboundedSender, ) { pf_debug!(me; "peer_messenger thread for {} ({}) spawned", id, addr); @@ -482,12 +537,32 @@ where // gets a message to send out msg = rx_send.recv(), if !retrying => { match msg { - Some(msg) => { + Some(PeerMessage::Leave) => { + // I decide to leave, notify peers + let peer_msg = PeerMessage::Leave; + if let Err(e) = Self::write_msg( + &mut write_buf, + &mut write_buf_cursor, + &conn_write, + Some(&peer_msg), + ) { + pf_error!(me; "error sending -> {}: {}", id, e); + } else { // skips `WouldBlock` failure check here + pf_debug!(me; "sent leave notification -> {}", id); + } + }, + + Some(PeerMessage::LeaveReply) => { + pf_error!(me; "proactively sending LeaveReply msg"); + }, + + Some(PeerMessage::Msg { msg }) => { + let peer_msg = PeerMessage::Msg { msg }; match Self::write_msg( &mut write_buf, &mut write_buf_cursor, &conn_write, - Some(&msg), + Some(&peer_msg), ) { Ok(true) => { // pf_trace!(me; "sent -> {} msg {:?}", id, msg); @@ -501,6 +576,7 @@ where } } }, + None => break, // channel gets closed and no messages remain } }, @@ -508,9 +584,35 @@ where // receives new message from peer msg = Self::read_msg(&mut read_buf, &mut conn_read) => { match msg { - Ok(msg) => { + Ok(PeerMessage::Leave) => { + // peer leaving, send dummy reply and break + let peer_msg = PeerMessage::LeaveReply; + if let Err(e) = Self::write_msg( + &mut write_buf, + &mut write_buf_cursor, + &conn_write, + Some(&peer_msg), + ) { + pf_error!(me; "error sending -> {}: {}", id, e); + } else { // skips `WouldBlock` failure check here + pf_debug!(me; "peer {} has left", id); + } + break; + }, + + Ok(PeerMessage::LeaveReply) => { + // my leave notification is acked by peer, break + let peer_msg = PeerMessage::LeaveReply; + if let Err(e) = tx_recv.send((id, peer_msg)) { + pf_error!(me; "error sending to tx_recv for {}: {}", id, e); + } + break; + } + + Ok(PeerMessage::Msg { msg }) => { // pf_trace!(me; "recv <- {} msg {:?}", id, msg); - if let Err(e) = tx_recv.send((id, msg)) { + let peer_msg = PeerMessage::Msg { msg }; + if let Err(e) = tx_recv.send((id, peer_msg)) { pf_error!(me; "error sending to tx_recv for {}: {}", id, e); } }, @@ -570,53 +672,53 @@ mod transport_tests { tokio::spawn(async move { // replica 1 let mut hub: TransportHub = - TransportHub::new_and_setup(1, 3, "127.0.0.1:54801".parse()?) + TransportHub::new_and_setup(1, 3, "127.0.0.1:53801".parse()?) .await?; barrier1.wait().await; - hub.connect_to_peer(2, "127.0.0.1:54802".parse()?).await?; + hub.connect_to_peer(2, "127.0.0.1:53802".parse()?).await?; // recv a message from 0 let (id, msg) = hub.recv_msg().await?; - assert!(id == 0); + assert_eq!(id, 0); assert_eq!(msg, TestMsg("hello".into())); // send a message to 0 hub.send_msg(TestMsg("world".into()), 0)?; // recv another message from 0 let (id, msg) = hub.recv_msg().await?; - assert!(id == 0); + assert_eq!(id, 0); assert_eq!(msg, TestMsg("nice".into())); // send another message to 0 hub.send_msg(TestMsg("job!".into()), 0)?; // wait for termination message let (id, msg) = hub.recv_msg().await?; - assert!(id == 0); + assert_eq!(id, 0); assert_eq!(msg, TestMsg("terminate".into())); Ok::<(), SummersetError>(()) }); tokio::spawn(async move { // replica 2 let mut hub: TransportHub = - TransportHub::new_and_setup(2, 3, "127.0.0.1:54802".parse()?) + TransportHub::new_and_setup(2, 3, "127.0.0.1:53802".parse()?) .await?; barrier2.wait().await; // recv a message from 0 let (id, msg) = hub.recv_msg().await?; - assert!(id == 0); + assert_eq!(id, 0); assert_eq!(msg, TestMsg("hello".into())); // send a message to 0 hub.send_msg(TestMsg("world".into()), 0)?; // wait for termination message let (id, msg) = hub.recv_msg().await?; - assert!(id == 0); + assert_eq!(id, 0); assert_eq!(msg, TestMsg("terminate".into())); Ok::<(), SummersetError>(()) }); // replica 0 let mut hub: TransportHub = - TransportHub::new_and_setup(0, 3, "127.0.0.1:54800".parse()?) + TransportHub::new_and_setup(0, 3, "127.0.0.1:53800".parse()?) .await?; barrier.wait().await; - hub.connect_to_peer(1, "127.0.0.1:54801".parse()?).await?; - hub.connect_to_peer(2, "127.0.0.1:54802".parse()?).await?; + hub.connect_to_peer(1, "127.0.0.1:53801".parse()?).await?; + hub.connect_to_peer(2, "127.0.0.1:53802".parse()?).await?; // send a message to 1 and 2 hub.bcast_msg(TestMsg("hello".into()), None)?; // recv a message from both 1 and 2 @@ -638,4 +740,49 @@ mod transport_tests { hub.bcast_msg(TestMsg("terminate".into()), None)?; Ok(()) } + + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn api_server_leave() -> Result<(), SummersetError> { + let barrier = Arc::new(Barrier::new(2)); + let barrier2 = barrier.clone(); + tokio::spawn(async move { + // replica 1/2 + let mut hub: TransportHub = + TransportHub::new_and_setup(1, 3, "127.0.0.1:54801".parse()?) + .await?; + barrier2.wait().await; + // recv a message from 0 + let (id, msg) = hub.recv_msg().await?; + assert_eq!(id, 0); + assert!(hub.current_peers()?.get(id)?); + assert_eq!(msg, TestMsg("goodbye".into())); + // leave and come back as 2 + hub.leave().await?; + time::sleep(Duration::from_millis(100)).await; + let mut hub: TransportHub = + TransportHub::new_and_setup(2, 3, "127.0.0.1:54802".parse()?) + .await?; + hub.connect_to_peer(0, "127.0.0.1:54800".parse()?).await?; + // send a message to 0 + hub.send_msg(TestMsg("hello".into()), 0)?; + Ok::<(), SummersetError>(()) + }); + // replica 0 + let mut hub: TransportHub = + TransportHub::new_and_setup(0, 3, "127.0.0.1:54800".parse()?) + .await?; + barrier.wait().await; + hub.connect_to_peer(1, "127.0.0.1:54801".parse()?).await?; + assert!(hub.current_peers()?.get(1)?); + assert!(!hub.current_peers()?.get(2)?); + // send a message to 1 + hub.send_msg(TestMsg("goodbye".into()), 1)?; + // recv a message from 2 + let (id, msg) = hub.recv_msg().await?; + assert_eq!(id, 2); + assert_eq!(msg, TestMsg("hello".into())); + assert!(!hub.current_peers()?.get(1)?); + assert!(hub.current_peers()?.get(2)?); + Ok(()) + } } diff --git a/summerset_client/Cargo.toml b/summerset_client/Cargo.toml index 9633986c..bf270e74 100644 --- a/summerset_client/Cargo.toml +++ b/summerset_client/Cargo.toml @@ -7,7 +7,7 @@ authors = ["Guanzhou Jose Hu "] [dependencies] summerset = { path = "../" } -tokio = { version = "1.29", features = ["macros", "rt-multi-thread"] } +tokio = { version = "1.29", features = ["full"] } rand = "0.8" lazy_static = "1.4" clap = { version = "4.0", features = ["derive"] } diff --git a/summerset_client/src/clients/tester.rs b/summerset_client/src/clients/tester.rs index 98743091..9ad2c3d6 100644 --- a/summerset_client/src/clients/tester.rs +++ b/summerset_client/src/clients/tester.rs @@ -18,8 +18,8 @@ use serde::Deserialize; use tokio::time::Duration; use summerset::{ - GenericEndpoint, CommandResult, RequestId, SummersetError, pf_error, - logged_err, parsed_config, + GenericEndpoint, CommandResult, RequestId, CtrlRequest, CtrlReply, + SummersetError, pf_error, logged_err, parsed_config, }; lazy_static! { @@ -215,13 +215,35 @@ impl ClientTester { } } + /// Resets all servers in the cluster to initial empty state. + async fn reset_cluster(&mut self) -> Result<(), SummersetError> { + let ctrl_stub = self.driver.ctrl_stub(); + + // send ResetServer request to manager + let req = CtrlRequest::ResetServer { + server: None, + durable: false, + }; + let mut sent = ctrl_stub.send_req(Some(&req))?; + while !sent { + sent = ctrl_stub.send_req(None)?; + } + + // wait for reply from manager + let reply = ctrl_stub.recv_reply().await?; + match reply { + CtrlReply::ResetServer { .. } => Ok(()), + _ => logged_err!("c"; "unexpected control reply type"), + } + } + /// Runs the individual correctness test. async fn do_test_by_name( &mut self, name: &str, ) -> Result<(), SummersetError> { // reset everything to initial state at the start of each test - // TODO: reset service state here + self.reset_cluster().await?; self.driver.connect().await?; self.cached_replies.clear(); diff --git a/summerset_client/src/drivers/closed_loop.rs b/summerset_client/src/drivers/closed_loop.rs index c3b1761c..cb361cf5 100644 --- a/summerset_client/src/drivers/closed_loop.rs +++ b/summerset_client/src/drivers/closed_loop.rs @@ -4,13 +4,14 @@ use tokio::time::{Duration, Instant}; use summerset::{ GenericEndpoint, ClientId, Command, CommandResult, ApiRequest, ApiReply, - RequestId, Timer, SummersetError, pf_debug, pf_error, logged_err, + RequestId, ClientCtrlStub, Timer, SummersetError, pf_debug, pf_error, + logged_err, }; /// Closed-loop driver struct. pub struct DriverClosedLoop { /// Client ID. - id: ClientId, + pub id: ClientId, /// Protocol-specific client endpoint. endpoint: Box, @@ -29,7 +30,7 @@ impl DriverClosedLoop { /// Creates a new closed-loop client. pub fn new(endpoint: Box, timeout: Duration) -> Self { DriverClosedLoop { - id: 255, // nil at this time + id: endpoint.id(), endpoint, next_req: 0, timer: Timer::new(), @@ -39,9 +40,7 @@ impl DriverClosedLoop { /// Establishes connection with the service. pub async fn connect(&mut self) -> Result<(), SummersetError> { - let id = self.endpoint.connect().await?; - self.id = id; - Ok(()) + self.endpoint.connect().await } /// Sends leave notification and forgets about the current TCP connections. @@ -185,4 +184,10 @@ impl DriverClosedLoop { _ => logged_err!(self.id; "unexpected reply type received"), } } + + /// Gets a mutable reference to the endpoint's control stub. + #[allow(dead_code)] + pub fn ctrl_stub(&mut self) -> &mut ClientCtrlStub { + self.endpoint.ctrl_stub() + } } diff --git a/summerset_client/src/drivers/open_loop.rs b/summerset_client/src/drivers/open_loop.rs index d07c4351..433f68a2 100644 --- a/summerset_client/src/drivers/open_loop.rs +++ b/summerset_client/src/drivers/open_loop.rs @@ -11,13 +11,14 @@ use tokio::time::{Duration, Instant}; use summerset::{ GenericEndpoint, ClientId, Command, CommandResult, ApiRequest, ApiReply, - RequestId, Timer, SummersetError, pf_debug, pf_error, logged_err, + RequestId, ClientCtrlStub, Timer, SummersetError, pf_debug, pf_error, + logged_err, }; /// Open-loop driver struct. pub struct DriverOpenLoop { /// Client ID. - id: ClientId, + pub id: ClientId, /// Protocol-specific client endpoint. endpoint: Box, @@ -43,7 +44,7 @@ impl DriverOpenLoop { /// Creates a new open-loop client. pub fn new(endpoint: Box, timeout: Duration) -> Self { DriverOpenLoop { - id: 255, // nil at this time + id: endpoint.id(), endpoint, next_req: 0, pending_reqs: HashMap::new(), @@ -55,9 +56,7 @@ impl DriverOpenLoop { /// Establishes connection with the service. pub async fn connect(&mut self) -> Result<(), SummersetError> { - let id = self.endpoint.connect().await?; - self.id = id; - Ok(()) + self.endpoint.connect().await } /// Waits for all pending replies to be received, then sends leave @@ -211,4 +210,9 @@ impl DriverOpenLoop { _ => logged_err!(self.id; "unexpected reply type received"), } } + + /// Gets a mutable reference to the endpoint's control stub. + pub fn ctrl_stub(&mut self) -> &mut ClientCtrlStub { + self.endpoint.ctrl_stub() + } } diff --git a/summerset_client/src/main.rs b/summerset_client/src/main.rs index 26346720..81fbcdba 100644 --- a/summerset_client/src/main.rs +++ b/summerset_client/src/main.rs @@ -113,8 +113,9 @@ fn client_main() -> Result<(), SummersetError> { // enter tokio runtime, connect to the service, and do work runtime.block_on(async move { - let endpoint = - protocol.new_client_endpoint(args.manager, config_str)?; + let endpoint = protocol + .new_client_endpoint(args.manager, config_str) + .await?; match mode { ClientMode::Repl => { diff --git a/summerset_manager/Cargo.toml b/summerset_manager/Cargo.toml index f0464aa8..f2920305 100644 --- a/summerset_manager/Cargo.toml +++ b/summerset_manager/Cargo.toml @@ -7,8 +7,9 @@ authors = ["Guanzhou Jose Hu "] [dependencies] summerset = { path = "../" } -tokio = { version = "1.29", features = ["macros", "rt-multi-thread"] } +tokio = { version = "1.29", features = ["full"] } rand = "0.8" clap = { version = "4.0", features = ["derive"] } log = "0.4" env_logger = "0.10" +ctrlc = { version = "3.4", features = ["termination"] } diff --git a/summerset_manager/src/main.rs b/summerset_manager/src/main.rs index 6b886372..9a08319e 100644 --- a/summerset_manager/src/main.rs +++ b/summerset_manager/src/main.rs @@ -5,11 +5,14 @@ use std::process::ExitCode; use clap::Parser; +use log::{self, LevelFilter}; + use env_logger::Env; use tokio::runtime::Builder; +use tokio::sync::watch; -use summerset::{SmrProtocol, SummersetError, pf_warn, pf_error}; +use summerset::{SmrProtocol, SummersetError, pf_error}; /// Command line arguments definition. #[derive(Parser, Debug)] @@ -99,24 +102,43 @@ fn manager_main() -> Result<(), SummersetError> { )) })?; - // create tokio multi-threaded runtime - let runtime = Builder::new_multi_thread() - .enable_all() - .worker_threads(args.threads) - .thread_name("tokio-worker-manager") - .build()?; + // set up termination signals handler + let (tx_term, rx_term) = watch::channel(false); + ctrlc::set_handler(move || { + if let Err(e) = tx_term.send(true) { + pf_error!("m"; "error sending to term channel: {}", e); + } + })?; + + let log_level = log::max_level(); - // enter tokio runtime, setup the cluster manager, and start the main - // event loop logic - runtime.block_on(async move { - let mut manager = protocol - .new_cluster_manager_setup(srv_addr, cli_addr, args.population) - .await?; + { + // create tokio multi-threaded runtime + let runtime = Builder::new_multi_thread() + .enable_all() + .worker_threads(args.threads) + .thread_name("tokio-worker-manager") + .build()?; - manager.run().await?; + // enter tokio runtime, setup the cluster manager, and start the main + // event loop logic + runtime.block_on(async move { + let mut manager = protocol + .new_cluster_manager_setup(srv_addr, cli_addr, args.population) + .await?; + + manager.run(rx_term).await?; + + // suppress logging before dropping the runtime to avoid spurious + // error messages + log::set_max_level(LevelFilter::Off); + + Ok::<(), SummersetError>(()) // give type hint for this async closure + })?; + } - Ok::<(), SummersetError>(()) // give type hint for this async closure - }) + log::set_max_level(log_level); + Ok(()) } fn main() -> ExitCode { @@ -130,7 +152,7 @@ fn main() -> ExitCode { pf_error!("m"; "manager_main exitted: {}", e); ExitCode::FAILURE } else { - pf_warn!("m"; "manager_main exitted successfully"); + // pf_warn!("m"; "manager_main exitted successfully"); ExitCode::SUCCESS } } diff --git a/summerset_server/Cargo.toml b/summerset_server/Cargo.toml index 0a8ad28b..3058e797 100644 --- a/summerset_server/Cargo.toml +++ b/summerset_server/Cargo.toml @@ -7,8 +7,9 @@ authors = ["Guanzhou Jose Hu "] [dependencies] summerset = { path = "../" } -tokio = { version = "1.29", features = ["macros", "rt-multi-thread"] } +tokio = { version = "1.29", features = ["full"] } rand = "0.8" clap = { version = "4.0", features = ["derive"] } log = "0.4" env_logger = "0.10" +ctrlc = { version = "3.4", features = ["termination"] } diff --git a/summerset_server/src/main.rs b/summerset_server/src/main.rs index abbbc20d..800ae9e1 100644 --- a/summerset_server/src/main.rs +++ b/summerset_server/src/main.rs @@ -7,11 +7,14 @@ use std::sync::atomic::{AtomicBool, Ordering}; use clap::Parser; +use log::{self, LevelFilter}; + use env_logger::Env; use tokio::runtime::Builder; +use tokio::sync::watch; -use summerset::{SmrProtocol, SummersetError, pf_warn, pf_error}; +use summerset::{SmrProtocol, SummersetError, pf_error}; /// Command line arguments definition. #[derive(Parser, Debug)] @@ -109,9 +112,21 @@ fn server_main() -> Result<(), SummersetError> { Some(&args.config[..]) }; + // set up termination signals handler + let (tx_term, rx_term) = watch::channel(false); + ctrlc::set_handler(move || { + if let Err(e) = tx_term.send(true) { + pf_error!("s"; "error sending to term channel: {}", e); + } + })?; + + let log_level = log::max_level(); let shutdown = Arc::new(AtomicBool::new(false)); + while !shutdown.load(Ordering::SeqCst) { - let sd = shutdown.clone(); + log::set_max_level(log_level); + let shutdown_clone = shutdown.clone(); + let rx_term_clone = rx_term.clone(); // create tokio multi-threaded runtime let runtime = Builder::new_multi_thread() @@ -132,20 +147,25 @@ fn server_main() -> Result<(), SummersetError> { ) .await?; - if replica.run().await? { + if replica.run(rx_term_clone).await? { // event loop terminated but wants to restart (e.g., when // receiving a reset control message); just drop this runtime // and move to the next iteration of loop } else { // event loop terminated and does not want to restart (e.g., // when receiving a termination signal) - sd.store(true, Ordering::SeqCst); + shutdown_clone.store(true, Ordering::SeqCst); } + // suppress logging before dropping the runtime to avoid spurious + // error messages + log::set_max_level(LevelFilter::Off); + Ok::<(), SummersetError>(()) // give type hint for this async closure })?; } + log::set_max_level(log_level); Ok(()) } @@ -160,7 +180,7 @@ fn main() -> ExitCode { pf_error!("s"; "server_main exitted: {}", e); ExitCode::FAILURE } else { - pf_warn!("s"; "server_main exitted successfully"); + // pf_warn!("s"; "server_main exitted successfully"); ExitCode::SUCCESS } } From c2fbbb0787ae5793c4cd3bc4b518825821e23930 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Thu, 31 Aug 2023 14:45:47 +0800 Subject: [PATCH 21/89] minor updates to README --- README.md | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 22eb30f3..974767d1 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,10 @@ This is a private mirror of [Summerset](https://github.com/josehu07/summerset). Below are a memo of development commands... +[![Format check](https://github.com/josehu07/summerset-private/actions/workflows/format.yml/badge.svg)](https://github.com/josehu07/summerset-private/actions?query=josehu07%3Aformat) +[![Build status](https://github.com/josehu07/summerset-private/actions/workflows/build.yml/badge.svg)](https://github.com/josehu07/summerset-private/actions?query=josehu07%3Abuild) +[![Tests status](https://github.com/josehu07/summerset-private/actions/workflows/tests.yml/badge.svg)](https://github.com/josehu07/summerset-private/actions?query=josehu07%3Atests) +[![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT) + To create a branch to track public repo `main`, pull new things from it, and merge into the private `main`: ```bash @@ -31,9 +36,9 @@ git push origin # Summerset -[![Format check](https://github.com/josehu07/summerset-private/actions/workflows/format.yml/badge.svg)](https://github.com/josehu07/summerset/actions?query=josehu07%3Aformat) -[![Build status](https://github.com/josehu07/summerset-private/actions/workflows/build.yml/badge.svg)](https://github.com/josehu07/summerset/actions?query=josehu07%3Abuild) -[![Tests status](https://github.com/josehu07/summerset-private/actions/workflows/tests.yml/badge.svg)](https://github.com/josehu07/summerset/actions?query=josehu07%3Atests) +[![Format check](https://github.com/josehu07/summerset/actions/workflows/format.yml/badge.svg)](https://github.com/josehu07/summerset/actions?query=josehu07%3Aformat) +[![Build status](https://github.com/josehu07/summerset/actions/workflows/build.yml/badge.svg)](https://github.com/josehu07/summerset/actions?query=josehu07%3Abuild) +[![Tests status](https://github.com/josehu07/summerset/actions/workflows/tests.yml/badge.svg)](https://github.com/josehu07/summerset/actions?query=josehu07%3Atests) [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT) Summerset is a distributed key-value store supporting a wide range of state machine replication (SMR) protocols for research purposes. More protocols are actively being added. From c2a36f76b8015da9c285fea05670e2abcb40d7f1 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Thu, 31 Aug 2023 19:45:08 +0800 Subject: [PATCH 22/89] add performance delay simulation support --- Cargo.lock | 119 +++++++++++++++++++++++++++++++++++ Cargo.toml | 1 + scripts/local_bench.tmp.py | 12 ++++ src/protocols/crossword.rs | 48 +++++++++++--- src/protocols/multipaxos.rs | 46 +++++++++++--- src/protocols/rep_nothing.rs | 26 ++++++-- src/protocols/rs_paxos.rs | 46 +++++++++++--- src/protocols/simple_push.rs | 46 +++++++++++--- src/server/external.rs | 6 +- src/server/statemach.rs | 6 +- src/server/storage.rs | 48 +++++++++++--- src/server/transport.rs | 82 ++++++++++++++++++------ src/utils/rscoding.rs | 20 ++++++ 13 files changed, 439 insertions(+), 67 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 883efd71..79437b05 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -97,6 +97,34 @@ dependencies = [ "syn 2.0.28", ] +[[package]] +name = "attribute-derive" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c124f12ade4e670107b132722d0ad1a5c9790bcbc1b265336369ea05626b4498" +dependencies = [ + "attribute-derive-macro", + "proc-macro2", + "quote", + "syn 2.0.28", +] + +[[package]] +name = "attribute-derive-macro" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b217a07446e0fb086f83401a98297e2d81492122f5874db5391bd270a185f88" +dependencies = [ + "collection_literals", + "interpolator", + "proc-macro-error", + "proc-macro-utils", + "proc-macro2", + "quote", + "quote-use", + "syn 2.0.28", +] + [[package]] name = "autocfg" version = "1.1.0" @@ -201,6 +229,12 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b" +[[package]] +name = "collection_literals" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "186dce98367766de751c42c4f03970fc60fc012296e706ccbb9d5df9b6c1e271" + [[package]] name = "color-print" version = "0.3.4" @@ -240,6 +274,17 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "derive-where" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "875a0460143f2dbcc71fd8a63f34b7c83ac66f14bead94054e7cd619c57bbb27" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.28", +] + [[package]] name = "dirs" version = "4.0.0" @@ -426,6 +471,26 @@ dependencies = [ "windows", ] +[[package]] +name = "get-size" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b61e2dab7eedce93a83ab3468b919873ff16bac5a3e704011ff836d22b2120" +dependencies = [ + "get-size-derive", +] + +[[package]] +name = "get-size-derive" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13a1bcfb855c1f340d5913ab542e36f25a1c56f57de79022928297632435dec2" +dependencies = [ + "attribute-derive", + "quote", + "syn 2.0.28", +] + [[package]] name = "getrandom" version = "0.2.10" @@ -495,6 +560,12 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "interpolator" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71dd52191aae121e8611f1e8dc3e324dd0dd1dee1e6dd91d10ee07a3cfb4d9d8" + [[package]] name = "is-terminal" version = "0.4.9" @@ -808,6 +879,41 @@ version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +[[package]] +name = "proc-macro-error" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "syn 1.0.109", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +dependencies = [ + "proc-macro2", + "quote", + "version_check", +] + +[[package]] +name = "proc-macro-utils" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f59e109e2f795a5070e69578c4dc101068139f74616778025ae1011d4cd41a8" +dependencies = [ + "proc-macro2", + "quote", + "smallvec", +] + [[package]] name = "proc-macro2" version = "1.0.66" @@ -826,6 +932,18 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "quote-use" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "58e9a38ef862d7fec635661503289062bc5b3035e61859a8de3d3f81823accd2" +dependencies = [ + "derive-where", + "proc-macro2", + "quote", + "syn 2.0.28", +] + [[package]] name = "rand" version = "0.8.5" @@ -1120,6 +1238,7 @@ dependencies = [ "fixedbitset", "flashmap", "futures", + "get-size", "lazy_static", "log", "rand", diff --git a/Cargo.toml b/Cargo.toml index 707f1150..663e5da3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,3 +23,4 @@ toml = { version = "0.7", features = ["parse"] } log = "0.4" reed-solomon-erasure = { version = "6.0", features = ["simd-accel"] } ctrlc = { version = "3.4", features = ["termination"] } +get-size = { version = "0.1", features = ["derive"] } diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py index a76b77ea..7107732b 100644 --- a/scripts/local_bench.tmp.py +++ b/scripts/local_bench.tmp.py @@ -3,6 +3,12 @@ import statistics +PERF_STORAGE_ALPHA = 0 +PERF_STORAGE_BETA = 0 +PERF_NETWORK_ALPHA = 10000 +PERF_NETWORK_BETA = 100 + + def do_cargo_build(): print("Building everything...") cmd = ["cargo", "build", "--workspace", "-r"] @@ -125,6 +131,12 @@ def bench_round( configs.append(f"fault_tolerance={fault_tolerance}") if shards_per_replica is not None: configs.append(f"shards_per_replica={shards_per_replica}") + + configs.append(f"perf_storage_a={PERF_STORAGE_ALPHA}") + configs.append(f"perf_storage_b={PERF_STORAGE_BETA}") + configs.append(f"perf_network_a={PERF_NETWORK_ALPHA}") + configs.append(f"perf_network_b={PERF_NETWORK_BETA}") + proc_cluster = launch_cluster(protocol, num_replicas, "+".join(configs)) wait_cluster_setup(proc_cluster, num_replicas) diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index 493213d4..5aff1728 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -19,6 +19,8 @@ use crate::protocols::SmrProtocol; use async_trait::async_trait; +use get_size::GetSize; + use serde::{Serialize, Deserialize}; use tokio::time::Duration; @@ -47,6 +49,12 @@ pub struct ReplicaConfigCrossword { /// Number of shards to assign to each replica. // TODO: proper config options. pub shards_per_replica: u8, + + // Performance simulation params (all zeros means no perf simulation): + pub perf_storage_a: u64, + pub perf_storage_b: u64, + pub perf_network_a: u64, + pub perf_network_b: u64, } #[allow(clippy::derivable_impls)] @@ -59,6 +67,10 @@ impl Default for ReplicaConfigCrossword { logger_sync: false, fault_tolerance: 0, shards_per_replica: 1, + perf_storage_a: 0, + perf_storage_b: 0, + perf_network_a: 0, + perf_network_b: 0, } } } @@ -122,7 +134,7 @@ struct Instance { } /// Stable storage log entry type. -#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] enum LogEntry { /// Records an update to the largest prepare ballot seen. PrepareBal { slot: usize, ballot: Ballot }, @@ -139,7 +151,7 @@ enum LogEntry { } /// Peer-peer message type. -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, GetSize)] enum PeerMsg { /// Prepare message from leader to replicas. Prepare { slot: usize, ballot: Ballot }, @@ -1174,8 +1186,10 @@ impl GenericReplica for CrosswordReplica { // parse protocol-specific configs let config = parsed_config!(config_str => ReplicaConfigCrossword; batch_interval_us, max_batch_size, - backer_path, logger_sync, fault_tolerance, - shards_per_replica)?; + backer_path, logger_sync, + fault_tolerance, shards_per_replica, + perf_storage_a, perf_storage_b, + perf_network_a, perf_network_b)?; if config.batch_interval_us == 0 { return logged_err!( id; @@ -1188,13 +1202,29 @@ impl GenericReplica for CrosswordReplica { let state_machine = StateMachine::new_and_setup(id).await?; // setup storage hub module - let storage_hub = - StorageHub::new_and_setup(id, Path::new(&config.backer_path)) - .await?; + let storage_hub = StorageHub::new_and_setup( + id, + Path::new(&config.backer_path), + if config.perf_storage_a == 0 && config.perf_storage_b == 0 { + None + } else { + Some((config.perf_storage_a, config.perf_storage_b)) + }, + ) + .await?; // setup transport hub module - let mut transport_hub = - TransportHub::new_and_setup(id, population, p2p_addr).await?; + let mut transport_hub = TransportHub::new_and_setup( + id, + population, + p2p_addr, + if config.perf_network_a == 0 && config.perf_network_b == 0 { + None + } else { + Some((config.perf_network_a, config.perf_network_b)) + }, + ) + .await?; // ask for the list of peers to proactively connect to. Do this after // transport hub has been set up, so that I will be able to accept diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs index 2431ff86..dfcad00c 100644 --- a/src/protocols/multipaxos.rs +++ b/src/protocols/multipaxos.rs @@ -23,6 +23,8 @@ use crate::protocols::SmrProtocol; use async_trait::async_trait; +use get_size::GetSize; + use serde::{Serialize, Deserialize}; use tokio::time::Duration; @@ -42,6 +44,12 @@ pub struct ReplicaConfigMultiPaxos { /// Whether to call `fsync()`/`fdatasync()` on logger. pub logger_sync: bool, + + // Performance simulation params (all zeros means no perf simulation): + pub perf_storage_a: u64, + pub perf_storage_b: u64, + pub perf_network_a: u64, + pub perf_network_b: u64, } #[allow(clippy::derivable_impls)] @@ -52,6 +60,10 @@ impl Default for ReplicaConfigMultiPaxos { max_batch_size: 5000, backer_path: "/tmp/summerset.multipaxos.wal".into(), logger_sync: false, + perf_storage_a: 0, + perf_storage_b: 0, + perf_network_a: 0, + perf_network_b: 0, } } } @@ -114,7 +126,7 @@ struct Instance { } /// Stable storage log entry type. -#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] enum LogEntry { /// Records an update to the largest prepare ballot seen. PrepareBal { slot: usize, ballot: Ballot }, @@ -131,7 +143,7 @@ enum LogEntry { } /// Peer-peer message type. -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, GetSize)] enum PeerMsg { /// Prepare message from leader to replicas. Prepare { slot: usize, ballot: Ballot }, @@ -989,7 +1001,9 @@ impl GenericReplica for MultiPaxosReplica { // parse protocol-specific configs let config = parsed_config!(config_str => ReplicaConfigMultiPaxos; batch_interval_us, max_batch_size, - backer_path, logger_sync)?; + backer_path, logger_sync, + perf_storage_a, perf_storage_b, + perf_network_a, perf_network_b)?; if config.batch_interval_us == 0 { return logged_err!( id; @@ -1002,13 +1016,29 @@ impl GenericReplica for MultiPaxosReplica { let state_machine = StateMachine::new_and_setup(id).await?; // setup storage hub module - let storage_hub = - StorageHub::new_and_setup(id, Path::new(&config.backer_path)) - .await?; + let storage_hub = StorageHub::new_and_setup( + id, + Path::new(&config.backer_path), + if config.perf_storage_a == 0 && config.perf_storage_b == 0 { + None + } else { + Some((config.perf_storage_a, config.perf_storage_b)) + }, + ) + .await?; // setup transport hub module - let mut transport_hub = - TransportHub::new_and_setup(id, population, p2p_addr).await?; + let mut transport_hub = TransportHub::new_and_setup( + id, + population, + p2p_addr, + if config.perf_network_a == 0 && config.perf_network_b == 0 { + None + } else { + Some((config.perf_network_a, config.perf_network_b)) + }, + ) + .await?; // ask for the list of peers to proactively connect to. Do this after // transport hub has been set up, so that I will be able to accept diff --git a/src/protocols/rep_nothing.rs b/src/protocols/rep_nothing.rs index ffbc57e1..a6f19997 100644 --- a/src/protocols/rep_nothing.rs +++ b/src/protocols/rep_nothing.rs @@ -18,6 +18,8 @@ use crate::protocols::SmrProtocol; use async_trait::async_trait; +use get_size::GetSize; + use serde::{Serialize, Deserialize}; use tokio::time::Duration; @@ -37,6 +39,10 @@ pub struct ReplicaConfigRepNothing { /// Whether to call `fsync()`/`fdatasync()` on logger. pub logger_sync: bool, + + // Performance simulation params (all zeros means no perf simulation): + pub perf_storage_a: u64, + pub perf_storage_b: u64, } #[allow(clippy::derivable_impls)] @@ -47,12 +53,14 @@ impl Default for ReplicaConfigRepNothing { max_batch_size: 5000, backer_path: "/tmp/summerset.rep_nothing.wal".into(), logger_sync: false, + perf_storage_a: 0, + perf_storage_b: 0, } } } /// Log entry type. -#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] struct LogEntry { reqs: Vec<(ClientId, ApiRequest)>, } @@ -296,7 +304,8 @@ impl GenericReplica for RepNothingReplica { // parse protocol-specific configs let config = parsed_config!(config_str => ReplicaConfigRepNothing; batch_interval_us, max_batch_size, - backer_path, logger_sync)?; + backer_path, logger_sync, + perf_storage_a, perf_storage_b)?; if config.batch_interval_us == 0 { return logged_err!( id; @@ -309,9 +318,16 @@ impl GenericReplica for RepNothingReplica { let state_machine = StateMachine::new_and_setup(id).await?; // setup storage hub module - let storage_hub = - StorageHub::new_and_setup(id, Path::new(&config.backer_path)) - .await?; + let storage_hub = StorageHub::new_and_setup( + id, + Path::new(&config.backer_path), + if config.perf_storage_a == 0 && config.perf_storage_b == 0 { + None + } else { + Some((config.perf_storage_a, config.perf_storage_b)) + }, + ) + .await?; // TransportHub is not needed in RepNothing diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs index ada30d45..79d025de 100644 --- a/src/protocols/rs_paxos.rs +++ b/src/protocols/rs_paxos.rs @@ -19,6 +19,8 @@ use crate::protocols::SmrProtocol; use async_trait::async_trait; +use get_size::GetSize; + use serde::{Serialize, Deserialize}; use tokio::time::Duration; @@ -43,6 +45,12 @@ pub struct ReplicaConfigRSPaxos { /// Fault-tolerance level. pub fault_tolerance: u8, + + // Performance simulation params (all zeros means no perf simulation): + pub perf_storage_a: u64, + pub perf_storage_b: u64, + pub perf_network_a: u64, + pub perf_network_b: u64, } #[allow(clippy::derivable_impls)] @@ -54,6 +62,10 @@ impl Default for ReplicaConfigRSPaxos { backer_path: "/tmp/summerset.rs_paxos.wal".into(), logger_sync: false, fault_tolerance: 0, + perf_storage_a: 0, + perf_storage_b: 0, + perf_network_a: 0, + perf_network_b: 0, } } } @@ -116,7 +128,7 @@ struct Instance { } /// Stable storage log entry type. -#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] enum LogEntry { /// Records an update to the largest prepare ballot seen. PrepareBal { slot: usize, ballot: Ballot }, @@ -133,7 +145,7 @@ enum LogEntry { } /// Peer-peer message type. -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, GetSize)] enum PeerMsg { /// Prepare message from leader to replicas. Prepare { slot: usize, ballot: Ballot }, @@ -1074,7 +1086,9 @@ impl GenericReplica for RSPaxosReplica { // parse protocol-specific configs let config = parsed_config!(config_str => ReplicaConfigRSPaxos; batch_interval_us, max_batch_size, - backer_path, logger_sync, fault_tolerance)?; + backer_path, logger_sync, fault_tolerance, + perf_storage_a, perf_storage_b, + perf_network_a, perf_network_b)?; if config.batch_interval_us == 0 { return logged_err!( id; @@ -1087,13 +1101,29 @@ impl GenericReplica for RSPaxosReplica { let state_machine = StateMachine::new_and_setup(id).await?; // setup storage hub module - let storage_hub = - StorageHub::new_and_setup(id, Path::new(&config.backer_path)) - .await?; + let storage_hub = StorageHub::new_and_setup( + id, + Path::new(&config.backer_path), + if config.perf_storage_a == 0 && config.perf_storage_b == 0 { + None + } else { + Some((config.perf_storage_a, config.perf_storage_b)) + }, + ) + .await?; // setup transport hub module - let mut transport_hub = - TransportHub::new_and_setup(id, population, p2p_addr).await?; + let mut transport_hub = TransportHub::new_and_setup( + id, + population, + p2p_addr, + if config.perf_network_a == 0 && config.perf_network_b == 0 { + None + } else { + Some((config.perf_network_a, config.perf_network_b)) + }, + ) + .await?; // ask for the list of peers to proactively connect to. Do this after // transport hub has been set up, so that I will be able to accept diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs index 7d9aa763..73c1d068 100644 --- a/src/protocols/simple_push.rs +++ b/src/protocols/simple_push.rs @@ -19,6 +19,8 @@ use crate::protocols::SmrProtocol; use async_trait::async_trait; +use get_size::GetSize; + use serde::{Serialize, Deserialize}; use tokio::time::Duration; @@ -38,6 +40,12 @@ pub struct ReplicaConfigSimplePush { /// Number of peer servers to push each command to. pub rep_degree: u8, + + // Performance simulation params (all zeros means no perf simulation): + pub perf_storage_a: u64, + pub perf_storage_b: u64, + pub perf_network_a: u64, + pub perf_network_b: u64, } #[allow(clippy::derivable_impls)] @@ -48,12 +56,16 @@ impl Default for ReplicaConfigSimplePush { max_batch_size: 5000, backer_path: "/tmp/summerset.simple_push.wal".into(), rep_degree: 2, + perf_storage_a: 0, + perf_storage_b: 0, + perf_network_a: 0, + perf_network_b: 0, } } } /// Log entry type. -#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] enum LogEntry { FromClient { reqs: Vec<(ClientId, ApiRequest)>, @@ -65,7 +77,7 @@ enum LogEntry { } /// Peer-peer message type. -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, GetSize)] enum PushMsg { Push { src_inst_idx: usize, @@ -456,7 +468,9 @@ impl GenericReplica for SimplePushReplica { // parse protocol-specific configs let config = parsed_config!(config_str => ReplicaConfigSimplePush; batch_interval_us, max_batch_size, - backer_path, rep_degree)?; + backer_path, rep_degree, + perf_storage_a, perf_storage_b, + perf_network_a, perf_network_b)?; if config.batch_interval_us == 0 { return logged_err!( id; @@ -469,13 +483,29 @@ impl GenericReplica for SimplePushReplica { let state_machine = StateMachine::new_and_setup(id).await?; // setup storage hub module - let storage_hub = - StorageHub::new_and_setup(id, Path::new(&config.backer_path)) - .await?; + let storage_hub = StorageHub::new_and_setup( + id, + Path::new(&config.backer_path), + if config.perf_storage_a == 0 && config.perf_storage_b == 0 { + None + } else { + Some((config.perf_storage_a, config.perf_storage_b)) + }, + ) + .await?; // setup transport hub module - let mut transport_hub = - TransportHub::new_and_setup(id, population, p2p_addr).await?; + let mut transport_hub = TransportHub::new_and_setup( + id, + population, + p2p_addr, + if config.perf_network_a == 0 && config.perf_network_b == 0 { + None + } else { + Some((config.perf_network_a, config.perf_network_b)) + }, + ) + .await?; // ask for the list of peers to proactively connect to. Do this after // transport hub has been set up, so that I will be able to accept diff --git a/src/server/external.rs b/src/server/external.rs index c52a946c..769c1cd7 100644 --- a/src/server/external.rs +++ b/src/server/external.rs @@ -9,6 +9,8 @@ use crate::utils::{ use crate::server::{ReplicaId, Command, CommandResult}; use crate::client::ClientId; +use get_size::GetSize; + use bytes::BytesMut; use serde::{Serialize, Deserialize}; @@ -26,7 +28,7 @@ pub type RequestId = u64; /// Request received from client. // TODO: add information fields such as read-only flag... -#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] pub enum ApiRequest { /// Regular request. Req { @@ -42,7 +44,7 @@ pub enum ApiRequest { } /// Reply back to client. -#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] pub enum ApiReply { /// Reply to regular request. Reply { diff --git a/src/server/statemach.rs b/src/server/statemach.rs index fd2c7670..47196cf4 100644 --- a/src/server/statemach.rs +++ b/src/server/statemach.rs @@ -5,6 +5,8 @@ use std::collections::HashMap; use crate::utils::SummersetError; use crate::server::ReplicaId; +use get_size::GetSize; + use serde::{Serialize, Deserialize}; use tokio::sync::mpsc; @@ -14,7 +16,7 @@ use tokio::task::JoinHandle; pub type CommandId = u64; /// Command to the state machine. -#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] pub enum Command { /// Get the value of given key. Get { key: String }, @@ -24,7 +26,7 @@ pub enum Command { } /// Command execution result returned by the state machine. -#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] pub enum CommandResult { /// `Some(value)` if key is found in state machine, else `None`. Get { value: Option }, diff --git a/src/server/storage.rs b/src/server/storage.rs index 8522175e..5d0fc3cb 100644 --- a/src/server/storage.rs +++ b/src/server/storage.rs @@ -3,10 +3,13 @@ use std::fmt; use std::path::Path; use std::io::SeekFrom; +use std::sync::Arc; use crate::utils::SummersetError; use crate::server::ReplicaId; +use get_size::GetSize; + use serde::{Serialize, Deserialize, de::DeserializeOwned}; use rmp_serde::encode::to_vec as encode_to_vec; @@ -16,13 +19,14 @@ use tokio::fs::{self, File, OpenOptions}; use tokio::io::{AsyncReadExt, AsyncWriteExt, AsyncSeekExt}; use tokio::sync::mpsc; use tokio::task::JoinHandle; +use tokio::time::{self, Duration}; /// Log action ID type. pub type LogActionId = u64; /// Action command to the logger. File cursor will be positioned at EOF after /// every action. -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug, Serialize, Deserialize, GetSize)] pub enum LogAction { /// Read a log entry out. Read { offset: usize }, @@ -45,7 +49,7 @@ pub enum LogAction { } /// Action result returned by the logger. -#[derive(Debug, Serialize, Deserialize, PartialEq)] +#[derive(Debug, Serialize, Deserialize, PartialEq, GetSize)] pub enum LogResult { /// `Some(entry)` if successful, else `None`. Read { entry: Option }, @@ -88,6 +92,7 @@ where + Clone + Serialize + DeserializeOwned + + GetSize + Send + Sync + 'static, @@ -99,6 +104,7 @@ where pub async fn new_and_setup( me: ReplicaId, path: &Path, + perf_a_b: Option<(u64, u64)>, // performance simulation params ) -> Result { // prepare backing file if !fs::try_exists(path).await? { @@ -111,11 +117,39 @@ where OpenOptions::new().read(true).write(true).open(path).await?; backer_file.seek(SeekFrom::End(0)).await?; // seek to EOF - let (tx_log, rx_log) = mpsc::unbounded_channel(); + let (tx_log, mut rx_log) = + mpsc::unbounded_channel::<(LogActionId, LogAction)>(); let (tx_ack, rx_ack) = mpsc::unbounded_channel(); - let logger_handle = - tokio::spawn(Self::logger_thread(me, backer_file, rx_log, tx_ack)); + // if doing performance delay simulation, add on-the-fly delay to + // each message received + let rx_log_true = if let Some((perf_a, perf_b)) = perf_a_b { + let (tx_log_delayed, rx_log_delayed) = mpsc::unbounded_channel(); + let tx_log_delayed_arc = Arc::new(tx_log_delayed); + + tokio::spawn(async move { + while let Some((id, log_action)) = rx_log.recv().await { + let tx_log_delayed_clone = tx_log_delayed_arc.clone(); + tokio::spawn(async move { + let approx_size = log_action.get_size() as u64; + let delay_ns = perf_a + approx_size * perf_b; + time::sleep(Duration::from_nanos(delay_ns)).await; + tx_log_delayed_clone.send((id, log_action)).unwrap(); + }); + } + }); + + rx_log_delayed + } else { + rx_log + }; + + let logger_handle = tokio::spawn(Self::logger_thread( + me, + backer_file, + rx_log_true, + tx_ack, + )); Ok(StorageHub { me, @@ -426,7 +460,7 @@ mod storage_tests { use super::*; use rmp_serde::encode::to_vec as encode_to_vec; - #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] + #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, GetSize)] struct TestEntry(String); async fn prepare_test_file(path: &str) -> Result { @@ -649,7 +683,7 @@ mod storage_tests { #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn api_log_ack() -> Result<(), SummersetError> { let path = Path::new("/tmp/test-backer-6.log"); - let mut hub = StorageHub::new_and_setup(0, path).await?; + let mut hub = StorageHub::new_and_setup(0, path, None).await?; let entry = TestEntry("abcdefgh".into()); let entry_bytes = encode_to_vec(&entry)?; hub.submit_action(0, LogAction::Append { entry, sync: true })?; diff --git a/src/server/transport.rs b/src/server/transport.rs index e7ca2998..a6a30ec8 100644 --- a/src/server/transport.rs +++ b/src/server/transport.rs @@ -2,6 +2,7 @@ use std::fmt; use std::net::SocketAddr; +use std::sync::Arc; use crate::utils::{ SummersetError, Bitmap, safe_tcp_read, safe_tcp_write, tcp_bind_with_retry, @@ -9,6 +10,8 @@ use crate::utils::{ }; use crate::server::ReplicaId; +use get_size::GetSize; + use bytes::BytesMut; use serde::{Serialize, Deserialize, de::DeserializeOwned}; @@ -21,7 +24,7 @@ use tokio::task::JoinHandle; use tokio::time::{self, Duration}; /// Peer-peer message wrapper type that includes leave notification variants. -#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] enum PeerMessage { /// Normal protocol-specific request. Msg { msg: Msg }, @@ -74,6 +77,7 @@ where + Clone + Serialize + DeserializeOwned + + GetSize + Send + Sync + 'static, @@ -85,12 +89,38 @@ where me: ReplicaId, population: u8, p2p_addr: SocketAddr, + perf_a_b: Option<(u64, u64)>, // performance simulation params ) -> Result { if population <= me { return logged_err!(me; "invalid population {}", population); } - let (tx_recv, rx_recv) = mpsc::unbounded_channel(); + let (tx_recv, mut rx_recv) = + mpsc::unbounded_channel::<(ReplicaId, PeerMessage)>(); + + // if doing performance delay simulation, add on-the-fly delay to + // each message received + let rx_recv_true = if let Some((perf_a, perf_b)) = perf_a_b { + let (tx_recv_delayed, rx_recv_delayed) = mpsc::unbounded_channel(); + let tx_recv_delayed_arc = Arc::new(tx_recv_delayed); + + tokio::spawn(async move { + while let Some((id, peer_msg)) = rx_recv.recv().await { + let tx_recv_delayed_clone = tx_recv_delayed_arc.clone(); + tokio::spawn(async move { + let approx_size = peer_msg.get_size() as u64; + let delay_ns = perf_a + approx_size * perf_b; + time::sleep(Duration::from_nanos(delay_ns)).await; + tx_recv_delayed_clone.send((id, peer_msg)).unwrap(); + }); + } + pf_error!("d"; "recv channel has been closed"); + }); + + rx_recv_delayed + } else { + rx_recv + }; let (tx_sends_write, tx_sends_read) = flashmap::new::< ReplicaId, @@ -119,7 +149,7 @@ where Ok(TransportHub { me, population, - rx_recv, + rx_recv: rx_recv_true, tx_sends: tx_sends_read, _peer_acceptor_handle: peer_acceptor_handle, tx_connect, @@ -661,7 +691,7 @@ mod transport_tests { use serde::{Serialize, Deserialize}; use tokio::sync::Barrier; - #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] + #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, GetSize)] struct TestMsg(String); #[tokio::test(flavor = "multi_thread", worker_threads = 2)] @@ -671,9 +701,13 @@ mod transport_tests { let barrier2 = barrier.clone(); tokio::spawn(async move { // replica 1 - let mut hub: TransportHub = - TransportHub::new_and_setup(1, 3, "127.0.0.1:53801".parse()?) - .await?; + let mut hub: TransportHub = TransportHub::new_and_setup( + 1, + 3, + "127.0.0.1:53801".parse()?, + None, + ) + .await?; barrier1.wait().await; hub.connect_to_peer(2, "127.0.0.1:53802".parse()?).await?; // recv a message from 0 @@ -696,9 +730,13 @@ mod transport_tests { }); tokio::spawn(async move { // replica 2 - let mut hub: TransportHub = - TransportHub::new_and_setup(2, 3, "127.0.0.1:53802".parse()?) - .await?; + let mut hub: TransportHub = TransportHub::new_and_setup( + 2, + 3, + "127.0.0.1:53802".parse()?, + None, + ) + .await?; barrier2.wait().await; // recv a message from 0 let (id, msg) = hub.recv_msg().await?; @@ -714,7 +752,7 @@ mod transport_tests { }); // replica 0 let mut hub: TransportHub = - TransportHub::new_and_setup(0, 3, "127.0.0.1:53800".parse()?) + TransportHub::new_and_setup(0, 3, "127.0.0.1:53800".parse()?, None) .await?; barrier.wait().await; hub.connect_to_peer(1, "127.0.0.1:53801".parse()?).await?; @@ -747,9 +785,13 @@ mod transport_tests { let barrier2 = barrier.clone(); tokio::spawn(async move { // replica 1/2 - let mut hub: TransportHub = - TransportHub::new_and_setup(1, 3, "127.0.0.1:54801".parse()?) - .await?; + let mut hub: TransportHub = TransportHub::new_and_setup( + 1, + 3, + "127.0.0.1:54801".parse()?, + None, + ) + .await?; barrier2.wait().await; // recv a message from 0 let (id, msg) = hub.recv_msg().await?; @@ -759,9 +801,13 @@ mod transport_tests { // leave and come back as 2 hub.leave().await?; time::sleep(Duration::from_millis(100)).await; - let mut hub: TransportHub = - TransportHub::new_and_setup(2, 3, "127.0.0.1:54802".parse()?) - .await?; + let mut hub: TransportHub = TransportHub::new_and_setup( + 2, + 3, + "127.0.0.1:54802".parse()?, + None, + ) + .await?; hub.connect_to_peer(0, "127.0.0.1:54800".parse()?).await?; // send a message to 0 hub.send_msg(TestMsg("hello".into()), 0)?; @@ -769,7 +815,7 @@ mod transport_tests { }); // replica 0 let mut hub: TransportHub = - TransportHub::new_and_setup(0, 3, "127.0.0.1:54800".parse()?) + TransportHub::new_and_setup(0, 3, "127.0.0.1:54800".parse()?, None) .await?; barrier.wait().await; hub.connect_to_peer(1, "127.0.0.1:54801".parse()?).await?; diff --git a/src/utils/rscoding.rs b/src/utils/rscoding.rs index 49c008a3..c8461c26 100644 --- a/src/utils/rscoding.rs +++ b/src/utils/rscoding.rs @@ -6,6 +6,8 @@ use std::marker::PhantomData; use crate::utils::{SummersetError, Bitmap}; +use get_size::GetSize; + use bytes::{BytesMut, BufMut}; use serde::{Serialize, Deserialize, de::DeserializeOwned}; @@ -43,6 +45,24 @@ pub struct RSCodeword { phantom: PhantomData, } +// implement `GetSize` trait for `RSCodeword`; the heap size is approximated +// simply by the sum of sizes of present shards +impl GetSize for RSCodeword +where + T: fmt::Debug + Clone + Serialize + DeserializeOwned + Send + Sync, +{ + fn get_heap_size(&self) -> usize { + self.shards + .iter() + .map(|s| if let Some(b) = s { b.len() } else { 0 }) + .sum() + } + + fn get_size(&self) -> usize { + Self::get_stack_size() + self.get_heap_size() + } +} + impl RSCodeword where T: fmt::Debug + Clone + Serialize + DeserializeOwned + Send + Sync, From 6bb7d3556aea1ad4df6268fb075370b64a40c5d4 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Thu, 31 Aug 2023 20:39:17 +0800 Subject: [PATCH 23/89] minor updates to bench script --- scripts/local_bench.tmp.py | 3 ++- scripts/local_cluster.py | 2 -- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py index 7107732b..fef021c1 100644 --- a/scripts/local_bench.tmp.py +++ b/scripts/local_bench.tmp.py @@ -1,3 +1,4 @@ +import sys import os import subprocess import statistics @@ -50,7 +51,7 @@ def wait_cluster_setup(proc, num_replicas): for line in iter(proc.stderr.readline, b""): l = line.decode() - # print(l, end="") + # print(l, end="", file=sys.stderr) if "manager" not in l and "accepting clients" in l: replica = int(l[l.find("(") + 1 : l.find(")")]) assert not accepting_clients[replica] diff --git a/scripts/local_cluster.py b/scripts/local_cluster.py index c4e0877c..9df8fe39 100644 --- a/scripts/local_cluster.py +++ b/scripts/local_cluster.py @@ -181,8 +181,6 @@ def launch_servers(protocol, num_replicas, release, config): def kill_spawned_procs(*args): for proc in server_procs: proc.terminate() - for proc in server_procs: - proc.wait() manager_proc.terminate() signal.signal(signal.SIGINT, kill_spawned_procs) From 27a0a2442479524b5a75bf5e1dc2c46900914052 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Thu, 31 Aug 2023 20:43:31 +0800 Subject: [PATCH 24/89] minor updates to bench script --- scripts/local_cluster.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/local_cluster.py b/scripts/local_cluster.py index 9df8fe39..c4e0877c 100644 --- a/scripts/local_cluster.py +++ b/scripts/local_cluster.py @@ -181,6 +181,8 @@ def launch_servers(protocol, num_replicas, release, config): def kill_spawned_procs(*args): for proc in server_procs: proc.terminate() + for proc in server_procs: + proc.wait() manager_proc.terminate() signal.signal(signal.SIGINT, kill_spawned_procs) From 28a64d1c1dc504f20b3483851deccdf8201ae077 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Thu, 31 Aug 2023 21:00:56 +0800 Subject: [PATCH 25/89] fixing bench script hanging issue --- scripts/local_bench.tmp.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py index fef021c1..e2f6c277 100644 --- a/scripts/local_bench.tmp.py +++ b/scripts/local_bench.tmp.py @@ -142,6 +142,9 @@ def bench_round( wait_cluster_setup(proc_cluster, num_replicas) proc_client = run_bench_client(protocol, value_size, put_ratio, length_s) + for line in iter(proc_client.stderr.readline, b""): + l = line.decode() + print(l, end="", file=sys.stderr) out, err = proc_client.communicate() proc_cluster.terminate() From 8ace60cc4d7d7508fdbf4f110f2b45873469a5d6 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Thu, 31 Aug 2023 21:04:50 +0800 Subject: [PATCH 26/89] fixing bench script hanging issue --- scripts/local_bench.tmp.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py index e2f6c277..b1d59e2c 100644 --- a/scripts/local_bench.tmp.py +++ b/scripts/local_bench.tmp.py @@ -19,7 +19,7 @@ def do_cargo_build(): def run_process(cmd): # print("Run:", " ".join(cmd)) - proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) return proc @@ -49,7 +49,7 @@ def launch_cluster(protocol, num_replicas, config): def wait_cluster_setup(proc, num_replicas): accepting_clients = [False for _ in range(num_replicas)] - for line in iter(proc.stderr.readline, b""): + for line in iter(proc.stdout.readline, b""): l = line.decode() # print(l, end="", file=sys.stderr) if "manager" not in l and "accepting clients" in l: @@ -142,7 +142,7 @@ def bench_round( wait_cluster_setup(proc_cluster, num_replicas) proc_client = run_bench_client(protocol, value_size, put_ratio, length_s) - for line in iter(proc_client.stderr.readline, b""): + for line in iter(proc_client.stdout.readline, b""): l = line.decode() print(l, end="", file=sys.stderr) out, err = proc_client.communicate() From 76c3fe99ebb916e04fd2aec9a2f231911aaa6a92 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Thu, 31 Aug 2023 21:28:25 +0800 Subject: [PATCH 27/89] fixing bench script hanging issue --- scripts/local_bench.tmp.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py index b1d59e2c..2d01e4f3 100644 --- a/scripts/local_bench.tmp.py +++ b/scripts/local_bench.tmp.py @@ -19,16 +19,24 @@ def do_cargo_build(): def run_process(cmd): # print("Run:", " ".join(cmd)) - proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) return proc def kill_all_matching(name, force=False): # print("Kill all:", name) assert name.count(" ") == 0 - cmd = "killall -9" if force else "killall" - cmd += f" {name} > /dev/null 2>&1" - os.system(cmd) + + pgrep_cmd = ["sudo", "pgrep", "-f", name] + pids = subprocess.check_output(pgrep_cmd, shell=True).decode() + + pids = pids.strip().split("\n") + for pid in pids: + pid = pid.strip() + if len(pid) > 0: + kill_cmd = f"sudo kill -9" if force else "sudo kill" + kill_cmd += f" {int(pid)} > /dev/null 2>&1" + os.system(kill_cmd) def launch_cluster(protocol, num_replicas, config): @@ -49,9 +57,9 @@ def launch_cluster(protocol, num_replicas, config): def wait_cluster_setup(proc, num_replicas): accepting_clients = [False for _ in range(num_replicas)] - for line in iter(proc.stdout.readline, b""): + for line in iter(proc.stderr.readline, b""): l = line.decode() - # print(l, end="", file=sys.stderr) + print(l, end="", file=sys.stderr) if "manager" not in l and "accepting clients" in l: replica = int(l[l.find("(") + 1 : l.find(")")]) assert not accepting_clients[replica] @@ -123,6 +131,8 @@ def bench_round( + f"s={shards_per_replica if shards_per_replica is not None else 'x':1} " + f"w%={put_ratio:<3d} {length_s:3d}s" ) + + kill_all_matching("local_cluster.py", force=True) kill_all_matching("summerset_client", force=True) kill_all_matching("summerset_server", force=True) kill_all_matching("summerset_manager", force=True) @@ -142,9 +152,6 @@ def bench_round( wait_cluster_setup(proc_cluster, num_replicas) proc_client = run_bench_client(protocol, value_size, put_ratio, length_s) - for line in iter(proc_client.stdout.readline, b""): - l = line.decode() - print(l, end="", file=sys.stderr) out, err = proc_client.communicate() proc_cluster.terminate() From 13ecd40f9b10b77acba1ce9207789080bd65d4e7 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Thu, 31 Aug 2023 21:31:09 +0800 Subject: [PATCH 28/89] fixing bench script hanging issue --- scripts/local_bench.tmp.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py index 2d01e4f3..e60f98e6 100644 --- a/scripts/local_bench.tmp.py +++ b/scripts/local_bench.tmp.py @@ -26,17 +26,18 @@ def run_process(cmd): def kill_all_matching(name, force=False): # print("Kill all:", name) assert name.count(" ") == 0 - pgrep_cmd = ["sudo", "pgrep", "-f", name] - pids = subprocess.check_output(pgrep_cmd, shell=True).decode() - - pids = pids.strip().split("\n") - for pid in pids: - pid = pid.strip() - if len(pid) > 0: - kill_cmd = f"sudo kill -9" if force else "sudo kill" - kill_cmd += f" {int(pid)} > /dev/null 2>&1" - os.system(kill_cmd) + try: + pids = subprocess.check_output(pgrep_cmd, shell=True).decode() + pids = pids.strip().split("\n") + for pid in pids: + pid = pid.strip() + if len(pid) > 0: + kill_cmd = f"sudo kill -9" if force else "sudo kill" + kill_cmd += f" {int(pid)} > /dev/null 2>&1" + os.system(kill_cmd) + except subprocess.CalledProcessError: + pass def launch_cluster(protocol, num_replicas, config): From 5777e84cdd79b2463bf1bda50ffa006084631fa6 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Thu, 31 Aug 2023 21:33:33 +0800 Subject: [PATCH 29/89] fixing bench script hanging issue --- scripts/local_bench.tmp.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py index e60f98e6..13c8b4ab 100644 --- a/scripts/local_bench.tmp.py +++ b/scripts/local_bench.tmp.py @@ -28,6 +28,7 @@ def kill_all_matching(name, force=False): assert name.count(" ") == 0 pgrep_cmd = ["sudo", "pgrep", "-f", name] try: + print("AAA") pids = subprocess.check_output(pgrep_cmd, shell=True).decode() pids = pids.strip().split("\n") for pid in pids: @@ -35,6 +36,7 @@ def kill_all_matching(name, force=False): if len(pid) > 0: kill_cmd = f"sudo kill -9" if force else "sudo kill" kill_cmd += f" {int(pid)} > /dev/null 2>&1" + print("BBB", kill_cmd) os.system(kill_cmd) except subprocess.CalledProcessError: pass From 461c8cc889106f67aa2b9bbbbf15cee496857e60 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Thu, 31 Aug 2023 21:35:31 +0800 Subject: [PATCH 30/89] fixing bench script hanging issue --- scripts/local_bench.tmp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py index 13c8b4ab..ed308dbb 100644 --- a/scripts/local_bench.tmp.py +++ b/scripts/local_bench.tmp.py @@ -28,7 +28,7 @@ def kill_all_matching(name, force=False): assert name.count(" ") == 0 pgrep_cmd = ["sudo", "pgrep", "-f", name] try: - print("AAA") + print("AAA", pgrep_cmd) pids = subprocess.check_output(pgrep_cmd, shell=True).decode() pids = pids.strip().split("\n") for pid in pids: From 7264252c055548be6055b81f994739df945283e1 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Thu, 31 Aug 2023 21:36:18 +0800 Subject: [PATCH 31/89] fixing bench script hanging issue --- scripts/local_bench.tmp.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py index ed308dbb..671b5497 100644 --- a/scripts/local_bench.tmp.py +++ b/scripts/local_bench.tmp.py @@ -26,9 +26,8 @@ def run_process(cmd): def kill_all_matching(name, force=False): # print("Kill all:", name) assert name.count(" ") == 0 - pgrep_cmd = ["sudo", "pgrep", "-f", name] try: - print("AAA", pgrep_cmd) + pgrep_cmd = f"sudo pgrep -f {name}" pids = subprocess.check_output(pgrep_cmd, shell=True).decode() pids = pids.strip().split("\n") for pid in pids: @@ -36,7 +35,6 @@ def kill_all_matching(name, force=False): if len(pid) > 0: kill_cmd = f"sudo kill -9" if force else "sudo kill" kill_cmd += f" {int(pid)} > /dev/null 2>&1" - print("BBB", kill_cmd) os.system(kill_cmd) except subprocess.CalledProcessError: pass From 3c07397c30364f0f58d993885122769c1a7d673e Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Fri, 1 Sep 2023 18:35:21 +0800 Subject: [PATCH 32/89] staging progress on log recovery --- src/protocols/crossword.rs | 1 + src/protocols/multipaxos.rs | 1 + src/protocols/rep_nothing.rs | 68 ++++++++++++++++++++++++++++++++++++ src/protocols/rs_paxos.rs | 1 + src/protocols/simple_push.rs | 1 + src/server/storage.rs | 49 +++++++++++++++----------- 6 files changed, 101 insertions(+), 20 deletions(-) diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index 5aff1728..012007b4 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -1310,6 +1310,7 @@ impl GenericReplica for CrosswordReplica { self.is_leader = true; } + // main event loop loop { tokio::select! { // client request batch diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs index dfcad00c..a0d06a19 100644 --- a/src/protocols/multipaxos.rs +++ b/src/protocols/multipaxos.rs @@ -1105,6 +1105,7 @@ impl GenericReplica for MultiPaxosReplica { self.is_leader = true; } + // main event loop loop { tokio::select! { // client request batch diff --git a/src/protocols/rep_nothing.rs b/src/protocols/rep_nothing.rs index a6f19997..849e3e05 100644 --- a/src/protocols/rep_nothing.rs +++ b/src/protocols/rep_nothing.rs @@ -287,6 +287,70 @@ impl RepNothingReplica { _ => Ok(None), // ignore all other types } } + + /// Recover state from durable storage log. + async fn recover_from_log(&mut self) -> Result<(), SummersetError> { + assert_eq!(self.log_offset, 0); + loop { + // using 0 as a special log action ID + self.storage_hub.submit_action( + 0, + LogAction::Read { + offset: self.log_offset, + }, + )?; + let (_, log_result) = self.storage_hub.get_result().await?; + + match log_result { + LogResult::Read { + entry: Some(entry), + end_offset, + } => { + // execute all commands on state machine synchronously + for (_, req) in entry.reqs.clone() { + if let ApiRequest::Req { cmd, .. } = req { + // using 0 as a special command ID + self.state_machine.submit_cmd(0, cmd)?; + let _ = self.state_machine.get_result().await?; + } + } + // rebuild in-memory log + let num_reqs = entry.reqs.len(); + self.insts.push(Instance { + reqs: entry.reqs, + durable: true, + execed: vec![true; num_reqs], + }); + // update log offset + self.log_offset = end_offset; + } + LogResult::Read { entry: None, .. } => { + // end of log reached + break; + } + _ => { + return logged_err!(self.id; "unexpected log result type"); + } + } + } + + // do an extra Truncate to remove paritial entry at the end if any + self.storage_hub.submit_action( + 0, + LogAction::Truncate { + offset: self.log_offset, + }, + )?; + let (_, log_result) = self.storage_hub.get_result().await?; + if let LogResult::Truncate { + offset_ok: true, .. + } = log_result + { + Ok(()) + } else { + logged_err!(self.id; "unexpected log result type") + } + } } #[async_trait] @@ -367,6 +431,10 @@ impl GenericReplica for RepNothingReplica { &mut self, mut rx_term: watch::Receiver, ) -> Result { + // recover state from durable storage log + self.recover_from_log().await?; + + // main event loop loop { tokio::select! { // client request batch diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs index 79d025de..8dbbafb8 100644 --- a/src/protocols/rs_paxos.rs +++ b/src/protocols/rs_paxos.rs @@ -1203,6 +1203,7 @@ impl GenericReplica for RSPaxosReplica { self.is_leader = true; } + // main event loop loop { tokio::select! { // client request batch diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs index 73c1d068..9deb6775 100644 --- a/src/protocols/simple_push.rs +++ b/src/protocols/simple_push.rs @@ -560,6 +560,7 @@ impl GenericReplica for SimplePushReplica { &mut self, mut rx_term: watch::Receiver, ) -> Result { + // main event loop loop { tokio::select! { // client request batch diff --git a/src/server/storage.rs b/src/server/storage.rs index 5d0fc3cb..99809e65 100644 --- a/src/server/storage.rs +++ b/src/server/storage.rs @@ -52,7 +52,10 @@ pub enum LogAction { #[derive(Debug, Serialize, Deserialize, PartialEq, GetSize)] pub enum LogResult { /// `Some(entry)` if successful, else `None`. - Read { entry: Option }, + Read { + entry: Option, + end_offset: usize, + }, /// `ok` is true if offset is valid, else false. `now_size` is the size /// of file after this. @@ -198,7 +201,7 @@ where backer: &mut File, file_size: usize, offset: usize, - ) -> Result, SummersetError> { + ) -> Result<(Option, usize), SummersetError> { if offset + 8 > file_size { pf_warn!( me; @@ -206,7 +209,7 @@ where offset + 8, file_size ); - return Ok(None); + return Ok((None, offset)); } // read entry length header @@ -216,7 +219,7 @@ where if offset_e > file_size { pf_warn!(me; "read entry invalid length {}", entry_len); backer.seek(SeekFrom::End(0)).await?; // recover cursor to EOF - return Ok(None); + return Ok((None, offset)); } // read entry content @@ -224,7 +227,7 @@ where backer.read_exact(&mut entry_buf[..]).await?; let entry = decode_from_slice(&entry_buf)?; backer.seek(SeekFrom::End(0)).await?; // recover cursor to EOF - Ok(Some(entry)) + Ok((Some(entry), offset_e)) } /// Write given entry to given offset. @@ -366,9 +369,9 @@ where ) -> Result, SummersetError> { match action { LogAction::Read { offset } => { - Self::read_entry(me, backer, *file_size, offset) - .await - .map(|entry| LogResult::Read { entry }) + Self::read_entry(me, backer, *file_size, offset).await.map( + |(entry, end_offset)| LogResult::Read { entry, end_offset }, + ) } LogAction::Write { entry, @@ -543,40 +546,45 @@ mod storage_tests { let mut backer_file = prepare_test_file("/tmp/test-backer-2.log").await?; let entry = TestEntry("test-entry-dummy-string".into()); - let now_size = + let mid_size = StorageHub::append_entry(0, &mut backer_file, 0, &entry, false) .await?; - let now_size = StorageHub::append_entry( + let end_size = StorageHub::append_entry( 0, &mut backer_file, - now_size, + mid_size, &entry, true, ) .await?; assert_eq!( - StorageHub::read_entry(0, &mut backer_file, now_size, 0).await?, - Some(TestEntry("test-entry-dummy-string".into())) + StorageHub::read_entry(0, &mut backer_file, end_size, mid_size) + .await?, + (Some(TestEntry("test-entry-dummy-string".into())), end_size) + ); + assert_eq!( + StorageHub::read_entry(0, &mut backer_file, end_size, 0).await?, + (Some(TestEntry("test-entry-dummy-string".into())), mid_size) ); assert_eq!( StorageHub::::read_entry( 0, &mut backer_file, - now_size, - now_size + 10 + end_size, + mid_size + 10 ) .await?, - None + (None, mid_size + 10) ); assert_eq!( StorageHub::::read_entry( 0, &mut backer_file, - now_size, - now_size - 4 + mid_size, + mid_size - 4 ) .await?, - None + (None, mid_size - 4) ); Ok(()) } @@ -703,7 +711,8 @@ mod storage_tests { ( 1, LogResult::Read { - entry: Some(TestEntry("abcdefgh".into())) + entry: Some(TestEntry("abcdefgh".into())), + end_offset: 8 + entry_bytes.len(), } ) ); From d3d607db36102dcfcda26ae50638109fc0f0ec50 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Fri, 1 Sep 2023 18:38:42 +0800 Subject: [PATCH 33/89] fixing bench script hanging issue --- scripts/local_bench.tmp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py index 671b5497..ee007de3 100644 --- a/scripts/local_bench.tmp.py +++ b/scripts/local_bench.tmp.py @@ -6,8 +6,8 @@ PERF_STORAGE_ALPHA = 0 PERF_STORAGE_BETA = 0 -PERF_NETWORK_ALPHA = 10000 -PERF_NETWORK_BETA = 100 +PERF_NETWORK_ALPHA = 1000 +PERF_NETWORK_BETA = 10 def do_cargo_build(): From 9e711dd334de549938cfca06c16270581a8fc7a3 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Thu, 7 Sep 2023 14:09:16 +0800 Subject: [PATCH 34/89] add crash restart test --- summerset_client/src/clients/repl.rs | 72 +++++++++++++++++++------- summerset_client/src/clients/tester.rs | 40 +++++++++----- 2 files changed, 79 insertions(+), 33 deletions(-) diff --git a/summerset_client/src/clients/repl.rs b/summerset_client/src/clients/repl.rs index c3f29a1a..0d3b60c0 100644 --- a/summerset_client/src/clients/repl.rs +++ b/summerset_client/src/clients/repl.rs @@ -15,6 +15,24 @@ use summerset::{ /// Prompt string at the start of line. const PROMPT: &str = ">>>>> "; +/// Recognizable command types. +enum ReplCommand { + /// Normal state machine replication command. + Normal(Command), + + /// Reconnect to the service. + Reconnect, + + /// Print help message. + PrintHelp, + + /// Client exit. + Exit, + + /// Nothing read. + Nothing, +} + /// Interactive REPL-style client struct. pub struct ClientRepl { /// Closed-loop request driver. @@ -47,6 +65,7 @@ impl ClientRepl { println!("HELP: Supported commands are:"); println!(" get "); println!(" put "); + println!(" reconnect"); println!(" help"); println!(" exit"); println!( @@ -56,17 +75,16 @@ impl ClientRepl { } /// Reads in user input and parses into a command. - fn read_command(&mut self) -> Result, SummersetError> { + fn read_command(&mut self) -> Result { self.input_buf.clear(); let nread = io::stdin().read_line(&mut self.input_buf)?; if nread == 0 { - println!("Exitting..."); - return Ok(None); + return Ok(ReplCommand::Exit); } let line: &str = self.input_buf.trim(); if line.is_empty() { - return Err(SummersetError("".into())); + return Ok(ReplCommand::Nothing); } // split input line by whitespaces, getting an iterator of segments @@ -86,7 +104,7 @@ impl ClientRepl { } // keys and values are kept as-is, no case conversions - Ok(Some(Command::Get { + Ok(ReplCommand::Normal(Command::Get { key: key.unwrap().into(), })) } @@ -105,21 +123,17 @@ impl ClientRepl { return Err(err); } - Ok(Some(Command::Put { + Ok(ReplCommand::Normal(Command::Put { key: key.unwrap().into(), value: value.unwrap().into(), })) } - "help" => { - self.print_help(None); - Err(SummersetError("".into())) - } + "help" => Ok(ReplCommand::PrintHelp), - "exit" => { - println!("Exitting..."); - Ok(None) - } + "reconnect" => Ok(ReplCommand::Reconnect), + + "exit" => Ok(ReplCommand::Exit), _ => { let err = SummersetError(format!( @@ -174,14 +188,32 @@ impl ClientRepl { self.print_prompt(); let cmd = self.read_command()?; - if cmd.is_none() { - return Ok(false); - } + match cmd { + ReplCommand::Exit => { + println!("Exitting..."); + Ok(false) + } + + ReplCommand::Nothing => Ok(true), - let result = self.eval_command(cmd.unwrap()).await?; + ReplCommand::Reconnect => { + println!("Reconnecting..."); + self.driver.leave(false).await?; + self.driver.connect().await?; + Ok(true) + } - self.print_result(result); - Ok(true) + ReplCommand::PrintHelp => { + self.print_help(None); + Ok(true) + } + + ReplCommand::Normal(cmd) => { + let result = self.eval_command(cmd).await?; + self.print_result(result); + Ok(true) + } + } } /// Runs the infinite REPL loop. diff --git a/summerset_client/src/clients/tester.rs b/summerset_client/src/clients/tester.rs index 9ad2c3d6..b097256e 100644 --- a/summerset_client/src/clients/tester.rs +++ b/summerset_client/src/clients/tester.rs @@ -18,8 +18,8 @@ use serde::Deserialize; use tokio::time::Duration; use summerset::{ - GenericEndpoint, CommandResult, RequestId, CtrlRequest, CtrlReply, - SummersetError, pf_error, logged_err, parsed_config, + ReplicaId, GenericEndpoint, CommandResult, RequestId, CtrlRequest, + CtrlReply, SummersetError, pf_error, logged_err, parsed_config, }; lazy_static! { @@ -215,15 +215,16 @@ impl ClientTester { } } - /// Resets all servers in the cluster to initial empty state. - async fn reset_cluster(&mut self) -> Result<(), SummersetError> { + /// Resets some server(s) in the cluster. + async fn reset_server( + &mut self, + server: Option, + durable: bool, + ) -> Result<(), SummersetError> { let ctrl_stub = self.driver.ctrl_stub(); // send ResetServer request to manager - let req = CtrlRequest::ResetServer { - server: None, - durable: false, - }; + let req = CtrlRequest::ResetServer { server, durable }; let mut sent = ctrl_stub.send_req(Some(&req))?; while !sent { sent = ctrl_stub.send_req(None)?; @@ -243,7 +244,7 @@ impl ClientTester { name: &str, ) -> Result<(), SummersetError> { // reset everything to initial state at the start of each test - self.reset_cluster().await?; + self.reset_server(None, false).await?; self.driver.connect().await?; self.cached_replies.clear(); @@ -330,18 +331,31 @@ impl ClientTester { /// Client leaves and reconnects. async fn test_reconnect(&mut self) -> Result<(), SummersetError> { - let v0 = Self::gen_rand_string(8); - let mut req_id = self.issue_put("Jose", &v0)?; + let v = Self::gen_rand_string(8); + let mut req_id = self.issue_put("Jose", &v)?; self.expect_put_reply(req_id, Some(None), 1).await?; self.driver.leave(false).await?; self.driver.connect().await?; req_id = self.issue_get("Jose")?; - self.expect_get_reply(req_id, Some(Some(&v0)), 1).await?; + self.expect_get_reply(req_id, Some(Some(&v)), 1).await?; Ok(()) } /// Replica node crashes and restarts. async fn test_crash_restart(&mut self) -> Result<(), SummersetError> { - todo!("TODO") + let v = Self::gen_rand_string(8); + let mut req_id = self.issue_put("Jose", &v)?; + self.expect_put_reply(req_id, Some(None), 1).await?; + self.driver.leave(false).await?; + self.reset_server(Some(1), true).await?; + self.driver.connect().await?; + req_id = self.issue_get("Jose")?; + self.expect_get_reply(req_id, Some(Some(&v)), 1).await?; + self.driver.leave(false).await?; + self.reset_server(Some(0), true).await?; + self.driver.connect().await?; + req_id = self.issue_get("Jose")?; + self.expect_get_reply(req_id, Some(Some(&v)), 1).await?; + Ok(()) } } From f69c8254d7aa8162e4128777af685d65fd3f508a Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Thu, 7 Sep 2023 14:25:05 +0800 Subject: [PATCH 35/89] add log recovery logic to SimplePush --- src/protocols/simple_push.rs | 79 ++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs index 9deb6775..dcd5204e 100644 --- a/src/protocols/simple_push.rs +++ b/src/protocols/simple_push.rs @@ -72,6 +72,7 @@ enum LogEntry { }, PeerPushed { peer: ReplicaId, + src_inst_idx: usize, reqs: Vec<(ClientId, ApiRequest)>, }, } @@ -285,6 +286,7 @@ impl SimplePushReplica { // submit log action to make this instance durable let log_entry = LogEntry::PeerPushed { peer, + src_inst_idx, reqs: req_batch.clone(), }; self.storage_hub.submit_action( @@ -450,6 +452,80 @@ impl SimplePushReplica { _ => Ok(None), // ignore all other types } } + + /// Recover state from durable storage log. + async fn recover_from_log(&mut self) -> Result<(), SummersetError> { + assert_eq!(self.log_offset, 0); + loop { + // using 0 as a special log action ID + self.storage_hub.submit_action( + 0, + LogAction::Read { + offset: self.log_offset, + }, + )?; + let (_, log_result) = self.storage_hub.get_result().await?; + + match log_result { + LogResult::Read { + entry: Some(entry), + end_offset, + } => { + let (from_peer, reqs) = match entry { + LogEntry::FromClient { reqs } => (None, reqs), + LogEntry::PeerPushed { + peer, + src_inst_idx, + reqs, + } => (Some((peer, src_inst_idx)), reqs), + }; + // execute all commands on state machine synchronously + for (_, req) in reqs.clone() { + if let ApiRequest::Req { cmd, .. } = req { + // using 0 as a special command ID + self.state_machine.submit_cmd(0, cmd)?; + let _ = self.state_machine.get_result().await?; + } + } + // rebuild in-memory log + let num_reqs = reqs.len(); + self.insts.push(Instance { + reqs, + durable: true, + pending_peers: Bitmap::new(self.population, false), + execed: vec![true; num_reqs], + from_peer, + }); + // update log offset + self.log_offset = end_offset; + } + LogResult::Read { entry: None, .. } => { + // end of log reached + break; + } + _ => { + return logged_err!(self.id; "unexpected log result type"); + } + } + } + + // do an extra Truncate to remove paritial entry at the end if any + self.storage_hub.submit_action( + 0, + LogAction::Truncate { + offset: self.log_offset, + }, + )?; + let (_, log_result) = self.storage_hub.get_result().await?; + if let LogResult::Truncate { + offset_ok: true, .. + } = log_result + { + Ok(()) + } else { + logged_err!(self.id; "unexpected log result type") + } + } } #[async_trait] @@ -560,6 +636,9 @@ impl GenericReplica for SimplePushReplica { &mut self, mut rx_term: watch::Receiver, ) -> Result { + // recover state from durable storage log + self.recover_from_log().await?; + // main event loop loop { tokio::select! { From e24e464bc826ef93391c42fb7a581e29b6ef3cb9 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Thu, 7 Sep 2023 20:16:00 +0800 Subject: [PATCH 36/89] add log recovery logic to Paxos variants --- src/protocols/crossword.rs | 163 ++++++++++++++++++++++ src/protocols/multipaxos.rs | 142 +++++++++++++++++++ src/protocols/rep_nothing.rs | 2 +- src/protocols/rs_paxos.rs | 163 ++++++++++++++++++++++ src/protocols/simple_push.rs | 2 +- summerset_client/src/clients/tester.rs | 8 +- summerset_client/src/drivers/open_loop.rs | 7 +- 7 files changed, 479 insertions(+), 8 deletions(-) diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index 012007b4..9ef9b2b5 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -1168,6 +1168,166 @@ impl CrosswordReplica { _ => Ok(None), // ignore all other types } } + + /// Apply a durable storage log entry for recovery. + async fn recover_apply_entry( + &mut self, + entry: LogEntry, + ) -> Result<(), SummersetError> { + match entry { + LogEntry::PrepareBal { slot, ballot } => { + // locate instance in memory, filling in null instances if needed + while self.insts.len() <= slot { + self.insts.push(Instance { + bal: 0, + status: Status::Null, + reqs_cw: RSCodeword::::from_null( + self.quorum_cnt, + self.population - self.quorum_cnt, + )?, + leader_bk: None, + replica_bk: None, + }); + } + // update instance state + let inst = &mut self.insts[slot]; + inst.bal = ballot; + inst.status = Status::Preparing; + // update bal_prep_sent and bal_max_seen, reset bal_prepared + if self.bal_prep_sent < ballot { + self.bal_prep_sent = ballot; + } + if self.bal_max_seen < ballot { + self.bal_max_seen = ballot; + } + self.bal_prepared = 0; + } + + LogEntry::AcceptData { + slot, + ballot, + reqs_cw, + } => { + // locate instance in memory, filling in null instances if needed + while self.insts.len() <= slot { + self.insts.push(Instance { + bal: 0, + status: Status::Null, + reqs_cw: RSCodeword::::from_null( + self.quorum_cnt, + self.population - self.quorum_cnt, + )?, + leader_bk: None, + replica_bk: None, + }); + } + // update instance state + let inst = &mut self.insts[slot]; + inst.bal = ballot; + inst.status = Status::Accepting; + inst.reqs_cw = reqs_cw; + // update bal_prepared and bal_max_seen + if self.bal_prepared < ballot { + self.bal_prepared = ballot; + } + if self.bal_max_seen < ballot { + self.bal_max_seen = ballot; + } + assert!(self.bal_prepared <= self.bal_prep_sent); + } + + LogEntry::CommitSlot { slot } => { + assert!(slot < self.insts.len()); + // update instance state + self.insts[slot].status = Status::Committed; + // submit commands in contiguously committed instance to the + // state machine + if slot == self.commit_bar { + while self.commit_bar < self.insts.len() { + let inst = &mut self.insts[self.commit_bar]; + if inst.status < Status::Committed { + break; + } + // check number of available shards + if inst.reqs_cw.avail_shards() < self.quorum_cnt { + // can't execute if I don't have the complete request batch + break; + } else if inst.reqs_cw.avail_data_shards() + < self.quorum_cnt + { + // have enough shards but need reconstruction + inst.reqs_cw + .reconstruct_data(Some(&self.rs_coder))?; + } + // execute all commands in this instance on state machine + // synchronously + for (_, req) in inst.reqs_cw.get_data()?.clone() { + if let ApiRequest::Req { cmd, .. } = req { + // using 0 as a special command ID + self.state_machine.submit_cmd(0, cmd)?; + let _ = self.state_machine.get_result().await?; + } + } + // update commit_bar and exec_bar + self.commit_bar += 1; + self.exec_bar += 1; + } + } + } + } + + Ok(()) + } + + /// Recover state from durable storage log. + async fn recover_from_log(&mut self) -> Result<(), SummersetError> { + assert_eq!(self.log_offset, 0); + loop { + // using 0 as a special log action ID + self.storage_hub.submit_action( + 0, + LogAction::Read { + offset: self.log_offset, + }, + )?; + let (_, log_result) = self.storage_hub.get_result().await?; + + match log_result { + LogResult::Read { + entry: Some(entry), + end_offset, + } => { + self.recover_apply_entry(entry).await?; + // update log offset + self.log_offset = end_offset; + } + LogResult::Read { entry: None, .. } => { + // end of log reached + break; + } + _ => { + return logged_err!(self.id; "unexpected log result type"); + } + } + } + + // do an extra Truncate to remove paritial entry at the end if any + self.storage_hub.submit_action( + 0, + LogAction::Truncate { + offset: self.log_offset, + }, + )?; + let (_, log_result) = self.storage_hub.get_result().await?; + if let LogResult::Truncate { + offset_ok: true, .. + } = log_result + { + Ok(()) + } else { + logged_err!(self.id; "unexpected log result type") + } + } } #[async_trait] @@ -1305,6 +1465,9 @@ impl GenericReplica for CrosswordReplica { &mut self, mut rx_term: watch::Receiver, ) -> Result { + // recover state from durable storage log + self.recover_from_log().await?; + // TODO: proper leader election if self.id == 0 { self.is_leader = true; diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs index a0d06a19..5ec372b9 100644 --- a/src/protocols/multipaxos.rs +++ b/src/protocols/multipaxos.rs @@ -983,6 +983,145 @@ impl MultiPaxosReplica { _ => Ok(None), // ignore all other types } } + + /// Apply a durable storage log entry for recovery. + async fn recover_apply_entry( + &mut self, + entry: LogEntry, + ) -> Result<(), SummersetError> { + match entry { + LogEntry::PrepareBal { slot, ballot } => { + // locate instance in memory, filling in null instances if needed + while self.insts.len() <= slot { + self.insts.push(Instance { + bal: 0, + status: Status::Null, + reqs: Vec::new(), + leader_bk: None, + replica_bk: None, + }); + } + // update instance state + let inst = &mut self.insts[slot]; + inst.bal = ballot; + inst.status = Status::Preparing; + // update bal_prep_sent and bal_max_seen, reset bal_prepared + if self.bal_prep_sent < ballot { + self.bal_prep_sent = ballot; + } + if self.bal_max_seen < ballot { + self.bal_max_seen = ballot; + } + self.bal_prepared = 0; + } + + LogEntry::AcceptData { slot, ballot, reqs } => { + // locate instance in memory, filling in null instances if needed + while self.insts.len() <= slot { + self.insts.push(Instance { + bal: 0, + status: Status::Null, + reqs: Vec::new(), + leader_bk: None, + replica_bk: None, + }); + } + // update instance state + let inst = &mut self.insts[slot]; + inst.bal = ballot; + inst.status = Status::Accepting; + inst.reqs = reqs; + // update bal_prepared and bal_max_seen + if self.bal_prepared < ballot { + self.bal_prepared = ballot; + } + if self.bal_max_seen < ballot { + self.bal_max_seen = ballot; + } + assert!(self.bal_prepared <= self.bal_prep_sent); + } + + LogEntry::CommitSlot { slot } => { + assert!(slot < self.insts.len()); + // update instance state + self.insts[slot].status = Status::Committed; + // submit commands in contiguously committed instance to the + // state machine + if slot == self.commit_bar { + while self.commit_bar < self.insts.len() { + let inst = &mut self.insts[self.commit_bar]; + if inst.status < Status::Committed { + break; + } + // execute all commands in this instance on state machine + // synchronously + for (_, req) in inst.reqs.clone() { + if let ApiRequest::Req { cmd, .. } = req { + // using 0 as a special command ID + self.state_machine.submit_cmd(0, cmd)?; + let _ = self.state_machine.get_result().await?; + } + } + // update commit_bar and exec_bar + self.commit_bar += 1; + self.exec_bar += 1; + } + } + } + } + + Ok(()) + } + + /// Recover state from durable storage log. + async fn recover_from_log(&mut self) -> Result<(), SummersetError> { + assert_eq!(self.log_offset, 0); + loop { + // using 0 as a special log action ID + self.storage_hub.submit_action( + 0, + LogAction::Read { + offset: self.log_offset, + }, + )?; + let (_, log_result) = self.storage_hub.get_result().await?; + + match log_result { + LogResult::Read { + entry: Some(entry), + end_offset, + } => { + self.recover_apply_entry(entry).await?; + // update log offset + self.log_offset = end_offset; + } + LogResult::Read { entry: None, .. } => { + // end of log reached + break; + } + _ => { + return logged_err!(self.id; "unexpected log result type"); + } + } + } + + // do an extra Truncate to remove paritial entry at the end if any + self.storage_hub.submit_action( + 0, + LogAction::Truncate { + offset: self.log_offset, + }, + )?; + let (_, log_result) = self.storage_hub.get_result().await?; + if let LogResult::Truncate { + offset_ok: true, .. + } = log_result + { + Ok(()) + } else { + logged_err!(self.id; "unexpected log result type") + } + } } #[async_trait] @@ -1100,6 +1239,9 @@ impl GenericReplica for MultiPaxosReplica { &mut self, mut rx_term: watch::Receiver, ) -> Result { + // recover state from durable storage log + self.recover_from_log().await?; + // TODO: proper leader election if self.id == 0 { self.is_leader = true; diff --git a/src/protocols/rep_nothing.rs b/src/protocols/rep_nothing.rs index 849e3e05..6475de8d 100644 --- a/src/protocols/rep_nothing.rs +++ b/src/protocols/rep_nothing.rs @@ -314,7 +314,7 @@ impl RepNothingReplica { let _ = self.state_machine.get_result().await?; } } - // rebuild in-memory log + // rebuild in-memory log entry let num_reqs = entry.reqs.len(); self.insts.push(Instance { reqs: entry.reqs, diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs index 8dbbafb8..ed66f6ea 100644 --- a/src/protocols/rs_paxos.rs +++ b/src/protocols/rs_paxos.rs @@ -1068,6 +1068,166 @@ impl RSPaxosReplica { _ => Ok(None), // ignore all other types } } + + /// Apply a durable storage log entry for recovery. + async fn recover_apply_entry( + &mut self, + entry: LogEntry, + ) -> Result<(), SummersetError> { + match entry { + LogEntry::PrepareBal { slot, ballot } => { + // locate instance in memory, filling in null instances if needed + while self.insts.len() <= slot { + self.insts.push(Instance { + bal: 0, + status: Status::Null, + reqs_cw: RSCodeword::::from_null( + self.quorum_cnt, + self.population - self.quorum_cnt, + )?, + leader_bk: None, + replica_bk: None, + }); + } + // update instance state + let inst = &mut self.insts[slot]; + inst.bal = ballot; + inst.status = Status::Preparing; + // update bal_prep_sent and bal_max_seen, reset bal_prepared + if self.bal_prep_sent < ballot { + self.bal_prep_sent = ballot; + } + if self.bal_max_seen < ballot { + self.bal_max_seen = ballot; + } + self.bal_prepared = 0; + } + + LogEntry::AcceptData { + slot, + ballot, + reqs_cw, + } => { + // locate instance in memory, filling in null instances if needed + while self.insts.len() <= slot { + self.insts.push(Instance { + bal: 0, + status: Status::Null, + reqs_cw: RSCodeword::::from_null( + self.quorum_cnt, + self.population - self.quorum_cnt, + )?, + leader_bk: None, + replica_bk: None, + }); + } + // update instance state + let inst = &mut self.insts[slot]; + inst.bal = ballot; + inst.status = Status::Accepting; + inst.reqs_cw = reqs_cw; + // update bal_prepared and bal_max_seen + if self.bal_prepared < ballot { + self.bal_prepared = ballot; + } + if self.bal_max_seen < ballot { + self.bal_max_seen = ballot; + } + assert!(self.bal_prepared <= self.bal_prep_sent); + } + + LogEntry::CommitSlot { slot } => { + assert!(slot < self.insts.len()); + // update instance state + self.insts[slot].status = Status::Committed; + // submit commands in contiguously committed instance to the + // state machine + if slot == self.commit_bar { + while self.commit_bar < self.insts.len() { + let inst = &mut self.insts[self.commit_bar]; + if inst.status < Status::Committed { + break; + } + // check number of available shards + if inst.reqs_cw.avail_shards() < self.quorum_cnt { + // can't execute if I don't have the complete request batch + break; + } else if inst.reqs_cw.avail_data_shards() + < self.quorum_cnt + { + // have enough shards but need reconstruction + inst.reqs_cw + .reconstruct_data(Some(&self.rs_coder))?; + } + // execute all commands in this instance on state machine + // synchronously + for (_, req) in inst.reqs_cw.get_data()?.clone() { + if let ApiRequest::Req { cmd, .. } = req { + // using 0 as a special command ID + self.state_machine.submit_cmd(0, cmd)?; + let _ = self.state_machine.get_result().await?; + } + } + // update commit_bar and exec_bar + self.commit_bar += 1; + self.exec_bar += 1; + } + } + } + } + + Ok(()) + } + + /// Recover state from durable storage log. + async fn recover_from_log(&mut self) -> Result<(), SummersetError> { + assert_eq!(self.log_offset, 0); + loop { + // using 0 as a special log action ID + self.storage_hub.submit_action( + 0, + LogAction::Read { + offset: self.log_offset, + }, + )?; + let (_, log_result) = self.storage_hub.get_result().await?; + + match log_result { + LogResult::Read { + entry: Some(entry), + end_offset, + } => { + self.recover_apply_entry(entry).await?; + // update log offset + self.log_offset = end_offset; + } + LogResult::Read { entry: None, .. } => { + // end of log reached + break; + } + _ => { + return logged_err!(self.id; "unexpected log result type"); + } + } + } + + // do an extra Truncate to remove paritial entry at the end if any + self.storage_hub.submit_action( + 0, + LogAction::Truncate { + offset: self.log_offset, + }, + )?; + let (_, log_result) = self.storage_hub.get_result().await?; + if let LogResult::Truncate { + offset_ok: true, .. + } = log_result + { + Ok(()) + } else { + logged_err!(self.id; "unexpected log result type") + } + } } #[async_trait] @@ -1198,6 +1358,9 @@ impl GenericReplica for RSPaxosReplica { &mut self, mut rx_term: watch::Receiver, ) -> Result { + // recover state from durable storage log + self.recover_from_log().await?; + // TODO: proper leader election if self.id == 0 { self.is_leader = true; diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs index dcd5204e..7260bc27 100644 --- a/src/protocols/simple_push.rs +++ b/src/protocols/simple_push.rs @@ -487,7 +487,7 @@ impl SimplePushReplica { let _ = self.state_machine.get_result().await?; } } - // rebuild in-memory log + // rebuild in-memory log entry let num_reqs = reqs.len(); self.insts.push(Instance { reqs, diff --git a/summerset_client/src/clients/tester.rs b/summerset_client/src/clients/tester.rs index b097256e..b0246688 100644 --- a/summerset_client/src/clients/tester.rs +++ b/summerset_client/src/clients/tester.rs @@ -255,15 +255,15 @@ impl ClientTester { _ => return logged_err!("c"; "unrecognized test name '{}'", name), }; - // send leave notification and forget about the TCP connections at the - // end of each test - self.driver.leave(false).await?; - if let Err(ref e) = result { cprintln!("{:>16} | {:^6} | {}", name, "FAIL", e); } else { cprintln!("{:>16} | {:^6} | --", name, "PASS"); } + + // send leave notification and forget about the TCP connections at the + // end of each test + self.driver.leave(false).await?; result } diff --git a/summerset_client/src/drivers/open_loop.rs b/summerset_client/src/drivers/open_loop.rs index 433f68a2..057d414f 100644 --- a/summerset_client/src/drivers/open_loop.rs +++ b/summerset_client/src/drivers/open_loop.rs @@ -11,8 +11,8 @@ use tokio::time::{Duration, Instant}; use summerset::{ GenericEndpoint, ClientId, Command, CommandResult, ApiRequest, ApiReply, - RequestId, ClientCtrlStub, Timer, SummersetError, pf_debug, pf_error, - logged_err, + RequestId, ClientCtrlStub, Timer, SummersetError, pf_trace, pf_debug, + pf_error, logged_err, }; /// Open-loop driver struct. @@ -68,9 +68,12 @@ impl DriverOpenLoop { ) -> Result<(), SummersetError> { // loop until all pending replies have been received while self.should_retry { + pf_trace!(self.id; "retrying last issue at leave"); self.issue_retry()?; } while !self.pending_reqs.is_empty() { + pf_trace!(self.id; "pending {} requests at leave", + self.pending_reqs.len()); self.wait_reply().await?; } From f036df5f05a4d5c782124d17d826ece6a16a0167 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Tue, 12 Sep 2023 08:46:09 -0500 Subject: [PATCH 37/89] minor changes to ResetServers control message --- src/manager/clusman.rs | 16 +++--- src/manager/reactor.rs | 8 +-- src/protocols/multipaxos.rs | 1 - src/utils/bitmap.rs | 2 +- summerset_client/src/clients/tester.rs | 67 ++++++++++++++++++-------- 5 files changed, 59 insertions(+), 35 deletions(-) diff --git a/src/manager/clusman.rs b/src/manager/clusman.rs index de18afb5..2a523972 100644 --- a/src/manager/clusman.rs +++ b/src/manager/clusman.rs @@ -244,19 +244,19 @@ impl ClusterManager { .send_reply(CtrlReply::QueryInfo { servers }, client) } - /// Handler of client ResetServer request. - async fn handle_client_reset_server( + /// Handler of client ResetServers request. + async fn handle_client_reset_servers( &mut self, client: ClientId, - server: Option, + servers: HashSet, durable: bool, ) -> Result<(), SummersetError> { let num_replicas = self.server_info.len(); - let mut servers: Vec = if server.is_none() { + let mut servers: Vec = if servers.is_empty() { // all active servers self.server_info.keys().copied().collect() } else { - vec![server.unwrap()] + servers.into_iter().collect() }; // reset specified server(s) @@ -291,7 +291,7 @@ impl ClusterManager { } self.client_reactor.send_reply( - CtrlReply::ResetServer { + CtrlReply::ResetServers { servers: reset_done, }, client, @@ -310,8 +310,8 @@ impl ClusterManager { self.handle_client_query_info(client)?; } - CtrlRequest::ResetServer { server, durable } => { - self.handle_client_reset_server(client, server, durable) + CtrlRequest::ResetServers { servers, durable } => { + self.handle_client_reset_servers(client, servers, durable) .await?; } diff --git a/src/manager/reactor.rs b/src/manager/reactor.rs index 41a0582d..e1e388c3 100644 --- a/src/manager/reactor.rs +++ b/src/manager/reactor.rs @@ -27,9 +27,9 @@ pub enum CtrlRequest { QueryInfo, /// Reset the specified server(s) to initial state. - ResetServer { - /// ID of server to reset. If `None`, resets all active servers. - server: Option, + ResetServers { + /// IDs of servers to reset. If empty, resets all active servers. + servers: HashSet, /// If false, cleans durable storage state as well. durable: bool, }, @@ -47,7 +47,7 @@ pub enum CtrlReply { }, /// Reply to server reset request. - ResetServer { servers: HashSet }, + ResetServers { servers: HashSet }, /// Reply to client leave notification. Leave, diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs index 5ec372b9..0ebf8564 100644 --- a/src/protocols/multipaxos.rs +++ b/src/protocols/multipaxos.rs @@ -314,7 +314,6 @@ impl MultiPaxosReplica { // create a new instance in the first null slot (or append a new one // at the end if no holes exist) - // TODO: maybe use a null_idx variable to better keep track of this let mut slot = self.insts.len(); for s in self.commit_bar..self.insts.len() { let old_inst = &mut self.insts[s]; diff --git a/src/utils/bitmap.rs b/src/utils/bitmap.rs index dfbb8467..d5fe9c8e 100644 --- a/src/utils/bitmap.rs +++ b/src/utils/bitmap.rs @@ -155,7 +155,7 @@ mod bitmap_tests { #[test] fn bitmap_iter() { - let ref_map = vec![true, true, false, true, true]; + let ref_map = [true, true, false, true, true]; let mut map = Bitmap::new(5, true); assert!(map.set(2, false).is_ok()); for (id, flag) in map.iter() { diff --git a/summerset_client/src/clients/tester.rs b/summerset_client/src/clients/tester.rs index b0246688..94528700 100644 --- a/summerset_client/src/clients/tester.rs +++ b/summerset_client/src/clients/tester.rs @@ -1,6 +1,6 @@ //! Correctness testing client using open-loop driver. -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use crate::drivers::DriverOpenLoop; @@ -25,9 +25,11 @@ use summerset::{ lazy_static! { /// List of all tests. If the flag is true, the test is marked as basic. static ref ALL_TESTS: Vec<(&'static str, bool)> = vec![ - ("primitives", true), - ("reconnect", true), - ("crash_restart", false), + ("primitive_ops", true), + ("client_reconnect", true), + ("node_1_crash", true), + ("node_0_crash", true), + ("two_nodes_crash", false) ]; } @@ -216,15 +218,15 @@ impl ClientTester { } /// Resets some server(s) in the cluster. - async fn reset_server( + async fn reset_servers( &mut self, - server: Option, + servers: HashSet, durable: bool, ) -> Result<(), SummersetError> { let ctrl_stub = self.driver.ctrl_stub(); // send ResetServer request to manager - let req = CtrlRequest::ResetServer { server, durable }; + let req = CtrlRequest::ResetServers { servers, durable }; let mut sent = ctrl_stub.send_req(Some(&req))?; while !sent { sent = ctrl_stub.send_req(None)?; @@ -233,7 +235,7 @@ impl ClientTester { // wait for reply from manager let reply = ctrl_stub.recv_reply().await?; match reply { - CtrlReply::ResetServer { .. } => Ok(()), + CtrlReply::ResetServers { .. } => Ok(()), _ => logged_err!("c"; "unexpected control reply type"), } } @@ -244,21 +246,23 @@ impl ClientTester { name: &str, ) -> Result<(), SummersetError> { // reset everything to initial state at the start of each test - self.reset_server(None, false).await?; + self.reset_servers(HashSet::new(), false).await?; self.driver.connect().await?; self.cached_replies.clear(); let result = match name { - "primitives" => self.test_primitives().await, - "reconnect" => self.test_reconnect().await, - "crash_restart" => self.test_crash_restart().await, + "primitive_ops" => self.test_primitive_ops().await, + "client_reconnect" => self.test_client_reconnect().await, + "node_1_crash" => self.test_node_1_crash().await, + "node_0_crash" => self.test_node_0_crash().await, + "two_nodes_crash" => self.test_two_nodes_crash().await, _ => return logged_err!("c"; "unrecognized test name '{}'", name), }; if let Err(ref e) = result { - cprintln!("{:>16} | {:^6} | {}", name, "FAIL", e); + cprintln!("{:>20} | {:^6} | {}", name, "FAIL", e); } else { - cprintln!("{:>16} | {:^6} | --", name, "PASS"); + cprintln!("{:>20} | {:^6} | --", name, "PASS"); } // send leave notification and forget about the TCP connections at the @@ -272,7 +276,7 @@ impl ClientTester { let test_name = self.params.test_name.clone(); let mut all_pass = true; - println!("{:^16} | {:^6} | Notes", "Test Case", "Result"); + println!("{:^20} | {:^6} | Notes", "Test Case", "Result"); match &test_name[..] { "basic" => { for (name, basic) in ALL_TESTS.iter() { @@ -313,7 +317,7 @@ impl ClientTester { // List of tests: impl ClientTester { /// Basic primitive operations. - async fn test_primitives(&mut self) -> Result<(), SummersetError> { + async fn test_primitive_ops(&mut self) -> Result<(), SummersetError> { let mut req_id = self.issue_get("Jose")?; self.expect_get_reply(req_id, Some(None), 1).await?; let v0 = Self::gen_rand_string(8); @@ -330,7 +334,7 @@ impl ClientTester { } /// Client leaves and reconnects. - async fn test_reconnect(&mut self) -> Result<(), SummersetError> { + async fn test_client_reconnect(&mut self) -> Result<(), SummersetError> { let v = Self::gen_rand_string(8); let mut req_id = self.issue_put("Jose", &v)?; self.expect_put_reply(req_id, Some(None), 1).await?; @@ -341,18 +345,39 @@ impl ClientTester { Ok(()) } - /// Replica node crashes and restarts. - async fn test_crash_restart(&mut self) -> Result<(), SummersetError> { + /// Replica node 1 crashes and restarts. + async fn test_node_1_crash(&mut self) -> Result<(), SummersetError> { let v = Self::gen_rand_string(8); let mut req_id = self.issue_put("Jose", &v)?; self.expect_put_reply(req_id, Some(None), 1).await?; self.driver.leave(false).await?; - self.reset_server(Some(1), true).await?; + self.reset_servers(HashSet::from([1]), true).await?; self.driver.connect().await?; req_id = self.issue_get("Jose")?; self.expect_get_reply(req_id, Some(Some(&v)), 1).await?; + Ok(()) + } + + /// Replica node 0 crashes and restarts. + async fn test_node_0_crash(&mut self) -> Result<(), SummersetError> { + let v = Self::gen_rand_string(8); + let mut req_id = self.issue_put("Jose", &v)?; + self.expect_put_reply(req_id, Some(None), 1).await?; + self.driver.leave(false).await?; + self.reset_servers(HashSet::from([0]), true).await?; + self.driver.connect().await?; + req_id = self.issue_get("Jose")?; + self.expect_get_reply(req_id, Some(Some(&v)), 1).await?; + Ok(()) + } + + /// Two replica nodes crashes and restarts. + async fn test_two_nodes_crash(&mut self) -> Result<(), SummersetError> { + let v = Self::gen_rand_string(8); + let mut req_id = self.issue_put("Jose", &v)?; + self.expect_put_reply(req_id, Some(None), 1).await?; self.driver.leave(false).await?; - self.reset_server(Some(0), true).await?; + self.reset_servers(HashSet::from([0, 1]), true).await?; self.driver.connect().await?; req_id = self.issue_get("Jose")?; self.expect_get_reply(req_id, Some(Some(&v)), 1).await?; From 8fc6a2aa07021fbd3dc399903c33feee5932114a Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Tue, 12 Sep 2023 08:51:56 -0500 Subject: [PATCH 38/89] minor updates to README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 974767d1..16944f3a 100644 --- a/README.md +++ b/README.md @@ -147,7 +147,7 @@ Complete cluster management and benchmarking scripts are available in another re - [x] cluster manager oracle impl. - [x] implementation of MultiPaxos - [x] client-side timeout/retry logic - - [ ] state persistence & restart check + - [x] state persistence & restart check - [ ] automatic leader election, backoffs - [ ] snapshotting & garbage collection - [ ] specialize read-only commands? From b6ce402086e51f1a25a08c13809adf816cfe85d1 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Tue, 12 Sep 2023 14:15:31 -0500 Subject: [PATCH 39/89] staging progress on leader timeouts --- src/protocols/multipaxos.rs | 181 +++++++++++++++++++++++-- summerset_client/src/clients/tester.rs | 4 +- 2 files changed, 174 insertions(+), 11 deletions(-) diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs index 0ebf8564..4690c08d 100644 --- a/src/protocols/multipaxos.rs +++ b/src/protocols/multipaxos.rs @@ -11,7 +11,7 @@ use std::collections::HashMap; use std::path::Path; use std::net::SocketAddr; -use crate::utils::{SummersetError, Bitmap}; +use crate::utils::{SummersetError, Bitmap, Timer}; use crate::manager::{CtrlMsg, CtrlRequest, CtrlReply}; use crate::server::{ ReplicaId, ControlHub, StateMachine, CommandResult, CommandId, ExternalApi, @@ -21,13 +21,15 @@ use crate::server::{ use crate::client::{ClientId, ClientApiStub, ClientCtrlStub, GenericEndpoint}; use crate::protocols::SmrProtocol; +use rand::prelude::*; + use async_trait::async_trait; use get_size::GetSize; use serde::{Serialize, Deserialize}; -use tokio::time::Duration; +use tokio::time::{self, Duration, Interval, MissedTickBehavior}; use tokio::sync::watch; /// Configuration parameters struct. @@ -45,6 +47,15 @@ pub struct ReplicaConfigMultiPaxos { /// Whether to call `fsync()`/`fdatasync()` on logger. pub logger_sync: bool, + /// Min timeout of not hearing any heartbeat from leader in millisecs. + pub hb_hear_timeout_min: u64, + + /// Max timeout of not hearing any heartbeat from leader in millisecs. + pub hb_hear_timeout_max: u64, + + /// Interval of leader sending heartbeats to followers. + pub hb_send_interval_ms: u64, + // Performance simulation params (all zeros means no perf simulation): pub perf_storage_a: u64, pub perf_storage_b: u64, @@ -60,6 +71,9 @@ impl Default for ReplicaConfigMultiPaxos { max_batch_size: 5000, backer_path: "/tmp/summerset.multipaxos.wal".into(), logger_sync: false, + hb_hear_timeout_min: 300, + hb_hear_timeout_max: 600, + hb_send_interval_ms: 50, perf_storage_a: 0, perf_storage_b: 0, perf_network_a: 0, @@ -169,6 +183,9 @@ enum PeerMsg { /// Commit notification from leader to replicas. Commit { slot: usize }, + + /// Leader activity heartbeat. + Heartbeat { ballot: Ballot }, } /// MultiPaxos server replica module. @@ -206,6 +223,12 @@ pub struct MultiPaxosReplica { /// TransportHub module. transport_hub: TransportHub, + /// Timer for hearing heartbeat from leader. + hb_hear_timer: Timer, + + /// Interval for sending heartbeat to followers. + hb_send_interval: Interval, + /// Do I think I am the leader? is_leader: bool, @@ -344,9 +367,6 @@ impl MultiPaxosReplica { } // decide whether we can enter fast path for this instance - // TODO: remember to reset bal_prepared to 0, update bal_max_seen, - // and re-handle all Preparing & Accepting instances in autonomous - // Prepare initiation if self.bal_prepared == 0 { // slow case: Prepare phase not done yet. Initiate a Prepare round // if none is on the fly, or just wait for some Prepare reply to @@ -869,6 +889,7 @@ impl MultiPaxosReplica { self.handle_msg_accept_reply(peer, slot, ballot) } PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot), + PeerMsg::Heartbeat { ballot } => self.heard_heartbeat(peer, ballot), } } @@ -927,6 +948,111 @@ impl MultiPaxosReplica { Ok(()) } + /// Becomes a leader, sends self-initiated Prepare messages to followers + /// for all in-progress instances, and starts broadcasting heartbeats. + fn become_a_leader(&mut self) -> Result<(), SummersetError> { + assert!(!self.is_leader); + self.is_leader = true; // this starts broadcasting heartbeats + pf_warn!(self.id; "becoming a leader..."); + + // broadcast a heartbeat right now + self.bcast_heartbeats()?; + + // make a greater ballot number and invalidate all in-progress instances + self.bal_prepared = 0; + self.bal_prep_sent = self.make_greater_ballot(self.bal_max_seen); + self.bal_max_seen = self.bal_prep_sent; + + // redo Prepare phase for all in-progress instances + for (slot, inst) in self.insts.iter_mut().enumerate() { + if inst.status < Status::Committed { + inst.bal = self.bal_prep_sent; + inst.status = Status::Preparing; + pf_debug!(self.id; "enter Prepare phase for slot {} bal {}", + slot, inst.bal); + + // record update to largest prepare ballot + self.storage_hub.submit_action( + Self::make_log_action_id(slot, Status::Preparing), + LogAction::Append { + entry: LogEntry::PrepareBal { + slot, + ballot: self.bal_prep_sent, + }, + sync: self.config.logger_sync, + }, + )?; + pf_trace!(self.id; "submitted PrepareBal log action for slot {} bal {}", + slot, inst.bal); + + // send Prepare messages to all peers + self.transport_hub.bcast_msg( + PeerMsg::Prepare { + slot, + ballot: self.bal_prep_sent, + }, + None, + )?; + pf_trace!(self.id; "broadcast Prepare messages for slot {} bal {}", + slot, inst.bal); + } + } + + Ok(()) + } + + /// Broadcasts heartbeats to all replicas. + fn bcast_heartbeats(&mut self) -> Result<(), SummersetError> { + self.transport_hub.bcast_msg( + PeerMsg::Heartbeat { + ballot: self.bal_prep_sent, + }, + None, + )?; + self.heard_heartbeat(self.id, self.bal_prep_sent)?; + + // pf_trace!(self.id; "broadcast heartbeats bal {}", self.bal_prep_sent); + Ok(()) + } + + /// Chooses a random hb_hear_timeout from the min-max range and kicks off + /// the hb_hear_timer. + fn kickoff_hb_hear_timer(&mut self) -> Result<(), SummersetError> { + let timeout_ms = thread_rng().gen_range( + self.config.hb_hear_timeout_min..=self.config.hb_hear_timeout_max, + ); + + // pf_trace!(self.id; "kickoff hb_hear_timer @ {} ms", timeout_ms); + self.hb_hear_timer + .kickoff(Duration::from_millis(timeout_ms))?; + Ok(()) + } + + /// Heard a heartbeat from some other replica. If the heartbeat carries a + /// high enough ballot number, refreshes my hearing timer and clears my + /// leader status if I currently think I'm a leader. + fn heard_heartbeat( + &mut self, + _peer: ReplicaId, + ballot: Ballot, + ) -> Result<(), SummersetError> { + // ignore outdated hearbeat + if ballot < self.bal_max_seen { + return Ok(()); + } + + // reset hearing timer + self.kickoff_hb_hear_timer()?; + + // clear my leader status if it carries a higher ballot number + if self.is_leader && ballot > self.bal_max_seen { + self.is_leader = false; + } + + // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot); + Ok(()) + } + /// Handler of ResetState control message. async fn handle_ctrl_reset_state( &mut self, @@ -1140,6 +1266,8 @@ impl GenericReplica for MultiPaxosReplica { let config = parsed_config!(config_str => ReplicaConfigMultiPaxos; batch_interval_us, max_batch_size, backer_path, logger_sync, + hb_hear_timeout_min, hb_hear_timeout_max, + hb_send_interval_ms, perf_storage_a, perf_storage_b, perf_network_a, perf_network_b)?; if config.batch_interval_us == 0 { @@ -1149,6 +1277,27 @@ impl GenericReplica for MultiPaxosReplica { config.batch_interval_us ); } + if config.hb_hear_timeout_min < 100 { + return logged_err!( + id; + "invalid config.hb_hear_timeout_min '{}'", + config.hb_hear_timeout_min + ); + } + if config.hb_hear_timeout_max < config.hb_hear_timeout_min + 100 { + return logged_err!( + id; + "invalid config.hb_hear_timeout_max '{}'", + config.hb_hear_timeout_max + ); + } + if config.hb_send_interval_ms == 0 { + return logged_err!( + id; + "invalid config.hb_send_interval_ms '{}'", + config.hb_send_interval_ms + ); + } // setup state machine module let state_machine = StateMachine::new_and_setup(id).await?; @@ -1211,6 +1360,10 @@ impl GenericReplica for MultiPaxosReplica { ) .await?; + let mut hb_send_interval = + time::interval(Duration::from_millis(config.hb_send_interval_ms)); + hb_send_interval.set_missed_tick_behavior(MissedTickBehavior::Skip); + Ok(MultiPaxosReplica { id, population, @@ -1223,6 +1376,8 @@ impl GenericReplica for MultiPaxosReplica { state_machine, storage_hub, transport_hub, + hb_hear_timer: Timer::new(), + hb_send_interval, is_leader: false, insts: vec![], bal_prep_sent: 0, @@ -1241,10 +1396,8 @@ impl GenericReplica for MultiPaxosReplica { // recover state from durable storage log self.recover_from_log().await?; - // TODO: proper leader election - if self.id == 0 { - self.is_leader = true; - } + // kick off leader activity hearing timer + self.kickoff_hb_hear_timer()?; // main event loop loop { @@ -1298,6 +1451,16 @@ impl GenericReplica for MultiPaxosReplica { } }, + // leader inactivity timeout + _ = self.hb_hear_timer.timeout() => { + self.become_a_leader()?; + }, + + // leader sending heartbeat + _ = self.hb_send_interval.tick(), if self.is_leader => { + self.bcast_heartbeats()?; + } + // manager control message ctrl_msg = self.control_hub.recv_ctrl() => { if let Err(e) = ctrl_msg { diff --git a/summerset_client/src/clients/tester.rs b/summerset_client/src/clients/tester.rs index 94528700..e7e6092e 100644 --- a/summerset_client/src/clients/tester.rs +++ b/summerset_client/src/clients/tester.rs @@ -169,7 +169,7 @@ impl ClientTester { &mut self, req_id: RequestId, expect_value: Option>, - // maximum number of tries if repeatedly getting `WouldBlock` failure + // maximum number of tries if repeatedly getting `Ok(None)` reply max_tries: u8, ) -> Result<(), SummersetError> { let cmd_result = self.wait_reply(req_id, max_tries).await?; @@ -196,7 +196,7 @@ impl ClientTester { &mut self, req_id: RequestId, expect_old_value: Option>, - // maximum number of tries if repeatedly getting `WouldBlock` failure + // maximum number of tries if repeatedly getting `Ok(None)` reply max_tries: u8, ) -> Result<(), SummersetError> { let cmd_result = self.wait_reply(req_id, max_tries).await?; From dadc65c2773c1d34ea6181ff02ea53b859b54d01 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Wed, 13 Sep 2023 06:02:59 -0500 Subject: [PATCH 40/89] better client driver side API --- scripts/local_bench.tmp.py | 1 + summerset_client/src/clients/bench.rs | 12 +- summerset_client/src/clients/repl.rs | 73 +++-- summerset_client/src/clients/tester.rs | 302 ++++++++++---------- summerset_client/src/drivers/closed_loop.rs | 66 +++-- summerset_client/src/drivers/mod.rs | 26 ++ summerset_client/src/drivers/open_loop.rs | 35 +-- 7 files changed, 286 insertions(+), 229 deletions(-) diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py index ee007de3..96170e8c 100644 --- a/scripts/local_bench.tmp.py +++ b/scripts/local_bench.tmp.py @@ -133,6 +133,7 @@ def bench_round( + f"w%={put_ratio:<3d} {length_s:3d}s" ) + kill_all_matching("local_client.py", force=True) kill_all_matching("local_cluster.py", force=True) kill_all_matching("summerset_client", force=True) kill_all_matching("summerset_server", force=True) diff --git a/summerset_client/src/clients/bench.rs b/summerset_client/src/clients/bench.rs index c3f42cb7..6b6ea18a 100644 --- a/summerset_client/src/clients/bench.rs +++ b/summerset_client/src/clients/bench.rs @@ -1,6 +1,6 @@ //! Benchmarking client using open-loop driver. -use crate::drivers::DriverOpenLoop; +use crate::drivers::{DriverReply, DriverOpenLoop}; use lazy_static::lazy_static; @@ -168,7 +168,6 @@ impl ClientBench { } /// Runs one iteration action of closed-loop style benchmark. - #[allow(clippy::too_many_arguments)] async fn closed_loop_iter(&mut self) -> Result<(), SummersetError> { // send next request let req_id = if self.retrying { @@ -186,10 +185,10 @@ impl ClientBench { if self.total_cnt > self.reply_cnt { let result = self.driver.wait_reply().await?; - if let Some((_, _, lat)) = result { + if let DriverReply::Success { latency, .. } = result { self.reply_cnt += 1; self.chunk_cnt += 1; - let lat_us = lat.as_secs_f64() * 1000000.0; + let lat_us = latency.as_secs_f64() * 1000000.0; self.chunk_lats.push(lat_us); } } @@ -198,7 +197,6 @@ impl ClientBench { } /// Runs one iteration action of open-loop style benchmark. - #[allow(clippy::too_many_arguments)] async fn open_loop_iter(&mut self) -> Result<(), SummersetError> { tokio::select! { // prioritize receiving reply @@ -206,10 +204,10 @@ impl ClientBench { // receive next reply result = self.driver.wait_reply() => { - if let Some((_, _, lat)) = result? { + if let DriverReply::Success { latency, .. } = result? { self.reply_cnt += 1; self.chunk_cnt += 1; - let lat_us = lat.as_secs_f64() * 1000000.0; + let lat_us = latency.as_secs_f64() * 1000000.0; self.chunk_lats.push(lat_us); if self.slowdown > 0 { diff --git a/summerset_client/src/clients/repl.rs b/summerset_client/src/clients/repl.rs index 0d3b60c0..09e4f330 100644 --- a/summerset_client/src/clients/repl.rs +++ b/summerset_client/src/clients/repl.rs @@ -2,15 +2,13 @@ use std::io::{self, Write}; -use crate::drivers::DriverClosedLoop; +use crate::drivers::{DriverReply, DriverClosedLoop}; -use color_print::cprint; +use color_print::{cprint, cprintln}; use tokio::time::Duration; -use summerset::{ - GenericEndpoint, Command, CommandResult, RequestId, SummersetError, -}; +use summerset::{GenericEndpoint, Command, SummersetError}; /// Prompt string at the start of line. const PROMPT: &str = ">>>>> "; @@ -38,6 +36,9 @@ pub struct ClientRepl { /// Closed-loop request driver. driver: DriverClosedLoop, + /// Timeout duration setting. + timeout: Duration, + /// User input buffer. input_buf: String, } @@ -47,6 +48,7 @@ impl ClientRepl { pub fn new(endpoint: Box, timeout: Duration) -> Self { ClientRepl { driver: DriverClosedLoop::new(endpoint, timeout), + timeout, input_buf: String::new(), } } @@ -60,7 +62,7 @@ impl ClientRepl { /// Prints (optionally) an error message and the help message. fn print_help(&mut self, err: Option<&SummersetError>) { if let Some(e) = err { - println!("ERROR: {}", e); + cprintln!("✗ {}", e); } println!("HELP: Supported commands are:"); println!(" get "); @@ -150,36 +152,51 @@ impl ClientRepl { async fn eval_command( &mut self, cmd: Command, - ) -> Result, SummersetError> - { + ) -> Result { match cmd { - Command::Get { key } => { - Ok(self.driver.get(&key).await?.map(|(req_id, value, lat)| { - (req_id, CommandResult::Get { value }, lat) - })) - } - + Command::Get { key } => Ok(self.driver.get(&key).await?), Command::Put { key, value } => { - Ok(self.driver.put(&key, &value).await?.map( - |(req_id, old_value, lat)| { - (req_id, CommandResult::Put { old_value }, lat) - }, - )) + Ok(self.driver.put(&key, &value).await?) } } } /// Prints command execution result. - fn print_result( - &mut self, - result: Option<(RequestId, CommandResult, Duration)>, - ) { - if let Some((req_id, cmd_result, lat)) = result { - let lat_ms = lat.as_secs_f64() * 1000.0; - println!("({}) {:?} ", req_id, cmd_result, lat_ms); - } else { - println!("Unsuccessful: wrong leader or timeout?"); + fn print_result(&mut self, result: DriverReply) { + match result { + DriverReply::Success { + req_id, + cmd_result, + latency, + } => { + let lat_ms = latency.as_secs_f64() * 1000.0; + cprintln!( + "✓ ({}) {:?} <>", + req_id, + cmd_result, + lat_ms + ); + } + + DriverReply::Failure => { + cprintln!("✗ service replied unknown error"); + } + + DriverReply::Redirect { server } => { + cprintln!( + "✗ service redirected me to server {}", + server + ); + } + + DriverReply::Timeout => { + cprintln!( + "✗ client-side timeout {} ms", + self.timeout.as_millis() + ); + } } + io::stdout().flush().unwrap(); } diff --git a/summerset_client/src/clients/tester.rs b/summerset_client/src/clients/tester.rs index e7e6092e..04b71a5b 100644 --- a/summerset_client/src/clients/tester.rs +++ b/summerset_client/src/clients/tester.rs @@ -1,8 +1,8 @@ -//! Correctness testing client using open-loop driver. +//! Correctness testing client using closed-loop driver. -use std::collections::{HashMap, HashSet}; +use std::collections::HashSet; -use crate::drivers::DriverOpenLoop; +use crate::drivers::{DriverReply, DriverClosedLoop}; use color_print::cprintln; @@ -15,11 +15,11 @@ use rand::distributions::Alphanumeric; use serde::Deserialize; -use tokio::time::Duration; +use tokio::time::{self, Duration}; use summerset::{ - ReplicaId, GenericEndpoint, CommandResult, RequestId, CtrlRequest, - CtrlReply, SummersetError, pf_error, logged_err, parsed_config, + ReplicaId, GenericEndpoint, CommandResult, CtrlRequest, CtrlReply, + SummersetError, pf_error, logged_err, parsed_config, }; lazy_static! { @@ -27,9 +27,8 @@ lazy_static! { static ref ALL_TESTS: Vec<(&'static str, bool)> = vec![ ("primitive_ops", true), ("client_reconnect", true), - ("node_1_crash", true), - ("node_0_crash", true), - ("two_nodes_crash", false) + ("one_node_reset", true), + ("two_nodes_reset", false) ]; } @@ -60,14 +59,14 @@ impl Default for ModeParamsTester { /// Correctness testing client struct. pub struct ClientTester { - /// Open-loop request driver. - driver: DriverOpenLoop, + /// Closed-loop request driver. + driver: DriverClosedLoop, + + /// Timeout duration setting. + timeout: Duration, /// Mode parameters struct. params: ModeParamsTester, - - /// Replies received but not yet used. - cached_replies: HashMap, } impl ClientTester { @@ -86,9 +85,9 @@ impl ClientTester { } Ok(ClientTester { - driver: DriverOpenLoop::new(endpoint, timeout), + driver: DriverClosedLoop::new(endpoint, timeout), + timeout, params, - cached_replies: HashMap::new(), }) } @@ -106,114 +105,128 @@ impl ClientTester { s.as_deref() == *expect } - /// Issues a Get request, retrying immediately on `WouldBlock` failures. - fn issue_get(&mut self, key: &str) -> Result { - let mut req_id = self.driver.issue_get(key)?; - while req_id.is_none() { - req_id = self.driver.issue_retry()?; - } - Ok(req_id.unwrap()) - } - - /// Issues a Put request, retrying immediately on `WouldBlock` failures. - fn issue_put( + /// Issues a Get request and checks its reply value against given one if + /// not `None`. Retries immediately upon getting redirection error. + async fn checked_get( &mut self, key: &str, - value: &str, - ) -> Result { - let mut req_id = self.driver.issue_put(key, value)?; - while req_id.is_none() { - req_id = self.driver.issue_retry()?; - } - Ok(req_id.unwrap()) - } + expect_value: Option>, + ) -> Result<(), SummersetError> { + loop { + let result = self.driver.get(key).await?; + match result { + DriverReply::Success { cmd_result, .. } => { + if let CommandResult::Get { ref value } = cmd_result { + if let Some(ref expect_value) = expect_value { + if !Self::strings_match(value, expect_value) { + return logged_err!( + self.driver.id; + "Get value mismatch: expect {:?}, got {:?}", + expect_value, value + ); + } + } + return Ok(()); + } else { + return logged_err!( + self.driver.id; + "CommandResult type mismatch: expect Get" + ); + } + } - /// Waits for the next reply from service with the given request ID. If - /// non-match replies received, cache them up for future references. - async fn wait_reply( - &mut self, - req_id: RequestId, - // maximum number of tries if repeatedly getting `Ok(None)` reply - max_tries: u8, - ) -> Result { - assert!(max_tries > 0); - let mut num_tries = 0; - - // look up cached_replies first - if let Some(cmd_result) = self.cached_replies.remove(&req_id) { - return Ok(cmd_result); - } + DriverReply::Failure => { + return logged_err!( + self.driver.id; + "service replied unknown error" + ); + } - let mut result = self.driver.wait_reply().await?; - while result.is_none() || result.as_ref().unwrap().0 != req_id { - if let Some((id, cmd_result, _)) = result { - self.cached_replies.insert(id, cmd_result); - } else { - num_tries += 1; - if num_tries == max_tries { - return Err(SummersetError(format!( - "exhausted {} tries expecting req {}", - max_tries, req_id, - ))); + DriverReply::Redirect { .. } => {} // re-issue immediately + + DriverReply::Timeout => { + return logged_err!( + self.driver.id; + "client-side timeout {} ms", + self.timeout.as_millis() + ) } } - result = self.driver.wait_reply().await?; } - - Ok(result.unwrap().1) } - /// Waits for the reply of given request ID, expecting the given Get value - /// if not `None`. - async fn expect_get_reply( + /// Issues a Put request and checks its reply old_value against given one + /// if not `None`. Retries immediately upon getting redirection error. + async fn checked_put( &mut self, - req_id: RequestId, - expect_value: Option>, - // maximum number of tries if repeatedly getting `Ok(None)` reply - max_tries: u8, + key: &str, + value: &str, + expect_old_value: Option>, ) -> Result<(), SummersetError> { - let cmd_result = self.wait_reply(req_id, max_tries).await?; - if let CommandResult::Get { ref value } = cmd_result { - if let Some(ref expect_value) = expect_value { - if !Self::strings_match(value, expect_value) { - return Err(SummersetError(format!( - "Get value mismatch: expect {:?}, got {:?}", - expect_value, value - ))); + loop { + let result = self.driver.put(key, value).await?; + match result { + DriverReply::Success { cmd_result, .. } => { + if let CommandResult::Put { ref old_value } = cmd_result { + if let Some(ref expect_old_value) = expect_old_value { + if !Self::strings_match(old_value, expect_old_value) + { + return logged_err!( + self.driver.id; + "Put old_value mismatch: expect {:?}, got {:?}", + expect_old_value, old_value + ); + } + } + return Ok(()); + } else { + return logged_err!( + self.driver.id; + "CommandResult type mismatch: expect Put" + ); + } + } + + DriverReply::Failure => { + return logged_err!( + self.driver.id; + "service replied unknown error" + ); + } + + DriverReply::Redirect { .. } => {} // re-issue immediately + + DriverReply::Timeout => { + return logged_err!( + self.driver.id; + "client-side timeout {} ms", + self.timeout.as_millis() + ) } } - Ok(()) - } else { - Err(SummersetError( - "CommandResult type mismatch: expect Get".into(), - )) } } - /// Waits for the reply of given request ID, expecting the given Put - /// old_value if not `None`. - async fn expect_put_reply( + /// Query the list of servers in the cluster. + async fn query_servers( &mut self, - req_id: RequestId, - expect_old_value: Option>, - // maximum number of tries if repeatedly getting `Ok(None)` reply - max_tries: u8, - ) -> Result<(), SummersetError> { - let cmd_result = self.wait_reply(req_id, max_tries).await?; - if let CommandResult::Put { ref old_value } = cmd_result { - if let Some(ref expect_old_value) = expect_old_value { - if !Self::strings_match(old_value, expect_old_value) { - return Err(SummersetError(format!( - "Put old_value mismatch: expect {:?}, got {:?}", - expect_old_value, old_value - ))); - } + ) -> Result, SummersetError> { + let ctrl_stub = self.driver.ctrl_stub(); + + // send QueryInfo request to manager + let req = CtrlRequest::QueryInfo; + let mut sent = ctrl_stub.send_req(Some(&req))?; + while !sent { + sent = ctrl_stub.send_req(None)?; + } + + // wait for reply from manager + let reply = ctrl_stub.recv_reply().await?; + match reply { + CtrlReply::QueryInfo { servers } => { + Ok(servers.keys().copied().collect()) } - Ok(()) - } else { - Err(SummersetError( - "CommandResult type mismatch: expect Put".into(), - )) + _ => logged_err!(self.driver.id; ""), } } @@ -236,7 +249,7 @@ impl ClientTester { let reply = ctrl_stub.recv_reply().await?; match reply { CtrlReply::ResetServers { .. } => Ok(()), - _ => logged_err!("c"; "unexpected control reply type"), + _ => logged_err!(self.driver.id; "unexpected control reply type"), } } @@ -248,15 +261,16 @@ impl ClientTester { // reset everything to initial state at the start of each test self.reset_servers(HashSet::new(), false).await?; self.driver.connect().await?; - self.cached_replies.clear(); let result = match name { "primitive_ops" => self.test_primitive_ops().await, "client_reconnect" => self.test_client_reconnect().await, - "node_1_crash" => self.test_node_1_crash().await, - "node_0_crash" => self.test_node_0_crash().await, - "two_nodes_crash" => self.test_two_nodes_crash().await, - _ => return logged_err!("c"; "unrecognized test name '{}'", name), + "one_node_reset" => self.test_one_node_reset().await, + "two_nodes_reset" => self.test_two_nodes_reset().await, + _ => { + return logged_err!(self.driver.id; "unrecognized test name '{}'", + name); + } }; if let Err(ref e) = result { @@ -318,69 +332,49 @@ impl ClientTester { impl ClientTester { /// Basic primitive operations. async fn test_primitive_ops(&mut self) -> Result<(), SummersetError> { - let mut req_id = self.issue_get("Jose")?; - self.expect_get_reply(req_id, Some(None), 1).await?; + self.checked_get("Jose", Some(None)).await?; let v0 = Self::gen_rand_string(8); - req_id = self.issue_put("Jose", &v0)?; - self.expect_put_reply(req_id, Some(None), 1).await?; - req_id = self.issue_get("Jose")?; - self.expect_get_reply(req_id, Some(Some(&v0)), 1).await?; + self.checked_put("Jose", &v0, Some(None)).await?; + self.checked_get("Jose", Some(Some(&v0))).await?; let v1 = Self::gen_rand_string(16); - req_id = self.issue_put("Jose", &v1)?; - self.expect_put_reply(req_id, Some(Some(&v0)), 1).await?; - req_id = self.issue_get("Jose")?; - self.expect_get_reply(req_id, Some(Some(&v1)), 1).await?; + self.checked_put("Jose", &v1, Some(Some(&v0))).await?; + self.checked_get("Jose", Some(Some(&v1))).await?; Ok(()) } /// Client leaves and reconnects. async fn test_client_reconnect(&mut self) -> Result<(), SummersetError> { let v = Self::gen_rand_string(8); - let mut req_id = self.issue_put("Jose", &v)?; - self.expect_put_reply(req_id, Some(None), 1).await?; - self.driver.leave(false).await?; - self.driver.connect().await?; - req_id = self.issue_get("Jose")?; - self.expect_get_reply(req_id, Some(Some(&v)), 1).await?; - Ok(()) - } - - /// Replica node 1 crashes and restarts. - async fn test_node_1_crash(&mut self) -> Result<(), SummersetError> { - let v = Self::gen_rand_string(8); - let mut req_id = self.issue_put("Jose", &v)?; - self.expect_put_reply(req_id, Some(None), 1).await?; + self.checked_put("Jose", &v, Some(None)).await?; self.driver.leave(false).await?; - self.reset_servers(HashSet::from([1]), true).await?; self.driver.connect().await?; - req_id = self.issue_get("Jose")?; - self.expect_get_reply(req_id, Some(Some(&v)), 1).await?; + self.checked_get("Jose", Some(Some(&v))).await?; Ok(()) } - /// Replica node 0 crashes and restarts. - async fn test_node_0_crash(&mut self) -> Result<(), SummersetError> { + /// Single replica node crashes and restarts. + async fn test_one_node_reset(&mut self) -> Result<(), SummersetError> { let v = Self::gen_rand_string(8); - let mut req_id = self.issue_put("Jose", &v)?; - self.expect_put_reply(req_id, Some(None), 1).await?; - self.driver.leave(false).await?; - self.reset_servers(HashSet::from([0]), true).await?; - self.driver.connect().await?; - req_id = self.issue_get("Jose")?; - self.expect_get_reply(req_id, Some(Some(&v)), 1).await?; + self.checked_put("Jose", &v, Some(None)).await?; + for s in self.query_servers().await? { + self.driver.leave(false).await?; + self.reset_servers(HashSet::from([s]), true).await?; + time::sleep(Duration::from_millis(100)).await; + self.driver.connect().await?; + self.checked_get("Jose", Some(Some(&v))).await?; + } Ok(()) } - /// Two replica nodes crashes and restarts. - async fn test_two_nodes_crash(&mut self) -> Result<(), SummersetError> { + /// Two replica nodes crash and restart. + async fn test_two_nodes_reset(&mut self) -> Result<(), SummersetError> { let v = Self::gen_rand_string(8); - let mut req_id = self.issue_put("Jose", &v)?; - self.expect_put_reply(req_id, Some(None), 1).await?; + self.checked_put("Jose", &v, Some(None)).await?; self.driver.leave(false).await?; self.reset_servers(HashSet::from([0, 1]), true).await?; + time::sleep(Duration::from_millis(100)).await; self.driver.connect().await?; - req_id = self.issue_get("Jose")?; - self.expect_get_reply(req_id, Some(Some(&v)), 1).await?; + self.checked_get("Jose", Some(Some(&v))).await?; Ok(()) } } diff --git a/summerset_client/src/drivers/closed_loop.rs b/summerset_client/src/drivers/closed_loop.rs index cb361cf5..a0a96e87 100644 --- a/summerset_client/src/drivers/closed_loop.rs +++ b/summerset_client/src/drivers/closed_loop.rs @@ -1,5 +1,7 @@ //! Closed-loop client-side driver implementation. +use crate::drivers::DriverReply; + use tokio::time::{Duration, Instant}; use summerset::{ @@ -83,16 +85,11 @@ impl DriverClosedLoop { } } - /// Send a Get request and wait for its reply. Returns: - /// - `Ok(Some((id, Some(value), latency)))` if successful and key exists - /// - `Ok(Some((id, None, latency)))` if successful and key does not exist - /// - `Ok(None)` if request unsuccessful, e.g., wrong leader or timeout - /// - `Err(err)` if any unexpected error occurs + /// Send a Get request and wait for its reply. pub async fn get( &mut self, key: &str, - ) -> Result, Duration)>, SummersetError> - { + ) -> Result { let req_id = self.next_req; self.next_req += 1; @@ -107,18 +104,31 @@ impl DriverClosedLoop { Some(ApiReply::Reply { id: reply_id, result: cmd_result, - .. + redirect, }) => { if reply_id != req_id { logged_err!(self.id; "request ID mismatch: expected {}, replied {}", req_id, reply_id) } else { match cmd_result { - None => Ok(None), + None => { + if let Some(server) = redirect { + Ok(DriverReply::Redirect { server }) + } else { + Ok(DriverReply::Failure) + } + } + Some(CommandResult::Get { value }) => { - let lat = Instant::now().duration_since(issue_ts); - Ok(Some((req_id, value, lat))) + let latency = + Instant::now().duration_since(issue_ts); + Ok(DriverReply::Success { + req_id, + cmd_result: CommandResult::Get { value }, + latency, + }) } + _ => { logged_err!(self.id; "command type mismatch: expected Get") } @@ -126,23 +136,18 @@ impl DriverClosedLoop { } } - None => Ok(None), // timed-out + None => Ok(DriverReply::Timeout), _ => logged_err!(self.id; "unexpected reply type received"), } } - /// Send a Put request and wait for its reply. Returns: - /// - `Ok(Some((id, Some(old_value), latency)))` if successful and key exists - /// - `Ok(Some((id, None, latency)))` if successful and key did not exist - /// - `Ok(None)` if request unsuccessful, e.g., wrong leader or timeout - /// - `Err(err)` if any unexpected error occurs + /// Send a Put request and wait for its reply. pub async fn put( &mut self, key: &str, value: &str, - ) -> Result, Duration)>, SummersetError> - { + ) -> Result { let req_id = self.next_req; self.next_req += 1; @@ -160,18 +165,31 @@ impl DriverClosedLoop { Some(ApiReply::Reply { id: reply_id, result: cmd_result, - .. + redirect, }) => { if reply_id != req_id { logged_err!(self.id; "request ID mismatch: expected {}, replied {}", req_id, reply_id) } else { match cmd_result { - None => Ok(None), + None => { + if let Some(server) = redirect { + Ok(DriverReply::Redirect { server }) + } else { + Ok(DriverReply::Failure) + } + } + Some(CommandResult::Put { old_value }) => { - let lat = Instant::now().duration_since(issue_ts); - Ok(Some((req_id, old_value, lat))) + let latency = + Instant::now().duration_since(issue_ts); + Ok(DriverReply::Success { + req_id, + cmd_result: CommandResult::Put { old_value }, + latency, + }) } + _ => { logged_err!(self.id; "command type mismatch: expected Put") } @@ -179,7 +197,7 @@ impl DriverClosedLoop { } } - None => Ok(None), // timed-out + None => Ok(DriverReply::Timeout), _ => logged_err!(self.id; "unexpected reply type received"), } diff --git a/summerset_client/src/drivers/mod.rs b/summerset_client/src/drivers/mod.rs index 7ad5a9b3..3ed0bc44 100644 --- a/summerset_client/src/drivers/mod.rs +++ b/summerset_client/src/drivers/mod.rs @@ -1,7 +1,33 @@ //! Closed-loop & Open-loop client-side driver implementations. +use tokio::time::Duration; + +use summerset::{ReplicaId, RequestId, CommandResult}; + mod closed_loop; mod open_loop; pub use closed_loop::DriverClosedLoop; pub use open_loop::DriverOpenLoop; + +/// Reply result type, common across the two driver styles. +pub enum DriverReply { + /// Successful reply. + Success { + /// Request ID. + req_id: RequestId, + /// Command result. + cmd_result: CommandResult, + /// Latency duration. + latency: Duration, + }, + + /// Service indicated redirection. + Redirect { server: ReplicaId }, + + /// Unknown failure. + Failure, + + /// Client-side timer timeout. + Timeout, +} diff --git a/summerset_client/src/drivers/open_loop.rs b/summerset_client/src/drivers/open_loop.rs index 057d414f..382091f1 100644 --- a/summerset_client/src/drivers/open_loop.rs +++ b/summerset_client/src/drivers/open_loop.rs @@ -7,12 +7,14 @@ use std::collections::HashMap; +use crate::drivers::DriverReply; + use tokio::time::{Duration, Instant}; use summerset::{ - GenericEndpoint, ClientId, Command, CommandResult, ApiRequest, ApiReply, - RequestId, ClientCtrlStub, Timer, SummersetError, pf_trace, pf_debug, - pf_error, logged_err, + GenericEndpoint, ClientId, Command, ApiRequest, ApiReply, RequestId, + ClientCtrlStub, Timer, SummersetError, pf_trace, pf_debug, pf_error, + logged_err, }; /// Open-loop driver struct. @@ -178,43 +180,44 @@ impl DriverOpenLoop { } } - /// Waits for the next reply. Returns the request ID and: - /// - `Ok(Some((id, cmd_result, latency)))` if request successful - /// - `Ok(None)` if request unsuccessful, e.g., wrong leader or timeout - /// - `Err(err)` if any unexpected error occurs - pub async fn wait_reply( - &mut self, - ) -> Result, SummersetError> - { + /// Waits for the next reply. + pub async fn wait_reply(&mut self) -> Result { let reply = self.recv_reply_with_timeout().await?; match reply { Some(ApiReply::Reply { id: reply_id, result: cmd_result, - .. + redirect, }) => { if !self.pending_reqs.contains_key(&reply_id) { logged_err!(self.id; "request ID {} not in pending set", reply_id) } else { let issue_ts = self.pending_reqs.remove(&reply_id).unwrap(); - let lat = Instant::now().duration_since(issue_ts); + let latency = Instant::now().duration_since(issue_ts); if let Some(res) = cmd_result { - Ok(Some((reply_id, res, lat))) + Ok(DriverReply::Success { + req_id: reply_id, + cmd_result: res, + latency, + }) + } else if let Some(server) = redirect { + Ok(DriverReply::Redirect { server }) } else { - Ok(None) + Ok(DriverReply::Failure) } } } - None => Ok(None), // timed-out + None => Ok(DriverReply::Timeout), _ => logged_err!(self.id; "unexpected reply type received"), } } /// Gets a mutable reference to the endpoint's control stub. + #[allow(dead_code)] pub fn ctrl_stub(&mut self) -> &mut ClientCtrlStub { self.endpoint.ctrl_stub() } From 71abe89bdc0da3f24296f4af85433334be1724be Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Wed, 13 Sep 2023 06:08:11 -0500 Subject: [PATCH 41/89] better client driver side API --- summerset_client/src/clients/tester.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/summerset_client/src/clients/tester.rs b/summerset_client/src/clients/tester.rs index 04b71a5b..4666f309 100644 --- a/summerset_client/src/clients/tester.rs +++ b/summerset_client/src/clients/tester.rs @@ -226,7 +226,7 @@ impl ClientTester { CtrlReply::QueryInfo { servers } => { Ok(servers.keys().copied().collect()) } - _ => logged_err!(self.driver.id; ""), + _ => logged_err!(self.driver.id; "unexpected control reply type"), } } From 8658d92be365c896999319eda57c7934489a0b4f Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Wed, 13 Sep 2023 11:19:48 -0500 Subject: [PATCH 42/89] add kill procs helper script --- scripts/kill_all_local_procs.sh | 14 ++++++++++++++ scripts/set_tcp_buf_sizes.sh | 2 +- 2 files changed, 15 insertions(+), 1 deletion(-) create mode 100755 scripts/kill_all_local_procs.sh diff --git a/scripts/kill_all_local_procs.sh b/scripts/kill_all_local_procs.sh new file mode 100755 index 00000000..a0bdc9db --- /dev/null +++ b/scripts/kill_all_local_procs.sh @@ -0,0 +1,14 @@ +#! /bin/bash + +kill_all_matching () { + for pid in $(sudo pgrep -f $1) + do + sudo kill -9 $pid + done +} + +kill_all_matching summerset_server +kill_all_matching summerset_client +kill_all_matching summerset_manager +kill_all_matching local_cluster.py +kill_all_matching local_client.py diff --git a/scripts/set_tcp_buf_sizes.sh b/scripts/set_tcp_buf_sizes.sh index 55d8d0a4..e95ca4e0 100755 --- a/scripts/set_tcp_buf_sizes.sh +++ b/scripts/set_tcp_buf_sizes.sh @@ -1,4 +1,4 @@ -#! /usr/bin/bash +#! /bin/bash echo "Per-socket TCP send/receive buffer:" echo "min default max" From b4db22d801ceae81c93d6abdc949314d1929686d Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Wed, 13 Sep 2023 11:43:46 -0500 Subject: [PATCH 43/89] minor updates to tester client --- summerset_client/src/clients/tester.rs | 33 ++++++++++++++++++-------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/summerset_client/src/clients/tester.rs b/summerset_client/src/clients/tester.rs index 4666f309..607dbfb0 100644 --- a/summerset_client/src/clients/tester.rs +++ b/summerset_client/src/clients/tester.rs @@ -106,7 +106,7 @@ impl ClientTester { } /// Issues a Get request and checks its reply value against given one if - /// not `None`. Retries immediately upon getting redirection error. + /// not `None`. Retries in-place upon getting redirection error. async fn checked_get( &mut self, key: &str, @@ -142,7 +142,10 @@ impl ClientTester { ); } - DriverReply::Redirect { .. } => {} // re-issue immediately + DriverReply::Redirect { .. } => { + time::sleep(Duration::from_millis(500)).await; + // retry + } DriverReply::Timeout => { return logged_err!( @@ -156,7 +159,7 @@ impl ClientTester { } /// Issues a Put request and checks its reply old_value against given one - /// if not `None`. Retries immediately upon getting redirection error. + /// if not `None`. Retries in-place upon getting redirection error. async fn checked_put( &mut self, key: &str, @@ -194,7 +197,10 @@ impl ClientTester { ); } - DriverReply::Redirect { .. } => {} // re-issue immediately + DriverReply::Redirect { .. } => { + time::sleep(Duration::from_millis(500)).await; + // retry + } DriverReply::Timeout => { return logged_err!( @@ -359,7 +365,7 @@ impl ClientTester { for s in self.query_servers().await? { self.driver.leave(false).await?; self.reset_servers(HashSet::from([s]), true).await?; - time::sleep(Duration::from_millis(100)).await; + time::sleep(Duration::from_millis(500)).await; self.driver.connect().await?; self.checked_get("Jose", Some(Some(&v))).await?; } @@ -370,11 +376,18 @@ impl ClientTester { async fn test_two_nodes_reset(&mut self) -> Result<(), SummersetError> { let v = Self::gen_rand_string(8); self.checked_put("Jose", &v, Some(None)).await?; - self.driver.leave(false).await?; - self.reset_servers(HashSet::from([0, 1]), true).await?; - time::sleep(Duration::from_millis(100)).await; - self.driver.connect().await?; - self.checked_get("Jose", Some(Some(&v))).await?; + let servers = self.query_servers().await?; + for &s in &servers { + self.driver.leave(false).await?; + self.reset_servers( + HashSet::from([s, (s + 1) % (servers.len() as u8)]), + true, + ) + .await?; + time::sleep(Duration::from_millis(500)).await; + self.driver.connect().await?; + self.checked_get("Jose", Some(Some(&v))).await?; + } Ok(()) } } From 1a82a7f276c21001a85c837538b9f518331acc50 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Thu, 14 Sep 2023 14:50:45 -0500 Subject: [PATCH 44/89] add autonomous leader timeouts and step-up --- README.md | 4 +- src/client/apistub.rs | 1 - src/client/ctrlstub.rs | 1 - src/manager/reactor.rs | 4 +- src/manager/reigner.rs | 4 +- src/protocols/crossword.rs | 304 +++++++++++++++++----- src/protocols/multipaxos.rs | 122 ++++----- src/protocols/rep_nothing.rs | 51 ++-- src/protocols/rs_paxos.rs | 304 +++++++++++++++++----- src/protocols/simple_push.rs | 51 ++-- src/server/external.rs | 4 +- src/server/transport.rs | 4 +- summerset_client/src/drivers/open_loop.rs | 18 +- 13 files changed, 585 insertions(+), 287 deletions(-) diff --git a/README.md b/README.md index 16944f3a..0d008572 100644 --- a/README.md +++ b/README.md @@ -148,13 +148,15 @@ Complete cluster management and benchmarking scripts are available in another re - [x] implementation of MultiPaxos - [x] client-side timeout/retry logic - [x] state persistence & restart check - - [ ] automatic leader election, backoffs + - [x] automatic leader election, backoffs - [ ] snapshotting & garbage collection - [ ] specialize read-only commands? - [ ] separate commit vs. exec responses? - [ ] membership discovery & view changes - [ ] implementation of Raft - [ ] implementation of Crossword prototype + - [ ] fault recovery reads + - [ ] follower gossiping - [x] client-side utilities - [x] REPL-style client - [x] random benchmarking client diff --git a/src/client/apistub.rs b/src/client/apistub.rs index ea0bb14f..54fc1604 100644 --- a/src/client/apistub.rs +++ b/src/client/apistub.rs @@ -40,7 +40,6 @@ impl ClientApiStub { id: ClientId, addr: SocketAddr, ) -> Result { - pf_info!(id; "connecting to server '{}'...", addr); let mut stream = tcp_connect_with_retry(addr, 10).await?; stream.write_u64(id).await?; // send my client ID let (read_half, write_half) = stream.into_split(); diff --git a/src/client/ctrlstub.rs b/src/client/ctrlstub.rs index e1a28e75..f1e79481 100644 --- a/src/client/ctrlstub.rs +++ b/src/client/ctrlstub.rs @@ -38,7 +38,6 @@ impl ClientCtrlStub { pub async fn new_by_connect( manager: SocketAddr, ) -> Result { - pf_info!("c"; "connecting to manager '{}'...", manager); let mut stream = TcpStream::connect(manager).await?; let id = stream.read_u64().await?; // receive my client ID let (read_half, write_half) = stream.into_split(); diff --git a/src/manager/reactor.rs b/src/manager/reactor.rs index e1e388c3..1942f591 100644 --- a/src/manager/reactor.rs +++ b/src/manager/reactor.rs @@ -324,7 +324,7 @@ impl ClientReactor { mut rx_reply: mpsc::UnboundedReceiver, tx_exit: mpsc::UnboundedSender, ) { - pf_debug!("m"; "client_responder thread for {} ({}) spawned", id, addr); + pf_debug!("m"; "client_responder thread for {} '{}' spawned", id, addr); let (mut conn_read, conn_write) = conn.into_split(); let mut req_buf = BytesMut::with_capacity(8 + 1024); @@ -419,7 +419,7 @@ impl ClientReactor { if let Err(e) = tx_exit.send(id) { pf_error!("m"; "error sending exit signal for {}: {}", id, e); } - pf_debug!("m"; "client_responder thread for {} ({}) exitted", id, addr); + pf_debug!("m"; "client_responder thread for {} '{}' exitted", id, addr); } } diff --git a/src/manager/reigner.rs b/src/manager/reigner.rs index 02e4e4c3..b28b9262 100644 --- a/src/manager/reigner.rs +++ b/src/manager/reigner.rs @@ -333,7 +333,7 @@ impl ServerReigner { mut rx_send: mpsc::UnboundedReceiver, tx_exit: mpsc::UnboundedSender, ) { - pf_debug!("m"; "server_controller thread for {} ({}) spawned", id, addr); + pf_debug!("m"; "server_controller thread for {} '{}' spawned", id, addr); let (mut conn_read, conn_write) = conn.into_split(); let mut read_buf = BytesMut::new(); @@ -451,7 +451,7 @@ impl ServerReigner { if let Err(e) = tx_exit.send(id) { pf_error!("m"; "error sending exit signal for {}: {}", id, e); } - pf_debug!("m"; "server_controller thread for {} ({}) exitted", id, addr); + pf_debug!("m"; "server_controller thread for {} '{}' exitted", id, addr); } } diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index 9ef9b2b5..1140a69c 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -7,7 +7,7 @@ use std::collections::HashMap; use std::path::Path; use std::net::SocketAddr; -use crate::utils::{SummersetError, Bitmap, RSCodeword}; +use crate::utils::{SummersetError, Bitmap, Timer, RSCodeword}; use crate::manager::{CtrlMsg, CtrlRequest, CtrlReply}; use crate::server::{ ReplicaId, ControlHub, StateMachine, CommandResult, CommandId, ExternalApi, @@ -17,13 +17,15 @@ use crate::server::{ use crate::client::{ClientId, ClientApiStub, ClientCtrlStub, GenericEndpoint}; use crate::protocols::SmrProtocol; +use rand::prelude::*; + use async_trait::async_trait; use get_size::GetSize; use serde::{Serialize, Deserialize}; -use tokio::time::Duration; +use tokio::time::{self, Duration, Interval, MissedTickBehavior}; use tokio::sync::watch; use reed_solomon_erasure::galois_8::ReedSolomon; @@ -43,6 +45,15 @@ pub struct ReplicaConfigCrossword { /// Whether to call `fsync()`/`fdatasync()` on logger. pub logger_sync: bool, + /// Min timeout of not hearing any heartbeat from leader in millisecs. + pub hb_hear_timeout_min: u64, + + /// Max timeout of not hearing any heartbeat from leader in millisecs. + pub hb_hear_timeout_max: u64, + + /// Interval of leader sending heartbeats to followers. + pub hb_send_interval_ms: u64, + /// Fault-tolerance level. pub fault_tolerance: u8, @@ -65,6 +76,9 @@ impl Default for ReplicaConfigCrossword { max_batch_size: 5000, backer_path: "/tmp/summerset.rs_paxos.wal".into(), logger_sync: false, + hb_hear_timeout_min: 300, + hb_hear_timeout_max: 600, + hb_send_interval_ms: 50, fault_tolerance: 0, shards_per_replica: 1, perf_storage_a: 0, @@ -131,6 +145,9 @@ struct Instance { /// Follower-side bookkeeping info. replica_bk: Option, + + /// True if from external client, else false. + external: bool, } /// Stable storage log entry type. @@ -177,6 +194,9 @@ enum PeerMsg { /// Commit notification from leader to replicas. Commit { slot: usize }, + + /// Leader activity heartbeat. + Heartbeat { ballot: Ballot }, } /// Crossword server replica module. @@ -214,6 +234,12 @@ pub struct CrosswordReplica { /// TransportHub module. transport_hub: TransportHub, + /// Timer for hearing heartbeat from leader. + hb_hear_timer: Timer, + + /// Interval for sending heartbeat to followers. + hb_send_interval: Interval, + /// Do I think I am the leader? is_leader: bool, @@ -385,7 +411,6 @@ impl CrosswordReplica { // create a new instance in the first null slot (or append a new one // at the end if no holes exist) - // TODO: maybe use a null_idx variable to better keep track of this let mut slot = self.insts.len(); for s in self.commit_bar..self.insts.len() { if self.insts[s].status == Status::Null { @@ -413,14 +438,12 @@ impl CrosswordReplica { accept_acks: HashMap::new(), }), replica_bk: None, + external: true, }; self.insts.push(new_inst); } // decide whether we can enter fast path for this instance - // TODO: remember to reset bal_prepared to 0, update bal_max_seen, - // and re-handle all Preparing & Accepting instances in autonomous - // Prepare initiation if self.bal_prepared == 0 { // slow case: Prepare phase not done yet. Initiate a Prepare round // if none is on the fly, or just wait for some Prepare reply to @@ -702,6 +725,7 @@ impl CrosswordReplica { )?, leader_bk: None, replica_bk: None, + external: false, }); } let inst = &mut self.insts[slot]; @@ -875,6 +899,7 @@ impl CrosswordReplica { )?, leader_bk: None, replica_bk: None, + external: false, }); } let inst = &mut self.insts[slot]; @@ -1002,6 +1027,7 @@ impl CrosswordReplica { )?, leader_bk: None, replica_bk: None, + external: false, }); } let inst = &mut self.insts[slot]; @@ -1054,6 +1080,7 @@ impl CrosswordReplica { self.handle_msg_accept_reply(peer, slot, ballot) } PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot), + PeerMsg::Heartbeat { ballot } => self.heard_heartbeat(peer, ballot), } } @@ -1075,7 +1102,7 @@ impl CrosswordReplica { // reply command result back to client if let ApiRequest::Req { id: req_id, .. } = req { - if self.external_api.has_client(client) { + if inst.external && self.external_api.has_client(client) { self.external_api.send_reply( ApiReply::Reply { id: *req_id, @@ -1113,6 +1140,111 @@ impl CrosswordReplica { Ok(()) } + /// Becomes a leader, sends self-initiated Prepare messages to followers + /// for all in-progress instances, and starts broadcasting heartbeats. + fn become_a_leader(&mut self) -> Result<(), SummersetError> { + assert!(!self.is_leader); + self.is_leader = true; // this starts broadcasting heartbeats + pf_warn!(self.id; "becoming a leader..."); + + // broadcast a heartbeat right now + self.bcast_heartbeats()?; + + // make a greater ballot number and invalidate all in-progress instances + self.bal_prepared = 0; + self.bal_prep_sent = self.make_greater_ballot(self.bal_max_seen); + self.bal_max_seen = self.bal_prep_sent; + + // redo Prepare phase for all in-progress instances + for (slot, inst) in self.insts.iter_mut().enumerate() { + if inst.status < Status::Committed { + inst.bal = self.bal_prep_sent; + inst.status = Status::Preparing; + pf_debug!(self.id; "enter Prepare phase for slot {} bal {}", + slot, inst.bal); + + // record update to largest prepare ballot + self.storage_hub.submit_action( + Self::make_log_action_id(slot, Status::Preparing), + LogAction::Append { + entry: LogEntry::PrepareBal { + slot, + ballot: self.bal_prep_sent, + }, + sync: self.config.logger_sync, + }, + )?; + pf_trace!(self.id; "submitted PrepareBal log action for slot {} bal {}", + slot, inst.bal); + + // send Prepare messages to all peers + self.transport_hub.bcast_msg( + PeerMsg::Prepare { + slot, + ballot: self.bal_prep_sent, + }, + None, + )?; + pf_trace!(self.id; "broadcast Prepare messages for slot {} bal {}", + slot, inst.bal); + } + } + + Ok(()) + } + + /// Broadcasts heartbeats to all replicas. + fn bcast_heartbeats(&mut self) -> Result<(), SummersetError> { + self.transport_hub.bcast_msg( + PeerMsg::Heartbeat { + ballot: self.bal_prep_sent, + }, + None, + )?; + self.heard_heartbeat(self.id, self.bal_prep_sent)?; + + // pf_trace!(self.id; "broadcast heartbeats bal {}", self.bal_prep_sent); + Ok(()) + } + + /// Chooses a random hb_hear_timeout from the min-max range and kicks off + /// the hb_hear_timer. + fn kickoff_hb_hear_timer(&mut self) -> Result<(), SummersetError> { + let timeout_ms = thread_rng().gen_range( + self.config.hb_hear_timeout_min..=self.config.hb_hear_timeout_max, + ); + + // pf_trace!(self.id; "kickoff hb_hear_timer @ {} ms", timeout_ms); + self.hb_hear_timer + .kickoff(Duration::from_millis(timeout_ms))?; + Ok(()) + } + + /// Heard a heartbeat from some other replica. If the heartbeat carries a + /// high enough ballot number, refreshes my hearing timer and clears my + /// leader status if I currently think I'm a leader. + fn heard_heartbeat( + &mut self, + _peer: ReplicaId, + ballot: Ballot, + ) -> Result<(), SummersetError> { + // ignore outdated hearbeat + if ballot < self.bal_max_seen { + return Ok(()); + } + + // reset hearing timer + self.kickoff_hb_hear_timer()?; + + // clear my leader status if it carries a higher ballot number + if self.is_leader && ballot > self.bal_max_seen { + self.is_leader = false; + } + + // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot); + Ok(()) + } + /// Handler of ResetState control message. async fn handle_ctrl_reset_state( &mut self, @@ -1187,6 +1319,7 @@ impl CrosswordReplica { )?, leader_bk: None, replica_bk: None, + external: false, }); } // update instance state @@ -1219,6 +1352,7 @@ impl CrosswordReplica { )?, leader_bk: None, replica_bk: None, + external: false, }); } // update instance state @@ -1347,6 +1481,8 @@ impl GenericReplica for CrosswordReplica { let config = parsed_config!(config_str => ReplicaConfigCrossword; batch_interval_us, max_batch_size, backer_path, logger_sync, + hb_hear_timeout_min, hb_hear_timeout_max, + hb_send_interval_ms, fault_tolerance, shards_per_replica, perf_storage_a, perf_storage_b, perf_network_a, perf_network_b)?; @@ -1357,6 +1493,27 @@ impl GenericReplica for CrosswordReplica { config.batch_interval_us ); } + if config.hb_hear_timeout_min < 100 { + return logged_err!( + id; + "invalid config.hb_hear_timeout_min '{}'", + config.hb_hear_timeout_min + ); + } + if config.hb_hear_timeout_max < config.hb_hear_timeout_min + 100 { + return logged_err!( + id; + "invalid config.hb_hear_timeout_max '{}'", + config.hb_hear_timeout_max + ); + } + if config.hb_send_interval_ms == 0 { + return logged_err!( + id; + "invalid config.hb_send_interval_ms '{}'", + config.hb_send_interval_ms + ); + } // setup state machine module let state_machine = StateMachine::new_and_setup(id).await?; @@ -1437,6 +1594,10 @@ impl GenericReplica for CrosswordReplica { ) .await?; + let mut hb_send_interval = + time::interval(Duration::from_millis(config.hb_send_interval_ms)); + hb_send_interval.set_missed_tick_behavior(MissedTickBehavior::Skip); + Ok(CrosswordReplica { id, population, @@ -1449,6 +1610,8 @@ impl GenericReplica for CrosswordReplica { state_machine, storage_hub, transport_hub, + hb_hear_timer: Timer::new(), + hb_send_interval, is_leader: false, insts: vec![], bal_prep_sent: 0, @@ -1468,10 +1631,8 @@ impl GenericReplica for CrosswordReplica { // recover state from durable storage log self.recover_from_log().await?; - // TODO: proper leader election - if self.id == 0 { - self.is_leader = true; - } + // kick off leader activity hearing timer + self.kickoff_hb_hear_timer()?; // main event loop loop { @@ -1525,6 +1686,16 @@ impl GenericReplica for CrosswordReplica { } }, + // leader inactivity timeout + _ = self.hb_hear_timer.timeout() => { + self.become_a_leader()?; + }, + + // leader sending heartbeat + _ = self.hb_send_interval.tick(), if self.is_leader => { + self.bcast_heartbeats()?; + } + // manager control message ctrl_msg = self.control_hub.recv_ctrl() => { if let Err(e) = ctrl_msg { @@ -1584,17 +1755,17 @@ pub struct CrosswordClient { /// Configuration parameters struct. _config: ClientConfigCrossword, - /// Cached list of active servers information. + /// List of active servers information. servers: HashMap, - /// Current server ID to connect to. + /// Current server ID to talk to. server_id: ReplicaId, /// Control API stub to the cluster manager. ctrl_stub: ClientCtrlStub, /// API stubs for communicating with servers. - api_stub: Option, + api_stubs: HashMap, } #[async_trait] @@ -1604,6 +1775,7 @@ impl GenericEndpoint for CrosswordClient { config_str: Option<&str>, ) -> Result { // connect to the cluster manager and get assigned a client ID + pf_info!("c"; "connecting to manager '{}'...", manager); let ctrl_stub = ClientCtrlStub::new_by_connect(manager).await?; let id = ctrl_stub.id; @@ -1618,13 +1790,13 @@ impl GenericEndpoint for CrosswordClient { servers: HashMap::new(), server_id: init_server_id, ctrl_stub, - api_stub: None, + api_stubs: HashMap::new(), }) } async fn connect(&mut self) -> Result<(), SummersetError> { // disallow reconnection without leaving - if self.api_stub.is_some() { + if !self.api_stubs.is_empty() { return logged_err!(self.id; "reconnecting without leaving"); } @@ -1638,13 +1810,13 @@ impl GenericEndpoint for CrosswordClient { let reply = self.ctrl_stub.recv_reply().await?; match reply { CtrlReply::QueryInfo { servers } => { - // connect to the one with server ID in config - let api_stub = ClientApiStub::new_by_connect( - self.id, - servers[&self.server_id], - ) - .await?; - self.api_stub = Some(api_stub); + // establish connection to all servers + for (&id, &server) in &servers { + pf_info!(self.id; "connecting to server {} '{}'...", id, server); + let api_stub = + ClientApiStub::new_by_connect(self.id, server).await?; + self.api_stubs.insert(id, api_stub); + } self.servers = servers; Ok(()) } @@ -1653,23 +1825,16 @@ impl GenericEndpoint for CrosswordClient { } async fn leave(&mut self, permanent: bool) -> Result<(), SummersetError> { - // send leave notification to current connected server - if let Some(mut api_stub) = self.api_stub.take() { + // send leave notification to all servers + for (id, mut api_stub) in self.api_stubs.drain() { let mut sent = api_stub.send_req(Some(&ApiRequest::Leave))?; while !sent { sent = api_stub.send_req(None)?; } - let reply = api_stub.recv_reply().await?; - match reply { - ApiReply::Leave => { - pf_info!(self.id; "left current server connection"); - api_stub.forget(); - } - _ => { - return logged_err!(self.id; "unexpected reply type received"); - } - } + while api_stub.recv_reply().await? != ApiReply::Leave {} + pf_info!(self.id; "left server connection {}", id); + api_stub.forget(); } // if permanently leaving, send leave notification to the manager @@ -1680,15 +1845,8 @@ impl GenericEndpoint for CrosswordClient { sent = self.ctrl_stub.send_req(None)?; } - let reply = self.ctrl_stub.recv_reply().await?; - match reply { - CtrlReply::Leave => { - pf_info!(self.id; "left current manager connection"); - } - _ => { - return logged_err!(self.id; "unexpected reply type received"); - } - } + while self.ctrl_stub.recv_reply().await? != CtrlReply::Leave {} + pf_info!(self.id; "left current manager connection"); } Ok(()) @@ -1698,38 +1856,44 @@ impl GenericEndpoint for CrosswordClient { &mut self, req: Option<&ApiRequest>, ) -> Result { - match self.api_stub { - Some(ref mut api_stub) => api_stub.send_req(req), - None => logged_err!(self.id; "client is not set up"), + if self.api_stubs.contains_key(&self.server_id) { + self.api_stubs + .get_mut(&self.server_id) + .unwrap() + .send_req(req) + } else { + Err(SummersetError("client not set up".into())) } } async fn recv_reply(&mut self) -> Result { - match self.api_stub { - Some(ref mut api_stub) => { - let reply = api_stub.recv_reply().await?; - - if let ApiReply::Reply { - ref result, - ref redirect, - .. - } = reply - { - // if the current server redirects me to a different server - if result.is_none() && redirect.is_some() { - let redirect_id = redirect.unwrap(); - assert!(self.servers.contains_key(&redirect_id)); - self.leave(false).await?; - self.server_id = redirect_id; - self.connect().await?; - pf_debug!(self.id; "redirected to replica {} '{}'", - redirect_id, self.servers[&redirect_id]); - } - } + if self.api_stubs.contains_key(&self.server_id) { + let reply = self + .api_stubs + .get_mut(&self.server_id) + .unwrap() + .recv_reply() + .await?; - Ok(reply) + if let ApiReply::Reply { + ref result, + ref redirect, + .. + } = reply + { + // if the current server redirects me to a different server + if result.is_none() && redirect.is_some() { + let redirect_id = redirect.unwrap(); + assert!(self.servers.contains_key(&redirect_id)); + self.server_id = redirect_id; + pf_debug!(self.id; "redirected to replica {} '{}'", + redirect_id, self.servers[&redirect_id]); + } } - None => logged_err!(self.id; "client is not set up"), + + Ok(reply) + } else { + Err(SummersetError("client not set up".into())) } } diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs index 4690c08d..130a2f45 100644 --- a/src/protocols/multipaxos.rs +++ b/src/protocols/multipaxos.rs @@ -137,6 +137,9 @@ struct Instance { /// Follower-side bookkeeping info. replica_bk: Option, + + /// True if from external client, else false. + external: bool, } /// Stable storage log entry type. @@ -362,6 +365,7 @@ impl MultiPaxosReplica { accept_acks: Bitmap::new(self.population, false), }), replica_bk: None, + external: true, }; self.insts.push(new_inst); } @@ -607,6 +611,7 @@ impl MultiPaxosReplica { reqs: Vec::new(), leader_bk: None, replica_bk: None, + external: false, }); } let inst = &mut self.insts[slot]; @@ -736,6 +741,7 @@ impl MultiPaxosReplica { reqs: Vec::new(), leader_bk: None, replica_bk: None, + external: false, }); } let inst = &mut self.insts[slot]; @@ -839,6 +845,7 @@ impl MultiPaxosReplica { reqs: Vec::new(), leader_bk: None, replica_bk: None, + external: false, }); } let inst = &mut self.insts[slot]; @@ -910,7 +917,7 @@ impl MultiPaxosReplica { // reply command result back to client if let ApiRequest::Req { id: req_id, .. } = req { - if self.external_api.has_client(client) { + if inst.external && self.external_api.has_client(client) { self.external_api.send_reply( ApiReply::Reply { id: *req_id, @@ -1124,6 +1131,7 @@ impl MultiPaxosReplica { reqs: Vec::new(), leader_bk: None, replica_bk: None, + external: false, }); } // update instance state @@ -1149,6 +1157,7 @@ impl MultiPaxosReplica { reqs: Vec::new(), leader_bk: None, replica_bk: None, + external: false, }); } // update instance state @@ -1520,17 +1529,17 @@ pub struct MultiPaxosClient { /// Configuration parameters struct. _config: ClientConfigMultiPaxos, - /// Cached list of active servers information. + /// List of active servers information. servers: HashMap, - /// Current server ID to connect to. + /// Current server ID to talk to. server_id: ReplicaId, /// Control API stub to the cluster manager. ctrl_stub: ClientCtrlStub, /// API stubs for communicating with servers. - api_stub: Option, + api_stubs: HashMap, } #[async_trait] @@ -1540,6 +1549,7 @@ impl GenericEndpoint for MultiPaxosClient { config_str: Option<&str>, ) -> Result { // connect to the cluster manager and get assigned a client ID + pf_info!("c"; "connecting to manager '{}'...", manager); let ctrl_stub = ClientCtrlStub::new_by_connect(manager).await?; let id = ctrl_stub.id; @@ -1554,13 +1564,13 @@ impl GenericEndpoint for MultiPaxosClient { servers: HashMap::new(), server_id: init_server_id, ctrl_stub, - api_stub: None, + api_stubs: HashMap::new(), }) } async fn connect(&mut self) -> Result<(), SummersetError> { // disallow reconnection without leaving - if self.api_stub.is_some() { + if !self.api_stubs.is_empty() { return logged_err!(self.id; "reconnecting without leaving"); } @@ -1574,13 +1584,13 @@ impl GenericEndpoint for MultiPaxosClient { let reply = self.ctrl_stub.recv_reply().await?; match reply { CtrlReply::QueryInfo { servers } => { - // connect to the one with server ID in config - let api_stub = ClientApiStub::new_by_connect( - self.id, - servers[&self.server_id], - ) - .await?; - self.api_stub = Some(api_stub); + // establish connection to all servers + for (&id, &server) in &servers { + pf_info!(self.id; "connecting to server {} '{}'...", id, server); + let api_stub = + ClientApiStub::new_by_connect(self.id, server).await?; + self.api_stubs.insert(id, api_stub); + } self.servers = servers; Ok(()) } @@ -1589,23 +1599,16 @@ impl GenericEndpoint for MultiPaxosClient { } async fn leave(&mut self, permanent: bool) -> Result<(), SummersetError> { - // send leave notification to current connected server - if let Some(mut api_stub) = self.api_stub.take() { + // send leave notification to all servers + for (id, mut api_stub) in self.api_stubs.drain() { let mut sent = api_stub.send_req(Some(&ApiRequest::Leave))?; while !sent { sent = api_stub.send_req(None)?; } - let reply = api_stub.recv_reply().await?; - match reply { - ApiReply::Leave => { - pf_info!(self.id; "left current server connection"); - api_stub.forget(); - } - _ => { - return logged_err!(self.id; "unexpected reply type received"); - } - } + while api_stub.recv_reply().await? != ApiReply::Leave {} + pf_info!(self.id; "left server connection {}", id); + api_stub.forget(); } // if permanently leaving, send leave notification to the manager @@ -1616,15 +1619,8 @@ impl GenericEndpoint for MultiPaxosClient { sent = self.ctrl_stub.send_req(None)?; } - let reply = self.ctrl_stub.recv_reply().await?; - match reply { - CtrlReply::Leave => { - pf_info!(self.id; "left current manager connection"); - } - _ => { - return logged_err!(self.id; "unexpected reply type received"); - } - } + while self.ctrl_stub.recv_reply().await? != CtrlReply::Leave {} + pf_info!(self.id; "left manager connection"); } Ok(()) @@ -1634,38 +1630,44 @@ impl GenericEndpoint for MultiPaxosClient { &mut self, req: Option<&ApiRequest>, ) -> Result { - match self.api_stub { - Some(ref mut api_stub) => api_stub.send_req(req), - None => logged_err!(self.id; "client is not set up"), + if self.api_stubs.contains_key(&self.server_id) { + self.api_stubs + .get_mut(&self.server_id) + .unwrap() + .send_req(req) + } else { + Err(SummersetError("client not set up".into())) } } async fn recv_reply(&mut self) -> Result { - match self.api_stub { - Some(ref mut api_stub) => { - let reply = api_stub.recv_reply().await?; - - if let ApiReply::Reply { - ref result, - ref redirect, - .. - } = reply - { - // if the current server redirects me to a different server - if result.is_none() && redirect.is_some() { - let redirect_id = redirect.unwrap(); - assert!(self.servers.contains_key(&redirect_id)); - self.leave(false).await?; - self.server_id = redirect_id; - self.connect().await?; - pf_debug!(self.id; "redirected to replica {} '{}'", - redirect_id, self.servers[&redirect_id]); - } - } + if self.api_stubs.contains_key(&self.server_id) { + let reply = self + .api_stubs + .get_mut(&self.server_id) + .unwrap() + .recv_reply() + .await?; - Ok(reply) + if let ApiReply::Reply { + ref result, + ref redirect, + .. + } = reply + { + // if the current server redirects me to a different server + if result.is_none() && redirect.is_some() { + let redirect_id = redirect.unwrap(); + assert!(self.servers.contains_key(&redirect_id)); + self.server_id = redirect_id; + pf_debug!(self.id; "redirected to replica {} '{}'", + redirect_id, self.servers[&redirect_id]); + } } - None => logged_err!(self.id; "client is not set up"), + + Ok(reply) + } else { + Err(SummersetError("client not set up".into())) } } diff --git a/src/protocols/rep_nothing.rs b/src/protocols/rep_nothing.rs index 6475de8d..643cdf7a 100644 --- a/src/protocols/rep_nothing.rs +++ b/src/protocols/rep_nothing.rs @@ -218,14 +218,16 @@ impl RepNothingReplica { let (client, req) = &inst.reqs[cmd_idx]; match req { ApiRequest::Req { id: req_id, .. } => { - self.external_api.send_reply( - ApiReply::Reply { - id: *req_id, - result: Some(cmd_result), - redirect: None, - }, - *client, - )?; + if self.external_api.has_client(*client) { + self.external_api.send_reply( + ApiReply::Reply { + id: *req_id, + result: Some(cmd_result), + redirect: None, + }, + *client, + )?; + } } _ => { return logged_err!(self.id; "unknown request type at {}|{}", inst_idx, cmd_idx) @@ -535,7 +537,7 @@ pub struct RepNothingClient { /// Control API stub to the cluster manager. ctrl_stub: ClientCtrlStub, - /// API stubs for communicating with servers. + /// API stub for communicating with the current server. api_stub: Option, } @@ -546,6 +548,7 @@ impl GenericEndpoint for RepNothingClient { config_str: Option<&str>, ) -> Result { // connect to the cluster manager and get assigned a client ID + pf_info!("c"; "connecting to manager '{}'...", manager); let ctrl_stub = ClientCtrlStub::new_by_connect(manager).await?; let id = ctrl_stub.id; @@ -578,6 +581,8 @@ impl GenericEndpoint for RepNothingClient { match reply { CtrlReply::QueryInfo { servers } => { // connect to the one with server ID in config + pf_info!(self.id; "connecting to server {} '{}'...", + self.config.server_id, servers[&self.config.server_id]); let api_stub = ClientApiStub::new_by_connect( self.id, servers[&self.config.server_id], @@ -598,16 +603,9 @@ impl GenericEndpoint for RepNothingClient { sent = api_stub.send_req(None)?; } - let reply = api_stub.recv_reply().await?; - match reply { - ApiReply::Leave => { - pf_info!(self.id; "left current server connection"); - api_stub.forget(); - } - _ => { - return logged_err!(self.id; "unexpected reply type received"); - } - } + while api_stub.recv_reply().await? != ApiReply::Leave {} + pf_info!(self.id; "left current server connection"); + api_stub.forget(); } // if permanently leaving, send leave notification to the manager @@ -618,15 +616,8 @@ impl GenericEndpoint for RepNothingClient { sent = self.ctrl_stub.send_req(None)?; } - let reply = self.ctrl_stub.recv_reply().await?; - match reply { - CtrlReply::Leave => { - pf_info!(self.id; "left current manager connection"); - } - _ => { - return logged_err!(self.id; "unexpected reply type received"); - } - } + while self.ctrl_stub.recv_reply().await? != CtrlReply::Leave {} + pf_info!(self.id; "left manager connection"); } Ok(()) @@ -638,14 +629,14 @@ impl GenericEndpoint for RepNothingClient { ) -> Result { match self.api_stub { Some(ref mut api_stub) => api_stub.send_req(req), - None => logged_err!(self.id; "client is not set up"), + None => Err(SummersetError("client not set up".into())), } } async fn recv_reply(&mut self) -> Result { match self.api_stub { Some(ref mut api_stub) => api_stub.recv_reply().await, - None => logged_err!(self.id; "client is not set up"), + None => Err(SummersetError("client not set up".into())), } } diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs index ed66f6ea..172c9771 100644 --- a/src/protocols/rs_paxos.rs +++ b/src/protocols/rs_paxos.rs @@ -7,7 +7,7 @@ use std::collections::HashMap; use std::path::Path; use std::net::SocketAddr; -use crate::utils::{SummersetError, Bitmap, RSCodeword}; +use crate::utils::{SummersetError, Bitmap, Timer, RSCodeword}; use crate::manager::{CtrlMsg, CtrlRequest, CtrlReply}; use crate::server::{ ReplicaId, ControlHub, StateMachine, CommandResult, CommandId, ExternalApi, @@ -17,13 +17,15 @@ use crate::server::{ use crate::client::{ClientId, ClientApiStub, ClientCtrlStub, GenericEndpoint}; use crate::protocols::SmrProtocol; +use rand::prelude::*; + use async_trait::async_trait; use get_size::GetSize; use serde::{Serialize, Deserialize}; -use tokio::time::Duration; +use tokio::time::{self, Duration, Interval, MissedTickBehavior}; use tokio::sync::watch; use reed_solomon_erasure::galois_8::ReedSolomon; @@ -43,6 +45,15 @@ pub struct ReplicaConfigRSPaxos { /// Whether to call `fsync()`/`fdatasync()` on logger. pub logger_sync: bool, + /// Min timeout of not hearing any heartbeat from leader in millisecs. + pub hb_hear_timeout_min: u64, + + /// Max timeout of not hearing any heartbeat from leader in millisecs. + pub hb_hear_timeout_max: u64, + + /// Interval of leader sending heartbeats to followers. + pub hb_send_interval_ms: u64, + /// Fault-tolerance level. pub fault_tolerance: u8, @@ -61,6 +72,9 @@ impl Default for ReplicaConfigRSPaxos { max_batch_size: 5000, backer_path: "/tmp/summerset.rs_paxos.wal".into(), logger_sync: false, + hb_hear_timeout_min: 300, + hb_hear_timeout_max: 600, + hb_send_interval_ms: 50, fault_tolerance: 0, perf_storage_a: 0, perf_storage_b: 0, @@ -125,6 +139,9 @@ struct Instance { /// Follower-side bookkeeping info. replica_bk: Option, + + /// True if from external client, else false. + external: bool, } /// Stable storage log entry type. @@ -171,6 +188,9 @@ enum PeerMsg { /// Commit notification from leader to replicas. Commit { slot: usize }, + + /// Leader activity heartbeat. + Heartbeat { ballot: Ballot }, } /// RSPaxos server replica module. @@ -208,6 +228,12 @@ pub struct RSPaxosReplica { /// TransportHub module. transport_hub: TransportHub, + /// Timer for hearing heartbeat from leader. + hb_hear_timer: Timer, + + /// Interval for sending heartbeat to followers. + hb_send_interval: Interval, + /// Do I think I am the leader? is_leader: bool, @@ -327,7 +353,6 @@ impl RSPaxosReplica { // create a new instance in the first null slot (or append a new one // at the end if no holes exist) - // TODO: maybe use a null_idx variable to better keep track of this let mut slot = self.insts.len(); for s in self.commit_bar..self.insts.len() { if self.insts[s].status == Status::Null { @@ -355,14 +380,12 @@ impl RSPaxosReplica { accept_acks: Bitmap::new(self.population, false), }), replica_bk: None, + external: true, }; self.insts.push(new_inst); } // decide whether we can enter fast path for this instance - // TODO: remember to reset bal_prepared to 0, update bal_max_seen, - // and re-handle all Preparing & Accepting instances in autonomous - // Prepare initiation if self.bal_prepared == 0 { // slow case: Prepare phase not done yet. Initiate a Prepare round // if none is on the fly, or just wait for some Prepare reply to @@ -629,6 +652,7 @@ impl RSPaxosReplica { )?, leader_bk: None, replica_bk: None, + external: false, }); } let inst = &mut self.insts[slot]; @@ -788,6 +812,7 @@ impl RSPaxosReplica { )?, leader_bk: None, replica_bk: None, + external: false, }); } let inst = &mut self.insts[slot]; @@ -902,6 +927,7 @@ impl RSPaxosReplica { )?, leader_bk: None, replica_bk: None, + external: false, }); } let inst = &mut self.insts[slot]; @@ -954,6 +980,7 @@ impl RSPaxosReplica { self.handle_msg_accept_reply(peer, slot, ballot) } PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot), + PeerMsg::Heartbeat { ballot } => self.heard_heartbeat(peer, ballot), } } @@ -975,7 +1002,7 @@ impl RSPaxosReplica { // reply command result back to client if let ApiRequest::Req { id: req_id, .. } = req { - if self.external_api.has_client(client) { + if inst.external && self.external_api.has_client(client) { self.external_api.send_reply( ApiReply::Reply { id: *req_id, @@ -1013,6 +1040,111 @@ impl RSPaxosReplica { Ok(()) } + /// Becomes a leader, sends self-initiated Prepare messages to followers + /// for all in-progress instances, and starts broadcasting heartbeats. + fn become_a_leader(&mut self) -> Result<(), SummersetError> { + assert!(!self.is_leader); + self.is_leader = true; // this starts broadcasting heartbeats + pf_warn!(self.id; "becoming a leader..."); + + // broadcast a heartbeat right now + self.bcast_heartbeats()?; + + // make a greater ballot number and invalidate all in-progress instances + self.bal_prepared = 0; + self.bal_prep_sent = self.make_greater_ballot(self.bal_max_seen); + self.bal_max_seen = self.bal_prep_sent; + + // redo Prepare phase for all in-progress instances + for (slot, inst) in self.insts.iter_mut().enumerate() { + if inst.status < Status::Committed { + inst.bal = self.bal_prep_sent; + inst.status = Status::Preparing; + pf_debug!(self.id; "enter Prepare phase for slot {} bal {}", + slot, inst.bal); + + // record update to largest prepare ballot + self.storage_hub.submit_action( + Self::make_log_action_id(slot, Status::Preparing), + LogAction::Append { + entry: LogEntry::PrepareBal { + slot, + ballot: self.bal_prep_sent, + }, + sync: self.config.logger_sync, + }, + )?; + pf_trace!(self.id; "submitted PrepareBal log action for slot {} bal {}", + slot, inst.bal); + + // send Prepare messages to all peers + self.transport_hub.bcast_msg( + PeerMsg::Prepare { + slot, + ballot: self.bal_prep_sent, + }, + None, + )?; + pf_trace!(self.id; "broadcast Prepare messages for slot {} bal {}", + slot, inst.bal); + } + } + + Ok(()) + } + + /// Broadcasts heartbeats to all replicas. + fn bcast_heartbeats(&mut self) -> Result<(), SummersetError> { + self.transport_hub.bcast_msg( + PeerMsg::Heartbeat { + ballot: self.bal_prep_sent, + }, + None, + )?; + self.heard_heartbeat(self.id, self.bal_prep_sent)?; + + // pf_trace!(self.id; "broadcast heartbeats bal {}", self.bal_prep_sent); + Ok(()) + } + + /// Chooses a random hb_hear_timeout from the min-max range and kicks off + /// the hb_hear_timer. + fn kickoff_hb_hear_timer(&mut self) -> Result<(), SummersetError> { + let timeout_ms = thread_rng().gen_range( + self.config.hb_hear_timeout_min..=self.config.hb_hear_timeout_max, + ); + + // pf_trace!(self.id; "kickoff hb_hear_timer @ {} ms", timeout_ms); + self.hb_hear_timer + .kickoff(Duration::from_millis(timeout_ms))?; + Ok(()) + } + + /// Heard a heartbeat from some other replica. If the heartbeat carries a + /// high enough ballot number, refreshes my hearing timer and clears my + /// leader status if I currently think I'm a leader. + fn heard_heartbeat( + &mut self, + _peer: ReplicaId, + ballot: Ballot, + ) -> Result<(), SummersetError> { + // ignore outdated hearbeat + if ballot < self.bal_max_seen { + return Ok(()); + } + + // reset hearing timer + self.kickoff_hb_hear_timer()?; + + // clear my leader status if it carries a higher ballot number + if self.is_leader && ballot > self.bal_max_seen { + self.is_leader = false; + } + + // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot); + Ok(()) + } + /// Handler of ResetState control message. async fn handle_ctrl_reset_state( &mut self, @@ -1087,6 +1219,7 @@ impl RSPaxosReplica { )?, leader_bk: None, replica_bk: None, + external: false, }); } // update instance state @@ -1119,6 +1252,7 @@ impl RSPaxosReplica { )?, leader_bk: None, replica_bk: None, + external: false, }); } // update instance state @@ -1246,7 +1380,9 @@ impl GenericReplica for RSPaxosReplica { // parse protocol-specific configs let config = parsed_config!(config_str => ReplicaConfigRSPaxos; batch_interval_us, max_batch_size, - backer_path, logger_sync, fault_tolerance, + backer_path, logger_sync, + hb_hear_timeout_min, hb_hear_timeout_max, + hb_send_interval_ms, fault_tolerance, perf_storage_a, perf_storage_b, perf_network_a, perf_network_b)?; if config.batch_interval_us == 0 { @@ -1256,6 +1392,27 @@ impl GenericReplica for RSPaxosReplica { config.batch_interval_us ); } + if config.hb_hear_timeout_min < 100 { + return logged_err!( + id; + "invalid config.hb_hear_timeout_min '{}'", + config.hb_hear_timeout_min + ); + } + if config.hb_hear_timeout_max < config.hb_hear_timeout_min + 100 { + return logged_err!( + id; + "invalid config.hb_hear_timeout_max '{}'", + config.hb_hear_timeout_max + ); + } + if config.hb_send_interval_ms == 0 { + return logged_err!( + id; + "invalid config.hb_send_interval_ms '{}'", + config.hb_send_interval_ms + ); + } // setup state machine module let state_machine = StateMachine::new_and_setup(id).await?; @@ -1330,6 +1487,10 @@ impl GenericReplica for RSPaxosReplica { ) .await?; + let mut hb_send_interval = + time::interval(Duration::from_millis(config.hb_send_interval_ms)); + hb_send_interval.set_missed_tick_behavior(MissedTickBehavior::Skip); + Ok(RSPaxosReplica { id, population, @@ -1342,6 +1503,8 @@ impl GenericReplica for RSPaxosReplica { state_machine, storage_hub, transport_hub, + hb_hear_timer: Timer::new(), + hb_send_interval, is_leader: false, insts: vec![], bal_prep_sent: 0, @@ -1361,10 +1524,8 @@ impl GenericReplica for RSPaxosReplica { // recover state from durable storage log self.recover_from_log().await?; - // TODO: proper leader election - if self.id == 0 { - self.is_leader = true; - } + // kick off leader activity hearing timer + self.kickoff_hb_hear_timer()?; // main event loop loop { @@ -1418,6 +1579,16 @@ impl GenericReplica for RSPaxosReplica { } }, + // leader inactivity timeout + _ = self.hb_hear_timer.timeout() => { + self.become_a_leader()?; + }, + + // leader sending heartbeat + _ = self.hb_send_interval.tick(), if self.is_leader => { + self.bcast_heartbeats()?; + } + // manager control message ctrl_msg = self.control_hub.recv_ctrl() => { if let Err(e) = ctrl_msg { @@ -1477,17 +1648,17 @@ pub struct RSPaxosClient { /// Configuration parameters struct. _config: ClientConfigRSPaxos, - /// Cached list of active servers information. + /// List of active servers information. servers: HashMap, - /// Current server ID to connect to. + /// Current server ID to talk to. server_id: ReplicaId, /// Control API stub to the cluster manager. ctrl_stub: ClientCtrlStub, /// API stubs for communicating with servers. - api_stub: Option, + api_stubs: HashMap, } #[async_trait] @@ -1497,6 +1668,7 @@ impl GenericEndpoint for RSPaxosClient { config_str: Option<&str>, ) -> Result { // connect to the cluster manager and get assigned a client ID + pf_info!("c"; "connecting to manager '{}'...", manager); let ctrl_stub = ClientCtrlStub::new_by_connect(manager).await?; let id = ctrl_stub.id; @@ -1511,13 +1683,13 @@ impl GenericEndpoint for RSPaxosClient { servers: HashMap::new(), server_id: init_server_id, ctrl_stub, - api_stub: None, + api_stubs: HashMap::new(), }) } async fn connect(&mut self) -> Result<(), SummersetError> { // disallow reconnection without leaving - if self.api_stub.is_some() { + if !self.api_stubs.is_empty() { return logged_err!(self.id; "reconnecting without leaving"); } @@ -1531,13 +1703,13 @@ impl GenericEndpoint for RSPaxosClient { let reply = self.ctrl_stub.recv_reply().await?; match reply { CtrlReply::QueryInfo { servers } => { - // connect to the one with server ID in config - let api_stub = ClientApiStub::new_by_connect( - self.id, - servers[&self.server_id], - ) - .await?; - self.api_stub = Some(api_stub); + // establish connection to all servers + for (&id, &server) in &servers { + pf_info!(self.id; "connecting to server {} '{}'...", id, server); + let api_stub = + ClientApiStub::new_by_connect(self.id, server).await?; + self.api_stubs.insert(id, api_stub); + } self.servers = servers; Ok(()) } @@ -1547,22 +1719,15 @@ impl GenericEndpoint for RSPaxosClient { async fn leave(&mut self, permanent: bool) -> Result<(), SummersetError> { // send leave notification to current connected server - if let Some(mut api_stub) = self.api_stub.take() { + for (id, mut api_stub) in self.api_stubs.drain() { let mut sent = api_stub.send_req(Some(&ApiRequest::Leave))?; while !sent { sent = api_stub.send_req(None)?; } - let reply = api_stub.recv_reply().await?; - match reply { - ApiReply::Leave => { - pf_info!(self.id; "left current server connection"); - api_stub.forget(); - } - _ => { - return logged_err!(self.id; "unexpected reply type received"); - } - } + while api_stub.recv_reply().await? != ApiReply::Leave {} + pf_info!(self.id; "left server connection {}", id); + api_stub.forget(); } // if permanently leaving, send leave notification to the manager @@ -1573,15 +1738,8 @@ impl GenericEndpoint for RSPaxosClient { sent = self.ctrl_stub.send_req(None)?; } - let reply = self.ctrl_stub.recv_reply().await?; - match reply { - CtrlReply::Leave => { - pf_info!(self.id; "left current manager connection"); - } - _ => { - return logged_err!(self.id; "unexpected reply type received"); - } - } + while self.ctrl_stub.recv_reply().await? != CtrlReply::Leave {} + pf_info!(self.id; "left current manager connection"); } Ok(()) @@ -1591,38 +1749,44 @@ impl GenericEndpoint for RSPaxosClient { &mut self, req: Option<&ApiRequest>, ) -> Result { - match self.api_stub { - Some(ref mut api_stub) => api_stub.send_req(req), - None => logged_err!(self.id; "client is not set up"), + if self.api_stubs.contains_key(&self.server_id) { + self.api_stubs + .get_mut(&self.server_id) + .unwrap() + .send_req(req) + } else { + Err(SummersetError("client not set up".into())) } } async fn recv_reply(&mut self) -> Result { - match self.api_stub { - Some(ref mut api_stub) => { - let reply = api_stub.recv_reply().await?; - - if let ApiReply::Reply { - ref result, - ref redirect, - .. - } = reply - { - // if the current server redirects me to a different server - if result.is_none() && redirect.is_some() { - let redirect_id = redirect.unwrap(); - assert!(self.servers.contains_key(&redirect_id)); - self.leave(false).await?; - self.server_id = redirect_id; - self.connect().await?; - pf_debug!(self.id; "redirected to replica {} '{}'", - redirect_id, self.servers[&redirect_id]); - } - } + if self.api_stubs.contains_key(&self.server_id) { + let reply = self + .api_stubs + .get_mut(&self.server_id) + .unwrap() + .recv_reply() + .await?; - Ok(reply) + if let ApiReply::Reply { + ref result, + ref redirect, + .. + } = reply + { + // if the current server redirects me to a different server + if result.is_none() && redirect.is_some() { + let redirect_id = redirect.unwrap(); + assert!(self.servers.contains_key(&redirect_id)); + self.server_id = redirect_id; + pf_debug!(self.id; "redirected to replica {} '{}'", + redirect_id, self.servers[&redirect_id]); + } } - None => logged_err!(self.id; "client is not set up"), + + Ok(reply) + } else { + Err(SummersetError("client not set up".into())) } } diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs index 7260bc27..593cbd27 100644 --- a/src/protocols/simple_push.rs +++ b/src/protocols/simple_push.rs @@ -379,14 +379,16 @@ impl SimplePushReplica { let (client, req) = &inst.reqs[cmd_idx]; match req { ApiRequest::Req { id: req_id, .. } => { - self.external_api.send_reply( - ApiReply::Reply { - id: *req_id, - result: Some(cmd_result), - redirect: None, - }, - *client, - )?; + if self.external_api.has_client(*client) { + self.external_api.send_reply( + ApiReply::Reply { + id: *req_id, + result: Some(cmd_result), + redirect: None, + }, + *client, + )?; + } } _ => { return logged_err!(self.id; "unknown request type at {}|{}", inst_idx, cmd_idx) @@ -762,7 +764,7 @@ pub struct SimplePushClient { /// Control API stub to the cluster manager. ctrl_stub: ClientCtrlStub, - /// API stubs for communicating with servers. + /// API stub for communicating with the current server. api_stub: Option, } @@ -773,6 +775,7 @@ impl GenericEndpoint for SimplePushClient { config_str: Option<&str>, ) -> Result { // connect to the cluster manager and get assigned a client ID + pf_info!("c"; "connecting to manager '{}'...", manager); let ctrl_stub = ClientCtrlStub::new_by_connect(manager).await?; let id = ctrl_stub.id; @@ -805,6 +808,8 @@ impl GenericEndpoint for SimplePushClient { match reply { CtrlReply::QueryInfo { servers } => { // connect to the one with server ID in config + pf_info!(self.id; "connecting to server {} '{}'...", + self.config.server_id, servers[&self.config.server_id]); let api_stub = ClientApiStub::new_by_connect( self.id, servers[&self.config.server_id], @@ -825,16 +830,9 @@ impl GenericEndpoint for SimplePushClient { sent = api_stub.send_req(None)?; } - let reply = api_stub.recv_reply().await?; - match reply { - ApiReply::Leave => { - pf_info!(self.id; "left current server connection"); - api_stub.forget(); - } - _ => { - return logged_err!(self.id; "unexpected reply type received"); - } - } + while api_stub.recv_reply().await? != ApiReply::Leave {} + pf_info!(self.id; "left current server connection"); + api_stub.forget(); } // if permanently leaving, send leave notification to the manager @@ -845,15 +843,8 @@ impl GenericEndpoint for SimplePushClient { sent = self.ctrl_stub.send_req(None)?; } - let reply = self.ctrl_stub.recv_reply().await?; - match reply { - CtrlReply::Leave => { - pf_info!(self.id; "left current manager connection"); - } - _ => { - return logged_err!(self.id; "unexpected reply type received"); - } - } + while self.ctrl_stub.recv_reply().await? != CtrlReply::Leave {} + pf_info!(self.id; "left manager connection"); } Ok(()) @@ -865,14 +856,14 @@ impl GenericEndpoint for SimplePushClient { ) -> Result { match self.api_stub { Some(ref mut api_stub) => api_stub.send_req(req), - None => logged_err!(self.id; "client is not set up"), + None => Err(SummersetError("client not set up".into())), } } async fn recv_reply(&mut self) -> Result { match self.api_stub { Some(ref mut api_stub) => api_stub.recv_reply().await, - None => logged_err!(self.id; "client is not set up"), + None => Err(SummersetError("client not set up".into())), } } diff --git a/src/server/external.rs b/src/server/external.rs index 769c1cd7..cc820c00 100644 --- a/src/server/external.rs +++ b/src/server/external.rs @@ -378,7 +378,7 @@ impl ExternalApi { mut rx_reply: mpsc::UnboundedReceiver, tx_exit: mpsc::UnboundedSender, ) { - pf_debug!(me; "client_servant thread for {} ({}) spawned", id, addr); + pf_debug!(me; "client_servant thread for {} '{}' spawned", id, addr); let (mut conn_read, conn_write) = conn.into_split(); let mut req_buf = BytesMut::with_capacity(8 + 1024); @@ -477,7 +477,7 @@ impl ExternalApi { if let Err(e) = tx_exit.send(id) { pf_error!(me; "error sending exit signal for {}: {}", id, e); } - pf_debug!(me; "client_servant thread for {} ({}) exitted", id, addr); + pf_debug!(me; "client_servant thread for {} '{}' exitted", id, addr); } } diff --git a/src/server/transport.rs b/src/server/transport.rs index a6a30ec8..18caa475 100644 --- a/src/server/transport.rs +++ b/src/server/transport.rs @@ -554,7 +554,7 @@ where tx_recv: mpsc::UnboundedSender<(ReplicaId, PeerMessage)>, tx_exit: mpsc::UnboundedSender, ) { - pf_debug!(me; "peer_messenger thread for {} ({}) spawned", id, addr); + pf_debug!(me; "peer_messenger thread for {} '{}' spawned", id, addr); let (mut conn_read, conn_write) = conn.into_split(); let mut read_buf = BytesMut::with_capacity(8 + 1024); @@ -680,7 +680,7 @@ where if let Err(e) = tx_exit.send(id) { pf_error!(me; "error sending exit signal for {}: {}", id, e); } - pf_debug!(me; "peer_messenger thread for {} ({}) exitted", id, addr); + pf_debug!(me; "peer_messenger thread for {} '{}' exitted", id, addr); } } diff --git a/summerset_client/src/drivers/open_loop.rs b/summerset_client/src/drivers/open_loop.rs index 382091f1..8e49c107 100644 --- a/summerset_client/src/drivers/open_loop.rs +++ b/summerset_client/src/drivers/open_loop.rs @@ -13,8 +13,7 @@ use tokio::time::{Duration, Instant}; use summerset::{ GenericEndpoint, ClientId, Command, ApiRequest, ApiReply, RequestId, - ClientCtrlStub, Timer, SummersetError, pf_trace, pf_debug, pf_error, - logged_err, + ClientCtrlStub, Timer, SummersetError, pf_debug, pf_error, logged_err, }; /// Open-loop driver struct. @@ -61,24 +60,11 @@ impl DriverOpenLoop { self.endpoint.connect().await } - /// Waits for all pending replies to be received, then sends leave - /// notification and forgets about the current TCP connections. The leave - /// action is left synchronous. + /// Sends leave notification and forgets about the current TCP connections. pub async fn leave( &mut self, permanent: bool, ) -> Result<(), SummersetError> { - // loop until all pending replies have been received - while self.should_retry { - pf_trace!(self.id; "retrying last issue at leave"); - self.issue_retry()?; - } - while !self.pending_reqs.is_empty() { - pf_trace!(self.id; "pending {} requests at leave", - self.pending_reqs.len()); - self.wait_reply().await?; - } - self.endpoint.leave(permanent).await } From 8beefa7228538d91ffde966544d5578e02822f04 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Thu, 14 Sep 2023 15:55:28 -0500 Subject: [PATCH 45/89] add leader info to manager for testing --- src/manager/clusman.rs | 44 +++++++++++++-- src/manager/reactor.rs | 40 ++++++------- src/manager/reigner.rs | 3 + src/protocols/crossword.rs | 11 +++- src/protocols/multipaxos.rs | 11 +++- src/protocols/rep_nothing.rs | 4 +- src/protocols/rs_paxos.rs | 11 +++- src/protocols/simple_push.rs | 4 +- summerset_client/src/clients/tester.rs | 77 ++++++++++++++++++-------- 9 files changed, 148 insertions(+), 57 deletions(-) diff --git a/src/manager/clusman.rs b/src/manager/clusman.rs index 2a523972..62b39a60 100644 --- a/src/manager/clusman.rs +++ b/src/manager/clusman.rs @@ -14,7 +14,6 @@ use crate::protocols::SmrProtocol; use tokio::sync::{mpsc, watch}; /// Information about an active server. -// TODO: maybe add things like leader info, etc. #[derive(Debug, Clone)] struct ServerInfo { /// The server's client-facing API address. @@ -22,6 +21,9 @@ struct ServerInfo { /// The server's internal peer-peer API address. p2p_addr: SocketAddr, + + /// This server is a leader (leader could be non-unique). + is_leader: bool, } /// Standalone cluster manager oracle. @@ -192,11 +194,39 @@ impl ClusterManager { )?; // save new server's info - self.server_info - .insert(server, ServerInfo { api_addr, p2p_addr }); + self.server_info.insert( + server, + ServerInfo { + api_addr, + p2p_addr, + is_leader: false, + }, + ); Ok(()) } + /// Handler of LeaderStatus message. + fn handle_leader_status( + &mut self, + server: ReplicaId, + step_up: bool, + ) -> Result<(), SummersetError> { + if !self.server_info.contains_key(&server) { + return logged_err!("m"; "leader status got unknown ID: {}", server); + } + + // update this server's info + let info = self.server_info.get_mut(&server).unwrap(); + if step_up && info.is_leader { + logged_err!("m"; "server {} is already marked as leader", server) + } else if !step_up && !info.is_leader { + logged_err!("m"; "server {} is already marked as non-leader", server) + } else { + info.is_leader = step_up; + Ok(()) + } + } + /// Synthesized handler of server-initiated control messages. async fn handle_ctrl_msg( &mut self, @@ -220,6 +250,10 @@ impl ClusterManager { )?; } + CtrlMsg::LeaderStatus { step_up } => { + self.handle_leader_status(server, step_up)?; + } + _ => {} // ignore all other types } @@ -235,10 +269,10 @@ impl ClusterManager { client: ClientId, ) -> Result<(), SummersetError> { // gather public addresses of all active servers - let servers: HashMap = self + let servers: HashMap = self .server_info .iter() - .map(|(&server, info)| (server, info.api_addr)) + .map(|(&server, info)| (server, (info.api_addr, info.is_leader))) .collect(); self.client_reactor .send_reply(CtrlReply::QueryInfo { servers }, client) diff --git a/src/manager/reactor.rs b/src/manager/reactor.rs index 1942f591..4a2f5bd6 100644 --- a/src/manager/reactor.rs +++ b/src/manager/reactor.rs @@ -20,7 +20,6 @@ use tokio::sync::mpsc; use tokio::task::JoinHandle; /// Control event request from client. -// TODO: maybe add things like leader info, etc. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] pub enum CtrlRequest { /// Query the set of active servers and their info. @@ -43,7 +42,8 @@ pub enum CtrlRequest { pub enum CtrlReply { /// Reply to server info query. QueryInfo { - servers: HashMap, + /// Map from replica ID -> (addr, is_leader). + servers: HashMap, }, /// Reply to server reset request. @@ -448,9 +448,9 @@ mod reactor_tests { // send reply to client reactor.send_reply( CtrlReply::QueryInfo { - servers: HashMap::::from([ - (0, "127.0.0.1:53700".parse()?), - (1, "127.0.0.1:53701".parse()?), + servers: HashMap::::from([ + (0, ("127.0.0.1:53700".parse()?, true)), + (1, ("127.0.0.1:53701".parse()?, false)), ]), }, client, @@ -467,9 +467,9 @@ mod reactor_tests { assert_eq!( ctrl_stub.recv_reply().await?, CtrlReply::QueryInfo { - servers: HashMap::::from([ - (0, "127.0.0.1:53700".parse()?), - (1, "127.0.0.1:53701".parse()?), + servers: HashMap::::from([ + (0, ("127.0.0.1:53700".parse()?, true)), + (1, ("127.0.0.1:53701".parse()?, false)), ]), } ); @@ -492,9 +492,9 @@ mod reactor_tests { assert_eq!( ctrl_stub.recv_reply().await?, CtrlReply::QueryInfo { - servers: HashMap::::from([ - (0, "127.0.0.1:54700".parse()?), - (1, "127.0.0.1:54701".parse()?), + servers: HashMap::::from([ + (0, ("127.0.0.1:54700".parse()?, true)), + (1, ("127.0.0.1:54701".parse()?, false)), ]), } ); @@ -512,9 +512,9 @@ mod reactor_tests { assert_eq!( ctrl_stub.recv_reply().await?, CtrlReply::QueryInfo { - servers: HashMap::::from([ - (0, "127.0.0.1:54710".parse()?), - (1, "127.0.0.1:54711".parse()?), + servers: HashMap::::from([ + (0, ("127.0.0.1:54710".parse()?, true)), + (1, ("127.0.0.1:54711".parse()?, false)), ]), } ); @@ -531,9 +531,9 @@ mod reactor_tests { // send reply to client reactor.send_reply( CtrlReply::QueryInfo { - servers: HashMap::::from([ - (0, "127.0.0.1:54700".parse()?), - (1, "127.0.0.1:54701".parse()?), + servers: HashMap::::from([ + (0, ("127.0.0.1:54700".parse()?, true)), + (1, ("127.0.0.1:54701".parse()?, false)), ]), }, client, @@ -546,9 +546,9 @@ mod reactor_tests { // send reply to new client reactor.send_reply( CtrlReply::QueryInfo { - servers: HashMap::::from([ - (0, "127.0.0.1:54710".parse()?), - (1, "127.0.0.1:54711".parse()?), + servers: HashMap::::from([ + (0, ("127.0.0.1:54710".parse()?, true)), + (1, ("127.0.0.1:54711".parse()?, false)), ]), }, client2, diff --git a/src/manager/reigner.rs b/src/manager/reigner.rs index b28b9262..a5a04450 100644 --- a/src/manager/reigner.rs +++ b/src/manager/reigner.rs @@ -39,6 +39,9 @@ pub enum CtrlMsg { to_peers: HashMap, }, + /// Server -> Manager: tell the manager that I steped-up/down as leader. + LeaderStatus { step_up: bool }, + /// Manager -> Server: reset to initial state. If durable is false, cleans /// durable storage state as well. ResetState { durable: bool }, diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index 1140a69c..9d4c3100 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -1145,6 +1145,8 @@ impl CrosswordReplica { fn become_a_leader(&mut self) -> Result<(), SummersetError> { assert!(!self.is_leader); self.is_leader = true; // this starts broadcasting heartbeats + self.control_hub + .send_ctrl(CtrlMsg::LeaderStatus { step_up: true })?; pf_warn!(self.id; "becoming a leader..."); // broadcast a heartbeat right now @@ -1239,6 +1241,8 @@ impl CrosswordReplica { // clear my leader status if it carries a higher ballot number if self.is_leader && ballot > self.bal_max_seen { self.is_leader = false; + self.control_hub + .send_ctrl(CtrlMsg::LeaderStatus { step_up: false })?; } // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot); @@ -1811,13 +1815,16 @@ impl GenericEndpoint for CrosswordClient { match reply { CtrlReply::QueryInfo { servers } => { // establish connection to all servers - for (&id, &server) in &servers { + self.servers = servers + .into_iter() + .map(|(id, info)| (id, info.0)) + .collect(); + for (&id, &server) in &self.servers { pf_info!(self.id; "connecting to server {} '{}'...", id, server); let api_stub = ClientApiStub::new_by_connect(self.id, server).await?; self.api_stubs.insert(id, api_stub); } - self.servers = servers; Ok(()) } _ => logged_err!(self.id; "unexpected reply type received"), diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs index 130a2f45..520ff556 100644 --- a/src/protocols/multipaxos.rs +++ b/src/protocols/multipaxos.rs @@ -960,6 +960,8 @@ impl MultiPaxosReplica { fn become_a_leader(&mut self) -> Result<(), SummersetError> { assert!(!self.is_leader); self.is_leader = true; // this starts broadcasting heartbeats + self.control_hub + .send_ctrl(CtrlMsg::LeaderStatus { step_up: true })?; pf_warn!(self.id; "becoming a leader..."); // broadcast a heartbeat right now @@ -1054,6 +1056,8 @@ impl MultiPaxosReplica { // clear my leader status if it carries a higher ballot number if self.is_leader && ballot > self.bal_max_seen { self.is_leader = false; + self.control_hub + .send_ctrl(CtrlMsg::LeaderStatus { step_up: false })?; } // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot); @@ -1585,13 +1589,16 @@ impl GenericEndpoint for MultiPaxosClient { match reply { CtrlReply::QueryInfo { servers } => { // establish connection to all servers - for (&id, &server) in &servers { + self.servers = servers + .into_iter() + .map(|(id, info)| (id, info.0)) + .collect(); + for (&id, &server) in &self.servers { pf_info!(self.id; "connecting to server {} '{}'...", id, server); let api_stub = ClientApiStub::new_by_connect(self.id, server).await?; self.api_stubs.insert(id, api_stub); } - self.servers = servers; Ok(()) } _ => logged_err!(self.id; "unexpected reply type received"), diff --git a/src/protocols/rep_nothing.rs b/src/protocols/rep_nothing.rs index 643cdf7a..ba73a21a 100644 --- a/src/protocols/rep_nothing.rs +++ b/src/protocols/rep_nothing.rs @@ -582,10 +582,10 @@ impl GenericEndpoint for RepNothingClient { CtrlReply::QueryInfo { servers } => { // connect to the one with server ID in config pf_info!(self.id; "connecting to server {} '{}'...", - self.config.server_id, servers[&self.config.server_id]); + self.config.server_id, servers[&self.config.server_id].0); let api_stub = ClientApiStub::new_by_connect( self.id, - servers[&self.config.server_id], + servers[&self.config.server_id].0, ) .await?; self.api_stub = Some(api_stub); diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs index 172c9771..ce987f33 100644 --- a/src/protocols/rs_paxos.rs +++ b/src/protocols/rs_paxos.rs @@ -1045,6 +1045,8 @@ impl RSPaxosReplica { fn become_a_leader(&mut self) -> Result<(), SummersetError> { assert!(!self.is_leader); self.is_leader = true; // this starts broadcasting heartbeats + self.control_hub + .send_ctrl(CtrlMsg::LeaderStatus { step_up: true })?; pf_warn!(self.id; "becoming a leader..."); // broadcast a heartbeat right now @@ -1139,6 +1141,8 @@ impl RSPaxosReplica { // clear my leader status if it carries a higher ballot number if self.is_leader && ballot > self.bal_max_seen { self.is_leader = false; + self.control_hub + .send_ctrl(CtrlMsg::LeaderStatus { step_up: false })?; } // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot); @@ -1704,13 +1708,16 @@ impl GenericEndpoint for RSPaxosClient { match reply { CtrlReply::QueryInfo { servers } => { // establish connection to all servers - for (&id, &server) in &servers { + self.servers = servers + .into_iter() + .map(|(id, info)| (id, info.0)) + .collect(); + for (&id, &server) in &self.servers { pf_info!(self.id; "connecting to server {} '{}'...", id, server); let api_stub = ClientApiStub::new_by_connect(self.id, server).await?; self.api_stubs.insert(id, api_stub); } - self.servers = servers; Ok(()) } _ => logged_err!(self.id; "unexpected reply type received"), diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs index 593cbd27..56b28414 100644 --- a/src/protocols/simple_push.rs +++ b/src/protocols/simple_push.rs @@ -809,10 +809,10 @@ impl GenericEndpoint for SimplePushClient { CtrlReply::QueryInfo { servers } => { // connect to the one with server ID in config pf_info!(self.id; "connecting to server {} '{}'...", - self.config.server_id, servers[&self.config.server_id]); + self.config.server_id, servers[&self.config.server_id].0); let api_stub = ClientApiStub::new_by_connect( self.id, - servers[&self.config.server_id], + servers[&self.config.server_id].0, ) .await?; self.api_stub = Some(api_stub); diff --git a/summerset_client/src/clients/tester.rs b/summerset_client/src/clients/tester.rs index 607dbfb0..a3c1ab64 100644 --- a/summerset_client/src/clients/tester.rs +++ b/summerset_client/src/clients/tester.rs @@ -1,6 +1,6 @@ //! Correctness testing client using closed-loop driver. -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use crate::drivers::{DriverReply, DriverClosedLoop}; @@ -27,7 +27,8 @@ lazy_static! { static ref ALL_TESTS: Vec<(&'static str, bool)> = vec![ ("primitive_ops", true), ("client_reconnect", true), - ("one_node_reset", true), + ("leader_node_reset", true), + ("non_leader_reset", true), ("two_nodes_reset", false) ]; } @@ -213,10 +214,11 @@ impl ClientTester { } } - /// Query the list of servers in the cluster. + /// Query the list of servers in the cluster. Returns a map from replica ID + /// -> is_leader status. async fn query_servers( &mut self, - ) -> Result, SummersetError> { + ) -> Result, SummersetError> { let ctrl_stub = self.driver.ctrl_stub(); // send QueryInfo request to manager @@ -230,7 +232,7 @@ impl ClientTester { let reply = ctrl_stub.recv_reply().await?; match reply { CtrlReply::QueryInfo { servers } => { - Ok(servers.keys().copied().collect()) + Ok(servers.into_iter().map(|(id, info)| (id, info.1)).collect()) } _ => logged_err!(self.driver.id; "unexpected control reply type"), } @@ -271,7 +273,8 @@ impl ClientTester { let result = match name { "primitive_ops" => self.test_primitive_ops().await, "client_reconnect" => self.test_client_reconnect().await, - "one_node_reset" => self.test_one_node_reset().await, + "leader_node_reset" => self.test_leader_node_reset().await, + "non_leader_reset" => self.test_non_leader_reset().await, "two_nodes_reset" => self.test_two_nodes_reset().await, _ => { return logged_err!(self.driver.id; "unrecognized test name '{}'", @@ -358,32 +361,62 @@ impl ClientTester { Ok(()) } - /// Single replica node crashes and restarts. - async fn test_one_node_reset(&mut self) -> Result<(), SummersetError> { + /// Single leader replica node crashes and restarts. + async fn test_leader_node_reset(&mut self) -> Result<(), SummersetError> { let v = Self::gen_rand_string(8); self.checked_put("Jose", &v, Some(None)).await?; - for s in self.query_servers().await? { - self.driver.leave(false).await?; - self.reset_servers(HashSet::from([s]), true).await?; - time::sleep(Duration::from_millis(500)).await; - self.driver.connect().await?; - self.checked_get("Jose", Some(Some(&v))).await?; + for (s, is_leader) in self.query_servers().await? { + if is_leader { + self.driver.leave(false).await?; + self.reset_servers(HashSet::from([s]), true).await?; + time::sleep(Duration::from_millis(500)).await; + self.driver.connect().await?; + self.checked_get("Jose", Some(Some(&v))).await?; + break; + } + } + Ok(()) + } + + /// Single leader replica node crashes and restarts. + async fn test_non_leader_reset(&mut self) -> Result<(), SummersetError> { + let v = Self::gen_rand_string(8); + self.checked_put("Jose", &v, Some(None)).await?; + for (s, is_leader) in self.query_servers().await? { + if !is_leader { + self.driver.leave(false).await?; + self.reset_servers(HashSet::from([s]), true).await?; + time::sleep(Duration::from_millis(500)).await; + self.driver.connect().await?; + self.checked_get("Jose", Some(Some(&v))).await?; + break; + } } Ok(()) } - /// Two replica nodes crash and restart. + /// Two replica nodes (leader + non-leader) crash and restart. async fn test_two_nodes_reset(&mut self) -> Result<(), SummersetError> { let v = Self::gen_rand_string(8); self.checked_put("Jose", &v, Some(None)).await?; - let servers = self.query_servers().await?; - for &s in &servers { + let mut resets = HashSet::new(); + let (mut l, mut nl) = (false, false); + for (s, is_leader) in self.query_servers().await? { + if !l && is_leader { + resets.insert(s); + l = true; + } + if !nl && !is_leader { + resets.insert(s); + nl = true; + } + if l && nl { + break; + } + } + if resets.len() == 2 { self.driver.leave(false).await?; - self.reset_servers( - HashSet::from([s, (s + 1) % (servers.len() as u8)]), - true, - ) - .await?; + self.reset_servers(resets, true).await?; time::sleep(Duration::from_millis(500)).await; self.driver.connect().await?; self.checked_get("Jose", Some(Some(&v))).await?; From 3a3e00d88e0b94e4299265fa6b144439dfeff6b0 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Thu, 14 Sep 2023 21:24:01 -0500 Subject: [PATCH 46/89] add pause & resume control requests --- src/manager/clusman.rs | 94 ++++++++++++++++++++- src/manager/reactor.rs | 18 ++++ src/manager/reigner.rs | 14 +++- src/protocols/crossword.rs | 56 +++++++++++-- src/protocols/multipaxos.rs | 56 +++++++++++-- src/protocols/rep_nothing.rs | 43 ++++++++-- src/protocols/rs_paxos.rs | 56 +++++++++++-- src/protocols/simple_push.rs | 45 ++++++++-- summerset_client/src/clients/tester.rs | 110 ++++++++++++++++++++++--- 9 files changed, 442 insertions(+), 50 deletions(-) diff --git a/src/manager/clusman.rs b/src/manager/clusman.rs index 62b39a60..0eeb1f1f 100644 --- a/src/manager/clusman.rs +++ b/src/manager/clusman.rs @@ -22,8 +22,11 @@ struct ServerInfo { /// The server's internal peer-peer API address. p2p_addr: SocketAddr, - /// This server is a leader (leader could be non-unique). + /// This server is a leader? (leader could be non-unique) is_leader: bool, + + /// This server is currently paused? + is_paused: bool, } /// Standalone cluster manager oracle. @@ -200,6 +203,7 @@ impl ClusterManager { api_addr, p2p_addr, is_leader: false, + is_paused: false, }, ); Ok(()) @@ -332,6 +336,86 @@ impl ClusterManager { ) } + /// Handler of client PauseServers request. + async fn handle_client_pause_servers( + &mut self, + client: ClientId, + servers: HashSet, + ) -> Result<(), SummersetError> { + let mut servers: Vec = if servers.is_empty() { + // all active servers + self.server_info.keys().copied().collect() + } else { + servers.into_iter().collect() + }; + + // pause specified server(s) + let mut pause_done = HashSet::new(); + while let Some(s) = servers.pop() { + // send puase server control message to server + self.server_reigner.send_ctrl(CtrlMsg::Pause, s)?; + + // set the is_paused flag + assert!(self.server_info.contains_key(&s)); + self.server_info.get_mut(&s).unwrap().is_paused = true; + + // wait for dummy reply + let (_, reply) = self.server_reigner.recv_ctrl().await?; + if reply != CtrlMsg::PauseReply { + return logged_err!("m"; "unexpected reply type received"); + } + + pause_done.insert(s); + } + + self.client_reactor.send_reply( + CtrlReply::PauseServers { + servers: pause_done, + }, + client, + ) + } + + /// Handler of client ResumeServers request. + async fn handle_client_resume_servers( + &mut self, + client: ClientId, + servers: HashSet, + ) -> Result<(), SummersetError> { + let mut servers: Vec = if servers.is_empty() { + // all active servers + self.server_info.keys().copied().collect() + } else { + servers.into_iter().collect() + }; + + // resume specified server(s) + let mut resume_done = HashSet::new(); + while let Some(s) = servers.pop() { + // send puase server control message to server + self.server_reigner.send_ctrl(CtrlMsg::Resume, s)?; + + // clear the is_paused flag + assert!(self.server_info.contains_key(&s)); + self.server_info.get_mut(&s).unwrap().is_paused = false; + + // wait for dummy reply + let (_, reply) = self.server_reigner.recv_ctrl().await?; + if reply != CtrlMsg::ResumeReply { + return logged_err!("m"; "unexpected reply type received"); + } + + resume_done.insert(s); + } + + self.client_reactor.send_reply( + CtrlReply::ResumeServers { + servers: resume_done, + }, + client, + ) + } + /// Synthesized handler of client-initiated control requests. async fn handle_ctrl_req( &mut self, @@ -349,6 +433,14 @@ impl ClusterManager { .await?; } + CtrlRequest::PauseServers { servers } => { + self.handle_client_pause_servers(client, servers).await?; + } + + CtrlRequest::ResumeServers { servers } => { + self.handle_client_resume_servers(client, servers).await?; + } + _ => {} // ignore all other types } diff --git a/src/manager/reactor.rs b/src/manager/reactor.rs index 4a2f5bd6..4df04e36 100644 --- a/src/manager/reactor.rs +++ b/src/manager/reactor.rs @@ -33,6 +33,18 @@ pub enum CtrlRequest { durable: bool, }, + /// Pause the specified server(s)' event loop execution. + PauseServers { + /// IDs of servers to pause. If empty, pauses all active servers. + servers: HashSet, + }, + + /// Resume the specified server(s)' event loop execution. + ResumeServers { + /// IDs of servers to resume. If empty, resumes all active servers. + servers: HashSet, + }, + /// Client leave notification. Leave, } @@ -49,6 +61,12 @@ pub enum CtrlReply { /// Reply to server reset request. ResetServers { servers: HashSet }, + /// Reply to server pause request. + PauseServers { servers: HashSet }, + + /// Reply to server resume request. + ResumeServers { servers: HashSet }, + /// Reply to client leave notification. Leave, } diff --git a/src/manager/reigner.rs b/src/manager/reigner.rs index a5a04450..c3551e30 100644 --- a/src/manager/reigner.rs +++ b/src/manager/reigner.rs @@ -21,7 +21,7 @@ use tokio::task::JoinHandle; /// Control message from/to servers. Control traffic could be bidirectional: /// some initiated by the manager and some by servers. -// TODO: add pause, resume, leader change, membership change, etc. +// TODO: later add leader change, membership change, etc. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] pub enum CtrlMsg { /// Server -> Manager: new server up, requesting a list of peers' addresses @@ -46,6 +46,18 @@ pub enum CtrlMsg { /// durable storage state as well. ResetState { durable: bool }, + /// Manager -> Server: pause server event loop execution. + Pause, + + /// Server -> Manager: dummy pause reply. + PauseReply, + + /// Manager -> Server: resume server event loop execution. + Resume, + + /// Server -> Manager: dummy resume reply. + ResumeReply, + /// Server -> Manager: leave notification. Leave, diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index 9d4c3100..cdfe2517 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -1147,7 +1147,7 @@ impl CrosswordReplica { self.is_leader = true; // this starts broadcasting heartbeats self.control_hub .send_ctrl(CtrlMsg::LeaderStatus { step_up: true })?; - pf_warn!(self.id; "becoming a leader..."); + pf_info!(self.id; "becoming a leader..."); // broadcast a heartbeat right now self.bcast_heartbeats()?; @@ -1243,6 +1243,7 @@ impl CrosswordReplica { self.is_leader = false; self.control_hub .send_ctrl(CtrlMsg::LeaderStatus { step_up: false })?; + pf_info!(self.id; "no longer a leader..."); } // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot); @@ -1287,20 +1288,56 @@ impl CrosswordReplica { Ok(()) } + /// Handler of Pause control message. + fn handle_ctrl_pause( + &mut self, + paused: &mut bool, + ) -> Result<(), SummersetError> { + pf_warn!(self.id; "server got pause req"); + *paused = true; + self.control_hub.send_ctrl(CtrlMsg::PauseReply)?; + Ok(()) + } + + /// Handler of Resume control message. + fn handle_ctrl_resume( + &mut self, + paused: &mut bool, + ) -> Result<(), SummersetError> { + pf_warn!(self.id; "server got resume req"); + *paused = false; + self.control_hub.send_ctrl(CtrlMsg::ResumeReply)?; + + // reset leader heartbeat timer + self.kickoff_hb_hear_timer()?; + + Ok(()) + } + /// Synthesized handler of manager control messages. If ok, returns /// `Some(true)` if decides to terminate and reboot, `Some(false)` if /// decides to shutdown completely, and `None` if not terminating. async fn handle_ctrl_msg( &mut self, msg: CtrlMsg, + paused: &mut bool, ) -> Result, SummersetError> { - // TODO: fill this when more control message types added match msg { CtrlMsg::ResetState { durable } => { self.handle_ctrl_reset_state(durable).await?; Ok(Some(true)) } + CtrlMsg::Pause => { + self.handle_ctrl_pause(paused)?; + Ok(None) + } + + CtrlMsg::Resume => { + self.handle_ctrl_resume(paused)?; + Ok(None) + } + _ => Ok(None), // ignore all other types } } @@ -1639,10 +1676,11 @@ impl GenericReplica for CrosswordReplica { self.kickoff_hb_hear_timer()?; // main event loop + let mut paused = false; loop { tokio::select! { // client request batch - req_batch = self.external_api.get_req_batch() => { + req_batch = self.external_api.get_req_batch(), if !paused => { if let Err(e) = req_batch { pf_error!(self.id; "error getting req batch: {}", e); continue; @@ -1654,7 +1692,7 @@ impl GenericReplica for CrosswordReplica { }, // durable logging result - log_result = self.storage_hub.get_result() => { + log_result = self.storage_hub.get_result(), if !paused => { if let Err(e) = log_result { pf_error!(self.id; "error getting log result: {}", e); continue; @@ -1667,7 +1705,7 @@ impl GenericReplica for CrosswordReplica { }, // message from peer - msg = self.transport_hub.recv_msg() => { + msg = self.transport_hub.recv_msg(), if !paused => { if let Err(e) = msg { pf_error!(self.id; "error receiving peer msg: {}", e); continue; @@ -1679,7 +1717,7 @@ impl GenericReplica for CrosswordReplica { } // state machine execution result - cmd_result = self.state_machine.get_result() => { + cmd_result = self.state_machine.get_result(), if !paused => { if let Err(e) = cmd_result { pf_error!(self.id; "error getting cmd result: {}", e); continue; @@ -1691,12 +1729,12 @@ impl GenericReplica for CrosswordReplica { }, // leader inactivity timeout - _ = self.hb_hear_timer.timeout() => { + _ = self.hb_hear_timer.timeout(), if !paused => { self.become_a_leader()?; }, // leader sending heartbeat - _ = self.hb_send_interval.tick(), if self.is_leader => { + _ = self.hb_send_interval.tick(), if !paused && self.is_leader => { self.bcast_heartbeats()?; } @@ -1707,7 +1745,7 @@ impl GenericReplica for CrosswordReplica { continue; } let ctrl_msg = ctrl_msg.unwrap(); - match self.handle_ctrl_msg(ctrl_msg).await { + match self.handle_ctrl_msg(ctrl_msg, &mut paused).await { Ok(terminate) => { if let Some(restart) = terminate { pf_warn!( diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs index 520ff556..759e7cf3 100644 --- a/src/protocols/multipaxos.rs +++ b/src/protocols/multipaxos.rs @@ -962,7 +962,7 @@ impl MultiPaxosReplica { self.is_leader = true; // this starts broadcasting heartbeats self.control_hub .send_ctrl(CtrlMsg::LeaderStatus { step_up: true })?; - pf_warn!(self.id; "becoming a leader..."); + pf_info!(self.id; "becoming a leader..."); // broadcast a heartbeat right now self.bcast_heartbeats()?; @@ -1058,6 +1058,7 @@ impl MultiPaxosReplica { self.is_leader = false; self.control_hub .send_ctrl(CtrlMsg::LeaderStatus { step_up: false })?; + pf_info!(self.id; "no longer a leader..."); } // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot); @@ -1102,20 +1103,56 @@ impl MultiPaxosReplica { Ok(()) } + /// Handler of Pause control message. + fn handle_ctrl_pause( + &mut self, + paused: &mut bool, + ) -> Result<(), SummersetError> { + pf_warn!(self.id; "server got pause req"); + *paused = true; + self.control_hub.send_ctrl(CtrlMsg::PauseReply)?; + Ok(()) + } + + /// Handler of Resume control message. + fn handle_ctrl_resume( + &mut self, + paused: &mut bool, + ) -> Result<(), SummersetError> { + pf_warn!(self.id; "server got resume req"); + *paused = false; + self.control_hub.send_ctrl(CtrlMsg::ResumeReply)?; + + // reset leader heartbeat timer + self.kickoff_hb_hear_timer()?; + + Ok(()) + } + /// Synthesized handler of manager control messages. If ok, returns /// `Some(true)` if decides to terminate and reboot, `Some(false)` if /// decides to shutdown completely, and `None` if not terminating. async fn handle_ctrl_msg( &mut self, msg: CtrlMsg, + paused: &mut bool, ) -> Result, SummersetError> { - // TODO: fill this when more control message types added match msg { CtrlMsg::ResetState { durable } => { self.handle_ctrl_reset_state(durable).await?; Ok(Some(true)) } + CtrlMsg::Pause => { + self.handle_ctrl_pause(paused)?; + Ok(None) + } + + CtrlMsg::Resume => { + self.handle_ctrl_resume(paused)?; + Ok(None) + } + _ => Ok(None), // ignore all other types } } @@ -1413,10 +1450,11 @@ impl GenericReplica for MultiPaxosReplica { self.kickoff_hb_hear_timer()?; // main event loop + let mut paused = false; loop { tokio::select! { // client request batch - req_batch = self.external_api.get_req_batch() => { + req_batch = self.external_api.get_req_batch(), if !paused => { if let Err(e) = req_batch { pf_error!(self.id; "error getting req batch: {}", e); continue; @@ -1428,7 +1466,7 @@ impl GenericReplica for MultiPaxosReplica { }, // durable logging result - log_result = self.storage_hub.get_result() => { + log_result = self.storage_hub.get_result(), if !paused => { if let Err(e) = log_result { pf_error!(self.id; "error getting log result: {}", e); continue; @@ -1441,7 +1479,7 @@ impl GenericReplica for MultiPaxosReplica { }, // message from peer - msg = self.transport_hub.recv_msg() => { + msg = self.transport_hub.recv_msg(), if !paused => { if let Err(e) = msg { pf_error!(self.id; "error receiving peer msg: {}", e); continue; @@ -1453,7 +1491,7 @@ impl GenericReplica for MultiPaxosReplica { } // state machine execution result - cmd_result = self.state_machine.get_result() => { + cmd_result = self.state_machine.get_result(), if !paused => { if let Err(e) = cmd_result { pf_error!(self.id; "error getting cmd result: {}", e); continue; @@ -1465,12 +1503,12 @@ impl GenericReplica for MultiPaxosReplica { }, // leader inactivity timeout - _ = self.hb_hear_timer.timeout() => { + _ = self.hb_hear_timer.timeout(), if !paused => { self.become_a_leader()?; }, // leader sending heartbeat - _ = self.hb_send_interval.tick(), if self.is_leader => { + _ = self.hb_send_interval.tick(), if !paused && self.is_leader => { self.bcast_heartbeats()?; } @@ -1481,7 +1519,7 @@ impl GenericReplica for MultiPaxosReplica { continue; } let ctrl_msg = ctrl_msg.unwrap(); - match self.handle_ctrl_msg(ctrl_msg).await { + match self.handle_ctrl_msg(ctrl_msg, &mut paused).await { Ok(terminate) => { if let Some(restart) = terminate { pf_warn!( diff --git a/src/protocols/rep_nothing.rs b/src/protocols/rep_nothing.rs index ba73a21a..97469f5f 100644 --- a/src/protocols/rep_nothing.rs +++ b/src/protocols/rep_nothing.rs @@ -272,20 +272,52 @@ impl RepNothingReplica { Ok(()) } + /// Handler of Pause control message. + fn handle_ctrl_pause( + &mut self, + paused: &mut bool, + ) -> Result<(), SummersetError> { + pf_warn!(self.id; "server got pause req"); + *paused = true; + self.control_hub.send_ctrl(CtrlMsg::PauseReply)?; + Ok(()) + } + + /// Handler of Resume control message. + fn handle_ctrl_resume( + &mut self, + paused: &mut bool, + ) -> Result<(), SummersetError> { + pf_warn!(self.id; "server got resume req"); + *paused = false; + self.control_hub.send_ctrl(CtrlMsg::ResumeReply)?; + Ok(()) + } + /// Synthesized handler of manager control messages. If ok, returns /// `Some(true)` if decides to terminate and reboot, `Some(false)` if /// decides to shutdown completely, and `None` if not terminating. async fn handle_ctrl_msg( &mut self, msg: CtrlMsg, + paused: &mut bool, ) -> Result, SummersetError> { - // TODO: fill this when more control message types added match msg { CtrlMsg::ResetState { durable } => { self.handle_ctrl_reset_state(durable).await?; Ok(Some(true)) } + CtrlMsg::Pause => { + self.handle_ctrl_pause(paused)?; + Ok(None) + } + + CtrlMsg::Resume => { + self.handle_ctrl_resume(paused)?; + Ok(None) + } + _ => Ok(None), // ignore all other types } } @@ -437,10 +469,11 @@ impl GenericReplica for RepNothingReplica { self.recover_from_log().await?; // main event loop + let mut paused = false; loop { tokio::select! { // client request batch - req_batch = self.external_api.get_req_batch() => { + req_batch = self.external_api.get_req_batch(), if !paused => { if let Err(e) = req_batch { pf_error!(self.id; "error getting req batch: {}", e); continue; @@ -452,7 +485,7 @@ impl GenericReplica for RepNothingReplica { }, // durable logging result - log_result = self.storage_hub.get_result() => { + log_result = self.storage_hub.get_result(), if !paused => { if let Err(e) = log_result { pf_error!(self.id; "error getting log result: {}", e); continue; @@ -464,7 +497,7 @@ impl GenericReplica for RepNothingReplica { }, // state machine execution result - cmd_result = self.state_machine.get_result() => { + cmd_result = self.state_machine.get_result(), if !paused => { if let Err(e) = cmd_result { pf_error!(self.id; "error getting cmd result: {}", e); continue; @@ -482,7 +515,7 @@ impl GenericReplica for RepNothingReplica { continue; } let ctrl_msg = ctrl_msg.unwrap(); - match self.handle_ctrl_msg(ctrl_msg).await { + match self.handle_ctrl_msg(ctrl_msg, &mut paused).await { Ok(terminate) => { if let Some(restart) = terminate { pf_warn!( diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs index ce987f33..d963a7b8 100644 --- a/src/protocols/rs_paxos.rs +++ b/src/protocols/rs_paxos.rs @@ -1047,7 +1047,7 @@ impl RSPaxosReplica { self.is_leader = true; // this starts broadcasting heartbeats self.control_hub .send_ctrl(CtrlMsg::LeaderStatus { step_up: true })?; - pf_warn!(self.id; "becoming a leader..."); + pf_info!(self.id; "becoming a leader..."); // broadcast a heartbeat right now self.bcast_heartbeats()?; @@ -1143,6 +1143,7 @@ impl RSPaxosReplica { self.is_leader = false; self.control_hub .send_ctrl(CtrlMsg::LeaderStatus { step_up: false })?; + pf_info!(self.id; "no longer a leader..."); } // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot); @@ -1187,20 +1188,56 @@ impl RSPaxosReplica { Ok(()) } + /// Handler of Pause control message. + fn handle_ctrl_pause( + &mut self, + paused: &mut bool, + ) -> Result<(), SummersetError> { + pf_warn!(self.id; "server got pause req"); + *paused = true; + self.control_hub.send_ctrl(CtrlMsg::PauseReply)?; + Ok(()) + } + + /// Handler of Resume control message. + fn handle_ctrl_resume( + &mut self, + paused: &mut bool, + ) -> Result<(), SummersetError> { + pf_warn!(self.id; "server got resume req"); + *paused = false; + self.control_hub.send_ctrl(CtrlMsg::ResumeReply)?; + + // reset leader heartbeat timer + self.kickoff_hb_hear_timer()?; + + Ok(()) + } + /// Synthesized handler of manager control messages. If ok, returns /// `Some(true)` if decides to terminate and reboot, `Some(false)` if /// decides to shutdown completely, and `None` if not terminating. async fn handle_ctrl_msg( &mut self, msg: CtrlMsg, + paused: &mut bool, ) -> Result, SummersetError> { - // TODO: fill this when more control message types added match msg { CtrlMsg::ResetState { durable } => { self.handle_ctrl_reset_state(durable).await?; Ok(Some(true)) } + CtrlMsg::Pause => { + self.handle_ctrl_pause(paused)?; + Ok(None) + } + + CtrlMsg::Resume => { + self.handle_ctrl_resume(paused)?; + Ok(None) + } + _ => Ok(None), // ignore all other types } } @@ -1532,10 +1569,11 @@ impl GenericReplica for RSPaxosReplica { self.kickoff_hb_hear_timer()?; // main event loop + let mut paused = false; loop { tokio::select! { // client request batch - req_batch = self.external_api.get_req_batch() => { + req_batch = self.external_api.get_req_batch(), if !paused => { if let Err(e) = req_batch { pf_error!(self.id; "error getting req batch: {}", e); continue; @@ -1547,7 +1585,7 @@ impl GenericReplica for RSPaxosReplica { }, // durable logging result - log_result = self.storage_hub.get_result() => { + log_result = self.storage_hub.get_result(), if !paused => { if let Err(e) = log_result { pf_error!(self.id; "error getting log result: {}", e); continue; @@ -1560,7 +1598,7 @@ impl GenericReplica for RSPaxosReplica { }, // message from peer - msg = self.transport_hub.recv_msg() => { + msg = self.transport_hub.recv_msg(), if !paused => { if let Err(e) = msg { pf_error!(self.id; "error receiving peer msg: {}", e); continue; @@ -1572,7 +1610,7 @@ impl GenericReplica for RSPaxosReplica { } // state machine execution result - cmd_result = self.state_machine.get_result() => { + cmd_result = self.state_machine.get_result(), if !paused => { if let Err(e) = cmd_result { pf_error!(self.id; "error getting cmd result: {}", e); continue; @@ -1584,12 +1622,12 @@ impl GenericReplica for RSPaxosReplica { }, // leader inactivity timeout - _ = self.hb_hear_timer.timeout() => { + _ = self.hb_hear_timer.timeout(), if !paused => { self.become_a_leader()?; }, // leader sending heartbeat - _ = self.hb_send_interval.tick(), if self.is_leader => { + _ = self.hb_send_interval.tick(), if !paused && self.is_leader => { self.bcast_heartbeats()?; } @@ -1600,7 +1638,7 @@ impl GenericReplica for RSPaxosReplica { continue; } let ctrl_msg = ctrl_msg.unwrap(); - match self.handle_ctrl_msg(ctrl_msg).await { + match self.handle_ctrl_msg(ctrl_msg, &mut paused).await { Ok(terminate) => { if let Some(restart) = terminate { pf_warn!( diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs index 56b28414..c42be3bc 100644 --- a/src/protocols/simple_push.rs +++ b/src/protocols/simple_push.rs @@ -437,20 +437,52 @@ impl SimplePushReplica { Ok(()) } + /// Handler of Pause control message. + fn handle_ctrl_pause( + &mut self, + paused: &mut bool, + ) -> Result<(), SummersetError> { + pf_warn!(self.id; "server got pause req"); + *paused = true; + self.control_hub.send_ctrl(CtrlMsg::PauseReply)?; + Ok(()) + } + + /// Handler of Resume control message. + fn handle_ctrl_resume( + &mut self, + paused: &mut bool, + ) -> Result<(), SummersetError> { + pf_warn!(self.id; "server got resume req"); + *paused = false; + self.control_hub.send_ctrl(CtrlMsg::ResumeReply)?; + Ok(()) + } + /// Synthesized handler of manager control messages. If ok, returns /// `Some(true)` if decides to terminate and reboot, `Some(false)` if /// decides to shutdown completely, and `None` if not terminating. async fn handle_ctrl_msg( &mut self, msg: CtrlMsg, + paused: &mut bool, ) -> Result, SummersetError> { - // TODO: fill this when more control message types added match msg { CtrlMsg::ResetState { durable } => { self.handle_ctrl_reset_state(durable).await?; Ok(Some(true)) } + CtrlMsg::Pause => { + self.handle_ctrl_pause(paused)?; + Ok(None) + } + + CtrlMsg::Resume => { + self.handle_ctrl_resume(paused)?; + Ok(None) + } + _ => Ok(None), // ignore all other types } } @@ -642,10 +674,11 @@ impl GenericReplica for SimplePushReplica { self.recover_from_log().await?; // main event loop + let mut paused = false; loop { tokio::select! { // client request batch - req_batch = self.external_api.get_req_batch() => { + req_batch = self.external_api.get_req_batch(), if !paused => { if let Err(e) = req_batch { pf_error!(self.id; "error getting req batch: {}", e); continue; @@ -657,7 +690,7 @@ impl GenericReplica for SimplePushReplica { }, // durable logging result - log_result = self.storage_hub.get_result() => { + log_result = self.storage_hub.get_result(), if !paused => { if let Err(e) = log_result { pf_error!(self.id; "error getting log result: {}", e); continue; @@ -669,7 +702,7 @@ impl GenericReplica for SimplePushReplica { }, // message from peer - msg = self.transport_hub.recv_msg() => { + msg = self.transport_hub.recv_msg(), if !paused => { if let Err(e) = msg { pf_error!(self.id; "error receiving peer msg: {}", e); continue; @@ -691,7 +724,7 @@ impl GenericReplica for SimplePushReplica { } // state machine execution result - cmd_result = self.state_machine.get_result() => { + cmd_result = self.state_machine.get_result(), if !paused => { if let Err(e) = cmd_result { pf_error!(self.id; "error getting cmd result: {}", e); continue; @@ -709,7 +742,7 @@ impl GenericReplica for SimplePushReplica { continue; } let ctrl_msg = ctrl_msg.unwrap(); - match self.handle_ctrl_msg(ctrl_msg).await { + match self.handle_ctrl_msg(ctrl_msg, &mut paused).await { Ok(terminate) => { if let Some(restart) = terminate { pf_warn!( diff --git a/summerset_client/src/clients/tester.rs b/summerset_client/src/clients/tester.rs index a3c1ab64..d5b1b8cd 100644 --- a/summerset_client/src/clients/tester.rs +++ b/summerset_client/src/clients/tester.rs @@ -27,9 +27,11 @@ lazy_static! { static ref ALL_TESTS: Vec<(&'static str, bool)> = vec![ ("primitive_ops", true), ("client_reconnect", true), - ("leader_node_reset", true), ("non_leader_reset", true), - ("two_nodes_reset", false) + ("leader_node_reset", true), + ("two_nodes_reset", false), + ("non_leader_pause", false), + ("leader_node_pause", false), ]; } @@ -246,7 +248,7 @@ impl ClientTester { ) -> Result<(), SummersetError> { let ctrl_stub = self.driver.ctrl_stub(); - // send ResetServer request to manager + // send ResetServers request to manager let req = CtrlRequest::ResetServers { servers, durable }; let mut sent = ctrl_stub.send_req(Some(&req))?; while !sent { @@ -261,21 +263,69 @@ impl ClientTester { } } + /// Pauses some server(s) in the cluster. + async fn pause_servers( + &mut self, + servers: HashSet, + ) -> Result<(), SummersetError> { + let ctrl_stub = self.driver.ctrl_stub(); + + // send PauseServers request to manager + let req = CtrlRequest::PauseServers { servers }; + let mut sent = ctrl_stub.send_req(Some(&req))?; + while !sent { + sent = ctrl_stub.send_req(None)?; + } + + // wait for reply from manager + let reply = ctrl_stub.recv_reply().await?; + match reply { + CtrlReply::PauseServers { .. } => Ok(()), + _ => logged_err!(self.driver.id; "unexpected control reply type"), + } + } + + /// Resume some server(s) in the cluster. + #[allow(dead_code)] + async fn resume_servers( + &mut self, + servers: HashSet, + ) -> Result<(), SummersetError> { + let ctrl_stub = self.driver.ctrl_stub(); + + // send ResumeServers request to manager + let req = CtrlRequest::ResumeServers { servers }; + let mut sent = ctrl_stub.send_req(Some(&req))?; + while !sent { + sent = ctrl_stub.send_req(None)?; + } + + // wait for reply from manager + let reply = ctrl_stub.recv_reply().await?; + match reply { + CtrlReply::ResumeServers { .. } => Ok(()), + _ => logged_err!(self.driver.id; "unexpected control reply type"), + } + } + /// Runs the individual correctness test. async fn do_test_by_name( &mut self, name: &str, ) -> Result<(), SummersetError> { // reset everything to initial state at the start of each test - self.reset_servers(HashSet::new(), false).await?; + // self.reset_servers(HashSet::new(), false).await?; + // time::sleep(Duration::from_secs(1)).await; self.driver.connect().await?; let result = match name { "primitive_ops" => self.test_primitive_ops().await, "client_reconnect" => self.test_client_reconnect().await, - "leader_node_reset" => self.test_leader_node_reset().await, "non_leader_reset" => self.test_non_leader_reset().await, + "leader_node_reset" => self.test_leader_node_reset().await, "two_nodes_reset" => self.test_two_nodes_reset().await, + "non_leader_pause" => self.test_non_leader_pause().await, + "leader_node_pause" => self.test_leader_node_pause().await, _ => { return logged_err!(self.driver.id; "unrecognized test name '{}'", name); @@ -361,12 +411,12 @@ impl ClientTester { Ok(()) } - /// Single leader replica node crashes and restarts. - async fn test_leader_node_reset(&mut self) -> Result<(), SummersetError> { + /// Single non-leader replica node crashes and restarts. + async fn test_non_leader_reset(&mut self) -> Result<(), SummersetError> { let v = Self::gen_rand_string(8); self.checked_put("Jose", &v, Some(None)).await?; for (s, is_leader) in self.query_servers().await? { - if is_leader { + if !is_leader { self.driver.leave(false).await?; self.reset_servers(HashSet::from([s]), true).await?; time::sleep(Duration::from_millis(500)).await; @@ -379,11 +429,11 @@ impl ClientTester { } /// Single leader replica node crashes and restarts. - async fn test_non_leader_reset(&mut self) -> Result<(), SummersetError> { + async fn test_leader_node_reset(&mut self) -> Result<(), SummersetError> { let v = Self::gen_rand_string(8); self.checked_put("Jose", &v, Some(None)).await?; for (s, is_leader) in self.query_servers().await? { - if !is_leader { + if is_leader { self.driver.leave(false).await?; self.reset_servers(HashSet::from([s]), true).await?; time::sleep(Duration::from_millis(500)).await; @@ -423,4 +473,44 @@ impl ClientTester { } Ok(()) } + + /// Single non-leader replica node paused. + async fn test_non_leader_pause(&mut self) -> Result<(), SummersetError> { + let v0 = Self::gen_rand_string(8); + self.checked_put("Jose", &v0, Some(None)).await?; + time::sleep(Duration::from_millis(300)).await; + for (s, is_leader) in self.query_servers().await? { + if !is_leader { + self.driver.leave(false).await?; + self.pause_servers(HashSet::from([s])).await?; + time::sleep(Duration::from_secs(1)).await; + self.driver.connect().await?; + self.checked_get("Jose", Some(Some(&v0))).await?; + let v1 = Self::gen_rand_string(8); + self.checked_put("Jose", &v1, Some(Some(&v0))).await?; + break; + } + } + Ok(()) + } + + /// Single leader replica node paused. + async fn test_leader_node_pause(&mut self) -> Result<(), SummersetError> { + let v0 = Self::gen_rand_string(8); + self.checked_put("Jose", &v0, Some(None)).await?; + time::sleep(Duration::from_millis(300)).await; + for (s, is_leader) in self.query_servers().await? { + if is_leader { + self.driver.leave(false).await?; + self.pause_servers(HashSet::from([s])).await?; + time::sleep(Duration::from_secs(1)).await; + self.driver.connect().await?; + self.checked_get("Jose", Some(Some(&v0))).await?; + let v1 = Self::gen_rand_string(8); + self.checked_put("Jose", &v1, Some(Some(&v0))).await?; + break; + } + } + Ok(()) + } } From 5d2273af52d6d1e5812e1f17e220ac30e65e0818 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Sun, 17 Sep 2023 12:16:41 -0500 Subject: [PATCH 47/89] minor updates to tester client --- summerset_client/src/clients/tester.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/summerset_client/src/clients/tester.rs b/summerset_client/src/clients/tester.rs index d5b1b8cd..e3c27cc0 100644 --- a/summerset_client/src/clients/tester.rs +++ b/summerset_client/src/clients/tester.rs @@ -314,8 +314,8 @@ impl ClientTester { name: &str, ) -> Result<(), SummersetError> { // reset everything to initial state at the start of each test - // self.reset_servers(HashSet::new(), false).await?; - // time::sleep(Duration::from_secs(1)).await; + self.reset_servers(HashSet::new(), false).await?; + time::sleep(Duration::from_secs(1)).await; self.driver.connect().await?; let result = match name { From e8bb281d2186579389de8127e3139580d286c0f0 Mon Sep 17 00:00:00 2001 From: josehu Date: Sun, 17 Sep 2023 21:09:13 +0000 Subject: [PATCH 48/89] fix MultiPaxos prepare reply voted bug --- src/manager/clusman.rs | 18 +++- src/manager/reactor.rs | 16 +++- src/protocols/crossword.rs | 26 +++++- src/protocols/multipaxos.rs | 48 ++++++++-- src/protocols/rep_nothing.rs | 13 ++- src/protocols/rs_paxos.rs | 26 +++++- src/protocols/simple_push.rs | 13 ++- src/server/storage.rs | 16 ++-- summerset_client/src/clients/tester.rs | 121 +++++++++++++++++++------ 9 files changed, 230 insertions(+), 67 deletions(-) diff --git a/src/manager/clusman.rs b/src/manager/clusman.rs index 0eeb1f1f..7b08e4d9 100644 --- a/src/manager/clusman.rs +++ b/src/manager/clusman.rs @@ -276,10 +276,22 @@ impl ClusterManager { let servers: HashMap = self .server_info .iter() - .map(|(&server, info)| (server, (info.api_addr, info.is_leader))) + .filter_map(|(&server, info)| { + if info.is_paused { + None // ignore paused servers + } else { + Some((server, (info.api_addr, info.is_leader))) + } + }) .collect(); - self.client_reactor - .send_reply(CtrlReply::QueryInfo { servers }, client) + + self.client_reactor.send_reply( + CtrlReply::QueryInfo { + population: self.population, + servers, + }, + client, + ) } /// Handler of client ResetServers request. diff --git a/src/manager/reactor.rs b/src/manager/reactor.rs index 4df04e36..3273c54f 100644 --- a/src/manager/reactor.rs +++ b/src/manager/reactor.rs @@ -54,6 +54,8 @@ pub enum CtrlRequest { pub enum CtrlReply { /// Reply to server info query. QueryInfo { + /// Number of replicas in cluster. + population: u8, /// Map from replica ID -> (addr, is_leader). servers: HashMap, }, @@ -466,6 +468,7 @@ mod reactor_tests { // send reply to client reactor.send_reply( CtrlReply::QueryInfo { + population: 2, servers: HashMap::::from([ (0, ("127.0.0.1:53700".parse()?, true)), (1, ("127.0.0.1:53701".parse()?, false)), @@ -485,6 +488,7 @@ mod reactor_tests { assert_eq!( ctrl_stub.recv_reply().await?, CtrlReply::QueryInfo { + population: 2, servers: HashMap::::from([ (0, ("127.0.0.1:53700".parse()?, true)), (1, ("127.0.0.1:53701".parse()?, false)), @@ -510,6 +514,7 @@ mod reactor_tests { assert_eq!( ctrl_stub.recv_reply().await?, CtrlReply::QueryInfo { + population: 2, servers: HashMap::::from([ (0, ("127.0.0.1:54700".parse()?, true)), (1, ("127.0.0.1:54701".parse()?, false)), @@ -530,9 +535,10 @@ mod reactor_tests { assert_eq!( ctrl_stub.recv_reply().await?, CtrlReply::QueryInfo { + population: 2, servers: HashMap::::from([ - (0, ("127.0.0.1:54710".parse()?, true)), - (1, ("127.0.0.1:54711".parse()?, false)), + (0, ("127.0.0.1:54700".parse()?, true)), + (1, ("127.0.0.1:54701".parse()?, false)), ]), } ); @@ -549,6 +555,7 @@ mod reactor_tests { // send reply to client reactor.send_reply( CtrlReply::QueryInfo { + population: 2, servers: HashMap::::from([ (0, ("127.0.0.1:54700".parse()?, true)), (1, ("127.0.0.1:54701".parse()?, false)), @@ -564,9 +571,10 @@ mod reactor_tests { // send reply to new client reactor.send_reply( CtrlReply::QueryInfo { + population: 2, servers: HashMap::::from([ - (0, ("127.0.0.1:54710".parse()?, true)), - (1, ("127.0.0.1:54711".parse()?, false)), + (0, ("127.0.0.1:54700".parse()?, true)), + (1, ("127.0.0.1:54701".parse()?, false)), ]), }, client2, diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index cdfe2517..529b35c8 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -1008,7 +1008,6 @@ impl CrosswordReplica { } /// Handler of Commit message from leader. - /// TODO: take care of missing/lost Commit messages fn handle_msg_commit( &mut self, peer: ReplicaId, @@ -1143,7 +1142,10 @@ impl CrosswordReplica { /// Becomes a leader, sends self-initiated Prepare messages to followers /// for all in-progress instances, and starts broadcasting heartbeats. fn become_a_leader(&mut self) -> Result<(), SummersetError> { - assert!(!self.is_leader); + if self.is_leader { + return Ok(()); + } + self.is_leader = true; // this starts broadcasting heartbeats self.control_hub .send_ctrl(CtrlMsg::LeaderStatus { step_up: true })?; @@ -1851,7 +1853,15 @@ impl GenericEndpoint for CrosswordClient { let reply = self.ctrl_stub.recv_reply().await?; match reply { - CtrlReply::QueryInfo { servers } => { + CtrlReply::QueryInfo { + population, + servers, + } => { + // shift to a new server_id if current one not active + assert!(!servers.is_empty()); + while !servers.contains_key(&self.server_id) { + self.server_id = (self.server_id + 1) % population; + } // establish connection to all servers self.servers = servers .into_iter() @@ -1907,7 +1917,10 @@ impl GenericEndpoint for CrosswordClient { .unwrap() .send_req(req) } else { - Err(SummersetError("client not set up".into())) + Err(SummersetError(format!( + "server_id {} not in api_stubs", + self.server_id + ))) } } @@ -1938,7 +1951,10 @@ impl GenericEndpoint for CrosswordClient { Ok(reply) } else { - Err(SummersetError("client not set up".into())) + Err(SummersetError(format!( + "server_id {} not in api_stubs", + self.server_id + ))) } } diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs index 759e7cf3..974b95d4 100644 --- a/src/protocols/multipaxos.rs +++ b/src/protocols/multipaxos.rs @@ -132,6 +132,9 @@ struct Instance { /// Batch of client requests. reqs: ReqBatch, + /// Highest ballot and associated value I have accepted. + voted: (Ballot, ReqBatch), + /// Leader-side bookkeeping info. leader_bk: Option, @@ -359,6 +362,7 @@ impl MultiPaxosReplica { bal: 0, status: Status::Null, reqs: req_batch.clone(), + voted: (0, Vec::new()), leader_bk: Some(LeaderBookkeeping { prepare_acks: Bitmap::new(self.population, false), prepare_max_bal: 0, @@ -420,6 +424,7 @@ impl MultiPaxosReplica { slot, inst.bal); // record update to largest accepted ballot and corresponding data + inst.voted = (inst.bal, req_batch.clone()); self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Accepting), LogAction::Append { @@ -458,8 +463,8 @@ impl MultiPaxosReplica { pf_trace!(self.id; "finished PrepareBal logging for slot {} bal {}", slot, self.insts[slot].bal); let inst = &self.insts[slot]; - let voted = if inst.status >= Status::Accepting { - Some((inst.bal, inst.reqs.clone())) + let voted = if inst.voted.0 > 0 { + Some(inst.voted.clone()) } else { None }; @@ -609,6 +614,7 @@ impl MultiPaxosReplica { bal: 0, status: Status::Null, reqs: Vec::new(), + voted: (0, Vec::new()), leader_bk: None, replica_bk: None, external: false, @@ -739,6 +745,7 @@ impl MultiPaxosReplica { bal: 0, status: Status::Null, reqs: Vec::new(), + voted: (0, Vec::new()), leader_bk: None, replica_bk: None, external: false, @@ -756,6 +763,7 @@ impl MultiPaxosReplica { self.bal_max_seen = ballot; // record update to largest prepare ballot + inst.voted = (ballot, reqs.clone()); self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Accepting), LogAction::Append { @@ -829,7 +837,6 @@ impl MultiPaxosReplica { } /// Handler of Commit message from leader. - /// TODO: take care of missing/lost Commit messages fn handle_msg_commit( &mut self, peer: ReplicaId, @@ -843,6 +850,7 @@ impl MultiPaxosReplica { bal: 0, status: Status::Null, reqs: Vec::new(), + voted: (0, Vec::new()), leader_bk: None, replica_bk: None, external: false, @@ -958,7 +966,10 @@ impl MultiPaxosReplica { /// Becomes a leader, sends self-initiated Prepare messages to followers /// for all in-progress instances, and starts broadcasting heartbeats. fn become_a_leader(&mut self) -> Result<(), SummersetError> { - assert!(!self.is_leader); + if self.is_leader { + return Ok(()); + } + self.is_leader = true; // this starts broadcasting heartbeats self.control_hub .send_ctrl(CtrlMsg::LeaderStatus { step_up: true })?; @@ -1120,12 +1131,12 @@ impl MultiPaxosReplica { paused: &mut bool, ) -> Result<(), SummersetError> { pf_warn!(self.id; "server got resume req"); - *paused = false; - self.control_hub.send_ctrl(CtrlMsg::ResumeReply)?; // reset leader heartbeat timer self.kickoff_hb_hear_timer()?; + *paused = false; + self.control_hub.send_ctrl(CtrlMsg::ResumeReply)?; Ok(()) } @@ -1170,6 +1181,7 @@ impl MultiPaxosReplica { bal: 0, status: Status::Null, reqs: Vec::new(), + voted: (0, Vec::new()), leader_bk: None, replica_bk: None, external: false, @@ -1196,6 +1208,7 @@ impl MultiPaxosReplica { bal: 0, status: Status::Null, reqs: Vec::new(), + voted: (0, Vec::new()), leader_bk: None, replica_bk: None, external: false, @@ -1205,7 +1218,8 @@ impl MultiPaxosReplica { let inst = &mut self.insts[slot]; inst.bal = ballot; inst.status = Status::Accepting; - inst.reqs = reqs; + inst.reqs = reqs.clone(); + inst.voted = (ballot, reqs); // update bal_prepared and bal_max_seen if self.bal_prepared < ballot { self.bal_prepared = ballot; @@ -1625,7 +1639,15 @@ impl GenericEndpoint for MultiPaxosClient { let reply = self.ctrl_stub.recv_reply().await?; match reply { - CtrlReply::QueryInfo { servers } => { + CtrlReply::QueryInfo { + population, + servers, + } => { + // shift to a new server_id if current one not active + assert!(!servers.is_empty()); + while !servers.contains_key(&self.server_id) { + self.server_id = (self.server_id + 1) % population; + } // establish connection to all servers self.servers = servers .into_iter() @@ -1681,7 +1703,10 @@ impl GenericEndpoint for MultiPaxosClient { .unwrap() .send_req(req) } else { - Err(SummersetError("client not set up".into())) + Err(SummersetError(format!( + "server_id {} not in api_stubs", + self.server_id + ))) } } @@ -1712,7 +1737,10 @@ impl GenericEndpoint for MultiPaxosClient { Ok(reply) } else { - Err(SummersetError("client not set up".into())) + Err(SummersetError(format!( + "server_id {} not in api_stubs", + self.server_id + ))) } } diff --git a/src/protocols/rep_nothing.rs b/src/protocols/rep_nothing.rs index 97469f5f..e5c6b0dd 100644 --- a/src/protocols/rep_nothing.rs +++ b/src/protocols/rep_nothing.rs @@ -612,8 +612,17 @@ impl GenericEndpoint for RepNothingClient { let reply = self.ctrl_stub.recv_reply().await?; match reply { - CtrlReply::QueryInfo { servers } => { - // connect to the one with server ID in config + CtrlReply::QueryInfo { + population, + servers, + } => { + // find a server to connect to, starting from provided server_id + assert!(!servers.is_empty()); + while !servers.contains_key(&self.config.server_id) { + self.config.server_id = + (self.config.server_id + 1) % population; + } + // connect to that server pf_info!(self.id; "connecting to server {} '{}'...", self.config.server_id, servers[&self.config.server_id].0); let api_stub = ClientApiStub::new_by_connect( diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs index d963a7b8..151470c7 100644 --- a/src/protocols/rs_paxos.rs +++ b/src/protocols/rs_paxos.rs @@ -908,7 +908,6 @@ impl RSPaxosReplica { } /// Handler of Commit message from leader. - /// TODO: take care of missing/lost Commit messages fn handle_msg_commit( &mut self, peer: ReplicaId, @@ -1043,7 +1042,10 @@ impl RSPaxosReplica { /// Becomes a leader, sends self-initiated Prepare messages to followers /// for all in-progress instances, and starts broadcasting heartbeats. fn become_a_leader(&mut self) -> Result<(), SummersetError> { - assert!(!self.is_leader); + if self.is_leader { + return Ok(()); + } + self.is_leader = true; // this starts broadcasting heartbeats self.control_hub .send_ctrl(CtrlMsg::LeaderStatus { step_up: true })?; @@ -1744,7 +1746,15 @@ impl GenericEndpoint for RSPaxosClient { let reply = self.ctrl_stub.recv_reply().await?; match reply { - CtrlReply::QueryInfo { servers } => { + CtrlReply::QueryInfo { + population, + servers, + } => { + // shift to a new server_id if current one not active + assert!(!servers.is_empty()); + while !servers.contains_key(&self.server_id) { + self.server_id = (self.server_id + 1) % population; + } // establish connection to all servers self.servers = servers .into_iter() @@ -1800,7 +1810,10 @@ impl GenericEndpoint for RSPaxosClient { .unwrap() .send_req(req) } else { - Err(SummersetError("client not set up".into())) + Err(SummersetError(format!( + "server_id {} not in api_stubs", + self.server_id + ))) } } @@ -1831,7 +1844,10 @@ impl GenericEndpoint for RSPaxosClient { Ok(reply) } else { - Err(SummersetError("client not set up".into())) + Err(SummersetError(format!( + "server_id {} not in api_stubs", + self.server_id + ))) } } diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs index c42be3bc..5d8baeec 100644 --- a/src/protocols/simple_push.rs +++ b/src/protocols/simple_push.rs @@ -839,8 +839,17 @@ impl GenericEndpoint for SimplePushClient { let reply = self.ctrl_stub.recv_reply().await?; match reply { - CtrlReply::QueryInfo { servers } => { - // connect to the one with server ID in config + CtrlReply::QueryInfo { + population, + servers, + } => { + // find a server to connect to, starting from provided server_id + assert!(!servers.is_empty()); + while !servers.contains_key(&self.config.server_id) { + self.config.server_id = + (self.config.server_id + 1) % population; + } + // connect to that server pf_info!(self.id; "connecting to server {} '{}'...", self.config.server_id, servers[&self.config.server_id].0); let api_stub = ClientApiStub::new_by_connect( diff --git a/src/server/storage.rs b/src/server/storage.rs index 99809e65..d14bd2bf 100644 --- a/src/server/storage.rs +++ b/src/server/storage.rs @@ -203,12 +203,16 @@ where offset: usize, ) -> Result<(Option, usize), SummersetError> { if offset + 8 > file_size { - pf_warn!( - me; - "read header end offset {} out of file bound {}", - offset + 8, - file_size - ); + if offset < file_size { + // suppress warning if offset == file_size to avoid excessive + // log lines during recovery + pf_warn!( + me; + "read header end offset {} out of file bound {}", + offset + 8, + file_size + ); + } return Ok((None, offset)); } diff --git a/summerset_client/src/clients/tester.rs b/summerset_client/src/clients/tester.rs index e3c27cc0..7da4d5df 100644 --- a/summerset_client/src/clients/tester.rs +++ b/summerset_client/src/clients/tester.rs @@ -19,7 +19,7 @@ use tokio::time::{self, Duration}; use summerset::{ ReplicaId, GenericEndpoint, CommandResult, CtrlRequest, CtrlReply, - SummersetError, pf_error, logged_err, parsed_config, + SummersetError, pf_debug, pf_error, logged_err, parsed_config, }; lazy_static! { @@ -29,9 +29,10 @@ lazy_static! { ("client_reconnect", true), ("non_leader_reset", true), ("leader_node_reset", true), - ("two_nodes_reset", false), + ("two_nodes_reset", true), ("non_leader_pause", false), ("leader_node_pause", false), + ("node_pause_resume", false), ]; } @@ -109,13 +110,16 @@ impl ClientTester { } /// Issues a Get request and checks its reply value against given one if - /// not `None`. Retries in-place upon getting redirection error. + /// not `None`. Retries in-place upon getting redirection error. Retries + /// at most max_timeouts times upon getting timeouts. async fn checked_get( &mut self, key: &str, expect_value: Option>, + max_timeouts: u8, ) -> Result<(), SummersetError> { - loop { + let mut timeouts = 0; + while timeouts <= max_timeouts { let result = self.driver.get(key).await?; match result { DriverReply::Success { cmd_result, .. } => { @@ -151,25 +155,36 @@ impl ClientTester { } DriverReply::Timeout => { - return logged_err!( + timeouts += 1; + pf_debug!( self.driver.id; "client-side timeout {} ms", self.timeout.as_millis() - ) + ); } } } + + logged_err!( + self.driver.id; + "client-side timeout {} ms {} times", + self.timeout.as_millis(), + max_timeouts + ) } /// Issues a Put request and checks its reply old_value against given one - /// if not `None`. Retries in-place upon getting redirection error. + /// if not `None`. Retries in-place upon getting redirection error. Retries + /// at most max_timeouts times upon getting timeouts. async fn checked_put( &mut self, key: &str, value: &str, expect_old_value: Option>, + max_timeouts: u8, ) -> Result<(), SummersetError> { - loop { + let mut timeouts = 0; + while timeouts <= max_timeouts { let result = self.driver.put(key, value).await?; match result { DriverReply::Success { cmd_result, .. } => { @@ -206,14 +221,22 @@ impl ClientTester { } DriverReply::Timeout => { - return logged_err!( + timeouts += 1; + pf_debug!( self.driver.id; "client-side timeout {} ms", self.timeout.as_millis() - ) + ); } } } + + logged_err!( + self.driver.id; + "client-side timeout {} ms {} times", + self.timeout.as_millis(), + max_timeouts + ) } /// Query the list of servers in the cluster. Returns a map from replica ID @@ -233,7 +256,7 @@ impl ClientTester { // wait for reply from manager let reply = ctrl_stub.recv_reply().await?; match reply { - CtrlReply::QueryInfo { servers } => { + CtrlReply::QueryInfo { servers, .. } => { Ok(servers.into_iter().map(|(id, info)| (id, info.1)).collect()) } _ => logged_err!(self.driver.id; "unexpected control reply type"), @@ -326,6 +349,7 @@ impl ClientTester { "two_nodes_reset" => self.test_two_nodes_reset().await, "non_leader_pause" => self.test_non_leader_pause().await, "leader_node_pause" => self.test_leader_node_pause().await, + "node_pause_resume" => self.test_node_pause_resume().await, _ => { return logged_err!(self.driver.id; "unrecognized test name '{}'", name); @@ -391,37 +415,37 @@ impl ClientTester { impl ClientTester { /// Basic primitive operations. async fn test_primitive_ops(&mut self) -> Result<(), SummersetError> { - self.checked_get("Jose", Some(None)).await?; + self.checked_get("Jose", Some(None), 0).await?; let v0 = Self::gen_rand_string(8); - self.checked_put("Jose", &v0, Some(None)).await?; - self.checked_get("Jose", Some(Some(&v0))).await?; + self.checked_put("Jose", &v0, Some(None), 0).await?; + self.checked_get("Jose", Some(Some(&v0)), 0).await?; let v1 = Self::gen_rand_string(16); - self.checked_put("Jose", &v1, Some(Some(&v0))).await?; - self.checked_get("Jose", Some(Some(&v1))).await?; + self.checked_put("Jose", &v1, Some(Some(&v0)), 0).await?; + self.checked_get("Jose", Some(Some(&v1)), 0).await?; Ok(()) } /// Client leaves and reconnects. async fn test_client_reconnect(&mut self) -> Result<(), SummersetError> { let v = Self::gen_rand_string(8); - self.checked_put("Jose", &v, Some(None)).await?; + self.checked_put("Jose", &v, Some(None), 0).await?; self.driver.leave(false).await?; self.driver.connect().await?; - self.checked_get("Jose", Some(Some(&v))).await?; + self.checked_get("Jose", Some(Some(&v)), 0).await?; Ok(()) } /// Single non-leader replica node crashes and restarts. async fn test_non_leader_reset(&mut self) -> Result<(), SummersetError> { let v = Self::gen_rand_string(8); - self.checked_put("Jose", &v, Some(None)).await?; + self.checked_put("Jose", &v, Some(None), 0).await?; for (s, is_leader) in self.query_servers().await? { if !is_leader { self.driver.leave(false).await?; self.reset_servers(HashSet::from([s]), true).await?; time::sleep(Duration::from_millis(500)).await; self.driver.connect().await?; - self.checked_get("Jose", Some(Some(&v))).await?; + self.checked_get("Jose", Some(Some(&v)), 0).await?; break; } } @@ -431,14 +455,14 @@ impl ClientTester { /// Single leader replica node crashes and restarts. async fn test_leader_node_reset(&mut self) -> Result<(), SummersetError> { let v = Self::gen_rand_string(8); - self.checked_put("Jose", &v, Some(None)).await?; + self.checked_put("Jose", &v, Some(None), 0).await?; for (s, is_leader) in self.query_servers().await? { if is_leader { self.driver.leave(false).await?; self.reset_servers(HashSet::from([s]), true).await?; time::sleep(Duration::from_millis(500)).await; self.driver.connect().await?; - self.checked_get("Jose", Some(Some(&v))).await?; + self.checked_get("Jose", Some(Some(&v)), 0).await?; break; } } @@ -448,7 +472,7 @@ impl ClientTester { /// Two replica nodes (leader + non-leader) crash and restart. async fn test_two_nodes_reset(&mut self) -> Result<(), SummersetError> { let v = Self::gen_rand_string(8); - self.checked_put("Jose", &v, Some(None)).await?; + self.checked_put("Jose", &v, Some(None), 0).await?; let mut resets = HashSet::new(); let (mut l, mut nl) = (false, false); for (s, is_leader) in self.query_servers().await? { @@ -469,7 +493,7 @@ impl ClientTester { self.reset_servers(resets, true).await?; time::sleep(Duration::from_millis(500)).await; self.driver.connect().await?; - self.checked_get("Jose", Some(Some(&v))).await?; + self.checked_get("Jose", Some(Some(&v)), 0).await?; } Ok(()) } @@ -477,7 +501,7 @@ impl ClientTester { /// Single non-leader replica node paused. async fn test_non_leader_pause(&mut self) -> Result<(), SummersetError> { let v0 = Self::gen_rand_string(8); - self.checked_put("Jose", &v0, Some(None)).await?; + self.checked_put("Jose", &v0, Some(None), 0).await?; time::sleep(Duration::from_millis(300)).await; for (s, is_leader) in self.query_servers().await? { if !is_leader { @@ -485,9 +509,9 @@ impl ClientTester { self.pause_servers(HashSet::from([s])).await?; time::sleep(Duration::from_secs(1)).await; self.driver.connect().await?; - self.checked_get("Jose", Some(Some(&v0))).await?; + self.checked_get("Jose", Some(Some(&v0)), 0).await?; let v1 = Self::gen_rand_string(8); - self.checked_put("Jose", &v1, Some(Some(&v0))).await?; + self.checked_put("Jose", &v1, Some(Some(&v0)), 0).await?; break; } } @@ -497,7 +521,7 @@ impl ClientTester { /// Single leader replica node paused. async fn test_leader_node_pause(&mut self) -> Result<(), SummersetError> { let v0 = Self::gen_rand_string(8); - self.checked_put("Jose", &v0, Some(None)).await?; + self.checked_put("Jose", &v0, Some(None), 0).await?; time::sleep(Duration::from_millis(300)).await; for (s, is_leader) in self.query_servers().await? { if is_leader { @@ -505,9 +529,46 @@ impl ClientTester { self.pause_servers(HashSet::from([s])).await?; time::sleep(Duration::from_secs(1)).await; self.driver.connect().await?; - self.checked_get("Jose", Some(Some(&v0))).await?; + self.checked_get("Jose", Some(Some(&v0)), 0).await?; let v1 = Self::gen_rand_string(8); - self.checked_put("Jose", &v1, Some(Some(&v0))).await?; + self.checked_put("Jose", &v1, Some(Some(&v0)), 0).await?; + break; + } + } + Ok(()) + } + + /// Leader replica node paused and then resumed, twice. + async fn test_node_pause_resume(&mut self) -> Result<(), SummersetError> { + let v0 = Self::gen_rand_string(8); + self.checked_put("Jose", &v0, Some(None), 0).await?; + time::sleep(Duration::from_millis(300)).await; + for (s, is_leader) in self.query_servers().await? { + if is_leader { + self.driver.leave(false).await?; + self.pause_servers(HashSet::from([s])).await?; + time::sleep(Duration::from_secs(1)).await; + self.driver.connect().await?; + let v1 = Self::gen_rand_string(8); + self.checked_put("Jose", &v1, Some(Some(&v0)), 0).await?; + self.driver.leave(false).await?; + self.resume_servers(HashSet::from([s])).await?; + time::sleep(Duration::from_secs(1)).await; + self.driver.connect().await?; + let v2 = Self::gen_rand_string(8); + self.checked_put("Jose", &v2, Some(Some(&v1)), 1).await?; + self.driver.leave(false).await?; + self.pause_servers(HashSet::from([s])).await?; + time::sleep(Duration::from_secs(1)).await; + self.driver.connect().await?; + let v3 = Self::gen_rand_string(8); + self.checked_put("Jose", &v3, Some(Some(&v2)), 0).await?; + self.driver.leave(false).await?; + self.resume_servers(HashSet::from([s])).await?; + time::sleep(Duration::from_secs(1)).await; + self.driver.connect().await?; + let v4 = Self::gen_rand_string(8); + self.checked_put("Jose", &v4, Some(Some(&v3)), 1).await?; break; } } From 694b3b1dcfa50286a581d69e61aae9b069797564 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Sun, 17 Sep 2023 21:46:17 +0000 Subject: [PATCH 49/89] fix prepare reply voted bug for other Paxos variants --- src/protocols/crossword.rs | 165 ++++++++++++++++-------------------- src/protocols/multipaxos.rs | 84 ++++++------------ src/protocols/rs_paxos.rs | 137 +++++++++++++----------------- 3 files changed, 154 insertions(+), 232 deletions(-) diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index 529b35c8..7bb12ff8 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -140,6 +140,9 @@ struct Instance { /// Shards of a batch of client requests. reqs_cw: RSCodeword, + /// Highest ballot and associated value I have accepted. + voted: (Ballot, RSCodeword), + /// Leader-side bookkeeping info. leader_bk: Option, @@ -270,6 +273,28 @@ pub struct CrosswordReplica { } impl CrosswordReplica { + /// Create an empty null instance. + fn null_instance(&self) -> Result { + Ok(Instance { + bal: 0, + status: Status::Null, + reqs_cw: RSCodeword::::from_null( + self.quorum_cnt, + self.population - self.quorum_cnt, + )?, + voted: ( + 0, + RSCodeword::::from_null( + self.quorum_cnt, + self.population - self.quorum_cnt, + )?, + ), + leader_bk: None, + replica_bk: None, + external: false, + }) + } + /// Compose a unique ballot number from base. fn make_unique_ballot(&self, base: u64) -> Ballot { ((base << 8) | ((self.id + 1) as u64)) as Ballot @@ -428,18 +453,14 @@ impl CrosswordReplica { accept_acks: HashMap::new(), }); } else { - let new_inst = Instance { - bal: 0, - status: Status::Null, - reqs_cw, - leader_bk: Some(LeaderBookkeeping { - prepare_acks: Bitmap::new(self.population, false), - prepare_max_bal: 0, - accept_acks: HashMap::new(), - }), - replica_bk: None, - external: true, - }; + let mut new_inst = self.null_instance()?; + new_inst.reqs_cw = reqs_cw; + new_inst.leader_bk = Some(LeaderBookkeeping { + prepare_acks: Bitmap::new(self.population, false), + prepare_max_bal: 0, + accept_acks: HashMap::new(), + }); + new_inst.external = true; self.insts.push(new_inst); } @@ -493,6 +514,18 @@ impl CrosswordReplica { slot, inst.bal); // record update to largest accepted ballot and corresponding data + let subset_copy = inst.reqs_cw.subset_copy( + Bitmap::from( + self.population, + Self::shards_for_replica( + self.id, + self.population, + self.config.shards_per_replica, + ), + ), + false, + )?; + inst.voted = (inst.bal, subset_copy.clone()); self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Accepting), LogAction::Append { @@ -500,17 +533,7 @@ impl CrosswordReplica { slot, ballot: inst.bal, // persist only some shards on myself - reqs_cw: inst.reqs_cw.subset_copy( - Bitmap::from( - self.population, - Self::shards_for_replica( - self.id, - self.population, - self.config.shards_per_replica, - ), - ), - false, - )?, + reqs_cw: subset_copy, }, sync: self.config.logger_sync, }, @@ -558,8 +581,8 @@ impl CrosswordReplica { pf_trace!(self.id; "finished PrepareBal logging for slot {} bal {}", slot, self.insts[slot].bal); let inst = &self.insts[slot]; - let voted = if inst.status >= Status::Accepting { - Some((inst.bal, inst.reqs_cw.clone())) + let voted = if inst.voted.0 > 0 { + Some(inst.voted.clone()) } else { None }; @@ -716,17 +739,7 @@ impl CrosswordReplica { if ballot >= self.bal_max_seen { // locate instance in memory, filling in null instances if needed while self.insts.len() <= slot { - self.insts.push(Instance { - bal: 0, - status: Status::Null, - reqs_cw: RSCodeword::::from_null( - self.quorum_cnt, - self.population - self.quorum_cnt, - )?, - leader_bk: None, - replica_bk: None, - external: false, - }); + self.insts.push(self.null_instance()?); } let inst = &mut self.insts[slot]; assert!(inst.bal <= ballot); @@ -819,23 +832,25 @@ impl CrosswordReplica { } // record update to largest accepted ballot and corresponding data + let subset_copy = inst.reqs_cw.subset_copy( + Bitmap::from( + self.population, + Self::shards_for_replica( + self.id, + self.population, + self.config.shards_per_replica, + ), + ), + false, + )?; + inst.voted = (ballot, subset_copy.clone()); self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Accepting), LogAction::Append { entry: LogEntry::AcceptData { slot, ballot, - reqs_cw: inst.reqs_cw.subset_copy( - Bitmap::from( - self.population, - Self::shards_for_replica( - self.id, - self.population, - self.config.shards_per_replica, - ), - ), - false, - )?, + reqs_cw: subset_copy, }, sync: self.config.logger_sync, }, @@ -890,17 +905,7 @@ impl CrosswordReplica { if ballot >= self.bal_max_seen { // locate instance in memory, filling in null instances if needed while self.insts.len() <= slot { - self.insts.push(Instance { - bal: 0, - status: Status::Null, - reqs_cw: RSCodeword::::from_null( - self.quorum_cnt, - self.population - self.quorum_cnt, - )?, - leader_bk: None, - replica_bk: None, - external: false, - }); + self.insts.push(self.null_instance()?); } let inst = &mut self.insts[slot]; assert!(inst.bal <= ballot); @@ -914,6 +919,7 @@ impl CrosswordReplica { self.bal_max_seen = ballot; // record update to largest prepare ballot + inst.voted = (ballot, inst.reqs_cw.clone()); self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Accepting), LogAction::Append { @@ -1017,17 +1023,7 @@ impl CrosswordReplica { // locate instance in memory, filling in null instances if needed while self.insts.len() <= slot { - self.insts.push(Instance { - bal: 0, - status: Status::Null, - reqs_cw: RSCodeword::::from_null( - self.quorum_cnt, - self.population - self.quorum_cnt, - )?, - leader_bk: None, - replica_bk: None, - external: false, - }); + self.insts.push(self.null_instance()?); } let inst = &mut self.insts[slot]; @@ -1307,12 +1303,12 @@ impl CrosswordReplica { paused: &mut bool, ) -> Result<(), SummersetError> { pf_warn!(self.id; "server got resume req"); - *paused = false; - self.control_hub.send_ctrl(CtrlMsg::ResumeReply)?; // reset leader heartbeat timer self.kickoff_hb_hear_timer()?; + *paused = false; + self.control_hub.send_ctrl(CtrlMsg::ResumeReply)?; Ok(()) } @@ -1353,17 +1349,7 @@ impl CrosswordReplica { LogEntry::PrepareBal { slot, ballot } => { // locate instance in memory, filling in null instances if needed while self.insts.len() <= slot { - self.insts.push(Instance { - bal: 0, - status: Status::Null, - reqs_cw: RSCodeword::::from_null( - self.quorum_cnt, - self.population - self.quorum_cnt, - )?, - leader_bk: None, - replica_bk: None, - external: false, - }); + self.insts.push(self.null_instance()?); } // update instance state let inst = &mut self.insts[slot]; @@ -1386,23 +1372,14 @@ impl CrosswordReplica { } => { // locate instance in memory, filling in null instances if needed while self.insts.len() <= slot { - self.insts.push(Instance { - bal: 0, - status: Status::Null, - reqs_cw: RSCodeword::::from_null( - self.quorum_cnt, - self.population - self.quorum_cnt, - )?, - leader_bk: None, - replica_bk: None, - external: false, - }); + self.insts.push(self.null_instance()?); } // update instance state let inst = &mut self.insts[slot]; inst.bal = ballot; inst.status = Status::Accepting; - inst.reqs_cw = reqs_cw; + inst.reqs_cw = reqs_cw.clone(); + inst.voted = (ballot, reqs_cw); // update bal_prepared and bal_max_seen if self.bal_prepared < ballot { self.bal_prepared = ballot; diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs index 974b95d4..e5024071 100644 --- a/src/protocols/multipaxos.rs +++ b/src/protocols/multipaxos.rs @@ -262,6 +262,19 @@ pub struct MultiPaxosReplica { } impl MultiPaxosReplica { + /// Create an empty null instance. + fn null_instance(&self) -> Instance { + Instance { + bal: 0, + status: Status::Null, + reqs: Vec::new(), + voted: (0, Vec::new()), + leader_bk: None, + replica_bk: None, + external: false, + } + } + /// Compose a unique ballot number from base. fn make_unique_ballot(&self, base: u64) -> Ballot { ((base << 8) | ((self.id + 1) as u64)) as Ballot @@ -358,19 +371,14 @@ impl MultiPaxosReplica { } } if slot == self.insts.len() { - let new_inst = Instance { - bal: 0, - status: Status::Null, - reqs: req_batch.clone(), - voted: (0, Vec::new()), - leader_bk: Some(LeaderBookkeeping { - prepare_acks: Bitmap::new(self.population, false), - prepare_max_bal: 0, - accept_acks: Bitmap::new(self.population, false), - }), - replica_bk: None, - external: true, - }; + let mut new_inst = self.null_instance(); + new_inst.reqs = req_batch.clone(); + new_inst.leader_bk = Some(LeaderBookkeeping { + prepare_acks: Bitmap::new(self.population, false), + prepare_max_bal: 0, + accept_acks: Bitmap::new(self.population, false), + }); + new_inst.external = true; self.insts.push(new_inst); } @@ -610,15 +618,7 @@ impl MultiPaxosReplica { if ballot >= self.bal_max_seen { // locate instance in memory, filling in null instances if needed while self.insts.len() <= slot { - self.insts.push(Instance { - bal: 0, - status: Status::Null, - reqs: Vec::new(), - voted: (0, Vec::new()), - leader_bk: None, - replica_bk: None, - external: false, - }); + self.insts.push(self.null_instance()); } let inst = &mut self.insts[slot]; assert!(inst.bal <= ballot); @@ -741,15 +741,7 @@ impl MultiPaxosReplica { if ballot >= self.bal_max_seen { // locate instance in memory, filling in null instances if needed while self.insts.len() <= slot { - self.insts.push(Instance { - bal: 0, - status: Status::Null, - reqs: Vec::new(), - voted: (0, Vec::new()), - leader_bk: None, - replica_bk: None, - external: false, - }); + self.insts.push(self.null_instance()); } let inst = &mut self.insts[slot]; assert!(inst.bal <= ballot); @@ -846,15 +838,7 @@ impl MultiPaxosReplica { // locate instance in memory, filling in null instances if needed while self.insts.len() <= slot { - self.insts.push(Instance { - bal: 0, - status: Status::Null, - reqs: Vec::new(), - voted: (0, Vec::new()), - leader_bk: None, - replica_bk: None, - external: false, - }); + self.insts.push(self.null_instance()); } let inst = &mut self.insts[slot]; @@ -1177,15 +1161,7 @@ impl MultiPaxosReplica { LogEntry::PrepareBal { slot, ballot } => { // locate instance in memory, filling in null instances if needed while self.insts.len() <= slot { - self.insts.push(Instance { - bal: 0, - status: Status::Null, - reqs: Vec::new(), - voted: (0, Vec::new()), - leader_bk: None, - replica_bk: None, - external: false, - }); + self.insts.push(self.null_instance()); } // update instance state let inst = &mut self.insts[slot]; @@ -1204,15 +1180,7 @@ impl MultiPaxosReplica { LogEntry::AcceptData { slot, ballot, reqs } => { // locate instance in memory, filling in null instances if needed while self.insts.len() <= slot { - self.insts.push(Instance { - bal: 0, - status: Status::Null, - reqs: Vec::new(), - voted: (0, Vec::new()), - leader_bk: None, - replica_bk: None, - external: false, - }); + self.insts.push(self.null_instance()); } // update instance state let inst = &mut self.insts[slot]; diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs index 151470c7..f0320a9e 100644 --- a/src/protocols/rs_paxos.rs +++ b/src/protocols/rs_paxos.rs @@ -134,6 +134,9 @@ struct Instance { /// Shards of a batch of client requests. reqs_cw: RSCodeword, + /// Highest ballot and associated value I have accepted. + voted: (Ballot, RSCodeword), + /// Leader-side bookkeeping info. leader_bk: Option, @@ -264,6 +267,28 @@ pub struct RSPaxosReplica { } impl RSPaxosReplica { + /// Create an empty null instance. + fn null_instance(&self) -> Result { + Ok(Instance { + bal: 0, + status: Status::Null, + reqs_cw: RSCodeword::::from_null( + self.quorum_cnt, + self.population - self.quorum_cnt, + )?, + voted: ( + 0, + RSCodeword::::from_null( + self.quorum_cnt, + self.population - self.quorum_cnt, + )?, + ), + leader_bk: None, + replica_bk: None, + external: false, + }) + } + /// Compose a unique ballot number from base. fn make_unique_ballot(&self, base: u64) -> Ballot { ((base << 8) | ((self.id + 1) as u64)) as Ballot @@ -370,18 +395,14 @@ impl RSPaxosReplica { accept_acks: Bitmap::new(self.population, false), }); } else { - let new_inst = Instance { - bal: 0, - status: Status::Null, - reqs_cw, - leader_bk: Some(LeaderBookkeeping { - prepare_acks: Bitmap::new(self.population, false), - prepare_max_bal: 0, - accept_acks: Bitmap::new(self.population, false), - }), - replica_bk: None, - external: true, - }; + let mut new_inst = self.null_instance()?; + new_inst.reqs_cw = reqs_cw; + new_inst.leader_bk = Some(LeaderBookkeeping { + prepare_acks: Bitmap::new(self.population, false), + prepare_max_bal: 0, + accept_acks: Bitmap::new(self.population, false), + }); + new_inst.external = true; self.insts.push(new_inst); } @@ -435,6 +456,11 @@ impl RSPaxosReplica { slot, inst.bal); // record update to largest accepted ballot and corresponding data + let subset_copy = inst.reqs_cw.subset_copy( + Bitmap::from(self.population, vec![self.id]), + false, + )?; + inst.voted = (inst.bal, subset_copy.clone()); self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Accepting), LogAction::Append { @@ -442,10 +468,7 @@ impl RSPaxosReplica { slot, ballot: inst.bal, // persist only one shard on myself - reqs_cw: inst.reqs_cw.subset_copy( - Bitmap::from(self.population, vec![self.id]), - false, - )?, + reqs_cw: subset_copy, }, sync: self.config.logger_sync, }, @@ -485,8 +508,8 @@ impl RSPaxosReplica { pf_trace!(self.id; "finished PrepareBal logging for slot {} bal {}", slot, self.insts[slot].bal); let inst = &self.insts[slot]; - let voted = if inst.status >= Status::Accepting { - Some((inst.bal, inst.reqs_cw.clone())) + let voted = if inst.voted.0 > 0 { + Some(inst.voted.clone()) } else { None }; @@ -643,17 +666,7 @@ impl RSPaxosReplica { if ballot >= self.bal_max_seen { // locate instance in memory, filling in null instances if needed while self.insts.len() <= slot { - self.insts.push(Instance { - bal: 0, - status: Status::Null, - reqs_cw: RSCodeword::::from_null( - self.quorum_cnt, - self.population - self.quorum_cnt, - )?, - leader_bk: None, - replica_bk: None, - external: false, - }); + self.insts.push(self.null_instance()?); } let inst = &mut self.insts[slot]; assert!(inst.bal <= ballot); @@ -746,16 +759,18 @@ impl RSPaxosReplica { } // record update to largest accepted ballot and corresponding data + let subset_copy = inst.reqs_cw.subset_copy( + Bitmap::from(self.population, vec![self.id]), + false, + )?; + inst.voted = (ballot, subset_copy.clone()); self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Accepting), LogAction::Append { entry: LogEntry::AcceptData { slot, ballot, - reqs_cw: inst.reqs_cw.subset_copy( - Bitmap::from(self.population, vec![self.id]), - false, - )?, + reqs_cw: subset_copy, }, sync: self.config.logger_sync, }, @@ -803,17 +818,7 @@ impl RSPaxosReplica { if ballot >= self.bal_max_seen { // locate instance in memory, filling in null instances if needed while self.insts.len() <= slot { - self.insts.push(Instance { - bal: 0, - status: Status::Null, - reqs_cw: RSCodeword::::from_null( - self.quorum_cnt, - self.population - self.quorum_cnt, - )?, - leader_bk: None, - replica_bk: None, - external: false, - }); + self.insts.push(self.null_instance()?); } let inst = &mut self.insts[slot]; assert!(inst.bal <= ballot); @@ -827,6 +832,7 @@ impl RSPaxosReplica { self.bal_max_seen = ballot; // record update to largest prepare ballot + inst.voted = (ballot, inst.reqs_cw.clone()); self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Accepting), LogAction::Append { @@ -917,17 +923,7 @@ impl RSPaxosReplica { // locate instance in memory, filling in null instances if needed while self.insts.len() <= slot { - self.insts.push(Instance { - bal: 0, - status: Status::Null, - reqs_cw: RSCodeword::::from_null( - self.quorum_cnt, - self.population - self.quorum_cnt, - )?, - leader_bk: None, - replica_bk: None, - external: false, - }); + self.insts.push(self.null_instance()?); } let inst = &mut self.insts[slot]; @@ -1207,12 +1203,12 @@ impl RSPaxosReplica { paused: &mut bool, ) -> Result<(), SummersetError> { pf_warn!(self.id; "server got resume req"); - *paused = false; - self.control_hub.send_ctrl(CtrlMsg::ResumeReply)?; // reset leader heartbeat timer self.kickoff_hb_hear_timer()?; + *paused = false; + self.control_hub.send_ctrl(CtrlMsg::ResumeReply)?; Ok(()) } @@ -1253,17 +1249,7 @@ impl RSPaxosReplica { LogEntry::PrepareBal { slot, ballot } => { // locate instance in memory, filling in null instances if needed while self.insts.len() <= slot { - self.insts.push(Instance { - bal: 0, - status: Status::Null, - reqs_cw: RSCodeword::::from_null( - self.quorum_cnt, - self.population - self.quorum_cnt, - )?, - leader_bk: None, - replica_bk: None, - external: false, - }); + self.insts.push(self.null_instance()?); } // update instance state let inst = &mut self.insts[slot]; @@ -1286,23 +1272,14 @@ impl RSPaxosReplica { } => { // locate instance in memory, filling in null instances if needed while self.insts.len() <= slot { - self.insts.push(Instance { - bal: 0, - status: Status::Null, - reqs_cw: RSCodeword::::from_null( - self.quorum_cnt, - self.population - self.quorum_cnt, - )?, - leader_bk: None, - replica_bk: None, - external: false, - }); + self.insts.push(self.null_instance()?); } // update instance state let inst = &mut self.insts[slot]; inst.bal = ballot; inst.status = Status::Accepting; - inst.reqs_cw = reqs_cw; + inst.reqs_cw = reqs_cw.clone(); + inst.voted = (ballot, reqs_cw); // update bal_prepared and bal_max_seen if self.bal_prepared < ballot { self.bal_prepared = ballot; From 33920992bec932aaee807f00040db271cbca8756 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Sun, 17 Sep 2023 23:54:14 +0000 Subject: [PATCH 50/89] add recovery read msgs & fix sharding bugs --- src/protocols/crossword.rs | 135 ++++++++++++++++++++++++- src/protocols/rs_paxos.rs | 135 ++++++++++++++++++++++++- summerset_client/src/clients/tester.rs | 2 +- 3 files changed, 267 insertions(+), 5 deletions(-) diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index 7bb12ff8..c9c6a8da 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -198,6 +198,16 @@ enum PeerMsg { /// Commit notification from leader to replicas. Commit { slot: usize }, + /// Recovery read from new leader to replicas. + Recover { slot: usize }, + + /// Recovery read reply from replica to leader. + RecoverReply { + slot: usize, + ballot: Ballot, + reqs_cw: RSCodeword, + }, + /// Leader activity heartbeat. Heartbeat { ballot: Ballot }, } @@ -816,8 +826,13 @@ impl CrosswordReplica { // instance using the request batch value constructed using shards // with the highest ballot number in quorum if leader_bk.prepare_acks.count() >= self.quorum_cnt - && inst.reqs_cw.avail_data_shards() >= self.quorum_cnt + && inst.reqs_cw.avail_shards() >= self.quorum_cnt { + if inst.reqs_cw.avail_data_shards() < self.quorum_cnt { + // have enough shards but need reconstruction + inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?; + } + inst.status = Status::Accepting; pf_debug!(self.id; "enter Accept phase for slot {} bal {}", slot, inst.bal); @@ -1051,6 +1066,105 @@ impl CrosswordReplica { Ok(()) } + /// Handler of Recover message from leader. + fn handle_msg_recover( + &mut self, + peer: ReplicaId, + slot: usize, + ) -> Result<(), SummersetError> { + pf_trace!(self.id; "received Recover <- {} for slot {}", peer, slot); + + // locate instance in memory, filling in null instances if needed + while self.insts.len() <= slot { + self.insts.push(self.null_instance()?); + } + let inst = &mut self.insts[slot]; + + // ignore spurious duplications; also ignore if I have nothing to send back + if inst.status < Status::Accepting || inst.reqs_cw.avail_shards() == 0 { + return Ok(()); + } + + // send back my ballot for this slot and the available shards + self.transport_hub.send_msg( + PeerMsg::RecoverReply { + slot, + ballot: inst.bal, + reqs_cw: inst.reqs_cw.clone(), + }, + peer, + )?; + pf_trace!(self.id; "sent RecoverReply message for slot {} bal {}", slot, inst.bal); + + Ok(()) + } + + /// Handler of Recover reply from replica. + fn handle_msg_recover_reply( + &mut self, + peer: ReplicaId, + slot: usize, + ballot: Ballot, + reqs_cw: RSCodeword, + ) -> Result<(), SummersetError> { + pf_trace!(self.id; "received RecoverReply <- {} for slot {} bal {} shards {:?}", + peer, slot, ballot, reqs_cw.avail_shards_map()); + assert!(slot < self.insts.len()); + assert!(self.insts[slot].status >= Status::Committed); + let num_insts = self.insts.len(); + let inst = &mut self.insts[slot]; + + // if reply not outdated and ballot is up-to-date + if inst.status < Status::Executed && ballot >= inst.bal { + // absorb the shards from this replica + inst.reqs_cw.absorb_other(reqs_cw)?; + + // if enough shards have been gathered, can push execution forward + if slot == self.commit_bar { + while self.commit_bar < num_insts { + let inst = &mut self.insts[self.commit_bar]; + if inst.status < Status::Committed + || inst.reqs_cw.avail_shards() < self.quorum_cnt + { + break; + } + + if inst.reqs_cw.avail_data_shards() < self.quorum_cnt { + // have enough shards but need reconstruction + inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?; + } + let reqs = inst.reqs_cw.get_data()?; + + // submit commands in committed instance to the state machine + // for execution + if reqs.is_empty() { + inst.status = Status::Executed; + } else { + for (cmd_idx, (_, req)) in reqs.iter().enumerate() { + if let ApiRequest::Req { cmd, .. } = req { + self.state_machine.submit_cmd( + Self::make_command_id( + self.commit_bar, + cmd_idx, + ), + cmd.clone(), + )?; + } else { + continue; // ignore other types of requests + } + } + pf_trace!(self.id; "submitted {} exec commands for slot {}", + reqs.len(), self.commit_bar); + } + + self.commit_bar += 1; + } + } + } + + Ok(()) + } + /// Synthesized handler of receiving message from peer. fn handle_msg_recv( &mut self, @@ -1075,6 +1189,12 @@ impl CrosswordReplica { self.handle_msg_accept_reply(peer, slot, ballot) } PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot), + PeerMsg::Recover { slot } => self.handle_msg_recover(peer, slot), + PeerMsg::RecoverReply { + slot, + ballot, + reqs_cw, + } => self.handle_msg_recover_reply(peer, slot, ballot, reqs_cw), PeerMsg::Heartbeat { ballot } => self.heard_heartbeat(peer, ballot), } } @@ -1155,8 +1275,8 @@ impl CrosswordReplica { self.bal_prep_sent = self.make_greater_ballot(self.bal_max_seen); self.bal_max_seen = self.bal_prep_sent; - // redo Prepare phase for all in-progress instances for (slot, inst) in self.insts.iter_mut().enumerate() { + // redo Prepare phase for all in-progress instances if inst.status < Status::Committed { inst.bal = self.bal_prep_sent; inst.status = Status::Preparing; @@ -1188,6 +1308,17 @@ impl CrosswordReplica { pf_trace!(self.id; "broadcast Prepare messages for slot {} bal {}", slot, inst.bal); } + + // do recovery reads for all committed instances that do not + // hold enough available shards for reconstruction + if inst.status == Status::Committed + && inst.reqs_cw.avail_shards() < self.quorum_cnt + { + self.transport_hub + .bcast_msg(PeerMsg::Recover { slot }, None)?; + pf_trace!(self.id; "broadcast Recover messages for slot {} bal {} shards {:?}", + slot, inst.bal, inst.reqs_cw.avail_shards_map()); + } } Ok(()) diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs index f0320a9e..9a79da2e 100644 --- a/src/protocols/rs_paxos.rs +++ b/src/protocols/rs_paxos.rs @@ -192,6 +192,16 @@ enum PeerMsg { /// Commit notification from leader to replicas. Commit { slot: usize }, + /// Recovery read from new leader to replicas. + Recover { slot: usize }, + + /// Recovery read reply from replica to leader. + RecoverReply { + slot: usize, + ballot: Ballot, + reqs_cw: RSCodeword, + }, + /// Leader activity heartbeat. Heartbeat { ballot: Ballot }, } @@ -743,8 +753,13 @@ impl RSPaxosReplica { // instance using the request batch value constructed using shards // with the highest ballot number in quorum if leader_bk.prepare_acks.count() >= self.quorum_cnt - && inst.reqs_cw.avail_data_shards() >= self.quorum_cnt + && inst.reqs_cw.avail_shards() >= self.quorum_cnt { + if inst.reqs_cw.avail_data_shards() < self.quorum_cnt { + // have enough shards but need reconstruction + inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?; + } + inst.status = Status::Accepting; pf_debug!(self.id; "enter Accept phase for slot {} bal {}", slot, inst.bal); @@ -951,6 +966,105 @@ impl RSPaxosReplica { Ok(()) } + /// Handler of Recover message from leader. + fn handle_msg_recover( + &mut self, + peer: ReplicaId, + slot: usize, + ) -> Result<(), SummersetError> { + pf_trace!(self.id; "received Recover <- {} for slot {}", peer, slot); + + // locate instance in memory, filling in null instances if needed + while self.insts.len() <= slot { + self.insts.push(self.null_instance()?); + } + let inst = &mut self.insts[slot]; + + // ignore spurious duplications; also ignore if I have nothing to send back + if inst.status < Status::Accepting || inst.reqs_cw.avail_shards() == 0 { + return Ok(()); + } + + // send back my ballot for this slot and the available shards + self.transport_hub.send_msg( + PeerMsg::RecoverReply { + slot, + ballot: inst.bal, + reqs_cw: inst.reqs_cw.clone(), + }, + peer, + )?; + pf_trace!(self.id; "sent RecoverReply message for slot {} bal {}", slot, inst.bal); + + Ok(()) + } + + /// Handler of Recover reply from replica. + fn handle_msg_recover_reply( + &mut self, + peer: ReplicaId, + slot: usize, + ballot: Ballot, + reqs_cw: RSCodeword, + ) -> Result<(), SummersetError> { + pf_trace!(self.id; "received RecoverReply <- {} for slot {} bal {} shards {:?}", + peer, slot, ballot, reqs_cw.avail_shards_map()); + assert!(slot < self.insts.len()); + assert!(self.insts[slot].status >= Status::Committed); + let num_insts = self.insts.len(); + let inst = &mut self.insts[slot]; + + // if reply not outdated and ballot is up-to-date + if inst.status < Status::Executed && ballot >= inst.bal { + // absorb the shards from this replica + inst.reqs_cw.absorb_other(reqs_cw)?; + + // if enough shards have been gathered, can push execution forward + if slot == self.commit_bar { + while self.commit_bar < num_insts { + let inst = &mut self.insts[self.commit_bar]; + if inst.status < Status::Committed + || inst.reqs_cw.avail_shards() < self.quorum_cnt + { + break; + } + + if inst.reqs_cw.avail_data_shards() < self.quorum_cnt { + // have enough shards but need reconstruction + inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?; + } + let reqs = inst.reqs_cw.get_data()?; + + // submit commands in committed instance to the state machine + // for execution + if reqs.is_empty() { + inst.status = Status::Executed; + } else { + for (cmd_idx, (_, req)) in reqs.iter().enumerate() { + if let ApiRequest::Req { cmd, .. } = req { + self.state_machine.submit_cmd( + Self::make_command_id( + self.commit_bar, + cmd_idx, + ), + cmd.clone(), + )?; + } else { + continue; // ignore other types of requests + } + } + pf_trace!(self.id; "submitted {} exec commands for slot {}", + reqs.len(), self.commit_bar); + } + + self.commit_bar += 1; + } + } + } + + Ok(()) + } + /// Synthesized handler of receiving message from peer. fn handle_msg_recv( &mut self, @@ -975,6 +1089,12 @@ impl RSPaxosReplica { self.handle_msg_accept_reply(peer, slot, ballot) } PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot), + PeerMsg::Recover { slot } => self.handle_msg_recover(peer, slot), + PeerMsg::RecoverReply { + slot, + ballot, + reqs_cw, + } => self.handle_msg_recover_reply(peer, slot, ballot, reqs_cw), PeerMsg::Heartbeat { ballot } => self.heard_heartbeat(peer, ballot), } } @@ -1055,8 +1175,8 @@ impl RSPaxosReplica { self.bal_prep_sent = self.make_greater_ballot(self.bal_max_seen); self.bal_max_seen = self.bal_prep_sent; - // redo Prepare phase for all in-progress instances for (slot, inst) in self.insts.iter_mut().enumerate() { + // redo Prepare phase for all in-progress instances if inst.status < Status::Committed { inst.bal = self.bal_prep_sent; inst.status = Status::Preparing; @@ -1088,6 +1208,17 @@ impl RSPaxosReplica { pf_trace!(self.id; "broadcast Prepare messages for slot {} bal {}", slot, inst.bal); } + + // do recovery reads for all committed instances that do not + // hold enough available shards for reconstruction + if inst.status == Status::Committed + && inst.reqs_cw.avail_shards() < self.quorum_cnt + { + self.transport_hub + .bcast_msg(PeerMsg::Recover { slot }, None)?; + pf_trace!(self.id; "broadcast Recover messages for slot {} bal {} shards {:?}", + slot, inst.bal, inst.reqs_cw.avail_shards_map()); + } } Ok(()) diff --git a/summerset_client/src/clients/tester.rs b/summerset_client/src/clients/tester.rs index 7da4d5df..240a18ff 100644 --- a/summerset_client/src/clients/tester.rs +++ b/summerset_client/src/clients/tester.rs @@ -235,7 +235,7 @@ impl ClientTester { self.driver.id; "client-side timeout {} ms {} times", self.timeout.as_millis(), - max_timeouts + max_timeouts + 1 ) } From e416476563e4f34698f3964336b298b7b9421d08 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Mon, 18 Sep 2023 00:18:44 +0000 Subject: [PATCH 51/89] add scripted tests to github workflow beside unit tests --- .github/workflows/tests_proc.yml | 20 ++++ .../workflows/{tests.yml => tests_unit.yml} | 2 +- README.md | 5 +- scripts/workflow_test.py | 109 ++++++++++++++++++ 4 files changed, 133 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/tests_proc.yml rename .github/workflows/{tests.yml => tests_unit.yml} (90%) create mode 100644 scripts/workflow_test.py diff --git a/.github/workflows/tests_proc.yml b/.github/workflows/tests_proc.yml new file mode 100644 index 00000000..dc8c63d9 --- /dev/null +++ b/.github/workflows/tests_proc.yml @@ -0,0 +1,20 @@ +name: Tests + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +env: + CARGO_TERM_COLOR: always + +jobs: + tests: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Run proc tests + run: python3 scripts/workflow_test.py diff --git a/.github/workflows/tests.yml b/.github/workflows/tests_unit.yml similarity index 90% rename from .github/workflows/tests.yml rename to .github/workflows/tests_unit.yml index 5c4a843d..91d6ca77 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests_unit.yml @@ -16,5 +16,5 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Run tests + - name: Run unit tests run: cargo test --workspace --verbose diff --git a/README.md b/README.md index 0d008572..a7250245 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,8 @@ This is a private mirror of [Summerset](https://github.com/josehu07/summerset). [![Format check](https://github.com/josehu07/summerset-private/actions/workflows/format.yml/badge.svg)](https://github.com/josehu07/summerset-private/actions?query=josehu07%3Aformat) [![Build status](https://github.com/josehu07/summerset-private/actions/workflows/build.yml/badge.svg)](https://github.com/josehu07/summerset-private/actions?query=josehu07%3Abuild) -[![Tests status](https://github.com/josehu07/summerset-private/actions/workflows/tests.yml/badge.svg)](https://github.com/josehu07/summerset-private/actions?query=josehu07%3Atests) +[![Unit tests status](https://github.com/josehu07/summerset-private/actions/workflows/tests_unit.yml/badge.svg)](https://github.com/josehu07/summerset-private/actions?query=josehu07%3Atests_unit) +[![Proc tests status](https://github.com/josehu07/summerset-private/actions/workflows/tests_proc.yml/badge.svg)](https://github.com/josehu07/summerset-private/actions?query=josehu07%3Atests_proc) [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT) To create a branch to track public repo `main`, pull new things from it, and merge into the private `main`: @@ -155,7 +156,7 @@ Complete cluster management and benchmarking scripts are available in another re - [ ] membership discovery & view changes - [ ] implementation of Raft - [ ] implementation of Crossword prototype - - [ ] fault recovery reads + - [x] fault recovery reads - [ ] follower gossiping - [x] client-side utilities - [x] REPL-style client diff --git a/scripts/workflow_test.py b/scripts/workflow_test.py new file mode 100644 index 00000000..33484aca --- /dev/null +++ b/scripts/workflow_test.py @@ -0,0 +1,109 @@ +import sys +import os +import subprocess + + +def do_cargo_build(): + print("Building everything...") + cmd = ["cargo", "build", "--workspace"] + proc = subprocess.Popen(cmd) + proc.wait() + + +def run_process(cmd): + # print("Run:", " ".join(cmd)) + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + return proc + + +def kill_all_matching(name, force=False): + # print("Kill all:", name) + assert name.count(" ") == 0 + try: + pgrep_cmd = f"sudo pgrep -f {name}" + pids = subprocess.check_output(pgrep_cmd, shell=True).decode() + pids = pids.strip().split("\n") + for pid in pids: + pid = pid.strip() + if len(pid) > 0: + kill_cmd = f"sudo kill -9" if force else "sudo kill" + kill_cmd += f" {int(pid)} > /dev/null 2>&1" + os.system(kill_cmd) + except subprocess.CalledProcessError: + pass + + +def launch_cluster(protocol, num_replicas, config): + cmd = [ + "python3", + "./scripts/local_cluster.py", + "-p", + protocol, + "-n", + str(num_replicas), + ] + if config is not None and len(config) > 0: + cmd += ["--config", config] + return run_process(cmd) + + +def wait_cluster_setup(proc, num_replicas): + accepting_clients = [False for _ in range(num_replicas)] + + for line in iter(proc.stderr.readline, b""): + l = line.decode() + print(l, end="", file=sys.stderr) + if "manager" not in l and "accepting clients" in l: + replica = int(l[l.find("(") + 1 : l.find(")")]) + assert not accepting_clients[replica] + accepting_clients[replica] = True + + if accepting_clients.count(True) == num_replicas: + break + + +def run_tester_client(protocol, test_name): + cmd = [ + "python3", + "./scripts/local_client.py", + "-p", + protocol, + "tester", + "-t", + test_name, + ] + return run_process(cmd) + + +if __name__ == "__main__": + do_cargo_build() + + kill_all_matching("local_client.py", force=True) + kill_all_matching("local_cluster.py", force=True) + kill_all_matching("summerset_client", force=True) + kill_all_matching("summerset_server", force=True) + kill_all_matching("summerset_manager", force=True) + + PROTOCOL = "MultiPaxos" + NUM_REPLICAS = 3 + TEST_NAME = "primitive_ops" + TIMEOUT = 300 + + proc_cluster = launch_cluster(PROTOCOL, NUM_REPLICAS, config=None) + wait_cluster_setup(proc_cluster, NUM_REPLICAS) + + proc_client = run_tester_client(PROTOCOL, TEST_NAME) + + try: + client_rc = proc_client.wait(timeout=TIMEOUT) + except subprocess.TimeoutExpired: + print(f"Client tester did not finish in {TIMEOUT} secs") + exit(1) + + proc_cluster.terminate() + + if client_rc != 0: + print(f"Client tester exitted with {client_rc}") + exit(client_rc) + else: + exit(0) From 70e6093561ab76c75ed60d5a280ad5da180ef203 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Mon, 18 Sep 2023 00:21:55 +0000 Subject: [PATCH 52/89] minor updates to workflow job names --- .github/workflows/tests_proc.yml | 4 ++-- .github/workflows/tests_unit.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/tests_proc.yml b/.github/workflows/tests_proc.yml index dc8c63d9..e8fbde3f 100644 --- a/.github/workflows/tests_proc.yml +++ b/.github/workflows/tests_proc.yml @@ -1,4 +1,4 @@ -name: Tests +name: Proc tests on: push: @@ -10,7 +10,7 @@ env: CARGO_TERM_COLOR: always jobs: - tests: + tests_proc: runs-on: ubuntu-latest diff --git a/.github/workflows/tests_unit.yml b/.github/workflows/tests_unit.yml index 91d6ca77..0a1fd8d6 100644 --- a/.github/workflows/tests_unit.yml +++ b/.github/workflows/tests_unit.yml @@ -1,4 +1,4 @@ -name: Tests +name: Unit tests on: push: @@ -10,7 +10,7 @@ env: CARGO_TERM_COLOR: always jobs: - tests: + tests_unit: runs-on: ubuntu-latest From de724566524e784ca3f1850aa18ba58616c1ee04 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Sat, 23 Sep 2023 22:58:42 -0500 Subject: [PATCH 53/89] staging progress on snapshotting --- scripts/local_cluster.py | 14 +- src/manager/clusman.rs | 89 ++++++++- src/manager/reactor.rs | 12 ++ src/manager/reigner.rs | 6 + src/protocols/crossword.rs | 14 +- src/protocols/multipaxos.rs | 257 +++++++++++++++++++++++++- src/protocols/rs_paxos.rs | 14 +- src/protocols/simple_push.rs | 2 +- src/server/transport.rs | 5 + summerset_client/src/clients/bench.rs | 1 - 10 files changed, 383 insertions(+), 31 deletions(-) diff --git a/scripts/local_cluster.py b/scripts/local_cluster.py index c4e0877c..5b0b0658 100644 --- a/scripts/local_cluster.py +++ b/scripts/local_cluster.py @@ -48,13 +48,21 @@ def kill_all_matching(name, force=False): "Crossword": lambda r: f"backer_path='/tmp/summerset.crossword.{r}.wal'", } +PROTOCOL_SNAPSHOT_PATH = { + "MultiPaxos": lambda r: f"snapshot_path='/tmp/summerset.multipaxos.{r}.snap'", +} + -def config_with_backer_path(protocol, config, replica): +def config_with_file_paths(protocol, config, replica): result_config = PROTOCOL_BACKER_PATH[protocol](replica) + if protocol in PROTOCOL_SNAPSHOT_PATH: + result_config += "+" + result_config += PROTOCOL_SNAPSHOT_PATH[protocol](replica) if config is not None and len(config) > 0: - if "backer_path" in config: + if "backer_path" in config or "snapshot_path" in config: result_config = config # use user-supplied path + # NOTE: ignores the other one else: result_config += "+" result_config += config @@ -132,7 +140,7 @@ def launch_servers(protocol, num_replicas, release, config): SERVER_API_PORT(replica), SERVER_P2P_PORT(replica), f"127.0.0.1:{MANAGER_SRV_PORT}", - config_with_backer_path(protocol, config, replica), + config_with_file_paths(protocol, config, replica), release, ) proc = run_process(cmd) diff --git a/src/manager/clusman.rs b/src/manager/clusman.rs index 7b08e4d9..58b4a3b3 100644 --- a/src/manager/clusman.rs +++ b/src/manager/clusman.rs @@ -27,6 +27,9 @@ struct ServerInfo { /// This server is currently paused? is_paused: bool, + + /// In-mem log start index after latest snapshot. + start_slot: usize, } /// Standalone cluster manager oracle. @@ -204,6 +207,7 @@ impl ClusterManager { p2p_addr, is_leader: false, is_paused: false, + start_slot: 0, }, ); Ok(()) @@ -231,6 +235,28 @@ impl ClusterManager { } } + /// Handler of autonomous SnapshotUpTo message. + fn handle_snapshot_up_to( + &mut self, + server: ReplicaId, + new_start: usize, + ) -> Result<(), SummersetError> { + if !self.server_info.contains_key(&server) { + return logged_err!("m"; "snapshot up to got unknown ID: {}", server); + } + + // update this server's info + let info = self.server_info.get_mut(&server).unwrap(); + if new_start < info.start_slot { + logged_err!("m"; "server {} snapshot up to {} < {}", + server, new_start, + self.server_info[&server].start_slot) + } else { + info.start_slot = new_start; + Ok(()) + } + } + /// Synthesized handler of server-initiated control messages. async fn handle_ctrl_msg( &mut self, @@ -258,6 +284,10 @@ impl ClusterManager { self.handle_leader_status(server, step_up)?; } + CtrlMsg::SnapshotUpTo { new_start } => { + self.handle_snapshot_up_to(server, new_start)?; + } + _ => {} // ignore all other types } @@ -364,7 +394,7 @@ impl ClusterManager { // pause specified server(s) let mut pause_done = HashSet::new(); while let Some(s) = servers.pop() { - // send puase server control message to server + // send pause server control message to server self.server_reigner.send_ctrl(CtrlMsg::Pause, s)?; // set the is_paused flag @@ -404,19 +434,19 @@ impl ClusterManager { // resume specified server(s) let mut resume_done = HashSet::new(); while let Some(s) = servers.pop() { - // send puase server control message to server + // send resume server control message to server self.server_reigner.send_ctrl(CtrlMsg::Resume, s)?; - // clear the is_paused flag - assert!(self.server_info.contains_key(&s)); - self.server_info.get_mut(&s).unwrap().is_paused = false; - // wait for dummy reply let (_, reply) = self.server_reigner.recv_ctrl().await?; if reply != CtrlMsg::ResumeReply { return logged_err!("m"; "unexpected reply type received"); } + // clear the is_paused flag + assert!(self.server_info.contains_key(&s)); + self.server_info.get_mut(&s).unwrap().is_paused = false; + resume_done.insert(s); } @@ -428,6 +458,49 @@ impl ClusterManager { ) } + /// Handler of client TakeSnapshot rquest. + async fn handle_client_take_snapshot( + &mut self, + client: ClientId, + servers: HashSet, + ) -> Result<(), SummersetError> { + let mut servers: Vec = if servers.is_empty() { + // all active servers + self.server_info.keys().copied().collect() + } else { + servers.into_iter().collect() + }; + + // tell specified server(s) + let mut snapshot_up_to = HashMap::new(); + while let Some(s) = servers.pop() { + // send take snapshot control message to server + self.server_reigner.send_ctrl(CtrlMsg::TakeSnapshot, s)?; + + // wait for reply + let (_, reply) = self.server_reigner.recv_ctrl().await?; + if let CtrlMsg::SnapshotUpTo { new_start } = reply { + // update the log start index info + assert!(self.server_info.contains_key(&s)); + if new_start < self.server_info[&s].start_slot { + return logged_err!("m"; "server {} snapshot up to {} < {}", + s, new_start, + self.server_info[&s].start_slot); + } else { + self.server_info.get_mut(&s).unwrap().start_slot = + new_start; + } + + snapshot_up_to.insert(s, new_start); + } else { + return logged_err!("m"; "unexpected reply type received"); + } + } + + self.client_reactor + .send_reply(CtrlReply::TakeSnapshot { snapshot_up_to }, client) + } + /// Synthesized handler of client-initiated control requests. async fn handle_ctrl_req( &mut self, @@ -453,6 +526,10 @@ impl ClusterManager { self.handle_client_resume_servers(client, servers).await?; } + CtrlRequest::TakeSnapshot { servers } => { + self.handle_client_take_snapshot(client, servers).await?; + } + _ => {} // ignore all other types } diff --git a/src/manager/reactor.rs b/src/manager/reactor.rs index 3273c54f..64ede623 100644 --- a/src/manager/reactor.rs +++ b/src/manager/reactor.rs @@ -45,6 +45,12 @@ pub enum CtrlRequest { servers: HashSet, }, + /// Tell the servers to take a snapshot now. + TakeSnapshot { + /// IDs of servers to take snapshot. If empty, tells all servers. + servers: HashSet, + }, + /// Client leave notification. Leave, } @@ -69,6 +75,12 @@ pub enum CtrlReply { /// Reply to server resume request. ResumeServers { servers: HashSet }, + /// Reply to take snapshot request. + TakeSnapshot { + /// Map from replica ID -> new log start index. + snapshot_up_to: HashMap, + }, + /// Reply to client leave notification. Leave, } diff --git a/src/manager/reigner.rs b/src/manager/reigner.rs index c3551e30..41ae38ec 100644 --- a/src/manager/reigner.rs +++ b/src/manager/reigner.rs @@ -58,6 +58,12 @@ pub enum CtrlMsg { /// Server -> Manager: dummy resume reply. ResumeReply, + /// Manager -> Server: tell server to take a snapshot now. + TakeSnapshot, + + /// Server -> Manager: server took snapshot up to log index. + SnapshotUpTo { new_start: usize }, + /// Server -> Manager: leave notification. Leave, diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index c9c6a8da..424b0cc8 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -1610,7 +1610,7 @@ impl CrosswordReplica { { Ok(()) } else { - logged_err!(self.id; "unexpected log result type") + logged_err!(self.id; "unexpected log result type or failed truncate") } } } @@ -1824,7 +1824,7 @@ impl GenericReplica for CrosswordReplica { if let Err(e) = self.handle_msg_recv(peer, msg) { pf_error!(self.id; "error handling msg recv <- {}: {}", peer, e); } - } + }, // state machine execution result cmd_result = self.state_machine.get_result(), if !paused => { @@ -1840,13 +1840,17 @@ impl GenericReplica for CrosswordReplica { // leader inactivity timeout _ = self.hb_hear_timer.timeout(), if !paused => { - self.become_a_leader()?; + if let Err(e) = self.become_a_leader() { + pf_error!(self.id; "error becoming a leader: {}", e); + } }, // leader sending heartbeat _ = self.hb_send_interval.tick(), if !paused && self.is_leader => { - self.bcast_heartbeats()?; - } + if let Err(e) = self.bcast_heartbeats() { + pf_error!(self.id; "error broadcasting heartbeats: {}", e); + } + }, // manager control message ctrl_msg = self.control_hub.recv_ctrl() => { diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs index e5024071..58fbaa84 100644 --- a/src/protocols/multipaxos.rs +++ b/src/protocols/multipaxos.rs @@ -14,9 +14,9 @@ use std::net::SocketAddr; use crate::utils::{SummersetError, Bitmap, Timer}; use crate::manager::{CtrlMsg, CtrlRequest, CtrlReply}; use crate::server::{ - ReplicaId, ControlHub, StateMachine, CommandResult, CommandId, ExternalApi, - ApiRequest, ApiReply, StorageHub, LogAction, LogResult, LogActionId, - TransportHub, GenericReplica, + ReplicaId, ControlHub, StateMachine, Command, CommandResult, CommandId, + ExternalApi, ApiRequest, ApiReply, StorageHub, LogAction, LogResult, + LogActionId, TransportHub, GenericReplica, }; use crate::client::{ClientId, ClientApiStub, ClientCtrlStub, GenericEndpoint}; use crate::protocols::SmrProtocol; @@ -41,7 +41,7 @@ pub struct ReplicaConfigMultiPaxos { /// Client request batching maximum batch size. pub max_batch_size: usize, - /// Path to backing file. + /// Path to backing log file. pub backer_path: String, /// Whether to call `fsync()`/`fdatasync()` on logger. @@ -56,6 +56,13 @@ pub struct ReplicaConfigMultiPaxos { /// Interval of leader sending heartbeats to followers. pub hb_send_interval_ms: u64, + /// Path to snapshot file. + pub snapshot_path: String, + + /// Snapshot self-triggering interval in secs. 0 means never trigger + /// snapshotting autonomously. + pub snapshot_interval_s: u64, + // Performance simulation params (all zeros means no perf simulation): pub perf_storage_a: u64, pub perf_storage_b: u64, @@ -74,6 +81,8 @@ impl Default for ReplicaConfigMultiPaxos { hb_hear_timeout_min: 300, hb_hear_timeout_max: 600, hb_send_interval_ms: 50, + snapshot_path: "/tmp/summerset.multipaxos.snap".into(), + snapshot_interval_s: 0, perf_storage_a: 0, perf_storage_b: 0, perf_network_a: 0, @@ -162,6 +171,17 @@ enum LogEntry { CommitSlot { slot: usize }, } +/// Snapshot file entry type. +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] +enum SnapEntry { + /// First entry at the start of file: number of log instances covered by + /// this snapshot file == the start slot index of in-mem log. + StartSlot { slot: usize }, + + /// Key-value pair entry to apply to the state. + NewKVPair { key: String, value: String }, +} + /// Peer-peer message type. #[derive(Debug, Clone, Serialize, Deserialize, GetSize)] enum PeerMsg { @@ -226,6 +246,9 @@ pub struct MultiPaxosReplica { /// StorageHub module. storage_hub: StorageHub, + /// StorageHub module for the snapshot file. + snapshot_hub: StorageHub, + /// TransportHub module. transport_hub: TransportHub, @@ -241,6 +264,12 @@ pub struct MultiPaxosReplica { /// In-memory log of instances. insts: Vec, + /// Start slot index of in-mem log after latest snapshot. + start_slot: usize, + + /// Timer for taking a new autonomous snapshot. + snapshot_interval: Option, + /// Largest ballot number that a leader has sent Prepare messages in. bal_prep_sent: Ballot, @@ -259,6 +288,9 @@ pub struct MultiPaxosReplica { /// Current durable log file offset. log_offset: usize, + + /// Current durable snapshot file offset. + snap_offset: usize, } impl MultiPaxosReplica { @@ -1124,6 +1156,19 @@ impl MultiPaxosReplica { Ok(()) } + /// Handler of TakeSnapshot control message. + async fn handle_ctrl_take_snapshot( + &mut self, + ) -> Result<(), SummersetError> { + pf_warn!(self.id; "server told to take snapshot"); + self.take_new_snapshot().await?; + + self.control_hub.send_ctrl(CtrlMsg::SnapshotUpTo { + new_start: self.start_slot, + })?; + Ok(()) + } + /// Synthesized handler of manager control messages. If ok, returns /// `Some(true)` if decides to terminate and reboot, `Some(false)` if /// decides to shutdown completely, and `None` if not terminating. @@ -1148,6 +1193,11 @@ impl MultiPaxosReplica { Ok(None) } + CtrlMsg::TakeSnapshot => { + self.handle_ctrl_take_snapshot().await?; + Ok(None) + } + _ => Ok(None), // ignore all other types } } @@ -1276,7 +1326,154 @@ impl MultiPaxosReplica { { Ok(()) } else { - logged_err!(self.id; "unexpected log result type") + logged_err!(self.id; "unexpected log result type or failed truncate") + } + } + + /// Take a snapshot up to current exec_idx, then discard the in-mem log up + /// to that index, and squash the durable WAL log file. + /// + /// NOTE: the current implementation does not guard against crashes in the + /// middle of taking a snapshot. + async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> { + pf_debug!(self.id; "taking a new snapshot: start {} exec {}", + self.start_slot, self.exec_bar); + assert!(self.exec_bar >= self.start_slot); + if self.exec_bar == self.start_slot { + return Ok(()); + } + + // dump all Puts in executed instances + for slot in self.start_slot..self.exec_bar { + let inst = &self.insts[slot - self.start_slot]; + for (_, req) in &inst.reqs { + if let ApiRequest::Req { + cmd: Command::Put { key, value }, + .. + } = req + { + self.snapshot_hub.submit_action( + 0, // using 0 as dummy log action ID + LogAction::Append { + entry: SnapEntry::NewKVPair { + key: key.clone(), + value: value.clone(), + }, + sync: self.config.logger_sync, + }, + )?; + let (_, log_result) = + self.snapshot_hub.get_result().await?; + if let LogResult::Write { + offset_ok: true, + now_size, + } = log_result + { + self.snap_offset = now_size; + } else { + return logged_err!( + self.id; + "unexpected log result type or failed write" + ); + } + } + } + } + + // update start_slot and discard all in-memory log instances up to exec_bar + self.insts.drain(0..(self.exec_bar - self.start_slot)); + self.start_slot = self.exec_bar; + + // TODO: squash the durable WAL log + + pf_debug!(self.id; "took new snapshot up to: start {}", self.start_slot); + Ok(()) + } + + /// Recover initial state from durable storage snapshot file. + async fn recover_from_snapshot(&mut self) -> Result<(), SummersetError> { + assert_eq!(self.snap_offset, 0); + + // first, try to read the first several bytes, which should record the + // start_slot index + self.snapshot_hub + .submit_action(0, LogAction::Read { offset: 0 })?; + let (_, log_result) = self.snapshot_hub.get_result().await?; + + match log_result { + LogResult::Read { + entry: Some(SnapEntry::StartSlot { slot }), + end_offset, + } => { + self.snap_offset = end_offset; + self.start_slot = slot; // get start slot index of in-mem log + + // repeatedly apply key-value pairs + loop { + self.snapshot_hub.submit_action( + 0, + LogAction::Read { + offset: self.snap_offset, + }, + )?; + let (_, log_result) = + self.snapshot_hub.get_result().await?; + + match log_result { + LogResult::Read { + entry: Some(SnapEntry::NewKVPair { key, value }), + end_offset, + } => { + // execute a Put command on state machine + self.state_machine + .submit_cmd(0, Command::Put { key, value })?; + let _ = self.state_machine.get_result().await?; + // update snapshot file offset + self.snap_offset = end_offset; + } + LogResult::Read { entry: None, .. } => { + // end of log reached + break; + } + _ => { + return logged_err!(self.id; "unexpected log result type"); + } + } + } + + // tell manager about my start_slot index + self.control_hub.send_ctrl(CtrlMsg::SnapshotUpTo { + new_start: self.start_slot, + })?; + Ok(()) + } + + LogResult::Read { entry: None, .. } => { + // snapshot file is empty. Write a 0 as start_slot and return + self.snapshot_hub.submit_action( + 0, + LogAction::Write { + entry: SnapEntry::StartSlot { slot: 0 }, + offset: 0, + sync: self.config.logger_sync, + }, + )?; + let (_, log_result) = self.snapshot_hub.get_result().await?; + if let LogResult::Write { + offset_ok: true, + now_size, + } = log_result + { + self.snap_offset = now_size; + Ok(()) + } else { + logged_err!(self.id; "unexpected log result type or failed truncate") + } + } + + _ => { + logged_err!(self.id; "unexpected log result type") + } } } } @@ -1383,6 +1580,14 @@ impl GenericReplica for MultiPaxosReplica { } transport_hub.wait_for_group(population).await?; + // setup snapshot hub module + let snapshot_hub = StorageHub::new_and_setup( + id, + Path::new(&config.snapshot_path), + None, + ) + .await?; + // setup external API module, ready to take in client requests let external_api = ExternalApi::new_and_setup( id, @@ -1396,6 +1601,15 @@ impl GenericReplica for MultiPaxosReplica { time::interval(Duration::from_millis(config.hb_send_interval_ms)); hb_send_interval.set_missed_tick_behavior(MissedTickBehavior::Skip); + let snapshot_interval = if config.snapshot_interval_s == 0 { + None + } else { + let mut si = + time::interval(Duration::from_secs(config.snapshot_interval_s)); + si.set_missed_tick_behavior(MissedTickBehavior::Skip); + Some(si) + }; + Ok(MultiPaxosReplica { id, population, @@ -1407,17 +1621,21 @@ impl GenericReplica for MultiPaxosReplica { external_api, state_machine, storage_hub, + snapshot_hub, transport_hub, hb_hear_timer: Timer::new(), hb_send_interval, is_leader: false, insts: vec![], + start_slot: 0, + snapshot_interval, bal_prep_sent: 0, bal_prepared: 0, bal_max_seen: 0, commit_bar: 0, exec_bar: 0, log_offset: 0, + snap_offset: 0, }) } @@ -1425,7 +1643,10 @@ impl GenericReplica for MultiPaxosReplica { &mut self, mut rx_term: watch::Receiver, ) -> Result { - // recover state from durable storage log + // recover state from durable snapshot file + self.recover_from_snapshot().await?; + + // recover the tail-piece memory log & state from durable storage log self.recover_from_log().await?; // kick off leader activity hearing timer @@ -1470,7 +1691,7 @@ impl GenericReplica for MultiPaxosReplica { if let Err(e) = self.handle_msg_recv(peer, msg) { pf_error!(self.id; "error handling msg recv <- {}: {}", peer, e); } - } + }, // state machine execution result cmd_result = self.state_machine.get_result(), if !paused => { @@ -1486,13 +1707,29 @@ impl GenericReplica for MultiPaxosReplica { // leader inactivity timeout _ = self.hb_hear_timer.timeout(), if !paused => { - self.become_a_leader()?; + if let Err(e) = self.become_a_leader() { + pf_error!(self.id; "error becoming a leader: {}", e); + } }, // leader sending heartbeat _ = self.hb_send_interval.tick(), if !paused && self.is_leader => { - self.bcast_heartbeats()?; - } + if let Err(e) = self.bcast_heartbeats() { + pf_error!(self.id; "error broadcasting heartbeats: {}", e); + } + }, + + // autonomous snapshot taking timeout + _ = self.snapshot_interval.as_mut().unwrap().tick(), if !paused + && self.snapshot_interval.is_some() => { + if let Err(e) = self.take_new_snapshot().await { + pf_error!(self.id; "error taking a new snapshot: {}", e); + } else { + self.control_hub.send_ctrl( + CtrlMsg::SnapshotUpTo { new_start: self.start_slot } + )?; + } + }, // manager control message ctrl_msg = self.control_hub.recv_ctrl() => { diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs index 9a79da2e..6df73631 100644 --- a/src/protocols/rs_paxos.rs +++ b/src/protocols/rs_paxos.rs @@ -1510,7 +1510,7 @@ impl RSPaxosReplica { { Ok(()) } else { - logged_err!(self.id; "unexpected log result type") + logged_err!(self.id; "unexpected log result type or failed truncate") } } } @@ -1717,7 +1717,7 @@ impl GenericReplica for RSPaxosReplica { if let Err(e) = self.handle_msg_recv(peer, msg) { pf_error!(self.id; "error handling msg recv <- {}: {}", peer, e); } - } + }, // state machine execution result cmd_result = self.state_machine.get_result(), if !paused => { @@ -1733,13 +1733,17 @@ impl GenericReplica for RSPaxosReplica { // leader inactivity timeout _ = self.hb_hear_timer.timeout(), if !paused => { - self.become_a_leader()?; + if let Err(e) = self.become_a_leader() { + pf_error!(self.id; "error becoming a leader: {}", e); + } }, // leader sending heartbeat _ = self.hb_send_interval.tick(), if !paused && self.is_leader => { - self.bcast_heartbeats()?; - } + if let Err(e) = self.bcast_heartbeats() { + pf_error!(self.id; "error broadcasting heartbeats: {}", e); + } + }, // manager control message ctrl_msg = self.control_hub.recv_ctrl() => { diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs index 5d8baeec..ce89c7d1 100644 --- a/src/protocols/simple_push.rs +++ b/src/protocols/simple_push.rs @@ -721,7 +721,7 @@ impl GenericReplica for SimplePushReplica { }, } - } + }, // state machine execution result cmd_result = self.state_machine.get_result(), if !paused => { diff --git a/src/server/transport.rs b/src/server/transport.rs index 18caa475..6f2a2752 100644 --- a/src/server/transport.rs +++ b/src/server/transport.rs @@ -1,4 +1,9 @@ //! Summerset server internal TCP transport module implementation. +//! +//! In concept, all messages are sent through unstable communication channels, +//! and are retried if the sender did not receive an ACK in a timely manner. +//! Here, we use TCP as the communication protocol to get the same effect of +//! "every message a sender wants to send will eventually be delivered". use std::fmt; use std::net::SocketAddr; diff --git a/summerset_client/src/clients/bench.rs b/summerset_client/src/clients/bench.rs index 6b6ea18a..a2a7066e 100644 --- a/summerset_client/src/clients/bench.rs +++ b/summerset_client/src/clients/bench.rs @@ -19,7 +19,6 @@ use summerset::{ lazy_static! { /// Pool of keys to choose from. - // TODO: enable using a dynamic pool of keys static ref KEYS_POOL: Vec = { let mut pool = vec![]; for _ in 0..5 { From d7d091ab4278c28aac41bb495d4d8b77cb80324a Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Sun, 24 Sep 2023 13:00:15 -0500 Subject: [PATCH 54/89] staging progress on snapshotting --- src/protocols/multipaxos.rs | 248 ++++++++++++++++++++++++++---------- src/server/statemach.rs | 11 ++ src/server/storage.rs | 11 ++ src/server/transport.rs | 12 ++ 4 files changed, 217 insertions(+), 65 deletions(-) diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs index 58fbaa84..22f02fc1 100644 --- a/src/protocols/multipaxos.rs +++ b/src/protocols/multipaxos.rs @@ -283,7 +283,7 @@ pub struct MultiPaxosReplica { commit_bar: usize, /// Index of the first non-executed instance. - /// It is always true that exec_bar <= commit_bar <= insts.len() + /// It is always true that exec_bar <= commit_bar <= start_slot + insts.len() exec_bar: usize, /// Current durable log file offset. @@ -388,9 +388,9 @@ impl MultiPaxosReplica { // create a new instance in the first null slot (or append a new one // at the end if no holes exist) - let mut slot = self.insts.len(); - for s in self.commit_bar..self.insts.len() { - let old_inst = &mut self.insts[s]; + let mut slot = self.start_slot + self.insts.len(); + for s in self.commit_bar..(self.start_slot + self.insts.len()) { + let old_inst = &mut self.insts[s - self.start_slot]; if old_inst.status == Status::Null { old_inst.reqs = req_batch.clone(); old_inst.leader_bk = Some(LeaderBookkeeping { @@ -402,7 +402,7 @@ impl MultiPaxosReplica { break; } } - if slot == self.insts.len() { + if slot == self.start_slot + self.insts.len() { let mut new_inst = self.null_instance(); new_inst.reqs = req_batch.clone(); new_inst.leader_bk = Some(LeaderBookkeeping { @@ -425,7 +425,7 @@ impl MultiPaxosReplica { self.bal_max_seen = self.bal_prep_sent; } - let inst = &mut self.insts[slot]; + let inst = &mut self.insts[slot - self.start_slot]; inst.bal = self.bal_prep_sent; inst.status = Status::Preparing; pf_debug!(self.id; "enter Prepare phase for slot {} bal {}", @@ -457,7 +457,7 @@ impl MultiPaxosReplica { slot, inst.bal); } else { // normal case: Prepare phase covered, only do the Accept phase - let inst = &mut self.insts[slot]; + let inst = &mut self.insts[slot - self.start_slot]; inst.bal = self.bal_prepared; inst.status = Status::Accepting; pf_debug!(self.id; "enter Accept phase for slot {} bal {}", @@ -500,9 +500,12 @@ impl MultiPaxosReplica { &mut self, slot: usize, ) -> Result<(), SummersetError> { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } pf_trace!(self.id; "finished PrepareBal logging for slot {} bal {}", - slot, self.insts[slot].bal); - let inst = &self.insts[slot]; + slot, self.insts[slot - self.start_slot].bal); + let inst = &self.insts[slot - self.start_slot]; let voted = if inst.voted.0 > 0 { Some(inst.voted.clone()) } else { @@ -539,9 +542,12 @@ impl MultiPaxosReplica { &mut self, slot: usize, ) -> Result<(), SummersetError> { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } pf_trace!(self.id; "finished AcceptData logging for slot {} bal {}", - slot, self.insts[slot].bal); - let inst = &self.insts[slot]; + slot, self.insts[slot - self.start_slot].bal); + let inst = &self.insts[slot - self.start_slot]; if self.is_leader { // on leader, finishing the logging of an AcceptData entry @@ -572,14 +578,17 @@ impl MultiPaxosReplica { &mut self, slot: usize, ) -> Result<(), SummersetError> { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } pf_trace!(self.id; "finished CommitSlot logging for slot {} bal {}", - slot, self.insts[slot].bal); - assert!(self.insts[slot].status >= Status::Committed); + slot, self.insts[slot - self.start_slot].bal); + assert!(self.insts[slot - self.start_slot].status >= Status::Committed); // update index of the first non-committed instance if slot == self.commit_bar { - while self.commit_bar < self.insts.len() { - let inst = &mut self.insts[self.commit_bar]; + while self.commit_bar < self.start_slot + self.insts.len() { + let inst = &mut self.insts[self.commit_bar - self.start_slot]; if inst.status < Status::Committed { break; } @@ -617,7 +626,10 @@ impl MultiPaxosReplica { log_result: LogResult, ) -> Result<(), SummersetError> { let (slot, entry_type) = Self::split_log_action_id(action_id); - assert!(slot < self.insts.len()); + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } + assert!(slot < self.start_slot + self.insts.len()); if let LogResult::Append { now_size } = log_result { assert!(now_size >= self.log_offset); @@ -643,16 +655,19 @@ impl MultiPaxosReplica { slot: usize, ballot: Ballot, ) -> Result<(), SummersetError> { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } pf_trace!(self.id; "received Prepare <- {} for slot {} bal {}", peer, slot, ballot); // if ballot is not smaller than what I have seen: if ballot >= self.bal_max_seen { // locate instance in memory, filling in null instances if needed - while self.insts.len() <= slot { + while self.start_slot + self.insts.len() <= slot { self.insts.push(self.null_instance()); } - let inst = &mut self.insts[slot]; + let inst = &mut self.insts[slot - self.start_slot]; assert!(inst.bal <= ballot); inst.bal = ballot; @@ -685,13 +700,16 @@ impl MultiPaxosReplica { ballot: Ballot, voted: Option<(Ballot, ReqBatch)>, ) -> Result<(), SummersetError> { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } pf_trace!(self.id; "received PrepareReply <- {} for slot {} bal {}", peer, slot, ballot); // if ballot is what I'm currently waiting on for Prepare replies: if ballot == self.bal_prep_sent { - assert!(slot < self.insts.len()); - let inst = &mut self.insts[slot]; + assert!(slot < self.start_slot + self.insts.len()); + let inst = &mut self.insts[slot - self.start_slot]; // ignore spurious duplications and outdated replies if (inst.status != Status::Preparing) || (ballot < inst.bal) { @@ -766,16 +784,19 @@ impl MultiPaxosReplica { ballot: Ballot, reqs: ReqBatch, ) -> Result<(), SummersetError> { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } pf_trace!(self.id; "received Accept <- {} for slot {} bal {}", peer, slot, ballot); // if ballot is not smaller than what I have made promises for: if ballot >= self.bal_max_seen { // locate instance in memory, filling in null instances if needed - while self.insts.len() <= slot { + while self.start_slot + self.insts.len() <= slot { self.insts.push(self.null_instance()); } - let inst = &mut self.insts[slot]; + let inst = &mut self.insts[slot - self.start_slot]; assert!(inst.bal <= ballot); inst.bal = ballot; @@ -809,13 +830,16 @@ impl MultiPaxosReplica { slot: usize, ballot: Ballot, ) -> Result<(), SummersetError> { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } pf_trace!(self.id; "received AcceptReply <- {} for slot {} bal {}", peer, slot, ballot); // if ballot is what I'm currently waiting on for Accept replies: if ballot == self.bal_prepared { - assert!(slot < self.insts.len()); - let inst = &mut self.insts[slot]; + assert!(slot < self.start_slot + self.insts.len()); + let inst = &mut self.insts[slot - self.start_slot]; // ignore spurious duplications and outdated replies if (inst.status != Status::Accepting) || (ballot < inst.bal) { @@ -866,13 +890,16 @@ impl MultiPaxosReplica { peer: ReplicaId, slot: usize, ) -> Result<(), SummersetError> { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } pf_trace!(self.id; "received Commit <- {} for slot {}", peer, slot); // locate instance in memory, filling in null instances if needed - while self.insts.len() <= slot { + while self.start_slot + self.insts.len() <= slot { self.insts.push(self.null_instance()); } - let inst = &mut self.insts[slot]; + let inst = &mut self.insts[slot - self.start_slot]; // ignore spurious duplications if inst.status != Status::Accepting { @@ -931,11 +958,14 @@ impl MultiPaxosReplica { cmd_result: CommandResult, ) -> Result<(), SummersetError> { let (slot, cmd_idx) = Self::split_command_id(cmd_id); - assert!(slot < self.insts.len()); + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } + assert!(slot < self.start_slot + self.insts.len()); pf_trace!(self.id; "executed cmd in instance at slot {} idx {}", slot, cmd_idx); - let inst = &mut self.insts[slot]; + let inst = &mut self.insts[slot - self.start_slot]; assert!(cmd_idx < inst.reqs.len()); let (client, ref req) = inst.reqs[cmd_idx]; @@ -966,8 +996,8 @@ impl MultiPaxosReplica { // update index of the first non-executed instance if slot == self.exec_bar { - while self.exec_bar < self.insts.len() { - let inst = &mut self.insts[self.exec_bar]; + while self.exec_bar < self.start_slot + self.insts.len() { + let inst = &mut self.insts[self.exec_bar - self.start_slot]; if inst.status < Status::Executed { break; } @@ -1000,7 +1030,12 @@ impl MultiPaxosReplica { self.bal_max_seen = self.bal_prep_sent; // redo Prepare phase for all in-progress instances - for (slot, inst) in self.insts.iter_mut().enumerate() { + for (slot, inst) in self + .insts + .iter_mut() + .enumerate() + .map(|(s, i)| (self.start_slot + s, i)) + { if inst.status < Status::Committed { inst.bal = self.bal_prep_sent; inst.status = Status::Preparing; @@ -1210,11 +1245,11 @@ impl MultiPaxosReplica { match entry { LogEntry::PrepareBal { slot, ballot } => { // locate instance in memory, filling in null instances if needed - while self.insts.len() <= slot { + while self.start_slot + self.insts.len() <= slot { self.insts.push(self.null_instance()); } // update instance state - let inst = &mut self.insts[slot]; + let inst = &mut self.insts[slot - self.start_slot]; inst.bal = ballot; inst.status = Status::Preparing; // update bal_prep_sent and bal_max_seen, reset bal_prepared @@ -1229,11 +1264,11 @@ impl MultiPaxosReplica { LogEntry::AcceptData { slot, ballot, reqs } => { // locate instance in memory, filling in null instances if needed - while self.insts.len() <= slot { + while self.start_slot + self.insts.len() <= slot { self.insts.push(self.null_instance()); } // update instance state - let inst = &mut self.insts[slot]; + let inst = &mut self.insts[slot - self.start_slot]; inst.bal = ballot; inst.status = Status::Accepting; inst.reqs = reqs.clone(); @@ -1249,14 +1284,15 @@ impl MultiPaxosReplica { } LogEntry::CommitSlot { slot } => { - assert!(slot < self.insts.len()); + assert!(slot < self.start_slot + self.insts.len()); // update instance state - self.insts[slot].status = Status::Committed; + self.insts[slot - self.start_slot].status = Status::Committed; // submit commands in contiguously committed instance to the // state machine if slot == self.commit_bar { - while self.commit_bar < self.insts.len() { - let inst = &mut self.insts[self.commit_bar]; + while self.commit_bar < self.start_slot + self.insts.len() { + let inst = + &mut self.insts[self.commit_bar - self.start_slot]; if inst.status < Status::Committed { break; } @@ -1330,6 +1366,110 @@ impl MultiPaxosReplica { } } + /// Dump a new key-value pair to snapshot file. + async fn snapshot_dump_kv_pair( + &mut self, + key: String, + value: String, + ) -> Result<(), SummersetError> { + self.snapshot_hub.submit_action( + 0, // using 0 as dummy log action ID + LogAction::Append { + entry: SnapEntry::NewKVPair { key, value }, + sync: self.config.logger_sync, + }, + )?; + let (_, log_result) = self.snapshot_hub.get_result().await?; + if let LogResult::Write { + offset_ok: true, + now_size, + } = log_result + { + self.snap_offset = now_size; + Ok(()) + } else { + logged_err!( + self.id; + "unexpected log result type or failed write" + ) + } + } + + /// Squash the durable WAL log, discarding everything older than start_slot. + async fn snapshot_squash_log(&mut self) -> Result<(), SummersetError> { + // read entries until one >= start_slot found + let mut cut_offset = 0; + loop { + self.storage_hub.submit_action( + 0, // using 0 as dummy log action ID + LogAction::Read { offset: cut_offset }, + )?; + + let mut found = false; + loop { + let (action_id, log_result) = + self.storage_hub.get_result().await?; + if action_id != 0 { + // normal log action previously in queue; process it + self.handle_log_result(action_id, log_result)?; + } else { + match log_result { + LogResult::Read { + entry: Some(entry), + end_offset, + } => { + let slot = match entry { + LogEntry::PrepareBal { slot, .. } => slot, + LogEntry::AcceptData { slot, .. } => slot, + LogEntry::CommitSlot { slot } => slot, + }; + if slot >= self.start_slot { + // first entry >= start_slot found + found = true; + } else { + // not found yet + cut_offset = end_offset; + } + } + LogResult::Read { entry: None, .. } => { + // end of WAL log + found = true; + } + _ => { + return logged_err!(self.id; "unexpected log result type"); + } + } + break; + } + } + + if found { + break; + } + } + + // discard the log before cut_offset + if cut_offset > 0 { + self.storage_hub + .submit_action(0, LogAction::Discard { offset: cut_offset })?; + let (_, log_result) = self.storage_hub.get_result().await?; + if let LogResult::Discard { + offset_ok: true, + now_size, + } = log_result + { + assert_eq!(self.log_offset - cut_offset, now_size); + self.log_offset = now_size; + } else { + return logged_err!( + self.id; + "unexpected log result type or failed discard" + ); + } + } + Ok(()) + } + /// Take a snapshot up to current exec_idx, then discard the in-mem log up /// to that index, and squash the durable WAL log file. /// @@ -1346,36 +1486,13 @@ impl MultiPaxosReplica { // dump all Puts in executed instances for slot in self.start_slot..self.exec_bar { let inst = &self.insts[slot - self.start_slot]; - for (_, req) in &inst.reqs { + for (_, req) in inst.reqs.clone() { if let ApiRequest::Req { cmd: Command::Put { key, value }, .. } = req { - self.snapshot_hub.submit_action( - 0, // using 0 as dummy log action ID - LogAction::Append { - entry: SnapEntry::NewKVPair { - key: key.clone(), - value: value.clone(), - }, - sync: self.config.logger_sync, - }, - )?; - let (_, log_result) = - self.snapshot_hub.get_result().await?; - if let LogResult::Write { - offset_ok: true, - now_size, - } = log_result - { - self.snap_offset = now_size; - } else { - return logged_err!( - self.id; - "unexpected log result type or failed write" - ); - } + self.snapshot_dump_kv_pair(key, value).await?; } } } @@ -1384,7 +1501,8 @@ impl MultiPaxosReplica { self.insts.drain(0..(self.exec_bar - self.start_slot)); self.start_slot = self.exec_bar; - // TODO: squash the durable WAL log + // squash the durable WAL log, discarding everything older than start_slot + self.snapshot_squash_log().await?; pf_debug!(self.id; "took new snapshot up to: start {}", self.start_slot); Ok(()) diff --git a/src/server/statemach.rs b/src/server/statemach.rs index 47196cf4..452b682e 100644 --- a/src/server/statemach.rs +++ b/src/server/statemach.rs @@ -94,6 +94,17 @@ impl StateMachine { None => logged_err!(self.me; "ack channel has been closed"), } } + + /// Try to get the next execution result using `try_recv()`. + #[allow(dead_code)] + pub fn try_get_result( + &mut self, + ) -> Result<(CommandId, CommandResult), SummersetError> { + match self.rx_ack.try_recv() { + Ok((id, result)) => Ok((id, result)), + Err(e) => Err(SummersetError(e.to_string())), + } + } } // StateMachine executor thread implementation diff --git a/src/server/storage.rs b/src/server/storage.rs index d14bd2bf..a11d6ba6 100644 --- a/src/server/storage.rs +++ b/src/server/storage.rs @@ -182,6 +182,17 @@ where None => logged_err!(self.me; "ack channel has been closed"), } } + + /// Try to get the next logging result using `try_recv()`. + #[allow(dead_code)] + pub fn try_get_result( + &mut self, + ) -> Result<(LogActionId, LogResult), SummersetError> { + match self.rx_ack.try_recv() { + Ok((id, result)) => Ok((id, result)), + Err(e) => Err(SummersetError(e.to_string())), + } + } } // StorageHub logger thread implementation diff --git a/src/server/transport.rs b/src/server/transport.rs index 6f2a2752..699697d8 100644 --- a/src/server/transport.rs +++ b/src/server/transport.rs @@ -281,6 +281,18 @@ where } } + /// Try to receive the next message using `try_recv()`. + #[allow(dead_code)] + pub fn try_recv_msg(&mut self) -> Result<(ReplicaId, Msg), SummersetError> { + match self.rx_recv.try_recv() { + Ok((id, peer_msg)) => match peer_msg { + PeerMessage::Msg { msg } => Ok((id, msg)), + _ => logged_err!(self.me; "unexpected peer message type"), + }, + Err(e) => Err(SummersetError(e.to_string())), + } + } + /// Broadcasts leave notifications to all peers and waits for replies. pub async fn leave(&mut self) -> Result<(), SummersetError> { let tx_sends_guard = self.tx_sends.guard(); From cac7a52cf98c1b6f31cb143e7fc21483ab5467f7 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Sun, 24 Sep 2023 16:41:01 -0500 Subject: [PATCH 55/89] finished snapshotting impl for MultiPaxos --- scripts/local_client.py | 7 +- scripts/local_cluster.py | 11 +- src/manager/clusman.rs | 4 + src/protocols/multipaxos.rs | 280 +++++++++++++------------ summerset_client/src/clients/tester.rs | 70 ++++++- 5 files changed, 225 insertions(+), 147 deletions(-) diff --git a/scripts/local_client.py b/scripts/local_client.py index f8ac5981..87e1d9eb 100644 --- a/scripts/local_client.py +++ b/scripts/local_client.py @@ -9,7 +9,7 @@ def do_cargo_build(release): if release: cmd.append("-r") proc = subprocess.Popen(cmd) - proc.wait() + return proc.wait() def run_process(cmd): @@ -124,7 +124,10 @@ def run_client(protocol, utility, params, release, config): args = parser.parse_args() # build everything - do_cargo_build(args.release) + rc = do_cargo_build(args.release) + if rc != 0: + print("ERROR: cargo build failed") + sys.exit(rc) # run client executable client_proc = run_client( diff --git a/scripts/local_cluster.py b/scripts/local_cluster.py index 5b0b0658..6d23db83 100644 --- a/scripts/local_cluster.py +++ b/scripts/local_cluster.py @@ -12,7 +12,7 @@ def do_cargo_build(release): if release: cmd.append("-r") proc = subprocess.Popen(cmd) - proc.wait() + return proc.wait() def run_process(cmd, capture_stderr=False): @@ -169,12 +169,17 @@ def launch_servers(protocol, num_replicas, release, config): kill_all_matching("summerset_server", force=True) kill_all_matching("summerset_manager", force=True) - # remove all existing wal files + # remove all existing wal log & snapshot files for path in Path("/tmp").glob("summerset.*.wal"): path.unlink() + for path in Path("/tmp").glob("summerset.*.snap"): + path.unlink() # build everything - do_cargo_build(args.release) + rc = do_cargo_build(args.release) + if rc != 0: + print("ERROR: cargo build failed") + sys.exit(rc) # launch cluster manager oracle first manager_proc = launch_manager(args.protocol, args.num_replicas, args.release) diff --git a/src/manager/clusman.rs b/src/manager/clusman.rs index 58b4a3b3..f1a496c9 100644 --- a/src/manager/clusman.rs +++ b/src/manager/clusman.rs @@ -12,6 +12,7 @@ use crate::client::ClientId; use crate::protocols::SmrProtocol; use tokio::sync::{mpsc, watch}; +use tokio::time::{self, Duration}; /// Information about an active server. #[derive(Debug, Clone)] @@ -358,6 +359,9 @@ impl ClusterManager { return logged_err!("m"; "error assigning new server ID: {}", e); } + // wait a while to ensure the server's transport hub is setup + time::sleep(Duration::from_millis(300)).await; + reset_done.insert(s); } diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs index 22f02fc1..a6123a8a 100644 --- a/src/protocols/multipaxos.rs +++ b/src/protocols/multipaxos.rs @@ -78,8 +78,8 @@ impl Default for ReplicaConfigMultiPaxos { max_batch_size: 5000, backer_path: "/tmp/summerset.multipaxos.wal".into(), logger_sync: false, - hb_hear_timeout_min: 300, - hb_hear_timeout_max: 600, + hb_hear_timeout_min: 600, + hb_hear_timeout_max: 900, hb_send_interval_ms: 50, snapshot_path: "/tmp/summerset.multipaxos.snap".into(), snapshot_interval_s: 0, @@ -152,6 +152,9 @@ struct Instance { /// True if from external client, else false. external: bool, + + /// Offset of first durable WAL log entry related to this instance. + log_offset: usize, } /// Stable storage log entry type. @@ -178,8 +181,8 @@ enum SnapEntry { /// this snapshot file == the start slot index of in-mem log. StartSlot { slot: usize }, - /// Key-value pair entry to apply to the state. - NewKVPair { key: String, value: String }, + /// Set of key-value pairs to apply to the state. + KVPairSet { pairs: HashMap }, } /// Peer-peer message type. @@ -268,7 +271,7 @@ pub struct MultiPaxosReplica { start_slot: usize, /// Timer for taking a new autonomous snapshot. - snapshot_interval: Option, + snapshot_interval: Interval, /// Largest ballot number that a leader has sent Prepare messages in. bal_prep_sent: Ballot, @@ -304,6 +307,7 @@ impl MultiPaxosReplica { leader_bk: None, replica_bk: None, external: false, + log_offset: 0, } } @@ -520,18 +524,18 @@ impl MultiPaxosReplica { } else { // on follower replica, finishing the logging of a // PrepareBal entry leads to sending back a Prepare reply - assert!(inst.replica_bk.is_some()); - let source = inst.replica_bk.as_ref().unwrap().source; - self.transport_hub.send_msg( - PeerMsg::PrepareReply { - slot, - ballot: inst.bal, - voted, - }, - source, - )?; - pf_trace!(self.id; "sent PrepareReply -> {} for slot {} bal {}", - source, slot, inst.bal); + if let Some(ReplicaBookkeeping { source }) = inst.replica_bk { + self.transport_hub.send_msg( + PeerMsg::PrepareReply { + slot, + ballot: inst.bal, + voted, + }, + source, + )?; + pf_trace!(self.id; "sent PrepareReply -> {} for slot {} bal {}", + source, slot, inst.bal); + } } Ok(()) @@ -557,17 +561,17 @@ impl MultiPaxosReplica { } else { // on follower replica, finishing the logging of an // AcceptData entry leads to sending back an Accept reply - assert!(inst.replica_bk.is_some()); - let source = inst.replica_bk.as_ref().unwrap().source; - self.transport_hub.send_msg( - PeerMsg::AcceptReply { - slot, - ballot: inst.bal, - }, - source, - )?; - pf_trace!(self.id; "sent AcceptReply -> {} for slot {} bal {}", - source, slot, inst.bal); + if let Some(ReplicaBookkeeping { source }) = inst.replica_bk { + self.transport_hub.send_msg( + PeerMsg::AcceptReply { + slot, + ballot: inst.bal, + }, + source, + )?; + pf_trace!(self.id; "sent AcceptReply -> {} for slot {} bal {}", + source, slot, inst.bal); + } } Ok(()) @@ -583,7 +587,6 @@ impl MultiPaxosReplica { } pf_trace!(self.id; "finished CommitSlot logging for slot {} bal {}", slot, self.insts[slot - self.start_slot].bal); - assert!(self.insts[slot - self.start_slot].status >= Status::Committed); // update index of the first non-committed instance if slot == self.commit_bar { @@ -633,6 +636,13 @@ impl MultiPaxosReplica { if let LogResult::Append { now_size } = log_result { assert!(now_size >= self.log_offset); + // update first log_offset of slot + let inst = &mut self.insts[slot - self.start_slot]; + if inst.log_offset == 0 { + inst.log_offset = self.log_offset; + } + assert!(inst.log_offset <= self.log_offset); + // then update self.log_offset self.log_offset = now_size; } else { return logged_err!(self.id; "unexpected log result type: {:?}", log_result); @@ -712,7 +722,10 @@ impl MultiPaxosReplica { let inst = &mut self.insts[slot - self.start_slot]; // ignore spurious duplications and outdated replies - if (inst.status != Status::Preparing) || (ballot < inst.bal) { + if !self.is_leader + || (inst.status != Status::Preparing) + || (ballot < inst.bal) + { return Ok(()); } assert_eq!(inst.bal, ballot); @@ -842,7 +855,10 @@ impl MultiPaxosReplica { let inst = &mut self.insts[slot - self.start_slot]; // ignore spurious duplications and outdated replies - if (inst.status != Status::Accepting) || (ballot < inst.bal) { + if !self.is_leader + || (inst.status != Status::Accepting) + || (ballot < inst.bal) + { return Ok(()); } assert_eq!(inst.bal, ballot); @@ -1039,6 +1055,11 @@ impl MultiPaxosReplica { if inst.status < Status::Committed { inst.bal = self.bal_prep_sent; inst.status = Status::Preparing; + inst.leader_bk = Some(LeaderBookkeeping { + prepare_acks: Bitmap::new(self.population, false), + prepare_max_bal: 0, + accept_acks: Bitmap::new(self.population, false), + }); pf_debug!(self.id; "enter Prepare phase for slot {} bal {}", slot, inst.bal); @@ -1132,6 +1153,8 @@ impl MultiPaxosReplica { &mut self, durable: bool, ) -> Result<(), SummersetError> { + pf_warn!(self.id; "server got restart req"); + // send leave notification to peers and wait for their replies self.transport_hub.leave().await?; @@ -1273,6 +1296,11 @@ impl MultiPaxosReplica { inst.status = Status::Accepting; inst.reqs = reqs.clone(); inst.voted = (ballot, reqs); + // it could be the case that the PrepareBal action for this + // ballot has been snapshotted + if self.bal_prep_sent < ballot { + self.bal_prep_sent = ballot; + } // update bal_prepared and bal_max_seen if self.bal_prepared < ballot { self.bal_prepared = ballot; @@ -1367,45 +1395,54 @@ impl MultiPaxosReplica { } /// Dump a new key-value pair to snapshot file. - async fn snapshot_dump_kv_pair( - &mut self, - key: String, - value: String, - ) -> Result<(), SummersetError> { + async fn snapshot_dump_kv_pairs(&mut self) -> Result<(), SummersetError> { + // collect all key-value pairs put up to exec_bar + let mut pairs = HashMap::new(); + for slot in self.start_slot..self.exec_bar { + let inst = &self.insts[slot - self.start_slot]; + for (_, req) in inst.reqs.clone() { + if let ApiRequest::Req { + cmd: Command::Put { key, value }, + .. + } = req + { + pairs.insert(key, value); + } + } + } + + // write the collection to snapshot file self.snapshot_hub.submit_action( 0, // using 0 as dummy log action ID LogAction::Append { - entry: SnapEntry::NewKVPair { key, value }, + entry: SnapEntry::KVPairSet { pairs }, sync: self.config.logger_sync, }, )?; let (_, log_result) = self.snapshot_hub.get_result().await?; - if let LogResult::Write { - offset_ok: true, - now_size, - } = log_result - { + if let LogResult::Append { now_size } = log_result { self.snap_offset = now_size; Ok(()) } else { logged_err!( self.id; - "unexpected log result type or failed write" + "unexpected log result type" ) } } - /// Squash the durable WAL log, discarding everything older than start_slot. - async fn snapshot_squash_log(&mut self) -> Result<(), SummersetError> { - // read entries until one >= start_slot found - let mut cut_offset = 0; - loop { - self.storage_hub.submit_action( - 0, // using 0 as dummy log action ID - LogAction::Read { offset: cut_offset }, - )?; + /// Discard everything older than start_slot in durable WAL log. + async fn snapshot_discard_log(&mut self) -> Result<(), SummersetError> { + let cut_offset = if !self.insts.is_empty() { + self.insts[0].log_offset + } else { + self.log_offset + }; - let mut found = false; + // discard the log before cut_offset + if cut_offset > 0 { + self.storage_hub + .submit_action(0, LogAction::Discard { offset: cut_offset })?; loop { let (action_id, log_result) = self.storage_hub.get_result().await?; @@ -1413,98 +1450,70 @@ impl MultiPaxosReplica { // normal log action previously in queue; process it self.handle_log_result(action_id, log_result)?; } else { - match log_result { - LogResult::Read { - entry: Some(entry), - end_offset, - } => { - let slot = match entry { - LogEntry::PrepareBal { slot, .. } => slot, - LogEntry::AcceptData { slot, .. } => slot, - LogEntry::CommitSlot { slot } => slot, - }; - if slot >= self.start_slot { - // first entry >= start_slot found - found = true; - } else { - // not found yet - cut_offset = end_offset; - } - } - LogResult::Read { entry: None, .. } => { - // end of WAL log - found = true; - } - _ => { - return logged_err!(self.id; "unexpected log result type"); - } + if let LogResult::Discard { + offset_ok: true, + now_size, + } = log_result + { + assert_eq!(self.log_offset - cut_offset, now_size); + self.log_offset = now_size; + } else { + return logged_err!( + self.id; + "unexpected log result type or failed discard" + ); } break; } } - - if found { - break; - } } - // discard the log before cut_offset - if cut_offset > 0 { - self.storage_hub - .submit_action(0, LogAction::Discard { offset: cut_offset })?; - let (_, log_result) = self.storage_hub.get_result().await?; - if let LogResult::Discard { - offset_ok: true, - now_size, - } = log_result - { - assert_eq!(self.log_offset - cut_offset, now_size); - self.log_offset = now_size; - } else { - return logged_err!( - self.id; - "unexpected log result type or failed discard" - ); + // update inst.log_offset for all remaining in-mem instances + for inst in &mut self.insts { + if inst.log_offset > 0 { + assert!(inst.log_offset >= cut_offset); + inst.log_offset -= cut_offset; } } + Ok(()) } /// Take a snapshot up to current exec_idx, then discard the in-mem log up - /// to that index, and squash the durable WAL log file. + /// to that index as well as outdate entries in the durable WAL log file. /// /// NOTE: the current implementation does not guard against crashes in the /// middle of taking a snapshot. async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> { - pf_debug!(self.id; "taking a new snapshot: start {} exec {}", + pf_debug!(self.id; "taking new snapshot: start {} exec {}", self.start_slot, self.exec_bar); assert!(self.exec_bar >= self.start_slot); if self.exec_bar == self.start_slot { return Ok(()); } - // dump all Puts in executed instances - for slot in self.start_slot..self.exec_bar { - let inst = &self.insts[slot - self.start_slot]; - for (_, req) in inst.reqs.clone() { - if let ApiRequest::Req { - cmd: Command::Put { key, value }, - .. - } = req - { - self.snapshot_dump_kv_pair(key, value).await?; - } - } + // collect and dump all Puts in executed instances + if self.is_leader { + // NOTE: broadcast heartbeats here to appease followers + self.bcast_heartbeats()?; } + self.snapshot_dump_kv_pairs().await?; // update start_slot and discard all in-memory log instances up to exec_bar self.insts.drain(0..(self.exec_bar - self.start_slot)); self.start_slot = self.exec_bar; - // squash the durable WAL log, discarding everything older than start_slot - self.snapshot_squash_log().await?; + // discarding everything older than start_slot in WAL log + if self.is_leader { + // NOTE: broadcast heartbeats here to appease followers + self.bcast_heartbeats()?; + } + self.snapshot_discard_log().await?; - pf_debug!(self.id; "took new snapshot up to: start {}", self.start_slot); + // reset the leader heartbeat hear timer + self.kickoff_hb_hear_timer()?; + + pf_info!(self.id; "took snapshot up to: start {}", self.start_slot); Ok(()) } @@ -1539,13 +1548,17 @@ impl MultiPaxosReplica { match log_result { LogResult::Read { - entry: Some(SnapEntry::NewKVPair { key, value }), + entry: Some(SnapEntry::KVPairSet { pairs }), end_offset, } => { - // execute a Put command on state machine - self.state_machine - .submit_cmd(0, Command::Put { key, value })?; - let _ = self.state_machine.get_result().await?; + // execute Put commands on state machine + for (key, value) in pairs { + self.state_machine.submit_cmd( + 0, + Command::Put { key, value }, + )?; + let _ = self.state_machine.get_result().await?; + } // update snapshot file offset self.snap_offset = end_offset; } @@ -1615,6 +1628,7 @@ impl GenericReplica for MultiPaxosReplica { backer_path, logger_sync, hb_hear_timeout_min, hb_hear_timeout_max, hb_send_interval_ms, + snapshot_path, snapshot_interval_s, perf_storage_a, perf_storage_b, perf_network_a, perf_network_b)?; if config.batch_interval_us == 0 { @@ -1719,14 +1733,14 @@ impl GenericReplica for MultiPaxosReplica { time::interval(Duration::from_millis(config.hb_send_interval_ms)); hb_send_interval.set_missed_tick_behavior(MissedTickBehavior::Skip); - let snapshot_interval = if config.snapshot_interval_s == 0 { - None - } else { - let mut si = - time::interval(Duration::from_secs(config.snapshot_interval_s)); - si.set_missed_tick_behavior(MissedTickBehavior::Skip); - Some(si) - }; + let mut snapshot_interval = time::interval(Duration::from_secs( + if config.snapshot_interval_s > 0 { + config.snapshot_interval_s + } else { + 60 // dummy non-zero value to make `time::interval` happy + }, + )); + snapshot_interval.set_missed_tick_behavior(MissedTickBehavior::Skip); Ok(MultiPaxosReplica { id, @@ -1838,8 +1852,8 @@ impl GenericReplica for MultiPaxosReplica { }, // autonomous snapshot taking timeout - _ = self.snapshot_interval.as_mut().unwrap().tick(), if !paused - && self.snapshot_interval.is_some() => { + _ = self.snapshot_interval.tick(), if !paused + && self.config.snapshot_interval_s > 0 => { if let Err(e) = self.take_new_snapshot().await { pf_error!(self.id; "error taking a new snapshot: {}", e); } else { @@ -1859,10 +1873,6 @@ impl GenericReplica for MultiPaxosReplica { match self.handle_ctrl_msg(ctrl_msg, &mut paused).await { Ok(terminate) => { if let Some(restart) = terminate { - pf_warn!( - self.id; - "server got {} req", - if restart { "restart" } else { "shutdown" }); return Ok(restart); } }, diff --git a/summerset_client/src/clients/tester.rs b/summerset_client/src/clients/tester.rs index 240a18ff..378256b7 100644 --- a/summerset_client/src/clients/tester.rs +++ b/summerset_client/src/clients/tester.rs @@ -29,10 +29,12 @@ lazy_static! { ("client_reconnect", true), ("non_leader_reset", true), ("leader_node_reset", true), - ("two_nodes_reset", true), + ("two_nodes_reset", false), + ("all_nodes_reset", false), ("non_leader_pause", false), ("leader_node_pause", false), ("node_pause_resume", false), + ("snapshot_reset", false), ]; } @@ -308,6 +310,28 @@ impl ClientTester { } } + /// Force some server(s) to take a new snapshot. + async fn force_snapshot( + &mut self, + servers: HashSet, + ) -> Result<(), SummersetError> { + let ctrl_stub = self.driver.ctrl_stub(); + + // send TakeSnapshot request to manager + let req = CtrlRequest::TakeSnapshot { servers }; + let mut sent = ctrl_stub.send_req(Some(&req))?; + while !sent { + sent = ctrl_stub.send_req(None)?; + } + + // wat for reply from manager + let reply = ctrl_stub.recv_reply().await?; + match reply { + CtrlReply::TakeSnapshot { .. } => Ok(()), + _ => logged_err!(self.driver.id; "unexpected control reply type"), + } + } + /// Resume some server(s) in the cluster. #[allow(dead_code)] async fn resume_servers( @@ -347,9 +371,11 @@ impl ClientTester { "non_leader_reset" => self.test_non_leader_reset().await, "leader_node_reset" => self.test_leader_node_reset().await, "two_nodes_reset" => self.test_two_nodes_reset().await, + "all_nodes_reset" => self.test_all_nodes_reset().await, "non_leader_pause" => self.test_non_leader_pause().await, "leader_node_pause" => self.test_leader_node_pause().await, "node_pause_resume" => self.test_node_pause_resume().await, + "snapshot_reset" => self.test_snapshot_reset().await, _ => { return logged_err!(self.driver.id; "unrecognized test name '{}'", name); @@ -443,7 +469,7 @@ impl ClientTester { if !is_leader { self.driver.leave(false).await?; self.reset_servers(HashSet::from([s]), true).await?; - time::sleep(Duration::from_millis(500)).await; + time::sleep(Duration::from_secs(1)).await; self.driver.connect().await?; self.checked_get("Jose", Some(Some(&v)), 0).await?; break; @@ -460,7 +486,7 @@ impl ClientTester { if is_leader { self.driver.leave(false).await?; self.reset_servers(HashSet::from([s]), true).await?; - time::sleep(Duration::from_millis(500)).await; + time::sleep(Duration::from_secs(1)).await; self.driver.connect().await?; self.checked_get("Jose", Some(Some(&v)), 0).await?; break; @@ -491,18 +517,30 @@ impl ClientTester { if resets.len() == 2 { self.driver.leave(false).await?; self.reset_servers(resets, true).await?; - time::sleep(Duration::from_millis(500)).await; + time::sleep(Duration::from_secs(1)).await; self.driver.connect().await?; self.checked_get("Jose", Some(Some(&v)), 0).await?; } Ok(()) } + /// All replica nodes crash and restart at the same time. + async fn test_all_nodes_reset(&mut self) -> Result<(), SummersetError> { + let v = Self::gen_rand_string(8); + self.checked_put("Jose", &v, Some(None), 0).await?; + self.driver.leave(false).await?; + self.reset_servers(HashSet::new(), true).await?; + time::sleep(Duration::from_secs(1)).await; + self.driver.connect().await?; + self.checked_get("Jose", Some(Some(&v)), 0).await?; + Ok(()) + } + /// Single non-leader replica node paused. async fn test_non_leader_pause(&mut self) -> Result<(), SummersetError> { let v0 = Self::gen_rand_string(8); self.checked_put("Jose", &v0, Some(None), 0).await?; - time::sleep(Duration::from_millis(300)).await; + time::sleep(Duration::from_millis(500)).await; for (s, is_leader) in self.query_servers().await? { if !is_leader { self.driver.leave(false).await?; @@ -522,7 +560,7 @@ impl ClientTester { async fn test_leader_node_pause(&mut self) -> Result<(), SummersetError> { let v0 = Self::gen_rand_string(8); self.checked_put("Jose", &v0, Some(None), 0).await?; - time::sleep(Duration::from_millis(300)).await; + time::sleep(Duration::from_millis(500)).await; for (s, is_leader) in self.query_servers().await? { if is_leader { self.driver.leave(false).await?; @@ -542,7 +580,7 @@ impl ClientTester { async fn test_node_pause_resume(&mut self) -> Result<(), SummersetError> { let v0 = Self::gen_rand_string(8); self.checked_put("Jose", &v0, Some(None), 0).await?; - time::sleep(Duration::from_millis(300)).await; + time::sleep(Duration::from_millis(500)).await; for (s, is_leader) in self.query_servers().await? { if is_leader { self.driver.leave(false).await?; @@ -574,4 +612,22 @@ impl ClientTester { } Ok(()) } + + /// Take snapshot and reset, check previously put key-value. + async fn test_snapshot_reset(&mut self) -> Result<(), SummersetError> { + let v0 = Self::gen_rand_string(8); + self.checked_put("Jose", &v0, Some(None), 0).await?; + let v1 = Self::gen_rand_string(8); + self.checked_put("Shawn", &v1, Some(None), 0).await?; + time::sleep(Duration::from_millis(500)).await; + self.force_snapshot(HashSet::new()).await?; + self.checked_put("Jose", &v1, Some(Some(&v0)), 0).await?; + self.driver.leave(false).await?; + self.reset_servers(HashSet::new(), true).await?; + time::sleep(Duration::from_secs(1)).await; + self.driver.connect().await?; + self.checked_get("Shawn", Some(Some(&v1)), 0).await?; + self.checked_get("Jose", Some(Some(&v1)), 0).await?; + Ok(()) + } } From 79d4e4b9e614f537c1913a59985864dbcfebb5a5 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Sun, 24 Sep 2023 17:40:22 -0500 Subject: [PATCH 56/89] finished snapshotting impl for RSPaxos --- README.md | 5 +- src/protocols/crossword.rs | 4 +- src/protocols/multipaxos.rs | 2 +- src/protocols/rs_paxos.rs | 582 +++++++++++++++++++++++++++++------- 4 files changed, 483 insertions(+), 110 deletions(-) diff --git a/README.md b/README.md index a7250245..6a6224ce 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,8 @@ git push origin [![Format check](https://github.com/josehu07/summerset/actions/workflows/format.yml/badge.svg)](https://github.com/josehu07/summerset/actions?query=josehu07%3Aformat) [![Build status](https://github.com/josehu07/summerset/actions/workflows/build.yml/badge.svg)](https://github.com/josehu07/summerset/actions?query=josehu07%3Abuild) -[![Tests status](https://github.com/josehu07/summerset/actions/workflows/tests.yml/badge.svg)](https://github.com/josehu07/summerset/actions?query=josehu07%3Atests) +[![Unit tests status](https://github.com/josehu07/summerset/actions/workflows/tests_unit.yml/badge.svg)](https://github.com/josehu07/summerset/actions?query=josehu07%3Atests_unit) +[![Proc tests status](https://github.com/josehu07/summerset/actions/workflows/tests_proc.yml/badge.svg)](https://github.com/josehu07/summerset/actions?query=josehu07%3Atests_proc) [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT) Summerset is a distributed key-value store supporting a wide range of state machine replication (SMR) protocols for research purposes. More protocols are actively being added. @@ -150,7 +151,7 @@ Complete cluster management and benchmarking scripts are available in another re - [x] client-side timeout/retry logic - [x] state persistence & restart check - [x] automatic leader election, backoffs - - [ ] snapshotting & garbage collection + - [x] snapshotting & garbage collection - [ ] specialize read-only commands? - [ ] separate commit vs. exec responses? - [ ] membership discovery & view changes diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index 424b0cc8..a98bbacb 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -128,7 +128,7 @@ struct ReplicaBookkeeping { source: ReplicaId, } -/// In-memory instance containing a complete commands batch. +/// In-memory instance containing a (possibly partial) commands batch. #[derive(Debug, Clone)] struct Instance { /// Ballot number. @@ -2013,7 +2013,7 @@ impl GenericEndpoint for CrosswordClient { } while self.ctrl_stub.recv_reply().await? != CtrlReply::Leave {} - pf_info!(self.id; "left current manager connection"); + pf_info!(self.id; "left manager connection"); } Ok(()) diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs index a6123a8a..d9d5cd76 100644 --- a/src/protocols/multipaxos.rs +++ b/src/protocols/multipaxos.rs @@ -638,7 +638,7 @@ impl MultiPaxosReplica { assert!(now_size >= self.log_offset); // update first log_offset of slot let inst = &mut self.insts[slot - self.start_slot]; - if inst.log_offset == 0 { + if inst.log_offset == 0 || inst.log_offset > self.log_offset { inst.log_offset = self.log_offset; } assert!(inst.log_offset <= self.log_offset); diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs index 6df73631..68e09acf 100644 --- a/src/protocols/rs_paxos.rs +++ b/src/protocols/rs_paxos.rs @@ -10,9 +10,9 @@ use std::net::SocketAddr; use crate::utils::{SummersetError, Bitmap, Timer, RSCodeword}; use crate::manager::{CtrlMsg, CtrlRequest, CtrlReply}; use crate::server::{ - ReplicaId, ControlHub, StateMachine, CommandResult, CommandId, ExternalApi, - ApiRequest, ApiReply, StorageHub, LogAction, LogResult, LogActionId, - TransportHub, GenericReplica, + ReplicaId, ControlHub, StateMachine, Command, CommandResult, CommandId, + ExternalApi, ApiRequest, ApiReply, StorageHub, LogAction, LogResult, + LogActionId, TransportHub, GenericReplica, }; use crate::client::{ClientId, ClientApiStub, ClientCtrlStub, GenericEndpoint}; use crate::protocols::SmrProtocol; @@ -39,7 +39,7 @@ pub struct ReplicaConfigRSPaxos { /// Client request batching maximum batch size. pub max_batch_size: usize, - /// Path to backing file. + /// Path to backing log file. pub backer_path: String, /// Whether to call `fsync()`/`fdatasync()` on logger. @@ -54,6 +54,13 @@ pub struct ReplicaConfigRSPaxos { /// Interval of leader sending heartbeats to followers. pub hb_send_interval_ms: u64, + /// Path to snapshot file. + pub snapshot_path: String, + + /// Snapshot self-triggering interval in secs. 0 means never trigger + /// snapshotting autonomously. + pub snapshot_interval_s: u64, + /// Fault-tolerance level. pub fault_tolerance: u8, @@ -72,9 +79,11 @@ impl Default for ReplicaConfigRSPaxos { max_batch_size: 5000, backer_path: "/tmp/summerset.rs_paxos.wal".into(), logger_sync: false, - hb_hear_timeout_min: 300, - hb_hear_timeout_max: 600, + hb_hear_timeout_min: 600, + hb_hear_timeout_max: 900, hb_send_interval_ms: 50, + snapshot_path: "/tmp/summerset.rs_paxos.snap".into(), + snapshot_interval_s: 0, fault_tolerance: 0, perf_storage_a: 0, perf_storage_b: 0, @@ -122,7 +131,7 @@ struct ReplicaBookkeeping { source: ReplicaId, } -/// In-memory instance containing a complete commands batch. +/// In-memory instance containing a (possibly partial) commands batch. #[derive(Debug, Clone)] struct Instance { /// Ballot number. @@ -145,6 +154,9 @@ struct Instance { /// True if from external client, else false. external: bool, + + /// Offset of first durable WAL log entry related to this instance. + log_offset: usize, } /// Stable storage log entry type. @@ -164,6 +176,17 @@ enum LogEntry { CommitSlot { slot: usize }, } +/// Snapshot file entry type. +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] +enum SnapEntry { + /// First entry at the start of file: number of log instances covered by + /// this snapshot file == the start slot index of in-mem log. + StartSlot { slot: usize }, + + /// Set of key-value pairs to apply to the state. + KVPairSet { pairs: HashMap }, +} + /// Peer-peer message type. #[derive(Debug, Clone, Serialize, Deserialize, GetSize)] enum PeerMsg { @@ -192,11 +215,11 @@ enum PeerMsg { /// Commit notification from leader to replicas. Commit { slot: usize }, - /// Recovery read from new leader to replicas. - Recover { slot: usize }, + /// Reconstruction read from new leader to replicas. + Reconstruct { slot: usize }, - /// Recovery read reply from replica to leader. - RecoverReply { + /// Reconstruction read reply from replica to leader. + ReconstructReply { slot: usize, ballot: Ballot, reqs_cw: RSCodeword, @@ -238,6 +261,9 @@ pub struct RSPaxosReplica { /// StorageHub module. storage_hub: StorageHub, + /// StorageHub module for the snapshot file. + snapshot_hub: StorageHub, + /// TransportHub module. transport_hub: TransportHub, @@ -253,6 +279,12 @@ pub struct RSPaxosReplica { /// In-memory log of instances. insts: Vec, + /// Start slot index of in-mem log after latest snapshot. + start_slot: usize, + + /// Timer for taking a new autonomous snapshot. + snapshot_interval: Interval, + /// Largest ballot number that a leader has sent Prepare messages in. bal_prep_sent: Ballot, @@ -266,12 +298,15 @@ pub struct RSPaxosReplica { commit_bar: usize, /// Index of the first non-executed instance. - /// It is always true that exec_bar <= commit_bar <= insts.len() + /// It is always true that exec_bar <= commit_bar <= start_slot + insts.len() exec_bar: usize, /// Current durable log file offset. log_offset: usize, + /// Current durable snapshot file offset. + snap_offset: usize, + /// Fixed Reed-Solomon coder. rs_coder: ReedSolomon, } @@ -296,6 +331,7 @@ impl RSPaxosReplica { leader_bk: None, replica_bk: None, external: false, + log_offset: 0, }) } @@ -388,15 +424,15 @@ impl RSPaxosReplica { // create a new instance in the first null slot (or append a new one // at the end if no holes exist) - let mut slot = self.insts.len(); - for s in self.commit_bar..self.insts.len() { - if self.insts[s].status == Status::Null { + let mut slot = self.start_slot + self.insts.len(); + for s in self.commit_bar..(self.start_slot + self.insts.len()) { + if self.insts[s - self.start_slot].status == Status::Null { slot = s; break; } } - if slot < self.insts.len() { - let old_inst = &mut self.insts[slot]; + if slot < self.start_slot + self.insts.len() { + let old_inst = &mut self.insts[slot - self.start_slot]; assert_eq!(old_inst.status, Status::Null); old_inst.reqs_cw = reqs_cw; old_inst.leader_bk = Some(LeaderBookkeeping { @@ -427,7 +463,7 @@ impl RSPaxosReplica { self.bal_max_seen = self.bal_prep_sent; } - let inst = &mut self.insts[slot]; + let inst = &mut self.insts[slot - self.start_slot]; inst.bal = self.bal_prep_sent; inst.status = Status::Preparing; pf_debug!(self.id; "enter Prepare phase for slot {} bal {}", @@ -459,7 +495,7 @@ impl RSPaxosReplica { slot, inst.bal); } else { // normal case: Prepare phase covered, only do the Accept phase - let inst = &mut self.insts[slot]; + let inst = &mut self.insts[slot - self.start_slot]; inst.bal = self.bal_prepared; inst.status = Status::Accepting; pf_debug!(self.id; "enter Accept phase for slot {} bal {}", @@ -515,9 +551,12 @@ impl RSPaxosReplica { &mut self, slot: usize, ) -> Result<(), SummersetError> { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } pf_trace!(self.id; "finished PrepareBal logging for slot {} bal {}", - slot, self.insts[slot].bal); - let inst = &self.insts[slot]; + slot, self.insts[slot - self.start_slot].bal); + let inst = &self.insts[slot - self.start_slot]; let voted = if inst.voted.0 > 0 { Some(inst.voted.clone()) } else { @@ -532,18 +571,18 @@ impl RSPaxosReplica { } else { // on follower replica, finishing the logging of a // PrepareBal entry leads to sending back a Prepare reply - assert!(inst.replica_bk.is_some()); - let source = inst.replica_bk.as_ref().unwrap().source; - self.transport_hub.send_msg( - PeerMsg::PrepareReply { - slot, - ballot: inst.bal, - voted, - }, - source, - )?; - pf_trace!(self.id; "sent PrepareReply -> {} for slot {} bal {}", - source, slot, inst.bal); + if let Some(ReplicaBookkeeping { source }) = inst.replica_bk { + self.transport_hub.send_msg( + PeerMsg::PrepareReply { + slot, + ballot: inst.bal, + voted, + }, + source, + )?; + pf_trace!(self.id; "sent PrepareReply -> {} for slot {} bal {}", + source, slot, inst.bal); + } } Ok(()) @@ -554,9 +593,12 @@ impl RSPaxosReplica { &mut self, slot: usize, ) -> Result<(), SummersetError> { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } pf_trace!(self.id; "finished AcceptData logging for slot {} bal {}", - slot, self.insts[slot].bal); - let inst = &self.insts[slot]; + slot, self.insts[slot - self.start_slot].bal); + let inst = &self.insts[slot - self.start_slot]; if self.is_leader { // on leader, finishing the logging of an AcceptData entry @@ -566,17 +608,17 @@ impl RSPaxosReplica { } else { // on follower replica, finishing the logging of an // AcceptData entry leads to sending back an Accept reply - assert!(inst.replica_bk.is_some()); - let source = inst.replica_bk.as_ref().unwrap().source; - self.transport_hub.send_msg( - PeerMsg::AcceptReply { - slot, - ballot: inst.bal, - }, - source, - )?; - pf_trace!(self.id; "sent AcceptReply -> {} for slot {} bal {}", - source, slot, inst.bal); + if let Some(ReplicaBookkeeping { source }) = inst.replica_bk { + self.transport_hub.send_msg( + PeerMsg::AcceptReply { + slot, + ballot: inst.bal, + }, + source, + )?; + pf_trace!(self.id; "sent AcceptReply -> {} for slot {} bal {}", + source, slot, inst.bal); + } } Ok(()) @@ -587,14 +629,16 @@ impl RSPaxosReplica { &mut self, slot: usize, ) -> Result<(), SummersetError> { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } pf_trace!(self.id; "finished CommitSlot logging for slot {} bal {}", - slot, self.insts[slot].bal); - assert!(self.insts[slot].status >= Status::Committed); + slot, self.insts[slot - self.start_slot].bal); // update index of the first non-committed instance if slot == self.commit_bar { - while self.commit_bar < self.insts.len() { - let inst = &mut self.insts[self.commit_bar]; + while self.commit_bar < self.start_slot + self.insts.len() { + let inst = &mut self.insts[self.commit_bar - self.start_slot]; if inst.status < Status::Committed { break; } @@ -643,10 +687,20 @@ impl RSPaxosReplica { log_result: LogResult, ) -> Result<(), SummersetError> { let (slot, entry_type) = Self::split_log_action_id(action_id); - assert!(slot < self.insts.len()); + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } + assert!(slot < self.start_slot + self.insts.len()); if let LogResult::Append { now_size } = log_result { assert!(now_size >= self.log_offset); + // update first log_offset of slot + let inst = &mut self.insts[slot - self.start_slot]; + if inst.log_offset == 0 || inst.log_offset > self.log_offset { + inst.log_offset = self.log_offset; + } + assert!(inst.log_offset <= self.log_offset); + // then update self.log_offset self.log_offset = now_size; } else { return logged_err!(self.id; "unexpected log result type: {:?}", log_result); @@ -669,16 +723,19 @@ impl RSPaxosReplica { slot: usize, ballot: Ballot, ) -> Result<(), SummersetError> { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } pf_trace!(self.id; "received Prepare <- {} for slot {} bal {}", peer, slot, ballot); // if ballot is not smaller than what I have seen: if ballot >= self.bal_max_seen { // locate instance in memory, filling in null instances if needed - while self.insts.len() <= slot { + while self.start_slot + self.insts.len() <= slot { self.insts.push(self.null_instance()?); } - let inst = &mut self.insts[slot]; + let inst = &mut self.insts[slot - self.start_slot]; assert!(inst.bal <= ballot); inst.bal = ballot; @@ -711,17 +768,23 @@ impl RSPaxosReplica { ballot: Ballot, voted: Option<(Ballot, RSCodeword)>, ) -> Result<(), SummersetError> { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } pf_trace!(self.id; "received PrepareReply <- {} for slot {} bal {} shards {:?}", peer, slot, ballot, voted.as_ref().map(|(_, cw)| cw.avail_shards_map())); // if ballot is what I'm currently waiting on for Prepare replies: if ballot == self.bal_prep_sent { - assert!(slot < self.insts.len()); - let inst = &mut self.insts[slot]; + assert!(slot < self.start_slot + self.insts.len()); + let inst = &mut self.insts[slot - self.start_slot]; // ignore spurious duplications and outdated replies - if (inst.status != Status::Preparing) || (ballot < inst.bal) { + if !self.is_leader + || (inst.status != Status::Preparing) + || (ballot < inst.bal) + { return Ok(()); } assert_eq!(inst.bal, ballot); @@ -826,16 +889,19 @@ impl RSPaxosReplica { ballot: Ballot, reqs_cw: RSCodeword, ) -> Result<(), SummersetError> { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } pf_trace!(self.id; "received Accept <- {} for slot {} bal {} shards {:?}", peer, slot, ballot, reqs_cw.avail_shards_map()); // if ballot is not smaller than what I have made promises for: if ballot >= self.bal_max_seen { // locate instance in memory, filling in null instances if needed - while self.insts.len() <= slot { + while self.start_slot + self.insts.len() <= slot { self.insts.push(self.null_instance()?); } - let inst = &mut self.insts[slot]; + let inst = &mut self.insts[slot - self.start_slot]; assert!(inst.bal <= ballot); inst.bal = ballot; @@ -873,16 +939,22 @@ impl RSPaxosReplica { slot: usize, ballot: Ballot, ) -> Result<(), SummersetError> { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } pf_trace!(self.id; "received AcceptReply <- {} for slot {} bal {}", peer, slot, ballot); // if ballot is what I'm currently waiting on for Accept replies: if ballot == self.bal_prepared { - assert!(slot < self.insts.len()); - let inst = &mut self.insts[slot]; + assert!(slot < self.start_slot + self.insts.len()); + let inst = &mut self.insts[slot - self.start_slot]; // ignore spurious duplications and outdated replies - if (inst.status != Status::Accepting) || (ballot < inst.bal) { + if !self.is_leader + || (inst.status != Status::Accepting) + || (ballot < inst.bal) + { return Ok(()); } assert_eq!(inst.bal, ballot); @@ -934,13 +1006,16 @@ impl RSPaxosReplica { peer: ReplicaId, slot: usize, ) -> Result<(), SummersetError> { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } pf_trace!(self.id; "received Commit <- {} for slot {}", peer, slot); // locate instance in memory, filling in null instances if needed - while self.insts.len() <= slot { + while self.start_slot + self.insts.len() <= slot { self.insts.push(self.null_instance()?); } - let inst = &mut self.insts[slot]; + let inst = &mut self.insts[slot - self.start_slot]; // ignore spurious duplications if inst.status != Status::Accepting { @@ -966,19 +1041,22 @@ impl RSPaxosReplica { Ok(()) } - /// Handler of Recover message from leader. - fn handle_msg_recover( + /// Handler of Reconstruct message from leader. + fn handle_msg_reconstruct( &mut self, peer: ReplicaId, slot: usize, ) -> Result<(), SummersetError> { - pf_trace!(self.id; "received Recover <- {} for slot {}", peer, slot); + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } + pf_trace!(self.id; "received Reconstruct <- {} for slot {}", peer, slot); // locate instance in memory, filling in null instances if needed - while self.insts.len() <= slot { + while self.start_slot + self.insts.len() <= slot { self.insts.push(self.null_instance()?); } - let inst = &mut self.insts[slot]; + let inst = &mut self.insts[slot - self.start_slot]; // ignore spurious duplications; also ignore if I have nothing to send back if inst.status < Status::Accepting || inst.reqs_cw.avail_shards() == 0 { @@ -987,32 +1065,35 @@ impl RSPaxosReplica { // send back my ballot for this slot and the available shards self.transport_hub.send_msg( - PeerMsg::RecoverReply { + PeerMsg::ReconstructReply { slot, ballot: inst.bal, reqs_cw: inst.reqs_cw.clone(), }, peer, )?; - pf_trace!(self.id; "sent RecoverReply message for slot {} bal {}", slot, inst.bal); + pf_trace!(self.id; "sent ReconstructReply message for slot {} bal {}", slot, inst.bal); Ok(()) } - /// Handler of Recover reply from replica. - fn handle_msg_recover_reply( + /// Handler of Reconstruct reply from replica. + fn handle_msg_reconstruct_reply( &mut self, peer: ReplicaId, slot: usize, ballot: Ballot, reqs_cw: RSCodeword, ) -> Result<(), SummersetError> { - pf_trace!(self.id; "received RecoverReply <- {} for slot {} bal {} shards {:?}", + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } + pf_trace!(self.id; "received ReconstructReply <- {} for slot {} bal {} shards {:?}", peer, slot, ballot, reqs_cw.avail_shards_map()); - assert!(slot < self.insts.len()); - assert!(self.insts[slot].status >= Status::Committed); - let num_insts = self.insts.len(); - let inst = &mut self.insts[slot]; + assert!(slot < self.start_slot + self.insts.len()); + assert!(self.insts[slot - self.start_slot].status >= Status::Committed); + let num_insts = self.start_slot + self.insts.len(); + let inst = &mut self.insts[slot - self.start_slot]; // if reply not outdated and ballot is up-to-date if inst.status < Status::Executed && ballot >= inst.bal { @@ -1022,7 +1103,8 @@ impl RSPaxosReplica { // if enough shards have been gathered, can push execution forward if slot == self.commit_bar { while self.commit_bar < num_insts { - let inst = &mut self.insts[self.commit_bar]; + let inst = + &mut self.insts[self.commit_bar - self.start_slot]; if inst.status < Status::Committed || inst.reqs_cw.avail_shards() < self.quorum_cnt { @@ -1089,12 +1171,14 @@ impl RSPaxosReplica { self.handle_msg_accept_reply(peer, slot, ballot) } PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot), - PeerMsg::Recover { slot } => self.handle_msg_recover(peer, slot), - PeerMsg::RecoverReply { + PeerMsg::Reconstruct { slot } => { + self.handle_msg_reconstruct(peer, slot) + } + PeerMsg::ReconstructReply { slot, ballot, reqs_cw, - } => self.handle_msg_recover_reply(peer, slot, ballot, reqs_cw), + } => self.handle_msg_reconstruct_reply(peer, slot, ballot, reqs_cw), PeerMsg::Heartbeat { ballot } => self.heard_heartbeat(peer, ballot), } } @@ -1106,11 +1190,14 @@ impl RSPaxosReplica { cmd_result: CommandResult, ) -> Result<(), SummersetError> { let (slot, cmd_idx) = Self::split_command_id(cmd_id); - assert!(slot < self.insts.len()); + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } + assert!(slot < self.start_slot + self.insts.len()); pf_trace!(self.id; "executed cmd in instance at slot {} idx {}", slot, cmd_idx); - let inst = &mut self.insts[slot]; + let inst = &mut self.insts[slot - self.start_slot]; let reqs = inst.reqs_cw.get_data()?; assert!(cmd_idx < reqs.len()); let (client, ref req) = reqs[cmd_idx]; @@ -1142,8 +1229,8 @@ impl RSPaxosReplica { // update index of the first non-executed instance if slot == self.exec_bar { - while self.exec_bar < self.insts.len() { - let inst = &mut self.insts[self.exec_bar]; + while self.exec_bar < self.start_slot + self.insts.len() { + let inst = &mut self.insts[self.exec_bar - self.start_slot]; if inst.status < Status::Executed { break; } @@ -1175,11 +1262,21 @@ impl RSPaxosReplica { self.bal_prep_sent = self.make_greater_ballot(self.bal_max_seen); self.bal_max_seen = self.bal_prep_sent; - for (slot, inst) in self.insts.iter_mut().enumerate() { + for (slot, inst) in self + .insts + .iter_mut() + .enumerate() + .map(|(s, i)| (self.start_slot + s, i)) + { // redo Prepare phase for all in-progress instances if inst.status < Status::Committed { inst.bal = self.bal_prep_sent; inst.status = Status::Preparing; + inst.leader_bk = Some(LeaderBookkeeping { + prepare_acks: Bitmap::new(self.population, false), + prepare_max_bal: 0, + accept_acks: Bitmap::new(self.population, false), + }); pf_debug!(self.id; "enter Prepare phase for slot {} bal {}", slot, inst.bal); @@ -1209,14 +1306,14 @@ impl RSPaxosReplica { slot, inst.bal); } - // do recovery reads for all committed instances that do not + // do reconstruction reads for all committed instances that do not // hold enough available shards for reconstruction if inst.status == Status::Committed && inst.reqs_cw.avail_shards() < self.quorum_cnt { self.transport_hub - .bcast_msg(PeerMsg::Recover { slot }, None)?; - pf_trace!(self.id; "broadcast Recover messages for slot {} bal {} shards {:?}", + .bcast_msg(PeerMsg::Reconstruct { slot }, None)?; + pf_trace!(self.id; "broadcast Reconstruct messages for slot {} bal {} shards {:?}", slot, inst.bal, inst.reqs_cw.avail_shards_map()); } } @@ -1284,6 +1381,8 @@ impl RSPaxosReplica { &mut self, durable: bool, ) -> Result<(), SummersetError> { + pf_warn!(self.id; "server got restart req"); + // send leave notification to peers and wait for their replies self.transport_hub.leave().await?; @@ -1343,6 +1442,19 @@ impl RSPaxosReplica { Ok(()) } + /// Handler of TakeSnapshot control message. + async fn handle_ctrl_take_snapshot( + &mut self, + ) -> Result<(), SummersetError> { + pf_warn!(self.id; "server told to take snapshot"); + self.take_new_snapshot().await?; + + self.control_hub.send_ctrl(CtrlMsg::SnapshotUpTo { + new_start: self.start_slot, + })?; + Ok(()) + } + /// Synthesized handler of manager control messages. If ok, returns /// `Some(true)` if decides to terminate and reboot, `Some(false)` if /// decides to shutdown completely, and `None` if not terminating. @@ -1367,6 +1479,11 @@ impl RSPaxosReplica { Ok(None) } + CtrlMsg::TakeSnapshot => { + self.handle_ctrl_take_snapshot().await?; + Ok(None) + } + _ => Ok(None), // ignore all other types } } @@ -1379,11 +1496,11 @@ impl RSPaxosReplica { match entry { LogEntry::PrepareBal { slot, ballot } => { // locate instance in memory, filling in null instances if needed - while self.insts.len() <= slot { + while self.start_slot + self.insts.len() <= slot { self.insts.push(self.null_instance()?); } // update instance state - let inst = &mut self.insts[slot]; + let inst = &mut self.insts[slot - self.start_slot]; inst.bal = ballot; inst.status = Status::Preparing; // update bal_prep_sent and bal_max_seen, reset bal_prepared @@ -1402,15 +1519,20 @@ impl RSPaxosReplica { reqs_cw, } => { // locate instance in memory, filling in null instances if needed - while self.insts.len() <= slot { + while self.start_slot + self.insts.len() <= slot { self.insts.push(self.null_instance()?); } // update instance state - let inst = &mut self.insts[slot]; + let inst = &mut self.insts[slot - self.start_slot]; inst.bal = ballot; inst.status = Status::Accepting; inst.reqs_cw = reqs_cw.clone(); inst.voted = (ballot, reqs_cw); + // it could be the case that the PrepareBal action for this + // ballot has been snapshotted + if self.bal_prep_sent < ballot { + self.bal_prep_sent = ballot; + } // update bal_prepared and bal_max_seen if self.bal_prepared < ballot { self.bal_prepared = ballot; @@ -1422,14 +1544,15 @@ impl RSPaxosReplica { } LogEntry::CommitSlot { slot } => { - assert!(slot < self.insts.len()); + assert!(slot < self.start_slot + self.insts.len()); // update instance state - self.insts[slot].status = Status::Committed; + self.insts[slot - self.start_slot].status = Status::Committed; // submit commands in contiguously committed instance to the // state machine if slot == self.commit_bar { - while self.commit_bar < self.insts.len() { - let inst = &mut self.insts[self.commit_bar]; + while self.commit_bar < self.start_slot + self.insts.len() { + let inst = + &mut self.insts[self.commit_bar - self.start_slot]; if inst.status < Status::Committed { break; } @@ -1513,6 +1636,221 @@ impl RSPaxosReplica { logged_err!(self.id; "unexpected log result type or failed truncate") } } + + /// Dump a new key-value pair to snapshot file. + async fn snapshot_dump_kv_pairs(&mut self) -> Result<(), SummersetError> { + // collect all key-value pairs put up to exec_bar + let mut pairs = HashMap::new(); + for slot in self.start_slot..self.exec_bar { + let inst = &mut self.insts[slot - self.start_slot]; + assert!(inst.reqs_cw.avail_data_shards() >= self.quorum_cnt); + for (_, req) in inst.reqs_cw.get_data()?.clone() { + if let ApiRequest::Req { + cmd: Command::Put { key, value }, + .. + } = req + { + pairs.insert(key, value); + } + } + } + + // write the collection to snapshot file + self.snapshot_hub.submit_action( + 0, // using 0 as dummy log action ID + LogAction::Append { + entry: SnapEntry::KVPairSet { pairs }, + sync: self.config.logger_sync, + }, + )?; + let (_, log_result) = self.snapshot_hub.get_result().await?; + if let LogResult::Append { now_size } = log_result { + self.snap_offset = now_size; + Ok(()) + } else { + logged_err!( + self.id; + "unexpected log result type" + ) + } + } + + /// Discard everything older than start_slot in durable WAL log. + async fn snapshot_discard_log(&mut self) -> Result<(), SummersetError> { + let cut_offset = if !self.insts.is_empty() { + self.insts[0].log_offset + } else { + self.log_offset + }; + + // discard the log before cut_offset + if cut_offset > 0 { + self.storage_hub + .submit_action(0, LogAction::Discard { offset: cut_offset })?; + loop { + let (action_id, log_result) = + self.storage_hub.get_result().await?; + if action_id != 0 { + // normal log action previously in queue; process it + self.handle_log_result(action_id, log_result)?; + } else { + if let LogResult::Discard { + offset_ok: true, + now_size, + } = log_result + { + assert_eq!(self.log_offset - cut_offset, now_size); + self.log_offset = now_size; + } else { + return logged_err!( + self.id; + "unexpected log result type or failed discard" + ); + } + break; + } + } + } + + // update inst.log_offset for all remaining in-mem instances + for inst in &mut self.insts { + if inst.log_offset > 0 { + assert!(inst.log_offset >= cut_offset); + inst.log_offset -= cut_offset; + } + } + + Ok(()) + } + + /// Take a snapshot up to current exec_idx, then discard the in-mem log up + /// to that index as well as outdate entries in the durable WAL log file. + /// + /// NOTE: the current implementation does not guard against crashes in the + /// middle of taking a snapshot. + async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> { + pf_debug!(self.id; "taking new snapshot: start {} exec {}", + self.start_slot, self.exec_bar); + assert!(self.exec_bar >= self.start_slot); + if self.exec_bar == self.start_slot { + return Ok(()); + } + + // collect and dump all Puts in executed instances + if self.is_leader { + // NOTE: broadcast heartbeats here to appease followers + self.bcast_heartbeats()?; + } + self.snapshot_dump_kv_pairs().await?; + + // update start_slot and discard all in-memory log instances up to exec_bar + self.insts.drain(0..(self.exec_bar - self.start_slot)); + self.start_slot = self.exec_bar; + + // discarding everything older than start_slot in WAL log + if self.is_leader { + // NOTE: broadcast heartbeats here to appease followers + self.bcast_heartbeats()?; + } + self.snapshot_discard_log().await?; + + // reset the leader heartbeat hear timer + self.kickoff_hb_hear_timer()?; + + pf_info!(self.id; "took snapshot up to: start {}", self.start_slot); + Ok(()) + } + + /// Recover initial state from durable storage snapshot file. + async fn recover_from_snapshot(&mut self) -> Result<(), SummersetError> { + assert_eq!(self.snap_offset, 0); + + // first, try to read the first several bytes, which should record the + // start_slot index + self.snapshot_hub + .submit_action(0, LogAction::Read { offset: 0 })?; + let (_, log_result) = self.snapshot_hub.get_result().await?; + + match log_result { + LogResult::Read { + entry: Some(SnapEntry::StartSlot { slot }), + end_offset, + } => { + self.snap_offset = end_offset; + self.start_slot = slot; // get start slot index of in-mem log + + // repeatedly apply key-value pairs + loop { + self.snapshot_hub.submit_action( + 0, + LogAction::Read { + offset: self.snap_offset, + }, + )?; + let (_, log_result) = + self.snapshot_hub.get_result().await?; + + match log_result { + LogResult::Read { + entry: Some(SnapEntry::KVPairSet { pairs }), + end_offset, + } => { + // execute Put commands on state machine + for (key, value) in pairs { + self.state_machine.submit_cmd( + 0, + Command::Put { key, value }, + )?; + let _ = self.state_machine.get_result().await?; + } + // update snapshot file offset + self.snap_offset = end_offset; + } + LogResult::Read { entry: None, .. } => { + // end of log reached + break; + } + _ => { + return logged_err!(self.id; "unexpected log result type"); + } + } + } + + // tell manager about my start_slot index + self.control_hub.send_ctrl(CtrlMsg::SnapshotUpTo { + new_start: self.start_slot, + })?; + Ok(()) + } + + LogResult::Read { entry: None, .. } => { + // snapshot file is empty. Write a 0 as start_slot and return + self.snapshot_hub.submit_action( + 0, + LogAction::Write { + entry: SnapEntry::StartSlot { slot: 0 }, + offset: 0, + sync: self.config.logger_sync, + }, + )?; + let (_, log_result) = self.snapshot_hub.get_result().await?; + if let LogResult::Write { + offset_ok: true, + now_size, + } = log_result + { + self.snap_offset = now_size; + Ok(()) + } else { + logged_err!(self.id; "unexpected log result type or failed truncate") + } + } + + _ => { + logged_err!(self.id; "unexpected log result type") + } + } + } } #[async_trait] @@ -1533,7 +1871,9 @@ impl GenericReplica for RSPaxosReplica { batch_interval_us, max_batch_size, backer_path, logger_sync, hb_hear_timeout_min, hb_hear_timeout_max, - hb_send_interval_ms, fault_tolerance, + hb_send_interval_ms, + snapshot_path, snapshot_interval_s, + fault_tolerance, perf_storage_a, perf_storage_b, perf_network_a, perf_network_b)?; if config.batch_interval_us == 0 { @@ -1629,6 +1969,14 @@ impl GenericReplica for RSPaxosReplica { } transport_hub.wait_for_group(population).await?; + // setup snapshot hub module + let snapshot_hub = StorageHub::new_and_setup( + id, + Path::new(&config.snapshot_path), + None, + ) + .await?; + // setup external API module, ready to take in client requests let external_api = ExternalApi::new_and_setup( id, @@ -1642,6 +1990,15 @@ impl GenericReplica for RSPaxosReplica { time::interval(Duration::from_millis(config.hb_send_interval_ms)); hb_send_interval.set_missed_tick_behavior(MissedTickBehavior::Skip); + let mut snapshot_interval = time::interval(Duration::from_secs( + if config.snapshot_interval_s > 0 { + config.snapshot_interval_s + } else { + 60 // dummy non-zero value to make `time::interval` happy + }, + )); + snapshot_interval.set_missed_tick_behavior(MissedTickBehavior::Skip); + Ok(RSPaxosReplica { id, population, @@ -1653,17 +2010,21 @@ impl GenericReplica for RSPaxosReplica { external_api, state_machine, storage_hub, + snapshot_hub, transport_hub, hb_hear_timer: Timer::new(), hb_send_interval, is_leader: false, insts: vec![], + start_slot: 0, + snapshot_interval, bal_prep_sent: 0, bal_prepared: 0, bal_max_seen: 0, commit_bar: 0, exec_bar: 0, log_offset: 0, + snap_offset: 0, rs_coder, }) } @@ -1672,7 +2033,10 @@ impl GenericReplica for RSPaxosReplica { &mut self, mut rx_term: watch::Receiver, ) -> Result { - // recover state from durable storage log + // recover state from durable snapshot file + self.recover_from_snapshot().await?; + + // recover the tail-piece memory log & state from durable storage log self.recover_from_log().await?; // kick off leader activity hearing timer @@ -1745,6 +2109,18 @@ impl GenericReplica for RSPaxosReplica { } }, + // autonomous snapshot taking timeout + _ = self.snapshot_interval.tick(), if !paused + && self.config.snapshot_interval_s > 0 => { + if let Err(e) = self.take_new_snapshot().await { + pf_error!(self.id; "error taking a new snapshot: {}", e); + } else { + self.control_hub.send_ctrl( + CtrlMsg::SnapshotUpTo { new_start: self.start_slot } + )?; + } + }, + // manager control message ctrl_msg = self.control_hub.recv_ctrl() => { if let Err(e) = ctrl_msg { @@ -1755,10 +2131,6 @@ impl GenericReplica for RSPaxosReplica { match self.handle_ctrl_msg(ctrl_msg, &mut paused).await { Ok(terminate) => { if let Some(restart) = terminate { - pf_warn!( - self.id; - "server got {} req", - if restart { "restart" } else { "shutdown" }); return Ok(restart); } }, @@ -1885,7 +2257,7 @@ impl GenericEndpoint for RSPaxosClient { } async fn leave(&mut self, permanent: bool) -> Result<(), SummersetError> { - // send leave notification to current connected server + // send leave notification to all servers for (id, mut api_stub) in self.api_stubs.drain() { let mut sent = api_stub.send_req(Some(&ApiRequest::Leave))?; while !sent { @@ -1906,7 +2278,7 @@ impl GenericEndpoint for RSPaxosClient { } while self.ctrl_stub.recv_reply().await? != CtrlReply::Leave {} - pf_info!(self.id; "left current manager connection"); + pf_info!(self.id; "left manager connection"); } Ok(()) From c77fc742effd39ae938dc97c10c682eb81767e5b Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Sun, 24 Sep 2023 17:55:43 -0500 Subject: [PATCH 57/89] finished snapshotting impl for RSPaxos --- src/protocols/crossword.rs | 3 ++- src/protocols/rs_paxos.rs | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index a98bbacb..02f26853 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -1094,7 +1094,8 @@ impl CrosswordReplica { }, peer, )?; - pf_trace!(self.id; "sent RecoverReply message for slot {} bal {}", slot, inst.bal); + pf_trace!(self.id; "sent RecoverReply message for slot {} bal {}", + slot, inst.bal); Ok(()) } diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs index 68e09acf..4aa5a81f 100644 --- a/src/protocols/rs_paxos.rs +++ b/src/protocols/rs_paxos.rs @@ -1072,7 +1072,8 @@ impl RSPaxosReplica { }, peer, )?; - pf_trace!(self.id; "sent ReconstructReply message for slot {} bal {}", slot, inst.bal); + pf_trace!(self.id; "sent ReconstructReply message for slot {} bal {}", + slot, inst.bal); Ok(()) } From 693d26ad88622b99e821944baf95921feaa4bb43 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Sun, 24 Sep 2023 22:02:47 -0500 Subject: [PATCH 58/89] staging progress on snapshotting --- scripts/local_cluster.py | 2 + src/protocols/crossword.rs | 576 ++++++++++++++++++++++++++++++------- 2 files changed, 476 insertions(+), 102 deletions(-) diff --git a/scripts/local_cluster.py b/scripts/local_cluster.py index 6d23db83..a088ed81 100644 --- a/scripts/local_cluster.py +++ b/scripts/local_cluster.py @@ -50,6 +50,8 @@ def kill_all_matching(name, force=False): PROTOCOL_SNAPSHOT_PATH = { "MultiPaxos": lambda r: f"snapshot_path='/tmp/summerset.multipaxos.{r}.snap'", + "RSPaxos": lambda r: f"snapshot_path='/tmp/summerset.rs_paxos.{r}.snap'", + "Crossword": lambda r: f"snapshot_path='/tmp/summerset.crossword.{r}.snap'", } diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index 02f26853..f8dda64d 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -1,7 +1,8 @@ //! Replication protocol: Crossword. //! //! MultiPaxos with flexible Reed-Solomon erasure coding that supports tunable -//! shard groups and asymmetric shard assignment. +//! shard groups, asymmetric shard assignment, and follower gossiping for actual +//! usability. use std::collections::HashMap; use std::path::Path; @@ -10,9 +11,9 @@ use std::net::SocketAddr; use crate::utils::{SummersetError, Bitmap, Timer, RSCodeword}; use crate::manager::{CtrlMsg, CtrlRequest, CtrlReply}; use crate::server::{ - ReplicaId, ControlHub, StateMachine, CommandResult, CommandId, ExternalApi, - ApiRequest, ApiReply, StorageHub, LogAction, LogResult, LogActionId, - TransportHub, GenericReplica, + ReplicaId, ControlHub, StateMachine, Command, CommandResult, CommandId, + ExternalApi, ApiRequest, ApiReply, StorageHub, LogAction, LogResult, + LogActionId, TransportHub, GenericReplica, }; use crate::client::{ClientId, ClientApiStub, ClientCtrlStub, GenericEndpoint}; use crate::protocols::SmrProtocol; @@ -39,7 +40,7 @@ pub struct ReplicaConfigCrossword { /// Client request batching maximum batch size. pub max_batch_size: usize, - /// Path to backing file. + /// Path to backing log file. pub backer_path: String, /// Whether to call `fsync()`/`fdatasync()` on logger. @@ -54,6 +55,13 @@ pub struct ReplicaConfigCrossword { /// Interval of leader sending heartbeats to followers. pub hb_send_interval_ms: u64, + /// Path to snapshot file. + pub snapshot_path: String, + + /// Snapshot self-triggering interval in secs. 0 means never trigger + /// snapshotting autonomously. + pub snapshot_interval_s: u64, + /// Fault-tolerance level. pub fault_tolerance: u8, @@ -76,9 +84,11 @@ impl Default for ReplicaConfigCrossword { max_batch_size: 5000, backer_path: "/tmp/summerset.rs_paxos.wal".into(), logger_sync: false, - hb_hear_timeout_min: 300, - hb_hear_timeout_max: 600, + hb_hear_timeout_min: 600, + hb_hear_timeout_max: 900, hb_send_interval_ms: 50, + snapshot_path: "/tmp/summerset.rs_paxos.snap".into(), + snapshot_interval_s: 0, fault_tolerance: 0, shards_per_replica: 1, perf_storage_a: 0, @@ -151,6 +161,9 @@ struct Instance { /// True if from external client, else false. external: bool, + + /// Offset of first durable WAL log entry related to this instance. + log_offset: usize, } /// Stable storage log entry type. @@ -170,6 +183,17 @@ enum LogEntry { CommitSlot { slot: usize }, } +/// Snapshot file entry type. +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] +enum SnapEntry { + /// First entry at the start of file: number of log instances covered by + /// this snapshot file == the start slot index of in-mem log. + StartSlot { slot: usize }, + + /// Set of key-value pairs to apply to the state. + KVPairSet { pairs: HashMap }, +} + /// Peer-peer message type. #[derive(Debug, Clone, Serialize, Deserialize, GetSize)] enum PeerMsg { @@ -198,11 +222,11 @@ enum PeerMsg { /// Commit notification from leader to replicas. Commit { slot: usize }, - /// Recovery read from new leader to replicas. - Recover { slot: usize }, + /// Reconstruction read from new leader to replicas. + Reconstruct { slot: usize }, - /// Recovery read reply from replica to leader. - RecoverReply { + /// Reconstruction read reply from replica to leader. + ReconstructReply { slot: usize, ballot: Ballot, reqs_cw: RSCodeword, @@ -244,6 +268,9 @@ pub struct CrosswordReplica { /// StorageHub module. storage_hub: StorageHub, + /// StorageHub module for the snapshot file. + snapshot_hub: StorageHub, + /// TransportHub module. transport_hub: TransportHub, @@ -259,6 +286,12 @@ pub struct CrosswordReplica { /// In-memory log of instances. insts: Vec, + /// Start slot index of in-mem log after latest snapshot. + start_slot: usize, + + /// Timer for taking a new autonomous snapshot. + snapshot_interval: Interval, + /// Largest ballot number that a leader has sent Prepare messages in. bal_prep_sent: Ballot, @@ -272,12 +305,15 @@ pub struct CrosswordReplica { commit_bar: usize, /// Index of the first non-executed instance. - /// It is always true that exec_bar <= commit_bar <= insts.len() + /// It is always true that exec_bar <= commit_bar <= start_slot + insts.len() exec_bar: usize, /// Current durable log file offset. log_offset: usize, + /// Current durable snapshot file offset. + snap_offset: usize, + /// Fixed Reed-Solomon coder. rs_coder: ReedSolomon, } @@ -302,6 +338,7 @@ impl CrosswordReplica { leader_bk: None, replica_bk: None, external: false, + log_offset: 0, }) } @@ -446,15 +483,15 @@ impl CrosswordReplica { // create a new instance in the first null slot (or append a new one // at the end if no holes exist) - let mut slot = self.insts.len(); - for s in self.commit_bar..self.insts.len() { - if self.insts[s].status == Status::Null { + let mut slot = self.start_slot + self.insts.len(); + for s in self.commit_bar..(self.start_slot + self.insts.len()) { + if self.insts[s - self.start_slot].status == Status::Null { slot = s; break; } } - if slot < self.insts.len() { - let old_inst = &mut self.insts[slot]; + if slot < self.start_slot + self.insts.len() { + let old_inst = &mut self.insts[slot - self.start_slot]; assert_eq!(old_inst.status, Status::Null); old_inst.reqs_cw = reqs_cw; old_inst.leader_bk = Some(LeaderBookkeeping { @@ -485,7 +522,7 @@ impl CrosswordReplica { self.bal_max_seen = self.bal_prep_sent; } - let inst = &mut self.insts[slot]; + let inst = &mut self.insts[slot - self.start_slot]; inst.bal = self.bal_prep_sent; inst.status = Status::Preparing; pf_debug!(self.id; "enter Prepare phase for slot {} bal {}", @@ -517,7 +554,7 @@ impl CrosswordReplica { slot, inst.bal); } else { // normal case: Prepare phase covered, only do the Accept phase - let inst = &mut self.insts[slot]; + let inst = &mut self.insts[slot - self.start_slot]; inst.bal = self.bal_prepared; inst.status = Status::Accepting; pf_debug!(self.id; "enter Accept phase for slot {} bal {}", @@ -588,9 +625,12 @@ impl CrosswordReplica { &mut self, slot: usize, ) -> Result<(), SummersetError> { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } pf_trace!(self.id; "finished PrepareBal logging for slot {} bal {}", - slot, self.insts[slot].bal); - let inst = &self.insts[slot]; + slot, self.insts[slot - self.start_slot].bal); + let inst = &self.insts[slot - self.start_slot]; let voted = if inst.voted.0 > 0 { Some(inst.voted.clone()) } else { @@ -605,18 +645,18 @@ impl CrosswordReplica { } else { // on follower replica, finishing the logging of a // PrepareBal entry leads to sending back a Prepare reply - assert!(inst.replica_bk.is_some()); - let source = inst.replica_bk.as_ref().unwrap().source; - self.transport_hub.send_msg( - PeerMsg::PrepareReply { - slot, - ballot: inst.bal, - voted, - }, - source, - )?; - pf_trace!(self.id; "sent PrepareReply -> {} for slot {} bal {}", - source, slot, inst.bal); + if let Some(ReplicaBookkeeping { source }) = inst.replica_bk { + self.transport_hub.send_msg( + PeerMsg::PrepareReply { + slot, + ballot: inst.bal, + voted, + }, + source, + )?; + pf_trace!(self.id; "sent PrepareReply -> {} for slot {} bal {}", + source, slot, inst.bal); + } } Ok(()) @@ -627,9 +667,12 @@ impl CrosswordReplica { &mut self, slot: usize, ) -> Result<(), SummersetError> { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } pf_trace!(self.id; "finished AcceptData logging for slot {} bal {}", - slot, self.insts[slot].bal); - let inst = &self.insts[slot]; + slot, self.insts[slot - self.start_slot].bal); + let inst = &self.insts[slot - self.start_slot]; if self.is_leader { // on leader, finishing the logging of an AcceptData entry @@ -639,17 +682,17 @@ impl CrosswordReplica { } else { // on follower replica, finishing the logging of an // AcceptData entry leads to sending back an Accept reply - assert!(inst.replica_bk.is_some()); - let source = inst.replica_bk.as_ref().unwrap().source; - self.transport_hub.send_msg( - PeerMsg::AcceptReply { - slot, - ballot: inst.bal, - }, - source, - )?; - pf_trace!(self.id; "sent AcceptReply -> {} for slot {} bal {}", - source, slot, inst.bal); + if let Some(ReplicaBookkeeping { source }) = inst.replica_bk { + self.transport_hub.send_msg( + PeerMsg::AcceptReply { + slot, + ballot: inst.bal, + }, + source, + )?; + pf_trace!(self.id; "sent AcceptReply -> {} for slot {} bal {}", + source, slot, inst.bal); + } } Ok(()) @@ -660,14 +703,16 @@ impl CrosswordReplica { &mut self, slot: usize, ) -> Result<(), SummersetError> { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } pf_trace!(self.id; "finished CommitSlot logging for slot {} bal {}", - slot, self.insts[slot].bal); - assert!(self.insts[slot].status >= Status::Committed); + slot, self.insts[slot - self.start_slot].bal); // update index of the first non-committed instance if slot == self.commit_bar { - while self.commit_bar < self.insts.len() { - let inst = &mut self.insts[self.commit_bar]; + while self.commit_bar < self.start_slot + self.insts.len() { + let inst = &mut self.insts[self.commit_bar - self.start_slot]; if inst.status < Status::Committed { break; } @@ -716,10 +761,20 @@ impl CrosswordReplica { log_result: LogResult, ) -> Result<(), SummersetError> { let (slot, entry_type) = Self::split_log_action_id(action_id); - assert!(slot < self.insts.len()); + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } + assert!(slot < self.start_slot + self.insts.len()); if let LogResult::Append { now_size } = log_result { assert!(now_size >= self.log_offset); + // update first log_offset of slot + let inst = &mut self.insts[slot - self.start_slot]; + if inst.log_offset == 0 || inst.log_offset > self.log_offset { + inst.log_offset = self.log_offset; + } + assert!(inst.log_offset <= self.log_offset); + // then update self.log_offset self.log_offset = now_size; } else { return logged_err!(self.id; "unexpected log result type: {:?}", log_result); @@ -742,16 +797,19 @@ impl CrosswordReplica { slot: usize, ballot: Ballot, ) -> Result<(), SummersetError> { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } pf_trace!(self.id; "received Prepare <- {} for slot {} bal {}", peer, slot, ballot); // if ballot is not smaller than what I have seen: if ballot >= self.bal_max_seen { // locate instance in memory, filling in null instances if needed - while self.insts.len() <= slot { + while self.start_slot + self.insts.len() <= slot { self.insts.push(self.null_instance()?); } - let inst = &mut self.insts[slot]; + let inst = &mut self.insts[slot - self.start_slot]; assert!(inst.bal <= ballot); inst.bal = ballot; @@ -784,17 +842,23 @@ impl CrosswordReplica { ballot: Ballot, voted: Option<(Ballot, RSCodeword)>, ) -> Result<(), SummersetError> { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } pf_trace!(self.id; "received PrepareReply <- {} for slot {} bal {} shards {:?}", peer, slot, ballot, voted.as_ref().map(|(_, cw)| cw.avail_shards_map())); // if ballot is what I'm currently waiting on for Prepare replies: if ballot == self.bal_prep_sent { - assert!(slot < self.insts.len()); - let inst = &mut self.insts[slot]; + assert!(slot < self.start_slot + self.insts.len()); + let inst = &mut self.insts[slot - self.start_slot]; // ignore spurious duplications and outdated replies - if (inst.status != Status::Preparing) || (ballot < inst.bal) { + if !self.is_leader + || (inst.status != Status::Preparing) + || (ballot < inst.bal) + { return Ok(()); } assert_eq!(inst.bal, ballot); @@ -913,16 +977,19 @@ impl CrosswordReplica { ballot: Ballot, reqs_cw: RSCodeword, ) -> Result<(), SummersetError> { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } pf_trace!(self.id; "received Accept <- {} for slot {} bal {} shards {:?}", peer, slot, ballot, reqs_cw.avail_shards_map()); // if ballot is not smaller than what I have made promises for: if ballot >= self.bal_max_seen { // locate instance in memory, filling in null instances if needed - while self.insts.len() <= slot { + while self.start_slot + self.insts.len() <= slot { self.insts.push(self.null_instance()?); } - let inst = &mut self.insts[slot]; + let inst = &mut self.insts[slot - self.start_slot]; assert!(inst.bal <= ballot); inst.bal = ballot; @@ -960,16 +1027,22 @@ impl CrosswordReplica { slot: usize, ballot: Ballot, ) -> Result<(), SummersetError> { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } pf_trace!(self.id; "received AcceptReply <- {} for slot {} bal {}", peer, slot, ballot); // if ballot is what I'm currently waiting on for Accept replies: if ballot == self.bal_prepared { - assert!(slot < self.insts.len()); - let inst = &mut self.insts[slot]; + assert!(slot < self.start_slot + self.insts.len()); + let inst = &mut self.insts[slot - self.start_slot]; // ignore spurious duplications and outdated replies - if (inst.status != Status::Accepting) || (ballot < inst.bal) { + if !self.is_leader + || (inst.status != Status::Accepting) + || (ballot < inst.bal) + { return Ok(()); } assert_eq!(inst.bal, ballot); @@ -1034,13 +1107,16 @@ impl CrosswordReplica { peer: ReplicaId, slot: usize, ) -> Result<(), SummersetError> { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } pf_trace!(self.id; "received Commit <- {} for slot {}", peer, slot); // locate instance in memory, filling in null instances if needed - while self.insts.len() <= slot { + while self.start_slot + self.insts.len() <= slot { self.insts.push(self.null_instance()?); } - let inst = &mut self.insts[slot]; + let inst = &mut self.insts[slot - self.start_slot]; // ignore spurious duplications if inst.status != Status::Accepting { @@ -1066,19 +1142,22 @@ impl CrosswordReplica { Ok(()) } - /// Handler of Recover message from leader. - fn handle_msg_recover( + /// Handler of Reconstruct message from leader. + fn handle_msg_reconstruct( &mut self, peer: ReplicaId, slot: usize, ) -> Result<(), SummersetError> { - pf_trace!(self.id; "received Recover <- {} for slot {}", peer, slot); + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } + pf_trace!(self.id; "received Reconstruct <- {} for slot {}", peer, slot); // locate instance in memory, filling in null instances if needed - while self.insts.len() <= slot { + while self.start_slot + self.insts.len() <= slot { self.insts.push(self.null_instance()?); } - let inst = &mut self.insts[slot]; + let inst = &mut self.insts[slot - self.start_slot]; // ignore spurious duplications; also ignore if I have nothing to send back if inst.status < Status::Accepting || inst.reqs_cw.avail_shards() == 0 { @@ -1087,33 +1166,36 @@ impl CrosswordReplica { // send back my ballot for this slot and the available shards self.transport_hub.send_msg( - PeerMsg::RecoverReply { + PeerMsg::ReconstructReply { slot, ballot: inst.bal, reqs_cw: inst.reqs_cw.clone(), }, peer, )?; - pf_trace!(self.id; "sent RecoverReply message for slot {} bal {}", + pf_trace!(self.id; "sent ReconstructReply message for slot {} bal {}", slot, inst.bal); Ok(()) } - /// Handler of Recover reply from replica. - fn handle_msg_recover_reply( + /// Handler of Reconstruct reply from replica. + fn handle_msg_reconstruct_reply( &mut self, peer: ReplicaId, slot: usize, ballot: Ballot, reqs_cw: RSCodeword, ) -> Result<(), SummersetError> { - pf_trace!(self.id; "received RecoverReply <- {} for slot {} bal {} shards {:?}", + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } + pf_trace!(self.id; "received ReconstructReply <- {} for slot {} bal {} shards {:?}", peer, slot, ballot, reqs_cw.avail_shards_map()); - assert!(slot < self.insts.len()); - assert!(self.insts[slot].status >= Status::Committed); - let num_insts = self.insts.len(); - let inst = &mut self.insts[slot]; + assert!(slot < self.start_slot + self.insts.len()); + assert!(self.insts[slot - self.start_slot].status >= Status::Committed); + let num_insts = self.start_slot + self.insts.len(); + let inst = &mut self.insts[slot - self.start_slot]; // if reply not outdated and ballot is up-to-date if inst.status < Status::Executed && ballot >= inst.bal { @@ -1123,7 +1205,8 @@ impl CrosswordReplica { // if enough shards have been gathered, can push execution forward if slot == self.commit_bar { while self.commit_bar < num_insts { - let inst = &mut self.insts[self.commit_bar]; + let inst = + &mut self.insts[self.commit_bar - self.start_slot]; if inst.status < Status::Committed || inst.reqs_cw.avail_shards() < self.quorum_cnt { @@ -1190,12 +1273,14 @@ impl CrosswordReplica { self.handle_msg_accept_reply(peer, slot, ballot) } PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot), - PeerMsg::Recover { slot } => self.handle_msg_recover(peer, slot), - PeerMsg::RecoverReply { + PeerMsg::Reconstruct { slot } => { + self.handle_msg_reconstruct(peer, slot) + } + PeerMsg::ReconstructReply { slot, ballot, reqs_cw, - } => self.handle_msg_recover_reply(peer, slot, ballot, reqs_cw), + } => self.handle_msg_reconstruct_reply(peer, slot, ballot, reqs_cw), PeerMsg::Heartbeat { ballot } => self.heard_heartbeat(peer, ballot), } } @@ -1207,11 +1292,14 @@ impl CrosswordReplica { cmd_result: CommandResult, ) -> Result<(), SummersetError> { let (slot, cmd_idx) = Self::split_command_id(cmd_id); - assert!(slot < self.insts.len()); + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } + assert!(slot < self.start_slot + self.insts.len()); pf_trace!(self.id; "executed cmd in instance at slot {} idx {}", slot, cmd_idx); - let inst = &mut self.insts[slot]; + let inst = &mut self.insts[slot - self.start_slot]; let reqs = inst.reqs_cw.get_data()?; assert!(cmd_idx < reqs.len()); let (client, ref req) = reqs[cmd_idx]; @@ -1243,8 +1331,8 @@ impl CrosswordReplica { // update index of the first non-executed instance if slot == self.exec_bar { - while self.exec_bar < self.insts.len() { - let inst = &mut self.insts[self.exec_bar]; + while self.exec_bar < self.start_slot + self.insts.len() { + let inst = &mut self.insts[self.exec_bar - self.start_slot]; if inst.status < Status::Executed { break; } @@ -1276,11 +1364,21 @@ impl CrosswordReplica { self.bal_prep_sent = self.make_greater_ballot(self.bal_max_seen); self.bal_max_seen = self.bal_prep_sent; - for (slot, inst) in self.insts.iter_mut().enumerate() { + for (slot, inst) in self + .insts + .iter_mut() + .enumerate() + .map(|(s, i)| (self.start_slot + s, i)) + { // redo Prepare phase for all in-progress instances if inst.status < Status::Committed { inst.bal = self.bal_prep_sent; inst.status = Status::Preparing; + inst.leader_bk = Some(LeaderBookkeeping { + prepare_acks: Bitmap::new(self.population, false), + prepare_max_bal: 0, + accept_acks: HashMap::new(), + }); pf_debug!(self.id; "enter Prepare phase for slot {} bal {}", slot, inst.bal); @@ -1310,14 +1408,14 @@ impl CrosswordReplica { slot, inst.bal); } - // do recovery reads for all committed instances that do not + // do reconstruction reads for all committed instances that do not // hold enough available shards for reconstruction if inst.status == Status::Committed && inst.reqs_cw.avail_shards() < self.quorum_cnt { self.transport_hub - .bcast_msg(PeerMsg::Recover { slot }, None)?; - pf_trace!(self.id; "broadcast Recover messages for slot {} bal {} shards {:?}", + .bcast_msg(PeerMsg::Reconstruct { slot }, None)?; + pf_trace!(self.id; "broadcast Reconstruct messages for slot {} bal {} shards {:?}", slot, inst.bal, inst.reqs_cw.avail_shards_map()); } } @@ -1385,6 +1483,8 @@ impl CrosswordReplica { &mut self, durable: bool, ) -> Result<(), SummersetError> { + pf_warn!(self.id; "server got restart req"); + // send leave notification to peers and wait for their replies self.transport_hub.leave().await?; @@ -1444,6 +1544,19 @@ impl CrosswordReplica { Ok(()) } + /// Handler of TakeSnapshot control message. + async fn handle_ctrl_take_snapshot( + &mut self, + ) -> Result<(), SummersetError> { + pf_warn!(self.id; "server told to take snapshot"); + self.take_new_snapshot().await?; + + self.control_hub.send_ctrl(CtrlMsg::SnapshotUpTo { + new_start: self.start_slot, + })?; + Ok(()) + } + /// Synthesized handler of manager control messages. If ok, returns /// `Some(true)` if decides to terminate and reboot, `Some(false)` if /// decides to shutdown completely, and `None` if not terminating. @@ -1468,6 +1581,11 @@ impl CrosswordReplica { Ok(None) } + CtrlMsg::TakeSnapshot => { + self.handle_ctrl_take_snapshot().await?; + Ok(None) + } + _ => Ok(None), // ignore all other types } } @@ -1480,11 +1598,11 @@ impl CrosswordReplica { match entry { LogEntry::PrepareBal { slot, ballot } => { // locate instance in memory, filling in null instances if needed - while self.insts.len() <= slot { + while self.start_slot + self.insts.len() <= slot { self.insts.push(self.null_instance()?); } // update instance state - let inst = &mut self.insts[slot]; + let inst = &mut self.insts[slot - self.start_slot]; inst.bal = ballot; inst.status = Status::Preparing; // update bal_prep_sent and bal_max_seen, reset bal_prepared @@ -1503,15 +1621,20 @@ impl CrosswordReplica { reqs_cw, } => { // locate instance in memory, filling in null instances if needed - while self.insts.len() <= slot { + while self.start_slot + self.insts.len() <= slot { self.insts.push(self.null_instance()?); } // update instance state - let inst = &mut self.insts[slot]; + let inst = &mut self.insts[slot - self.start_slot]; inst.bal = ballot; inst.status = Status::Accepting; inst.reqs_cw = reqs_cw.clone(); inst.voted = (ballot, reqs_cw); + // it could be the case that the PrepareBal action for this + // ballot has been snapshotted + if self.bal_prep_sent < ballot { + self.bal_prep_sent = ballot; + } // update bal_prepared and bal_max_seen if self.bal_prepared < ballot { self.bal_prepared = ballot; @@ -1523,14 +1646,15 @@ impl CrosswordReplica { } LogEntry::CommitSlot { slot } => { - assert!(slot < self.insts.len()); + assert!(slot < self.start_slot + self.insts.len()); // update instance state - self.insts[slot].status = Status::Committed; + self.insts[slot - self.start_slot].status = Status::Committed; // submit commands in contiguously committed instance to the // state machine if slot == self.commit_bar { - while self.commit_bar < self.insts.len() { - let inst = &mut self.insts[self.commit_bar]; + while self.commit_bar < self.start_slot + self.insts.len() { + let inst = + &mut self.insts[self.commit_bar - self.start_slot]; if inst.status < Status::Committed { break; } @@ -1614,6 +1738,221 @@ impl CrosswordReplica { logged_err!(self.id; "unexpected log result type or failed truncate") } } + + /// Dump a new key-value pair to snapshot file. + async fn snapshot_dump_kv_pairs(&mut self) -> Result<(), SummersetError> { + // collect all key-value pairs put up to exec_bar + let mut pairs = HashMap::new(); + for slot in self.start_slot..self.exec_bar { + let inst = &mut self.insts[slot - self.start_slot]; + assert!(inst.reqs_cw.avail_data_shards() >= self.quorum_cnt); + for (_, req) in inst.reqs_cw.get_data()?.clone() { + if let ApiRequest::Req { + cmd: Command::Put { key, value }, + .. + } = req + { + pairs.insert(key, value); + } + } + } + + // write the collection to snapshot file + self.snapshot_hub.submit_action( + 0, // using 0 as dummy log action ID + LogAction::Append { + entry: SnapEntry::KVPairSet { pairs }, + sync: self.config.logger_sync, + }, + )?; + let (_, log_result) = self.snapshot_hub.get_result().await?; + if let LogResult::Append { now_size } = log_result { + self.snap_offset = now_size; + Ok(()) + } else { + logged_err!( + self.id; + "unexpected log result type" + ) + } + } + + /// Discard everything older than start_slot in durable WAL log. + async fn snapshot_discard_log(&mut self) -> Result<(), SummersetError> { + let cut_offset = if !self.insts.is_empty() { + self.insts[0].log_offset + } else { + self.log_offset + }; + + // discard the log before cut_offset + if cut_offset > 0 { + self.storage_hub + .submit_action(0, LogAction::Discard { offset: cut_offset })?; + loop { + let (action_id, log_result) = + self.storage_hub.get_result().await?; + if action_id != 0 { + // normal log action previously in queue; process it + self.handle_log_result(action_id, log_result)?; + } else { + if let LogResult::Discard { + offset_ok: true, + now_size, + } = log_result + { + assert_eq!(self.log_offset - cut_offset, now_size); + self.log_offset = now_size; + } else { + return logged_err!( + self.id; + "unexpected log result type or failed discard" + ); + } + break; + } + } + } + + // update inst.log_offset for all remaining in-mem instances + for inst in &mut self.insts { + if inst.log_offset > 0 { + assert!(inst.log_offset >= cut_offset); + inst.log_offset -= cut_offset; + } + } + + Ok(()) + } + + /// Take a snapshot up to current exec_idx, then discard the in-mem log up + /// to that index as well as outdate entries in the durable WAL log file. + /// + /// NOTE: the current implementation does not guard against crashes in the + /// middle of taking a snapshot. + async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> { + pf_debug!(self.id; "taking new snapshot: start {} exec {}", + self.start_slot, self.exec_bar); + assert!(self.exec_bar >= self.start_slot); + if self.exec_bar == self.start_slot { + return Ok(()); + } + + // collect and dump all Puts in executed instances + if self.is_leader { + // NOTE: broadcast heartbeats here to appease followers + self.bcast_heartbeats()?; + } + self.snapshot_dump_kv_pairs().await?; + + // update start_slot and discard all in-memory log instances up to exec_bar + self.insts.drain(0..(self.exec_bar - self.start_slot)); + self.start_slot = self.exec_bar; + + // discarding everything older than start_slot in WAL log + if self.is_leader { + // NOTE: broadcast heartbeats here to appease followers + self.bcast_heartbeats()?; + } + self.snapshot_discard_log().await?; + + // reset the leader heartbeat hear timer + self.kickoff_hb_hear_timer()?; + + pf_info!(self.id; "took snapshot up to: start {}", self.start_slot); + Ok(()) + } + + /// Recover initial state from durable storage snapshot file. + async fn recover_from_snapshot(&mut self) -> Result<(), SummersetError> { + assert_eq!(self.snap_offset, 0); + + // first, try to read the first several bytes, which should record the + // start_slot index + self.snapshot_hub + .submit_action(0, LogAction::Read { offset: 0 })?; + let (_, log_result) = self.snapshot_hub.get_result().await?; + + match log_result { + LogResult::Read { + entry: Some(SnapEntry::StartSlot { slot }), + end_offset, + } => { + self.snap_offset = end_offset; + self.start_slot = slot; // get start slot index of in-mem log + + // repeatedly apply key-value pairs + loop { + self.snapshot_hub.submit_action( + 0, + LogAction::Read { + offset: self.snap_offset, + }, + )?; + let (_, log_result) = + self.snapshot_hub.get_result().await?; + + match log_result { + LogResult::Read { + entry: Some(SnapEntry::KVPairSet { pairs }), + end_offset, + } => { + // execute Put commands on state machine + for (key, value) in pairs { + self.state_machine.submit_cmd( + 0, + Command::Put { key, value }, + )?; + let _ = self.state_machine.get_result().await?; + } + // update snapshot file offset + self.snap_offset = end_offset; + } + LogResult::Read { entry: None, .. } => { + // end of log reached + break; + } + _ => { + return logged_err!(self.id; "unexpected log result type"); + } + } + } + + // tell manager about my start_slot index + self.control_hub.send_ctrl(CtrlMsg::SnapshotUpTo { + new_start: self.start_slot, + })?; + Ok(()) + } + + LogResult::Read { entry: None, .. } => { + // snapshot file is empty. Write a 0 as start_slot and return + self.snapshot_hub.submit_action( + 0, + LogAction::Write { + entry: SnapEntry::StartSlot { slot: 0 }, + offset: 0, + sync: self.config.logger_sync, + }, + )?; + let (_, log_result) = self.snapshot_hub.get_result().await?; + if let LogResult::Write { + offset_ok: true, + now_size, + } = log_result + { + self.snap_offset = now_size; + Ok(()) + } else { + logged_err!(self.id; "unexpected log result type or failed truncate") + } + } + + _ => { + logged_err!(self.id; "unexpected log result type") + } + } + } } #[async_trait] @@ -1635,6 +1974,7 @@ impl GenericReplica for CrosswordReplica { backer_path, logger_sync, hb_hear_timeout_min, hb_hear_timeout_max, hb_send_interval_ms, + snapshot_path, snapshot_interval_s, fault_tolerance, shards_per_replica, perf_storage_a, perf_storage_b, perf_network_a, perf_network_b)?; @@ -1737,6 +2077,14 @@ impl GenericReplica for CrosswordReplica { } transport_hub.wait_for_group(population).await?; + // setup snapshot hub module + let snapshot_hub = StorageHub::new_and_setup( + id, + Path::new(&config.snapshot_path), + None, + ) + .await?; + // setup external API module, ready to take in client requests let external_api = ExternalApi::new_and_setup( id, @@ -1750,6 +2098,15 @@ impl GenericReplica for CrosswordReplica { time::interval(Duration::from_millis(config.hb_send_interval_ms)); hb_send_interval.set_missed_tick_behavior(MissedTickBehavior::Skip); + let mut snapshot_interval = time::interval(Duration::from_secs( + if config.snapshot_interval_s > 0 { + config.snapshot_interval_s + } else { + 60 // dummy non-zero value to make `time::interval` happy + }, + )); + snapshot_interval.set_missed_tick_behavior(MissedTickBehavior::Skip); + Ok(CrosswordReplica { id, population, @@ -1761,17 +2118,21 @@ impl GenericReplica for CrosswordReplica { external_api, state_machine, storage_hub, + snapshot_hub, transport_hub, hb_hear_timer: Timer::new(), hb_send_interval, is_leader: false, insts: vec![], + start_slot: 0, + snapshot_interval, bal_prep_sent: 0, bal_prepared: 0, bal_max_seen: 0, commit_bar: 0, exec_bar: 0, log_offset: 0, + snap_offset: 0, rs_coder, }) } @@ -1780,7 +2141,10 @@ impl GenericReplica for CrosswordReplica { &mut self, mut rx_term: watch::Receiver, ) -> Result { - // recover state from durable storage log + // recover state from durable snapshot file + self.recover_from_snapshot().await?; + + // recover the tail-piece memory log & state from durable storage log self.recover_from_log().await?; // kick off leader activity hearing timer @@ -1853,6 +2217,18 @@ impl GenericReplica for CrosswordReplica { } }, + // autonomous snapshot taking timeout + _ = self.snapshot_interval.tick(), if !paused + && self.config.snapshot_interval_s > 0 => { + if let Err(e) = self.take_new_snapshot().await { + pf_error!(self.id; "error taking a new snapshot: {}", e); + } else { + self.control_hub.send_ctrl( + CtrlMsg::SnapshotUpTo { new_start: self.start_slot } + )?; + } + }, + // manager control message ctrl_msg = self.control_hub.recv_ctrl() => { if let Err(e) = ctrl_msg { @@ -1863,10 +2239,6 @@ impl GenericReplica for CrosswordReplica { match self.handle_ctrl_msg(ctrl_msg, &mut paused).await { Ok(terminate) => { if let Some(restart) = terminate { - pf_warn!( - self.id; - "server got {} req", - if restart { "restart" } else { "shutdown" }); return Ok(restart); } }, From 70fb71f0853e2e0feacd9ce109ed945181534bdc Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Sun, 24 Sep 2023 23:16:17 -0500 Subject: [PATCH 59/89] finish implementation of snapshotting --- src/manager/clusman.rs | 2 +- src/protocols/crossword.rs | 14 +++++++++----- src/protocols/multipaxos.rs | 14 +++++++++----- src/protocols/rs_paxos.rs | 14 +++++++++----- src/server/transport.rs | 29 ++++++++++++++++------------- 5 files changed, 44 insertions(+), 29 deletions(-) diff --git a/src/manager/clusman.rs b/src/manager/clusman.rs index f1a496c9..a21ef9c7 100644 --- a/src/manager/clusman.rs +++ b/src/manager/clusman.rs @@ -360,7 +360,7 @@ impl ClusterManager { } // wait a while to ensure the server's transport hub is setup - time::sleep(Duration::from_millis(300)).await; + time::sleep(Duration::from_millis(500)).await; reset_done.insert(s); } diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index f8dda64d..3964c9bf 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -233,7 +233,7 @@ enum PeerMsg { }, /// Leader activity heartbeat. - Heartbeat { ballot: Ballot }, + Heartbeat { ballot: Ballot, exec_bar: usize }, } /// Crossword server replica module. @@ -1281,7 +1281,9 @@ impl CrosswordReplica { ballot, reqs_cw, } => self.handle_msg_reconstruct_reply(peer, slot, ballot, reqs_cw), - PeerMsg::Heartbeat { ballot } => self.heard_heartbeat(peer, ballot), + PeerMsg::Heartbeat { ballot, exec_bar } => { + self.heard_heartbeat(peer, ballot, exec_bar) + } } } @@ -1428,10 +1430,11 @@ impl CrosswordReplica { self.transport_hub.bcast_msg( PeerMsg::Heartbeat { ballot: self.bal_prep_sent, + exec_bar: self.exec_bar, }, None, )?; - self.heard_heartbeat(self.id, self.bal_prep_sent)?; + self.heard_heartbeat(self.id, self.bal_prep_sent, self.exec_bar)?; // pf_trace!(self.id; "broadcast heartbeats bal {}", self.bal_prep_sent); Ok(()) @@ -1457,9 +1460,10 @@ impl CrosswordReplica { &mut self, _peer: ReplicaId, ballot: Ballot, + exec_bar: usize, ) -> Result<(), SummersetError> { - // ignore outdated hearbeat - if ballot < self.bal_max_seen { + // ignore outdated heartbeats and those from peers with exec_bar < mine + if ballot < self.bal_max_seen || exec_bar < self.exec_bar { return Ok(()); } diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs index d9d5cd76..00e5f964 100644 --- a/src/protocols/multipaxos.rs +++ b/src/protocols/multipaxos.rs @@ -214,7 +214,7 @@ enum PeerMsg { Commit { slot: usize }, /// Leader activity heartbeat. - Heartbeat { ballot: Ballot }, + Heartbeat { ballot: Ballot, exec_bar: usize }, } /// MultiPaxos server replica module. @@ -963,7 +963,9 @@ impl MultiPaxosReplica { self.handle_msg_accept_reply(peer, slot, ballot) } PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot), - PeerMsg::Heartbeat { ballot } => self.heard_heartbeat(peer, ballot), + PeerMsg::Heartbeat { ballot, exec_bar } => { + self.heard_heartbeat(peer, ballot, exec_bar) + } } } @@ -1098,10 +1100,11 @@ impl MultiPaxosReplica { self.transport_hub.bcast_msg( PeerMsg::Heartbeat { ballot: self.bal_prep_sent, + exec_bar: self.exec_bar, }, None, )?; - self.heard_heartbeat(self.id, self.bal_prep_sent)?; + self.heard_heartbeat(self.id, self.bal_prep_sent, self.exec_bar)?; // pf_trace!(self.id; "broadcast heartbeats bal {}", self.bal_prep_sent); Ok(()) @@ -1127,9 +1130,10 @@ impl MultiPaxosReplica { &mut self, _peer: ReplicaId, ballot: Ballot, + exec_bar: usize, ) -> Result<(), SummersetError> { - // ignore outdated hearbeat - if ballot < self.bal_max_seen { + // ignore outdated heartbeats and those from peers with exec_bar < mine + if ballot < self.bal_max_seen || exec_bar < self.exec_bar { return Ok(()); } diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs index 4aa5a81f..82990870 100644 --- a/src/protocols/rs_paxos.rs +++ b/src/protocols/rs_paxos.rs @@ -226,7 +226,7 @@ enum PeerMsg { }, /// Leader activity heartbeat. - Heartbeat { ballot: Ballot }, + Heartbeat { ballot: Ballot, exec_bar: usize }, } /// RSPaxos server replica module. @@ -1180,7 +1180,9 @@ impl RSPaxosReplica { ballot, reqs_cw, } => self.handle_msg_reconstruct_reply(peer, slot, ballot, reqs_cw), - PeerMsg::Heartbeat { ballot } => self.heard_heartbeat(peer, ballot), + PeerMsg::Heartbeat { ballot, exec_bar } => { + self.heard_heartbeat(peer, ballot, exec_bar) + } } } @@ -1327,10 +1329,11 @@ impl RSPaxosReplica { self.transport_hub.bcast_msg( PeerMsg::Heartbeat { ballot: self.bal_prep_sent, + exec_bar: self.exec_bar, }, None, )?; - self.heard_heartbeat(self.id, self.bal_prep_sent)?; + self.heard_heartbeat(self.id, self.bal_prep_sent, self.exec_bar)?; // pf_trace!(self.id; "broadcast heartbeats bal {}", self.bal_prep_sent); Ok(()) @@ -1356,9 +1359,10 @@ impl RSPaxosReplica { &mut self, _peer: ReplicaId, ballot: Ballot, + exec_bar: usize, ) -> Result<(), SummersetError> { - // ignore outdated hearbeat - if ballot < self.bal_max_seen { + // ignore outdated heartbeats and those from peers with exec_bar < mine + if ballot < self.bal_max_seen || exec_bar < self.exec_bar { return Ok(()); } diff --git a/src/server/transport.rs b/src/server/transport.rs index 699697d8..ba0e1e8b 100644 --- a/src/server/transport.rs +++ b/src/server/transport.rs @@ -295,8 +295,9 @@ where /// Broadcasts leave notifications to all peers and waits for replies. pub async fn leave(&mut self) -> Result<(), SummersetError> { - let tx_sends_guard = self.tx_sends.guard(); + #[allow(unused_variables)] let mut num_peers = 0; + let tx_sends_guard = self.tx_sends.guard(); for &peer in tx_sends_guard.keys() { if peer == self.me { continue; @@ -311,18 +312,20 @@ where num_peers += 1; } - let mut replies = Bitmap::new(self.population, false); - while replies.count() < num_peers { - match self.rx_recv.recv().await { - Some((id, peer_msg)) => match peer_msg { - PeerMessage::LeaveReply => replies.set(id, true)?, - _ => continue, // ignore all other types of messages - }, - None => { - return logged_err!(self.me; "recv channel has been closed"); - } - } - } + // NOTE: commenting out the following to avoid rare blocking during + // tester resets + // let mut replies = Bitmap::new(self.population, false); + // while replies.count() < num_peers { + // match self.rx_recv.recv().await { + // Some((id, peer_msg)) => match peer_msg { + // PeerMessage::LeaveReply => replies.set(id, true)?, + // _ => continue, // ignore all other types of messages + // }, + // None => { + // return logged_err!(self.me; "recv channel has been closed"); + // } + // } + // } Ok(()) } From dee1ef06e1ff872eb67391dac9b9d3c3ca0faa37 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Mon, 25 Sep 2023 12:49:01 -0500 Subject: [PATCH 60/89] exclude unwanted shards in Reconstruct --- src/protocols/crossword.rs | 39 +++++++++++++++++++++++++++++++------- src/utils/bitmap.rs | 14 ++++++++++++++ 2 files changed, 46 insertions(+), 7 deletions(-) diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index 3964c9bf..0cd89f2e 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -223,7 +223,7 @@ enum PeerMsg { Commit { slot: usize }, /// Reconstruction read from new leader to replicas. - Reconstruct { slot: usize }, + Reconstruct { slot: usize, exclude: Vec }, /// Reconstruction read reply from replica to leader. ReconstructReply { @@ -1147,6 +1147,7 @@ impl CrosswordReplica { &mut self, peer: ReplicaId, slot: usize, + exclude: Vec, ) -> Result<(), SummersetError> { if slot < self.start_slot { return Ok(()); // ignore if slot index outdated @@ -1160,7 +1161,13 @@ impl CrosswordReplica { let inst = &mut self.insts[slot - self.start_slot]; // ignore spurious duplications; also ignore if I have nothing to send back - if inst.status < Status::Accepting || inst.reqs_cw.avail_shards() == 0 { + if inst.status < Status::Accepting { + return Ok(()); + } + let mut subset = Bitmap::from(inst.reqs_cw.num_shards(), exclude); + subset.flip(); // exclude unwanted shards the sender already has + let reply_cw = inst.reqs_cw.subset_copy(subset, false)?; + if reply_cw.avail_shards() == 0 { return Ok(()); } @@ -1169,7 +1176,7 @@ impl CrosswordReplica { PeerMsg::ReconstructReply { slot, ballot: inst.bal, - reqs_cw: inst.reqs_cw.clone(), + reqs_cw: reply_cw.clone(), }, peer, )?; @@ -1273,8 +1280,8 @@ impl CrosswordReplica { self.handle_msg_accept_reply(peer, slot, ballot) } PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot), - PeerMsg::Reconstruct { slot } => { - self.handle_msg_reconstruct(peer, slot) + PeerMsg::Reconstruct { slot, exclude } => { + self.handle_msg_reconstruct(peer, slot, exclude) } PeerMsg::ReconstructReply { slot, @@ -1415,8 +1422,26 @@ impl CrosswordReplica { if inst.status == Status::Committed && inst.reqs_cw.avail_shards() < self.quorum_cnt { - self.transport_hub - .bcast_msg(PeerMsg::Reconstruct { slot }, None)?; + self.transport_hub.bcast_msg( + PeerMsg::Reconstruct { + slot, + exclude: inst + .reqs_cw + .avail_shards_map() + .iter() + .filter_map( + |(idx, flag)| { + if flag { + Some(idx) + } else { + None + } + }, + ) + .collect(), + }, + None, + )?; pf_trace!(self.id; "broadcast Reconstruct messages for slot {} bal {} shards {:?}", slot, inst.bal, inst.reqs_cw.avail_shards_map()); } diff --git a/src/utils/bitmap.rs b/src/utils/bitmap.rs index d5fe9c8e..592ddb0c 100644 --- a/src/utils/bitmap.rs +++ b/src/utils/bitmap.rs @@ -71,6 +71,12 @@ impl Bitmap { self.0.count_ones(..) as u8 } + /// Flips all flags in the bitmap. + #[inline] + pub fn flip(&mut self) { + self.0.toggle_range(..) + } + /// Allows `for (id, bit) in map.iter()`. #[inline] pub fn iter(&self) -> BitmapIter { @@ -143,6 +149,14 @@ mod bitmap_tests { assert!(map.get(7).is_err()); } + #[test] + fn bitmap_flip() { + let mut map = Bitmap::new(5, false); + assert!(map.set(1, true).is_ok()); + map.flip(); + assert_eq!(map, Bitmap::from(5, vec![0, 2, 3, 4])); + } + #[test] fn bitmap_count() { let mut map = Bitmap::new(7, false); From 9fce9cb6daa7876e0308a32a5f66bb3fb3850233 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Mon, 25 Sep 2023 15:01:43 -0500 Subject: [PATCH 61/89] add primitive follower gossiping --- src/protocols/crossword.rs | 122 ++++++++++++++++++++++++++++++++++- src/protocols/multipaxos.rs | 26 +++++++- src/protocols/rep_nothing.rs | 16 +++++ src/protocols/rs_paxos.rs | 26 +++++++- src/protocols/simple_push.rs | 19 ++++++ 5 files changed, 204 insertions(+), 5 deletions(-) diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index 0cd89f2e..95f044c1 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -48,7 +48,6 @@ pub struct ReplicaConfigCrossword { /// Min timeout of not hearing any heartbeat from leader in millisecs. pub hb_hear_timeout_min: u64, - /// Max timeout of not hearing any heartbeat from leader in millisecs. pub hb_hear_timeout_max: u64, @@ -62,6 +61,11 @@ pub struct ReplicaConfigCrossword { /// snapshotting autonomously. pub snapshot_interval_s: u64, + /// Min timeout of follower gossiping trigger in millisecs. + pub gossip_timeout_min: u64, + /// Max timeout of follower gossiping trigger in millisecs. + pub gossip_timeout_max: u64, + /// Fault-tolerance level. pub fault_tolerance: u8, @@ -89,6 +93,8 @@ impl Default for ReplicaConfigCrossword { hb_send_interval_ms: 50, snapshot_path: "/tmp/summerset.rs_paxos.snap".into(), snapshot_interval_s: 0, + gossip_timeout_min: 100, + gossip_timeout_max: 300, fault_tolerance: 0, shards_per_replica: 1, perf_storage_a: 0, @@ -292,6 +298,9 @@ pub struct CrosswordReplica { /// Timer for taking a new autonomous snapshot. snapshot_interval: Interval, + /// Titer for trigger follower gossiping. + gossip_timer: Timer, + /// Largest ballot number that a leader has sent Prepare messages in. bal_prep_sent: Ballot, @@ -318,6 +327,7 @@ pub struct CrosswordReplica { rs_coder: ReedSolomon, } +// CrosswordReplica common helpers impl CrosswordReplica { /// Create an empty null instance. fn null_instance(&self) -> Result { @@ -442,7 +452,10 @@ impl CrosswordReplica { min_coverage } +} +// CrosswordReplica client requests entrance +impl CrosswordReplica { /// Handler of client request batch chan recv. fn handle_req_batch( &mut self, @@ -619,7 +632,10 @@ impl CrosswordReplica { Ok(()) } +} +// CrosswordReplica durable WAL logging +impl CrosswordReplica { /// Handler of PrepareBal logging result chan recv. fn handle_logged_prepare_bal( &mut self, @@ -789,7 +805,10 @@ impl CrosswordReplica { } } } +} +// CrosswordReplica peer-peer messages handling +impl CrosswordReplica { /// Handler of Prepare message from leader. fn handle_msg_prepare( &mut self, @@ -1293,7 +1312,10 @@ impl CrosswordReplica { } } } +} +// CrosswordReplica state machine execution +impl CrosswordReplica { /// Handler of state machine exec result chan recv. fn handle_cmd_result( &mut self, @@ -1352,7 +1374,10 @@ impl CrosswordReplica { Ok(()) } +} +// CrosswordReplica leadership related logic +impl CrosswordReplica { /// Becomes a leader, sends self-initiated Prepare messages to followers /// for all in-progress instances, and starts broadcasting heartbeats. fn become_a_leader(&mut self) -> Result<(), SummersetError> { @@ -1506,7 +1531,80 @@ impl CrosswordReplica { // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot); Ok(()) } +} + +// CrosswordReplica follower gossiping +impl CrosswordReplica { + /// Chooses a random gossip_timeout from the min-max range and kicks off + /// the gossip_timer. + fn kickoff_gossip_timer(&mut self) -> Result<(), SummersetError> { + let timeout_ms = thread_rng().gen_range( + self.config.gossip_timeout_min..=self.config.gossip_timeout_max, + ); + + // pf_trace!(self.id; "kickoff gossip_timer @ {} ms", timeout_ms); + self.gossip_timer + .kickoff(Duration::from_millis(timeout_ms))?; + Ok(()) + } + /// Triggers gossiping for my missing shards in committed but not-yet- + /// executed instances: fetch missing shards from peers, preferring + /// follower peers that hold data shards. + fn trigger_gossiping(&mut self) -> Result<(), SummersetError> { + // TODO: want cleverer design than this! + let mut slot_up_to = self.exec_bar; + for slot in self.exec_bar..(self.start_slot + self.insts.len()) { + slot_up_to = slot; + let inst = &self.insts[slot - self.start_slot]; + if inst.status >= Status::Executed { + continue; + } else if inst.status < Status::Committed { + break; + } + + if inst.reqs_cw.avail_shards() < self.quorum_cnt { + let mut target = Bitmap::new(self.population, true); + if let Some(ReplicaBookkeeping { source }) = inst.replica_bk { + // skip leader who initially replicated this instance to me + target.set(source, false)?; + } + self.transport_hub.bcast_msg( + PeerMsg::Reconstruct { + slot, + exclude: inst + .reqs_cw + .avail_shards_map() + .iter() + .filter_map( + |(idx, flag)| { + if flag { + Some(idx) + } else { + None + } + }, + ) + .collect(), + }, + Some(target), + )?; + } + } + + // reset gossip trigger timer + self.kickoff_gossip_timer()?; + + if slot_up_to > self.exec_bar { + pf_debug!(self.id; "triggered gossiping: slots {} - {}", + self.exec_bar, slot_up_to); + } + Ok(()) + } +} + +// CrosswordReplica control messages handling +impl CrosswordReplica { /// Handler of ResetState control message. async fn handle_ctrl_reset_state( &mut self, @@ -1618,7 +1716,10 @@ impl CrosswordReplica { _ => Ok(None), // ignore all other types } } +} +// CrosswordReplica recovery from WAL log +impl CrosswordReplica { /// Apply a durable storage log entry for recovery. async fn recover_apply_entry( &mut self, @@ -1767,7 +1868,10 @@ impl CrosswordReplica { logged_err!(self.id; "unexpected log result type or failed truncate") } } +} +// CrosswordReplica snapshotting & GC logic +impl CrosswordReplica { /// Dump a new key-value pair to snapshot file. async fn snapshot_dump_kv_pairs(&mut self) -> Result<(), SummersetError> { // collect all key-value pairs put up to exec_bar @@ -1860,12 +1964,12 @@ impl CrosswordReplica { /// NOTE: the current implementation does not guard against crashes in the /// middle of taking a snapshot. async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> { - pf_debug!(self.id; "taking new snapshot: start {} exec {}", - self.start_slot, self.exec_bar); assert!(self.exec_bar >= self.start_slot); if self.exec_bar == self.start_slot { return Ok(()); } + pf_debug!(self.id; "taking new snapshot: start {} exec {}", + self.start_slot, self.exec_bar); // collect and dump all Puts in executed instances if self.is_leader { @@ -2004,6 +2108,7 @@ impl GenericReplica for CrosswordReplica { hb_hear_timeout_min, hb_hear_timeout_max, hb_send_interval_ms, snapshot_path, snapshot_interval_s, + gossip_timeout_min, gossip_timeout_max, fault_tolerance, shards_per_replica, perf_storage_a, perf_storage_b, perf_network_a, perf_network_b)?; @@ -2155,6 +2260,7 @@ impl GenericReplica for CrosswordReplica { insts: vec![], start_slot: 0, snapshot_interval, + gossip_timer: Timer::new(), bal_prep_sent: 0, bal_prepared: 0, bal_max_seen: 0, @@ -2179,6 +2285,9 @@ impl GenericReplica for CrosswordReplica { // kick off leader activity hearing timer self.kickoff_hb_hear_timer()?; + // kick off follower gossiping trigger timer + self.kickoff_gossip_timer()?; + // main event loop let mut paused = false; loop { @@ -2258,6 +2367,13 @@ impl GenericReplica for CrosswordReplica { } }, + // follower gossiping trigger + _ = self.gossip_timer.timeout(), if !paused && !self.is_leader => { + if let Err(e) = self.trigger_gossiping() { + pf_error!(self.id; "error triggering gossiping: {}", e); + } + }, + // manager control message ctrl_msg = self.control_hub.recv_ctrl() => { if let Err(e) = ctrl_msg { diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs index 00e5f964..4783f4f7 100644 --- a/src/protocols/multipaxos.rs +++ b/src/protocols/multipaxos.rs @@ -49,7 +49,6 @@ pub struct ReplicaConfigMultiPaxos { /// Min timeout of not hearing any heartbeat from leader in millisecs. pub hb_hear_timeout_min: u64, - /// Max timeout of not hearing any heartbeat from leader in millisecs. pub hb_hear_timeout_max: u64, @@ -296,6 +295,7 @@ pub struct MultiPaxosReplica { snap_offset: usize, } +// MultiPaxosReplica common helpers impl MultiPaxosReplica { /// Create an empty null instance. fn null_instance(&self) -> Instance { @@ -359,7 +359,10 @@ impl MultiPaxosReplica { let cmd_idx = (command_id & ((1 << 32) - 1)) as usize; (slot, cmd_idx) } +} +// MultiPaxosReplica client requests entrance +impl MultiPaxosReplica { /// Handler of client request batch chan recv. fn handle_req_batch( &mut self, @@ -498,7 +501,10 @@ impl MultiPaxosReplica { Ok(()) } +} +// MultiPaxosReplica durable WAL logging +impl MultiPaxosReplica { /// Handler of PrepareBal logging result chan recv. fn handle_logged_prepare_bal( &mut self, @@ -657,7 +663,10 @@ impl MultiPaxosReplica { } } } +} +// MultiPaxosReplica peer-peer messages handling +impl MultiPaxosReplica { /// Handler of Prepare message from leader. fn handle_msg_prepare( &mut self, @@ -968,7 +977,10 @@ impl MultiPaxosReplica { } } } +} +// MultiPaxosReplica state machine execution +impl MultiPaxosReplica { /// Handler of state machine exec result chan recv. fn handle_cmd_result( &mut self, @@ -1026,7 +1038,10 @@ impl MultiPaxosReplica { Ok(()) } +} +// MultiPaxosReplica leadership related logic +impl MultiPaxosReplica { /// Becomes a leader, sends self-initiated Prepare messages to followers /// for all in-progress instances, and starts broadcasting heartbeats. fn become_a_leader(&mut self) -> Result<(), SummersetError> { @@ -1151,7 +1166,10 @@ impl MultiPaxosReplica { // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot); Ok(()) } +} +// MultiPaxosReplica control messages handling +impl MultiPaxosReplica { /// Handler of ResetState control message. async fn handle_ctrl_reset_state( &mut self, @@ -1263,7 +1281,10 @@ impl MultiPaxosReplica { _ => Ok(None), // ignore all other types } } +} +// MultiPaxosReplica recovery from WAL log +impl MultiPaxosReplica { /// Apply a durable storage log entry for recovery. async fn recover_apply_entry( &mut self, @@ -1397,7 +1418,10 @@ impl MultiPaxosReplica { logged_err!(self.id; "unexpected log result type or failed truncate") } } +} +// MultiPaxosReplica snapshotting & GC logic +impl MultiPaxosReplica { /// Dump a new key-value pair to snapshot file. async fn snapshot_dump_kv_pairs(&mut self) -> Result<(), SummersetError> { // collect all key-value pairs put up to exec_bar diff --git a/src/protocols/rep_nothing.rs b/src/protocols/rep_nothing.rs index e5c6b0dd..74ea31a7 100644 --- a/src/protocols/rep_nothing.rs +++ b/src/protocols/rep_nothing.rs @@ -106,6 +106,7 @@ pub struct RepNothingReplica { log_offset: usize, } +// RepNothingReplica common helpers impl RepNothingReplica { /// Compose CommandId from instance index & command index within. fn make_command_id(inst_idx: usize, cmd_idx: usize) -> CommandId { @@ -120,7 +121,10 @@ impl RepNothingReplica { let cmd_idx = (command_id & ((1 << 32) - 1)) as usize; (inst_idx, cmd_idx) } +} +// RepNothingReplica client requests entrance +impl RepNothingReplica { /// Handler of client request batch chan recv. fn handle_req_batch( &mut self, @@ -149,7 +153,10 @@ impl RepNothingReplica { Ok(()) } +} +// RepNothingReplica durable WAL logging +impl RepNothingReplica { /// Handler of durable logging result chan recv. fn handle_log_result( &mut self, @@ -190,7 +197,10 @@ impl RepNothingReplica { Ok(()) } +} +// RepNothingReplica state machine execution +impl RepNothingReplica { /// Handler of state machine exec result chan recv. fn handle_cmd_result( &mut self, @@ -236,7 +246,10 @@ impl RepNothingReplica { Ok(()) } +} +// RepNothingReplica control messages handling +impl RepNothingReplica { /// Handler of ResetState control message. async fn handle_ctrl_reset_state( &mut self, @@ -321,7 +334,10 @@ impl RepNothingReplica { _ => Ok(None), // ignore all other types } } +} +// RepNothingReplica recovery from WAL log +impl RepNothingReplica { /// Recover state from durable storage log. async fn recover_from_log(&mut self) -> Result<(), SummersetError> { assert_eq!(self.log_offset, 0); diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs index 82990870..5d2fb450 100644 --- a/src/protocols/rs_paxos.rs +++ b/src/protocols/rs_paxos.rs @@ -47,7 +47,6 @@ pub struct ReplicaConfigRSPaxos { /// Min timeout of not hearing any heartbeat from leader in millisecs. pub hb_hear_timeout_min: u64, - /// Max timeout of not hearing any heartbeat from leader in millisecs. pub hb_hear_timeout_max: u64, @@ -311,6 +310,7 @@ pub struct RSPaxosReplica { rs_coder: ReedSolomon, } +// RSPaxosReplica common helpers impl RSPaxosReplica { /// Create an empty null instance. fn null_instance(&self) -> Result { @@ -383,7 +383,10 @@ impl RSPaxosReplica { let cmd_idx = (command_id & ((1 << 32) - 1)) as usize; (slot, cmd_idx) } +} +// RSPaxosReplica client requests entrance +impl RSPaxosReplica { /// Handler of client request batch chan recv. fn handle_req_batch( &mut self, @@ -545,7 +548,10 @@ impl RSPaxosReplica { Ok(()) } +} +// RSPaxosReplica durable WAL logging +impl RSPaxosReplica { /// Handler of PrepareBal logging result chan recv. fn handle_logged_prepare_bal( &mut self, @@ -715,7 +721,10 @@ impl RSPaxosReplica { } } } +} +// RSPaxosReplica peer-peer messages handling +impl RSPaxosReplica { /// Handler of Prepare message from leader. fn handle_msg_prepare( &mut self, @@ -1185,7 +1194,10 @@ impl RSPaxosReplica { } } } +} +// RSPaxosReplica state machine execution +impl RSPaxosReplica { /// Handler of state machine exec result chan recv. fn handle_cmd_result( &mut self, @@ -1244,7 +1256,10 @@ impl RSPaxosReplica { Ok(()) } +} +// RSPaxosReplica leadership related logic +impl RSPaxosReplica { /// Becomes a leader, sends self-initiated Prepare messages to followers /// for all in-progress instances, and starts broadcasting heartbeats. fn become_a_leader(&mut self) -> Result<(), SummersetError> { @@ -1380,7 +1395,10 @@ impl RSPaxosReplica { // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot); Ok(()) } +} +// RSPaxosReplica control messages handling +impl RSPaxosReplica { /// Handler of ResetState control message. async fn handle_ctrl_reset_state( &mut self, @@ -1492,7 +1510,10 @@ impl RSPaxosReplica { _ => Ok(None), // ignore all other types } } +} +// RSPaxosReplica recovery from WAL log +impl RSPaxosReplica { /// Apply a durable storage log entry for recovery. async fn recover_apply_entry( &mut self, @@ -1641,7 +1662,10 @@ impl RSPaxosReplica { logged_err!(self.id; "unexpected log result type or failed truncate") } } +} +// RSPaxosReplica snapshotting & GC logic +impl RSPaxosReplica { /// Dump a new key-value pair to snapshot file. async fn snapshot_dump_kv_pairs(&mut self) -> Result<(), SummersetError> { // collect all key-value pairs put up to exec_bar diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs index ce89c7d1..c6a283c4 100644 --- a/src/protocols/simple_push.rs +++ b/src/protocols/simple_push.rs @@ -138,6 +138,7 @@ pub struct SimplePushReplica { log_offset: usize, } +// SimplePushReplica common helpers impl SimplePushReplica { /// Compose CommandId from instance index & command index within. fn make_command_id(inst_idx: usize, cmd_idx: usize) -> CommandId { @@ -152,7 +153,10 @@ impl SimplePushReplica { let cmd_idx = (command_id & ((1 << 32) - 1)) as usize; (inst_idx, cmd_idx) } +} +// SimplePushReplica client requests entrance +impl SimplePushReplica { /// Handler of client request batch chan recv. fn handle_req_batch( &mut self, @@ -208,7 +212,10 @@ impl SimplePushReplica { Ok(()) } +} +// SimplePushReplica durable WAL logging +impl SimplePushReplica { /// Handler of durable logging result chan recv. fn handle_log_result( &mut self, @@ -265,7 +272,10 @@ impl SimplePushReplica { Ok(()) } +} +// SimplePushReplica peer-peer messages handling +impl SimplePushReplica { /// Handler of push message from peer. fn handle_push_msg( &mut self, @@ -346,7 +356,10 @@ impl SimplePushReplica { Ok(()) } +} +// SimplePushReplica state machine execution +impl SimplePushReplica { /// Handler of state machine exec result chan recv. fn handle_cmd_result( &mut self, @@ -398,7 +411,10 @@ impl SimplePushReplica { Ok(()) } +} +// SimplePushReplica control messages handling +impl SimplePushReplica { /// Handler of ResetState control message. async fn handle_ctrl_reset_state( &mut self, @@ -486,7 +502,10 @@ impl SimplePushReplica { _ => Ok(None), // ignore all other types } } +} +// SimplePushReplica recovery from WAL log +impl SimplePushReplica { /// Recover state from durable storage log. async fn recover_from_log(&mut self) -> Result<(), SummersetError> { assert_eq!(self.log_offset, 0); From 59666514e62bf07f7a3d2d3f6abc0eee98064187 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Mon, 25 Sep 2023 19:07:51 -0500 Subject: [PATCH 62/89] add perf sim skeleton --- perf_sim/perf_sim.py | 70 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 perf_sim/perf_sim.py diff --git a/perf_sim/perf_sim.py b/perf_sim/perf_sim.py new file mode 100644 index 00000000..c2cbc40f --- /dev/null +++ b/perf_sim/perf_sim.py @@ -0,0 +1,70 @@ +import simpy + + +class Data: + def __init__(self, mark, size): + self.mark = mark + self.size = size + + +class NetLink: + def __init__(self, env, a, b): + self.env = env + self.a = a + self.b = b + self.store = simpy.Store(env) + + def delay(self, data): + delay = self.a + self.b * data.size + yield self.env.timeout(delay) + self.store.put(data) + + def send(self, data): + self.env.process(self.delay(data)) + + def recv(self): + return self.store.get() + + +class DiskDev: + def __init__(self, env, a, b): + self.env = env + self.a = a + self.b = b + + def delay(self, data): + delay = self.a + self.b * data.size + yield self.env.timeout(delay) + + def save(self, data): + self.env.process(self.delay(data)) + + +class Replica: + def __init__(self, env, is_leader, disk_a, disk_b): + self.env = env + self.is_leader = is_leader + self.disk_dev = DiskDev(env, disk_a, disk_b) + self.peers = dict() + self.net_links = dict() + + def add_peer(self, name, peer, net_a, net_b): + self.peers[name] = peer + self.net_links[name] = NetLink(self.env, net_a, net_b) + + def run(self): + while True: + yield self.env.timeout(3) + print(f"req") + yield self.disk_dev.save(Data("d", 2)) + print(f"saved") + + +class Cluster: + def __init__(self, num_replicas): + pass + + +if __name__ == "__main__": + env = simpy.Environment() + env.run(until=15) From 6e341671003338ce58cb8b34639fe406ff20e594 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Thu, 28 Sep 2023 18:37:36 -0500 Subject: [PATCH 63/89] add constraint boundary figure script --- perf_sim/perf_sim.py | 2 +- perf_sim/plot_cstr_bounds.py | 189 +++++++++++++++++++++++++++++++++++ 2 files changed, 190 insertions(+), 1 deletion(-) create mode 100644 perf_sim/plot_cstr_bounds.py diff --git a/perf_sim/perf_sim.py b/perf_sim/perf_sim.py index c2cbc40f..40d85d00 100644 --- a/perf_sim/perf_sim.py +++ b/perf_sim/perf_sim.py @@ -1,4 +1,4 @@ -import simpy +import simpy # type: ignore class Data: diff --git a/perf_sim/plot_cstr_bounds.py b/perf_sim/plot_cstr_bounds.py new file mode 100644 index 00000000..091f6f70 --- /dev/null +++ b/perf_sim/plot_cstr_bounds.py @@ -0,0 +1,189 @@ +import matplotlib # type: ignore + +matplotlib.use("Agg") + +import matplotlib.pyplot as plt # type: ignore +import matplotlib.patches as mpatches # type: ignore +from matplotlib.legend_handler import HandlerPatch # type: ignore +import math + + +SUBPLOT_ARG = lambda idx: 141 + idx + +CLUSTER_SIZES = [3, 5, 7, 9] +SIZE_COLOR_MAP = { + 3: ("orange", "bisque"), + 5: ("seagreen", "lightgreen"), + 7: ("steelblue", "skyblue"), + 9: ("chocolate", "mistyrose"), +} + +X_TICKS = list(range(1, 10)) +Y_TICKS = list(range(1, 6)) + + +def plot_cstr_bound(idx, cluster_size): + ax = plt.subplot(SUBPLOT_ARG(idx)) + + n = cluster_size + f = n // 2 + m = n - f + + line_color, fill_color = SIZE_COLOR_MAP[cluster_size] + + # Classic Paxos/Raft point + plt.scatter( + m, m, marker="D", s=100, color="dimgray", label="Classic Paxos/Raft", zorder=10 + ) + + # CRaft point + craft_q = math.ceil((n + m) / 2) + plt.scatter( + craft_q, + 1, + marker="X", + s=100, + color="lightcoral", + label="RS-Paxos/CRaft", + zorder=10, + ) + + # boundary lines + xs = [x for x in range(m, n + 1)] + ys = [x for x in range(m, 0, -1)] + plt.plot( + xs, + ys, + linewidth=2, + marker="o", + markersize=7, + color=line_color, + label="Crossword configs", + zorder=20, + ) + plt.vlines(m, ymin=m, ymax=m + 1.5, linestyles="--", color=line_color, zorder=20) + plt.vlines(n, ymin=1, ymax=2.5, linestyles="--", color=line_color, zorder=20) + + # correct region + xs = [m, m, n, n] + ys = [m, m + 1, 2, 1] + plt.fill(xs, ys, color=fill_color, label="Correct region", zorder=0) + + # latency & throughput optimized arrows + plt.arrow( + m + 0.3, + m + 1.7, + -0.9, + 0.9, + linewidth=1, + color="dimgray", + length_includes_head=True, + head_width=0.3, + overhang=0.5, + label="Tradeoff decisions", + ) + plt.text( + m + 0.18 if n == 3 else m + 0.5 if n == 9 else m + 0.4, + m + 2.78 if n == 3 else m + 2.0 if n == 9 else m + 2.4, + "Lat.\noptim.", + horizontalalignment="left", + verticalalignment="center", + color="dimgray", + ) + plt.arrow( + n - 0.3, + 3.3, + 0.9, + -0.9, + linewidth=1, + color="dimgray", + length_includes_head=True, + head_width=0.3, + overhang=0.5, + ) + plt.text( + n + 0.8 if n == 3 else n + 0.0 if n == 9 else n + 0.4, + 1 + 1.5 if n == 3 else 1 + 2.9 if n == 9 else 1 + 2.6, + "Tput.\noptim.", + horizontalalignment="left", + verticalalignment="center", + color="dimgray", + ) + + plt.axis("scaled") + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + + plt.xlim((0, X_TICKS[-1] + 0.7)) + plt.ylim((0, Y_TICKS[-1] + 2.7)) + plt.xticks(X_TICKS, list(map(str, X_TICKS))) + plt.yticks(Y_TICKS, list(map(str, Y_TICKS))) + + plt.xlabel("|Quorum|", loc="right") + plt.ylabel("#Shards\n/replica", loc="top", rotation=0, backgroundcolor="white") + ax.xaxis.set_label_coords(1.05, -0.18) + ax.yaxis.set_label_coords(0.2, 0.8) + + plt.title( + f"|Cluster|={n} f={f}", + x=0.27, + y=-0.38, + fontsize=10, + fontweight="bold", + backgroundcolor=fill_color, + ) + + return ax + + +def plot_all_cstr_bounds(): + matplotlib.rcParams.update( + { + "figure.figsize": (10, 3), + "font.size": 10, + "axes.axisbelow": False, + } + ) + fig = plt.figure() + + handles, labels = None, None + for idx, cluster_size in enumerate(CLUSTER_SIZES): + ax = plot_cstr_bound(idx, cluster_size) + if idx == len(CLUSTER_SIZES) - 1: + handles, labels = ax.get_legend_handles_labels() + + def make_legend_arrow( + legend, orig_handle, xdescent, ydescent, width, height, fontsize + ): + return mpatches.FancyArrow( + 0, + 0.5 * height, + width, + 0, + linewidth=1, + color="dimgray", + length_includes_head=True, + head_width=0.75 * height, + overhang=0.3, + ) + + # single legend group on top + handles = handles[-2:] + handles[:-2] + labels = labels[-2:] + labels[:-2] + fig.legend( + handles, + labels, + loc="lower center", + bbox_to_anchor=(0.5, 0.78), + ncol=len(handles), + handlelength=1.5, + handletextpad=0.5, + handler_map={mpatches.FancyArrow: HandlerPatch(patch_func=make_legend_arrow)}, + ) + + plt.tight_layout() + plt.savefig(f"results/cstr_bounds.png", dpi=300) + + +if __name__ == "__main__": + plot_all_cstr_bounds() From 4514ac611dcecb7b3bf76aedd21bf501783915a1 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Thu, 28 Sep 2023 22:41:43 -0500 Subject: [PATCH 64/89] update constraint boundary figure script --- perf_sim/plot_cstr_bounds.py | 107 +++++++++++++++++++++++------------ 1 file changed, 70 insertions(+), 37 deletions(-) diff --git a/perf_sim/plot_cstr_bounds.py b/perf_sim/plot_cstr_bounds.py index 091f6f70..a9c49cfc 100644 --- a/perf_sim/plot_cstr_bounds.py +++ b/perf_sim/plot_cstr_bounds.py @@ -2,6 +2,7 @@ matplotlib.use("Agg") +import numpy as np # type: ignore import matplotlib.pyplot as plt # type: ignore import matplotlib.patches as mpatches # type: ignore from matplotlib.legend_handler import HandlerPatch # type: ignore @@ -12,9 +13,9 @@ CLUSTER_SIZES = [3, 5, 7, 9] SIZE_COLOR_MAP = { - 3: ("orange", "bisque"), - 5: ("seagreen", "lightgreen"), - 7: ("steelblue", "skyblue"), + 3: ("seagreen", "palegreen"), + 5: ("orange", "bisque"), + 7: ("steelblue", "powderblue"), 9: ("chocolate", "mistyrose"), } @@ -33,7 +34,7 @@ def plot_cstr_bound(idx, cluster_size): # Classic Paxos/Raft point plt.scatter( - m, m, marker="D", s=100, color="dimgray", label="Classic Paxos/Raft", zorder=10 + m, m, marker="s", s=100, color="black", label="Classic Paxos/Raft", zorder=10 ) # CRaft point @@ -42,7 +43,7 @@ def plot_cstr_bound(idx, cluster_size): craft_q, 1, marker="X", - s=100, + s=110, color="lightcoral", label="RS-Paxos/CRaft", zorder=10, @@ -61,13 +62,13 @@ def plot_cstr_bound(idx, cluster_size): label="Crossword configs", zorder=20, ) - plt.vlines(m, ymin=m, ymax=m + 1.5, linestyles="--", color=line_color, zorder=20) - plt.vlines(n, ymin=1, ymax=2.5, linestyles="--", color=line_color, zorder=20) + plt.vlines(m, ymin=m, ymax=m + 1.5, linestyles="-", color=line_color, zorder=20) + plt.vlines(n, ymin=1, ymax=2.5, linestyles="-", color=line_color, zorder=20) # correct region xs = [m, m, n, n] ys = [m, m + 1, 2, 1] - plt.fill(xs, ys, color=fill_color, label="Correct region", zorder=0) + plt.fill(xs, ys, color=fill_color, label="Region of fault-tolerance=f", zorder=0) # latency & throughput optimized arrows plt.arrow( @@ -83,8 +84,8 @@ def plot_cstr_bound(idx, cluster_size): label="Tradeoff decisions", ) plt.text( - m + 0.18 if n == 3 else m + 0.5 if n == 9 else m + 0.4, - m + 2.78 if n == 3 else m + 2.0 if n == 9 else m + 2.4, + m + 0.18 if n <= 5 else m + 0.5 if n == 9 else m + 0.4, + m + 2.78 if n <= 5 else m + 2.0 if n == 9 else m + 2.4, "Lat.\noptim.", horizontalalignment="left", verticalalignment="center", @@ -102,8 +103,8 @@ def plot_cstr_bound(idx, cluster_size): overhang=0.5, ) plt.text( - n + 0.8 if n == 3 else n + 0.0 if n == 9 else n + 0.4, - 1 + 1.5 if n == 3 else 1 + 2.9 if n == 9 else 1 + 2.6, + n + 0.8 if n <= 5 else n + 0.0 if n == 9 else n + 0.4, + 1 + 1.5 if n <= 5 else 1 + 2.9 if n == 9 else 1 + 2.6, "Tput.\noptim.", horizontalalignment="left", verticalalignment="center", @@ -126,7 +127,7 @@ def plot_cstr_bound(idx, cluster_size): plt.title( f"|Cluster|={n} f={f}", - x=0.27, + x=0.3, y=-0.38, fontsize=10, fontweight="bold", @@ -136,22 +137,7 @@ def plot_cstr_bound(idx, cluster_size): return ax -def plot_all_cstr_bounds(): - matplotlib.rcParams.update( - { - "figure.figsize": (10, 3), - "font.size": 10, - "axes.axisbelow": False, - } - ) - fig = plt.figure() - - handles, labels = None, None - for idx, cluster_size in enumerate(CLUSTER_SIZES): - ax = plot_cstr_bound(idx, cluster_size) - if idx == len(CLUSTER_SIZES) - 1: - handles, labels = ax.get_legend_handles_labels() - +def make_legend(fig, handles, labels): def make_legend_arrow( legend, orig_handle, xdescent, ydescent, width, height, fontsize ): @@ -163,23 +149,70 @@ def make_legend_arrow( linewidth=1, color="dimgray", length_includes_head=True, - head_width=0.75 * height, + head_width=0.6 * height, overhang=0.3, ) - # single legend group on top - handles = handles[-2:] + handles[:-2] - labels = labels[-2:] + labels[:-2] - fig.legend( - handles, - labels, + def make_legend_polygon( + legend, orig_handle, xdescent, ydescent, width, height, fontsize + ): + return mpatches.Polygon( + xy=np.array( + [ + [0.2 * width, 0.5 * height], + [0.2 * width, 1.2 * height], + [0.8 * width, 0.5 * height], + [0.8 * width, -0.2 * height], + ] + ), + closed=True, + color="dimgray", + ) + + order = [] + for s in ("Classic", "RS-", "Crossword", "Region", "Tradeoff"): + for i, l in enumerate(labels): + if s in l: + order.append(i) + break + sorted_handles = [handles[i] for i in order] + sorted_labels = [labels[i] for i in order] + + leg = fig.legend( + sorted_handles, + sorted_labels, loc="lower center", bbox_to_anchor=(0.5, 0.78), ncol=len(handles), handlelength=1.5, handletextpad=0.5, - handler_map={mpatches.FancyArrow: HandlerPatch(patch_func=make_legend_arrow)}, + handler_map={ + mpatches.FancyArrow: HandlerPatch(patch_func=make_legend_arrow), + mpatches.Polygon: HandlerPatch(patch_func=make_legend_polygon), + }, + ) + for h in leg.legend_handles[2:]: + h.set_color("dimgray") + + +def plot_all_cstr_bounds(): + matplotlib.rcParams.update( + { + "figure.figsize": (10, 3), + "font.size": 10, + "axes.axisbelow": False, + } ) + fig = plt.figure() + + handles, labels = None, None + for idx, cluster_size in enumerate(CLUSTER_SIZES): + ax = plot_cstr_bound(idx, cluster_size) + if idx == len(CLUSTER_SIZES) - 1: + handles, labels = ax.get_legend_handles_labels() + + # single legend group on top + make_legend(fig, handles, labels) plt.tight_layout() plt.savefig(f"results/cstr_bounds.png", dpi=300) From 94906503a5b1a8f571ed51a39e60dde6ef67c237 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Sat, 30 Sep 2023 07:57:27 -0500 Subject: [PATCH 65/89] rename perf_sim/ to models/ --- {perf_sim => models}/perf_sim.py | 0 {perf_sim => models}/plot_cstr_bounds.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename {perf_sim => models}/perf_sim.py (100%) rename {perf_sim => models}/plot_cstr_bounds.py (100%) diff --git a/perf_sim/perf_sim.py b/models/perf_sim.py similarity index 100% rename from perf_sim/perf_sim.py rename to models/perf_sim.py diff --git a/perf_sim/plot_cstr_bounds.py b/models/plot_cstr_bounds.py similarity index 100% rename from perf_sim/plot_cstr_bounds.py rename to models/plot_cstr_bounds.py From ae8cb3bc1d381d315198bfe7dd4b39bc73ea5ab3 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Sat, 30 Sep 2023 09:00:00 -0500 Subject: [PATCH 66/89] staging progress on perf sim --- models/perf_sim.py | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/models/perf_sim.py b/models/perf_sim.py index 40d85d00..3a776b1c 100644 --- a/models/perf_sim.py +++ b/models/perf_sim.py @@ -41,16 +41,15 @@ def save(self, data): class Replica: - def __init__(self, env, is_leader, disk_a, disk_b): + def __init__(self, env, rid, disk_ab): self.env = env - self.is_leader = is_leader - self.disk_dev = DiskDev(env, disk_a, disk_b) - self.peers = dict() + self.rid = rid + self.api = NetLink(env, 0, 0) + self.disk_dev = DiskDev(env, disk_ab[0], disk_ab[1]) self.net_links = dict() - def add_peer(self, name, peer, net_a, net_b): - self.peers[name] = peer - self.net_links[name] = NetLink(self.env, net_a, net_b) + def add_peer(self, rid, net_ab): + self.net_links[rid] = NetLink(self.env, net_ab[0], net_ab[1]) def run(self): while True: @@ -59,10 +58,30 @@ def run(self): yield self.disk_dev.save(Data("d", 2)) print(f"saved") + def req(self, data): + self.api.send(data) + class Cluster: - def __init__(self, num_replicas): - pass + def __init__(self, env, num_replicas, disk_perf_map, net_perf_map): + self.env = env + self.replicas = [ + Replica( + env, + rid, + disk_perf_map[rid], + ) + for rid in range(num_replicas) + ] + self.leader = self.replicas[0] + + for replica in self.replicas: + for rid in range(num_replicas): + if rid != replica.rid: + replica.add_peer( + rid, + net_perf_map[(replica.rid, rid)], + ) if __name__ == "__main__": From 94b8f94111aa868b950a16328ea90f6c796e506c Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Sat, 30 Sep 2023 13:19:03 -0500 Subject: [PATCH 67/89] staging progress on perf sim --- models/perf_sim.py | 218 ++++++++++++++++++++++++++++++++++------- src/server/external.rs | 3 +- 2 files changed, 183 insertions(+), 38 deletions(-) diff --git a/models/perf_sim.py b/models/perf_sim.py index 3a776b1c..512b6ef5 100644 --- a/models/perf_sim.py +++ b/models/perf_sim.py @@ -1,4 +1,10 @@ import simpy # type: ignore +from enum import Enum # type: ignore + + +############## +# Data types # +############## class Data: @@ -6,82 +12,220 @@ def __init__(self, mark, size): self.mark = mark self.size = size + def __str__(self): + return f"<{self.mark};{self.size}>" + + +class Req(Data): + def __init__(self, mark, size): + super().__init__(mark, size) + + +class Batch(Data): + def __init__(self, mark, vec): + self.vec = vec + size = sum((data.size for data in vec)) + super().__init__(mark, size) + + +############### +# Event types # +############### + + +class EType(Enum): + NetRecved = 1 + DiskSaved = 2 + ApiBatch = 3 + + +class Event: + def __init__(self, enum, info, value): + self.enum = enum + self.info = info + self.value = value + + def __str__(self): + return f"{{{self.enum}|{self.info}|{self.value}}}" -class NetLink: - def __init__(self, env, a, b): + +class NetRecved(Event): + def __init__(self, src, msg): + super().__init__(EType.NetRecved, src, msg) + + +class DiskSaved(Event): + def __init__(self, mark): + super().__init__(EType.DiskSaved, None, mark) + + +class ApiBatch(Event): + def __init__(self, batch): + super().__init__(EType.ApiBatch, None, batch) + + +################### +# Component types # +################### + + +class Device: + def __init__(self, env, l, t, v): self.env = env - self.a = a - self.b = b - self.store = simpy.Store(env) + self.l = l + self.t = t + self.v = v # TODO: use this + self.pipe = simpy.Store(env) def delay(self, data): - delay = self.a + self.b * data.size + delay = self.l + self.t * data.size yield self.env.timeout(delay) - self.store.put(data) + self.pipe.put(data) - def send(self, data): - self.env.process(self.delay(data)) + +class NetLink(Device): + def __init__(self, env, l, t, v, src, dst): + self.src = src + self.dst = dst + super().__init__(env, l, t, v) + + def send(self, msg): + self.env.process(self.delay(msg)) def recv(self): - return self.store.get() + msg = yield self.pipe.get() + return NetRecved(self.src, msg) -class DiskDev: - def __init__(self, env, a, b): +class DiskDev(Device): + def __init__(self, env, l, t, v, rid): + self.rid = rid + super().__init__(env, l, t, v) + + def write(self, ent): + self.env.process(self.delay(ent)) + + def saved(self): + ent = yield self.pipe.get() + return DiskSaved(ent.mark) + + +class ExtlApi: + def __init__(self, env, b, rid): self.env = env - self.a = a self.b = b + self.rid = rid + self.mark = 0 + self.ibuf = [] + self.tick = simpy.Container() + + self.env.process(self.ticker()) + + def ticker(self): + while True: + yield self.env.timeout(self.b) + self.tick.put(1) + + def req(self, req): + self.ibuf.append(req) + + def batch(self): + yield self.tick.get(1) + self.tick.level = 0 + + batch = Batch(self.mark, self.ibuf) + self.mark += 1 + self.ibuf = [] + return ApiBatch(batch) - def delay(self, data): - delay = self.a + self.b * data.size - yield self.env.timeout(delay) - def save(self, data): - self.env.process(self.delay(data)) +##################### +# Replica & Cluster # +##################### class Replica: - def __init__(self, env, rid, disk_ab): + def __init__(self, env, rid, api_b, disk_ltv): self.env = env self.rid = rid - self.api = NetLink(env, 0, 0) - self.disk_dev = DiskDev(env, disk_ab[0], disk_ab[1]) - self.net_links = dict() - - def add_peer(self, rid, net_ab): - self.net_links[rid] = NetLink(self.env, net_ab[0], net_ab[1]) + self.extl_api = ExtlApi(env, api_b) + self.disk_dev = DiskDev(env, disk_ltv[0], disk_ltv[1], disk_ltv[2]) + self.send_links = dict() + self.recv_links = dict() + + def add_peer(self, peer, net_ltv): + s2p_link = NetLink( + self.env, net_ltv[0], net_ltv[1], net_ltv[2], self.rid, peer.rid + ) + p2s_link = NetLink( + self.env, net_ltv[0], net_ltv[1], net_ltv[2], peer.rid, self.rid + ) + self.send_links[peer.rid] = s2p_link + self.recv_links[peer.rid] = p2s_link + peer.send_links[self.rid] = p2s_link + peer.recv_links[self.rid] = s2p_link def run(self): while True: - yield self.env.timeout(3) - print(f"req") - yield self.disk_dev.save(Data("d", 2)) - print(f"saved") + events = [ + self.env.process(self.extl_api.batch()), + self.env.process(self.disk_dev.saved()), + ] + for link in self.recv_links: + events.append(self.env.process(link.recv())) + event = yield self.env.any_of(events) + + print(event) def req(self, data): - self.api.send(data) + self.extl_api.req(data) class Cluster: - def __init__(self, env, num_replicas, disk_perf_map, net_perf_map): + def __init__(self, env, num_replicas, api_b, disk_perf_map, net_perf_map): self.env = env self.replicas = [ Replica( env, rid, + api_b, disk_perf_map[rid], ) for rid in range(num_replicas) ] self.leader = self.replicas[0] + for rid, replica in enumerate(self.replicas): + for peerid in range(rid + 1, num_replicas): + peer = self.replicas[peerid] + replica.add_peer(peer, net_perf_map[{rid, peerid}]) + + def launch(self): for replica in self.replicas: - for rid in range(num_replicas): - if rid != replica.rid: - replica.add_peer( - rid, - net_perf_map[(replica.rid, rid)], - ) + self.env.process(replica.run()) + + def req(self, data): + self.leader.req(data) + + +########## +# Client # +########## + + +class Client: + def __init__(self, env, cluster, freq): + self.env = env + self.service = cluster + self.gap = 1.0 / freq + + def driver(self): + while True: + yield self.env.timeout(self.gap) + self.service.req(Req("TODO", 8)) + + def start(self): + self.env.process(self.driver()) if __name__ == "__main__": diff --git a/src/server/external.rs b/src/server/external.rs index cc820c00..a0728861 100644 --- a/src/server/external.rs +++ b/src/server/external.rs @@ -21,7 +21,7 @@ use tokio::io::AsyncReadExt; use tokio::sync::{mpsc, Notify}; use tokio::sync::mpsc::error::TryRecvError; use tokio::task::JoinHandle; -use tokio::time::{self, Duration}; +use tokio::time::{self, Duration, MissedTickBehavior}; /// External API request ID type. pub type RequestId = u64; @@ -490,6 +490,7 @@ impl ExternalApi { batch_notify: Arc, ) { let mut interval = time::interval(batch_interval); + interval.set_missed_tick_behavior(MissedTickBehavior::Skip); loop { interval.tick().await; From 41e1ee857abd3d451c7aca2df10cb49ea23e5760 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Sat, 30 Sep 2023 20:20:01 -0500 Subject: [PATCH 68/89] staging progress on perf sim --- models/perf_sim.py | 233 ------------------- models/perf_simulation.py | 477 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 477 insertions(+), 233 deletions(-) delete mode 100644 models/perf_sim.py create mode 100644 models/perf_simulation.py diff --git a/models/perf_sim.py b/models/perf_sim.py deleted file mode 100644 index 512b6ef5..00000000 --- a/models/perf_sim.py +++ /dev/null @@ -1,233 +0,0 @@ -import simpy # type: ignore -from enum import Enum # type: ignore - - -############## -# Data types # -############## - - -class Data: - def __init__(self, mark, size): - self.mark = mark - self.size = size - - def __str__(self): - return f"<{self.mark};{self.size}>" - - -class Req(Data): - def __init__(self, mark, size): - super().__init__(mark, size) - - -class Batch(Data): - def __init__(self, mark, vec): - self.vec = vec - size = sum((data.size for data in vec)) - super().__init__(mark, size) - - -############### -# Event types # -############### - - -class EType(Enum): - NetRecved = 1 - DiskSaved = 2 - ApiBatch = 3 - - -class Event: - def __init__(self, enum, info, value): - self.enum = enum - self.info = info - self.value = value - - def __str__(self): - return f"{{{self.enum}|{self.info}|{self.value}}}" - - -class NetRecved(Event): - def __init__(self, src, msg): - super().__init__(EType.NetRecved, src, msg) - - -class DiskSaved(Event): - def __init__(self, mark): - super().__init__(EType.DiskSaved, None, mark) - - -class ApiBatch(Event): - def __init__(self, batch): - super().__init__(EType.ApiBatch, None, batch) - - -################### -# Component types # -################### - - -class Device: - def __init__(self, env, l, t, v): - self.env = env - self.l = l - self.t = t - self.v = v # TODO: use this - self.pipe = simpy.Store(env) - - def delay(self, data): - delay = self.l + self.t * data.size - yield self.env.timeout(delay) - self.pipe.put(data) - - -class NetLink(Device): - def __init__(self, env, l, t, v, src, dst): - self.src = src - self.dst = dst - super().__init__(env, l, t, v) - - def send(self, msg): - self.env.process(self.delay(msg)) - - def recv(self): - msg = yield self.pipe.get() - return NetRecved(self.src, msg) - - -class DiskDev(Device): - def __init__(self, env, l, t, v, rid): - self.rid = rid - super().__init__(env, l, t, v) - - def write(self, ent): - self.env.process(self.delay(ent)) - - def saved(self): - ent = yield self.pipe.get() - return DiskSaved(ent.mark) - - -class ExtlApi: - def __init__(self, env, b, rid): - self.env = env - self.b = b - self.rid = rid - self.mark = 0 - self.ibuf = [] - self.tick = simpy.Container() - - self.env.process(self.ticker()) - - def ticker(self): - while True: - yield self.env.timeout(self.b) - self.tick.put(1) - - def req(self, req): - self.ibuf.append(req) - - def batch(self): - yield self.tick.get(1) - self.tick.level = 0 - - batch = Batch(self.mark, self.ibuf) - self.mark += 1 - self.ibuf = [] - return ApiBatch(batch) - - -##################### -# Replica & Cluster # -##################### - - -class Replica: - def __init__(self, env, rid, api_b, disk_ltv): - self.env = env - self.rid = rid - self.extl_api = ExtlApi(env, api_b) - self.disk_dev = DiskDev(env, disk_ltv[0], disk_ltv[1], disk_ltv[2]) - self.send_links = dict() - self.recv_links = dict() - - def add_peer(self, peer, net_ltv): - s2p_link = NetLink( - self.env, net_ltv[0], net_ltv[1], net_ltv[2], self.rid, peer.rid - ) - p2s_link = NetLink( - self.env, net_ltv[0], net_ltv[1], net_ltv[2], peer.rid, self.rid - ) - self.send_links[peer.rid] = s2p_link - self.recv_links[peer.rid] = p2s_link - peer.send_links[self.rid] = p2s_link - peer.recv_links[self.rid] = s2p_link - - def run(self): - while True: - events = [ - self.env.process(self.extl_api.batch()), - self.env.process(self.disk_dev.saved()), - ] - for link in self.recv_links: - events.append(self.env.process(link.recv())) - event = yield self.env.any_of(events) - - print(event) - - def req(self, data): - self.extl_api.req(data) - - -class Cluster: - def __init__(self, env, num_replicas, api_b, disk_perf_map, net_perf_map): - self.env = env - self.replicas = [ - Replica( - env, - rid, - api_b, - disk_perf_map[rid], - ) - for rid in range(num_replicas) - ] - self.leader = self.replicas[0] - - for rid, replica in enumerate(self.replicas): - for peerid in range(rid + 1, num_replicas): - peer = self.replicas[peerid] - replica.add_peer(peer, net_perf_map[{rid, peerid}]) - - def launch(self): - for replica in self.replicas: - self.env.process(replica.run()) - - def req(self, data): - self.leader.req(data) - - -########## -# Client # -########## - - -class Client: - def __init__(self, env, cluster, freq): - self.env = env - self.service = cluster - self.gap = 1.0 / freq - - def driver(self): - while True: - yield self.env.timeout(self.gap) - self.service.req(Req("TODO", 8)) - - def start(self): - self.env.process(self.driver()) - - -if __name__ == "__main__": - env = simpy.Environment() - env.run(until=15) diff --git a/models/perf_simulation.py b/models/perf_simulation.py new file mode 100644 index 00000000..6d5f29b2 --- /dev/null +++ b/models/perf_simulation.py @@ -0,0 +1,477 @@ +import simpy # type: ignore +from enum import Enum # type: ignore + + +############## +# Data types # +############## + + +class Data: + def __init__(self, mark, size): + self.mark = mark + self.size = size + + def __str__(self): + return f"<{self.mark};{self.size}>" + + +class Req(Data): + def __init__(self, cid, mark, size): + self.cid = cid + super().__init__(mark, size) + + +class Batch(Data): + def __init__(self, mark, vec): + self.vec = vec + size = sum((data.size for data in vec)) + super().__init__(mark, size) + + +############### +# Event types # +############### + + +class EType(Enum): + NetRecved = 1 + DiskSaved = 2 + ApiBatch = 3 + SendNewReq = 4 + + +class Event: + def __init__(self, enum, info, value): + self.enum = enum + self.info = info + self.value = value + + def __str__(self): + return f"{{{self.enum}|{self.info}|{self.value}}}" + + +class NetRecved(Event): + def __init__(self, src, msg): + super().__init__(EType.NetRecved, src, msg) + + +class DiskSaved(Event): + def __init__(self, mark): + super().__init__(EType.DiskSaved, None, mark) + + +class ApiBatch(Event): + def __init__(self, batch): + super().__init__(EType.ApiBatch, None, batch) + + +class SendNewReq(Event): + def __init__(self, mark): + super().__init__(EType.SendNewReq, None, mark) + + +################### +# Component types # +################### + + +class Device: + def __init__(self, env, l, t, v): + self.env = env + self.l = l + self.t = t + self.v = v # TODO: use this + self.pipe = simpy.Store(env) + + def delay(self, data): + delay = self.l + self.t * data.size + yield self.env.timeout(delay) + self.pipe.put(data) + + +class NetLink(Device): + def __init__(self, env, l, t, v, src, dst): + self.src = src + self.dst = dst + super().__init__(env, l, t, v) + + def send(self, msg): + if self.src == 1 and self.dst == 0: + print("!!!", msg) + self.env.process(self.delay(msg)) + + def recv(self): + if self.src == 1 and self.dst == 0: + print("???", self.env.now, self.pipe.items) + msg = yield self.pipe.get() + return NetRecved(self.src, msg) + + +class DiskDev(Device): + def __init__(self, env, l, t, v, rid): + self.rid = rid + super().__init__(env, l, t, v) + + def write(self, ent): + self.env.process(self.delay(ent)) + + def saved(self): + ent = yield self.pipe.get() + return DiskSaved(ent.mark) + + +class ExtlApi: + def __init__(self, env, l, t, v, b, rid): + self.env = env + self.l = l + self.t = t + self.v = v + self.b = b + + self.rid = rid + self.req_links = dict() + self.ack_links = dict() + + self.mark = 0 + self.tick = simpy.Container(env, capacity=1) + + self.env.process(self.ticker()) + + def connect(self, client): + req_link = NetLink(self.env, self.l, self.t, self.v, client.cid, self.rid) + ack_link = NetLink(self.env, self.l, self.t, self.v, self.rid, client.cid) + self.req_links[client.cid] = req_link + self.ack_links[client.cid] = ack_link + return (req_link, ack_link) + + def ticker(self): + while True: + yield self.env.timeout(self.b) + if self.tick.level == 0: + self.tick.put(1) + + def batch(self): + while True: + yield self.tick.get(1) + + reqs = [] + for link in self.req_links.values(): + if len(link.pipe.items) > 0: + reqs += link.pipe.items + link.pipe.items = [] + + # do not return if no reqs available at this tick + if len(reqs) == 0: + continue + + batch = Batch(self.mark, reqs) + self.mark += 1 + return ApiBatch(batch) + + def ack(self, cid, mark): + if cid not in self.ack_links: + raise RuntimeError(f"cid {cid} not in connected") + self.ack_links[cid].send(mark) + + +##################### +# Replica & Cluster # +##################### + + +class Replica: + def __init__(self, env, rid, api_ltvb, disk_ltv, protocol, **protocol_args): + self.env = env + self.rid = rid + self.extl_api = ExtlApi( + env, api_ltvb[0], api_ltvb[1], api_ltvb[2], api_ltvb[3], rid + ) + self.disk_dev = DiskDev(env, disk_ltv[0], disk_ltv[1], disk_ltv[2], rid) + self.send_links = dict() + self.recv_links = dict() + + # protocol-specific fields & event handlers + self.protocol = protocol(self, **protocol_args) + + def add_peer(self, peer, net_ltv): + s2p_link = NetLink( + self.env, net_ltv[0], net_ltv[1], net_ltv[2], self.rid, peer.rid + ) + p2s_link = NetLink( + self.env, net_ltv[0], net_ltv[1], net_ltv[2], peer.rid, self.rid + ) + self.send_links[peer.rid] = s2p_link + self.recv_links[peer.rid] = p2s_link + peer.send_links[self.rid] = p2s_link + peer.recv_links[self.rid] = s2p_link + + def run(self): + while True: + events = [ + self.env.process(self.extl_api.batch()), + self.env.process(self.disk_dev.saved()), + ] + for link in self.recv_links.values(): + events.append(self.env.process(link.recv())) + print("XXX", self.rid, len(events)) + + # could get multiple completed triggers at this yield + conds = yield self.env.any_of(events) + for event in conds.values(): + print(f"{self.env.now}: R{self.rid} {event}") + if event.enum == EType.ApiBatch: + self.protocol.handle_api_batch(event.value) + elif event.enum == EType.DiskSaved: + self.protocol.handle_disk_saved(event.value) + elif event.enum == EType.NetRecved: + self.protocol.handle_net_recved(event.info, event.value) + else: + raise RuntimeError(f"unrecognized event type: {event}") + + def connect(self, client): + return self.extl_api.connect(client) + + +class Cluster: + def __init__( + self, + env, + num_replicas, + api_ltvb, + disk_ltv_map, + net_ltv_map, + protocol, + **protocol_args, + ): + self.env = env + self.replicas = [ + Replica( + env, + rid, + api_ltvb, + disk_ltv_map[rid], + protocol, + **protocol_args, + ) + for rid in range(num_replicas) + ] + self.leader = self.replicas[0] + + for rid, replica in enumerate(self.replicas): + for peerid in range(rid + 1, num_replicas): + peer = self.replicas[peerid] + replica.add_peer(peer, net_ltv_map[(rid, peerid)]) + + def launch(self): + for replica in self.replicas: + self.env.process(replica.run()) + + def connect(self, client): + return self.leader.connect(client) + + +############# +# Protocols # +############# + + +class Protocol: + def __init__(self, replica): + self.replica = replica + + +class MultiPaxos(Protocol): + def __init__(self, replica, quorum_size): + super().__init__(replica) + + self.quorum_size = quorum_size + self.insts = [] + + class Instance: + def __init__(self, batch=None): + self.batch = batch + self.num_replies = 0 + self.from_peer = 0 + self.client_acked = False + + class AcceptMsg(Data): + def __init__(self, slot, batch): + super().__init__(f"a-{slot}", batch.size + 8) + self.batch = batch + + class AcceptReply(Data): + def __init__(self, slot): + super().__init__(f"r-{slot}", 8) + + def handle_api_batch(self, batch): + self.insts.append(self.Instance(batch)) + slot = len(self.insts) - 1 + + for link in self.replica.send_links.values(): + link.send(self.AcceptMsg(slot, batch)) + + self.replica.disk_dev.write(self.AcceptMsg(slot, batch)) + + def handle_disk_saved(self, mark): + if not mark.startswith("a-"): + raise RuntimeError(f"unrecognized ent mark: {mark}") + + slot = int(mark[2:]) + assert slot < len(self.insts) + self.insts[slot].num_replies += 1 + + if ( + not self.insts[slot].client_acked + and self.insts[slot].num_replies >= self.quorum_size + ): + self.ack_client_reqs(slot) + + def handle_net_recved(self, peer, msg): + if msg.mark.startswith("a-"): + slot = int(msg.mark[2:]) + while slot >= len(self.insts): + self.insts.append(self.Instance()) + self.insts[slot].from_peer = peer + self.insts[slot].batch = msg.batch + + self.replica.send_links[peer].send(self.AcceptReply(slot)) + + elif msg.mark.startswith("r-"): + print("!!!", slot) + slot = int(msg.mark[2:]) + assert slot < len(self.insts) + self.insts[slot].num_replies += 1 + + if ( + not self.insts[slot].client_acked + and self.insts[slot].num_replies >= self.quorum_size + ): + self.ack_client_reqs(slot) + + else: + raise RuntimeError(f"unrecognized msg mark: {msg.mark}") + + def ack_client_reqs(self, slot): + assert not self.insts[slot].client_acked + + for req in self.insts[slot].batch.vec: + self.replica.extl_api.ack(req.cid, req.mark) + + self.insts[slot].client_acked = True + + +########## +# Client # +########## + + +class Stats: + def __init__(self, env): + self.env = env + self.total_cnt = 0 + self.req_times = dict() + self.ack_times = dict() + + def add_req(self, mark): + assert mark not in self.req_times + self.req_times[mark] = self.env.now + + def add_ack(self, mark): + assert mark in self.req_times + assert mark not in self.ack_times + self.total_cnt += 1 + self.ack_times[mark] = self.env.now + print("!!!", self.env.now) + + def clear(self): + for mark in self.ack_times: + del self.req_times[mark] + self.ack_times = dict() + + +class Client: + def __init__(self, env, cluster, cid, freq, vsize): + self.env = env + self.cid = cid + self.service = cluster + self.req_link, self.ack_link = self.service.connect(self) + + self.gap = 1.0 / freq + self.vsize = vsize + + self.mark = 0 + self.tick = simpy.Container(env, capacity=1) + self.stats = Stats(env) + + self.env.process(self.ticker()) + + def ticker(self): + while True: + yield self.env.timeout(self.gap) + if self.tick.level == 0: + self.tick.put(1) + + def new_req(self): + yield self.tick.get(1) + self.mark += 1 + return SendNewReq(self.mark) + + def driver(self): + while True: + events = [ + self.env.process(self.new_req()), + self.env.process(self.ack_link.recv()), + ] + + # could get multiple completed triggers at this yield + conds = yield self.env.any_of(events) + for event in conds.values(): + print(f"{self.env.now}: C{self.cid} {event}") + if event.enum == EType.SendNewReq: + mark = event.value + self.req_link.send(Req(self.cid, mark, self.vsize)) + self.stats.add_req(mark) + elif event.enum == EType.NetRecved: + mark = event.value + self.stats.add_ack(mark) + else: + raise RuntimeError(f"unrecognized event type: {event}") + + def start(self): + self.env.process(self.driver()) + + +################# +# Main entrance # +################# + + +if __name__ == "__main__": + num_replicas = 5 + api_ltvb = (1, 1, 0, 3) + disk_ltv_map = {rid: (1, 1, 0) for rid in range(num_replicas)} + net_ltv_map = dict() + for rid in range(num_replicas): + for peerid in range(rid + 1, num_replicas): + net_ltv_map[(rid, peerid)] = (1, 1, 0) + freq = 1 + vsize = 1 + + env = simpy.Environment() + + cluster = Cluster( + env, + 5, + api_ltvb, + disk_ltv_map, + net_ltv_map, + MultiPaxos, + quorum_size=3, + ) + cluster.launch() + + client = Client(env, cluster, 2957, freq, vsize) + client.start() + + env.run(until=60) From 04b5c9619069bef66efb423b3afd0af51d136ab6 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Sat, 30 Sep 2023 21:22:10 -0500 Subject: [PATCH 69/89] staging progress on perf sim --- models/perf_simulation.py | 72 ++++++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 28 deletions(-) diff --git a/models/perf_simulation.py b/models/perf_simulation.py index 6d5f29b2..f4ec0436 100644 --- a/models/perf_simulation.py +++ b/models/perf_simulation.py @@ -22,6 +22,12 @@ def __init__(self, cid, mark, size): super().__init__(mark, size) +class Ack(Data): + def __init__(self, cid, mark): + self.cid = cid + super().__init__(mark, 8) + + class Batch(Data): def __init__(self, mark, vec): self.vec = vec @@ -97,13 +103,9 @@ def __init__(self, env, l, t, v, src, dst): super().__init__(env, l, t, v) def send(self, msg): - if self.src == 1 and self.dst == 0: - print("!!!", msg) self.env.process(self.delay(msg)) def recv(self): - if self.src == 1 and self.dst == 0: - print("???", self.env.now, self.pipe.items) msg = yield self.pipe.get() return NetRecved(self.src, msg) @@ -172,7 +174,7 @@ def batch(self): def ack(self, cid, mark): if cid not in self.ack_links: raise RuntimeError(f"cid {cid} not in connected") - self.ack_links[cid].send(mark) + self.ack_links[cid].send(Ack(cid, mark)) ##################### @@ -207,25 +209,36 @@ def add_peer(self, peer, net_ltv): peer.recv_links[self.rid] = s2p_link def run(self): - while True: - events = [ - self.env.process(self.extl_api.batch()), - self.env.process(self.disk_dev.saved()), - ] - for link in self.recv_links.values(): - events.append(self.env.process(link.recv())) - print("XXX", self.rid, len(events)) + events = { + "api_batch": self.env.process(self.extl_api.batch()), + "disk_saved": self.env.process(self.disk_dev.saved()), + } + for peer, link in self.recv_links.items(): + events[("net_recved", peer)] = self.env.process(link.recv()) + while True: # could get multiple completed triggers at this yield - conds = yield self.env.any_of(events) + conds = yield self.env.any_of(events.values()) for event in conds.values(): - print(f"{self.env.now}: R{self.rid} {event}") + # print(f"{self.env.now}: R{self.rid} {event}") + if event.enum == EType.ApiBatch: - self.protocol.handle_api_batch(event.value) + batch = event.value + self.protocol.handle_api_batch(batch) + events["api_batch"] = self.env.process(self.extl_api.batch()) + elif event.enum == EType.DiskSaved: - self.protocol.handle_disk_saved(event.value) + mark = event.value + self.protocol.handle_disk_saved(mark) + events["disk_saved"] = self.env.process(self.disk_dev.saved()) + elif event.enum == EType.NetRecved: - self.protocol.handle_net_recved(event.info, event.value) + peer, msg = event.info, event.value + self.protocol.handle_net_recved(peer, msg) + events[("net_recved", peer)] = self.env.process( + self.recv_links[peer].recv() + ) + else: raise RuntimeError(f"unrecognized event type: {event}") @@ -338,7 +351,6 @@ def handle_net_recved(self, peer, msg): self.replica.send_links[peer].send(self.AcceptReply(slot)) elif msg.mark.startswith("r-"): - print("!!!", slot) slot = int(msg.mark[2:]) assert slot < len(self.insts) self.insts[slot].num_replies += 1 @@ -382,7 +394,6 @@ def add_ack(self, mark): assert mark not in self.ack_times self.total_cnt += 1 self.ack_times[mark] = self.env.now - print("!!!", self.env.now) def clear(self): for mark in self.ack_times: @@ -418,23 +429,28 @@ def new_req(self): return SendNewReq(self.mark) def driver(self): - while True: - events = [ - self.env.process(self.new_req()), - self.env.process(self.ack_link.recv()), - ] + events = { + "req": self.env.process(self.new_req()), + "ack": self.env.process(self.ack_link.recv()), + } + while True: # could get multiple completed triggers at this yield - conds = yield self.env.any_of(events) + conds = yield self.env.any_of(events.values()) for event in conds.values(): - print(f"{self.env.now}: C{self.cid} {event}") + # print(f"{self.env.now}: C{self.cid} {event}") + if event.enum == EType.SendNewReq: mark = event.value self.req_link.send(Req(self.cid, mark, self.vsize)) self.stats.add_req(mark) + events["req"] = self.env.process(self.new_req()) + elif event.enum == EType.NetRecved: - mark = event.value + mark = event.value.mark self.stats.add_ack(mark) + events["ack"] = self.env.process(self.ack_link.recv()) + else: raise RuntimeError(f"unrecognized event type: {event}") From 8f32971e8c7acd20293b3bf19f2e169910e9e95f Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Sat, 30 Sep 2023 22:01:09 -0500 Subject: [PATCH 70/89] staging progress on perf sim --- models/perf_simulation.py | 67 ++++++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 18 deletions(-) diff --git a/models/perf_simulation.py b/models/perf_simulation.py index f4ec0436..def8d3d9 100644 --- a/models/perf_simulation.py +++ b/models/perf_simulation.py @@ -302,10 +302,10 @@ def __init__(self, replica, quorum_size): self.insts = [] class Instance: - def __init__(self, batch=None): - self.batch = batch + def __init__(self): + self.batch = None self.num_replies = 0 - self.from_peer = 0 + self.from_peer = -1 self.client_acked = False class AcceptMsg(Data): @@ -318,8 +318,9 @@ def __init__(self, slot): super().__init__(f"r-{slot}", 8) def handle_api_batch(self, batch): - self.insts.append(self.Instance(batch)) + self.insts.append(self.Instance()) slot = len(self.insts) - 1 + self.insts[slot].batch = batch for link in self.replica.send_links.values(): link.send(self.AcceptMsg(slot, batch)) @@ -329,28 +330,38 @@ def handle_api_batch(self, batch): def handle_disk_saved(self, mark): if not mark.startswith("a-"): raise RuntimeError(f"unrecognized ent mark: {mark}") - slot = int(mark[2:]) assert slot < len(self.insts) - self.insts[slot].num_replies += 1 - if ( - not self.insts[slot].client_acked - and self.insts[slot].num_replies >= self.quorum_size - ): - self.ack_client_reqs(slot) + if self.insts[slot].from_peer < 0: + # disk save on leader + self.insts[slot].num_replies += 1 + + if ( + not self.insts[slot].client_acked + and self.insts[slot].num_replies >= self.quorum_size + ): + self.ack_client_reqs(slot) + + else: + # disk save on follower + self.replica.send_links[self.insts[slot].from_peer].send( + self.AcceptReply(slot) + ) def handle_net_recved(self, peer, msg): if msg.mark.startswith("a-"): + # net recv on follower slot = int(msg.mark[2:]) while slot >= len(self.insts): self.insts.append(self.Instance()) self.insts[slot].from_peer = peer self.insts[slot].batch = msg.batch - self.replica.send_links[peer].send(self.AcceptReply(slot)) + self.replica.disk_dev.write(self.AcceptMsg(slot, msg.batch)) elif msg.mark.startswith("r-"): + # net recv on leader slot = int(msg.mark[2:]) assert slot < len(self.insts) self.insts[slot].num_replies += 1 @@ -381,20 +392,28 @@ def ack_client_reqs(self, slot): class Stats: def __init__(self, env): self.env = env - self.total_cnt = 0 + self.total_sent = 0 + self.total_acks = 0 self.req_times = dict() self.ack_times = dict() def add_req(self, mark): assert mark not in self.req_times + self.total_sent += 1 self.req_times[mark] = self.env.now def add_ack(self, mark): assert mark in self.req_times assert mark not in self.ack_times - self.total_cnt += 1 + self.total_acks += 1 self.ack_times[mark] = self.env.now + def display(self, chunk_time): + lats = [self.ack_times[m] - self.req_times[m] for m in self.ack_times] + avg_tput = len(lats) / chunk_time + avg_lat = sum(lats) / len(lats) if len(lats) > 0 else 0.0 + return f"{avg_tput:>9.2f} {avg_lat:>9.2f} {len(lats):>7d} {self.total_acks:>8d} / {self.total_sent:<8d}" + def clear(self): for mark in self.ack_times: del self.req_times[mark] @@ -402,7 +421,7 @@ def clear(self): class Client: - def __init__(self, env, cluster, cid, freq, vsize): + def __init__(self, env, cluster, cid, freq, vsize, chunk_time): self.env = env self.cid = cid self.service = cluster @@ -413,7 +432,10 @@ def __init__(self, env, cluster, cid, freq, vsize): self.mark = 0 self.tick = simpy.Container(env, capacity=1) + self.stats = Stats(env) + self.last_print = 0 + self.chunk_time = chunk_time self.env.process(self.ticker()) @@ -428,12 +450,15 @@ def new_req(self): self.mark += 1 return SendNewReq(self.mark) - def driver(self): + def loop(self): events = { "req": self.env.process(self.new_req()), "ack": self.env.process(self.ack_link.recv()), } + print( + f"{'Time':>5s}: {'Tput':>9s} {'Lat':>9s} {'Chunk':>7s} {'Reply':>8s} / {'Total':<8s}" + ) while True: # could get multiple completed triggers at this yield conds = yield self.env.any_of(events.values()) @@ -454,8 +479,14 @@ def driver(self): else: raise RuntimeError(f"unrecognized event type: {event}") + # print chunk-average stats + if self.env.now - self.last_print > self.chunk_time: + print(f"{self.env.now:>5.1f}: {self.stats.display(self.chunk_time)}") + self.stats.clear() + self.last_print = self.env.now + def start(self): - self.env.process(self.driver()) + self.env.process(self.loop()) ################# @@ -487,7 +518,7 @@ def start(self): ) cluster.launch() - client = Client(env, cluster, 2957, freq, vsize) + client = Client(env, cluster, 2957, freq, vsize, 10) client.start() env.run(until=60) From a6623d2afbc00c86d55738e5173a61e5ad58f4ba Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Mon, 2 Oct 2023 00:13:27 -0500 Subject: [PATCH 71/89] staging progress on perf sim --- README.md | 9 +- models/perf_simulation.py | 668 ++++++++++++++++++++++++++++++------- models/plot_sim_results.py | 105 ++++++ 3 files changed, 651 insertions(+), 131 deletions(-) create mode 100644 models/plot_sim_results.py diff --git a/README.md b/README.md index 6a6224ce..fb43e7d9 100644 --- a/README.md +++ b/README.md @@ -155,15 +155,20 @@ Complete cluster management and benchmarking scripts are available in another re - [ ] specialize read-only commands? - [ ] separate commit vs. exec responses? - [ ] membership discovery & view changes +- [x] implementation of RS-Paxos - [ ] implementation of Raft -- [ ] implementation of Crossword prototype +- [ ] implementation of CRaft +- [x] implementation of Crossword prototype - [x] fault recovery reads - [ ] follower gossiping + - [ ] fall-back mechanism + - [ ] workload adaptiveness + - [ ] unbalanced assignment - [x] client-side utilities - [x] REPL-style client - [x] random benchmarking client - [x] testing client - - [ ] benchmarking with YCSB input + - [ ] YCSB-driven client - [ ] better README & documentation --- diff --git a/models/perf_simulation.py b/models/perf_simulation.py index def8d3d9..bfb6734a 100644 --- a/models/perf_simulation.py +++ b/models/perf_simulation.py @@ -1,5 +1,15 @@ import simpy # type: ignore -from enum import Enum # type: ignore +from enum import Enum +import random +import statistics +import math +import pickle + +import matplotlib + +matplotlib.use("Agg") + +import matplotlib.pyplot as plt ############## @@ -28,11 +38,16 @@ def __init__(self, cid, mark): super().__init__(mark, 8) -class Batch(Data): - def __init__(self, mark, vec): - self.vec = vec - size = sum((data.size for data in vec)) - super().__init__(mark, size) +class Codeword(Data): + def __init__(self, req, n, m, flags): + assert len(flags) > 0 + assert len(flags) <= n + self.req = req + self.m = m + self.n = n + self.flags = flags + shard_size = req.size / m + super().__init__(req.mark, shard_size * len(flags)) ############### @@ -43,7 +58,7 @@ def __init__(self, mark, vec): class EType(Enum): NetRecved = 1 DiskSaved = 2 - ApiBatch = 3 + ApiGotReq = 3 SendNewReq = 4 @@ -67,9 +82,9 @@ def __init__(self, mark): super().__init__(EType.DiskSaved, None, mark) -class ApiBatch(Event): - def __init__(self, batch): - super().__init__(EType.ApiBatch, None, batch) +class ApiGotReq(Event): + def __init__(self, cid, req): + super().__init__(EType.ApiGotReq, cid, req) class SendNewReq(Event): @@ -83,24 +98,32 @@ def __init__(self, mark): class Device: - def __init__(self, env, l, t, v): + def __init__(self, env, l, t, lv, tv): self.env = env - self.l = l - self.t = t - self.v = v # TODO: use this + self.l = l # latency factor in ms + self.t = t # ms to transfer 1 MB + self.lv = lv # max variation multiplier for l + self.tv = tv # max variation multiplier for t self.pipe = simpy.Store(env) + if self.lv < 1: + raise RuntimeError(f"invalid variation ratio {self.lv}") + if self.tv < 1: + raise RuntimeError(f"invalid variation ratio {self.tv}") + def delay(self, data): - delay = self.l + self.t * data.size + l = self.l * random.uniform(1, self.lv) + t = self.t * random.uniform(1, self.tv) + delay = l + t * (data.size / 1000000.0) yield self.env.timeout(delay) self.pipe.put(data) class NetLink(Device): - def __init__(self, env, l, t, v, src, dst): + def __init__(self, env, l, t, lv, tv, src, dst): self.src = src self.dst = dst - super().__init__(env, l, t, v) + super().__init__(env, l, t, lv, tv) def send(self, msg): self.env.process(self.delay(msg)) @@ -111,9 +134,9 @@ def recv(self): class DiskDev(Device): - def __init__(self, env, l, t, v, rid): + def __init__(self, env, l, t, lv, tv, rid): self.rid = rid - super().__init__(env, l, t, v) + super().__init__(env, l, t, lv, tv) def write(self, ent): self.env.process(self.delay(ent)) @@ -124,52 +147,32 @@ def saved(self): class ExtlApi: - def __init__(self, env, l, t, v, b, rid): + def __init__(self, env, l, t, lv, tv, rid): self.env = env + self.rid = rid self.l = l self.t = t - self.v = v - self.b = b - - self.rid = rid + self.lv = lv + self.tv = tv self.req_links = dict() self.ack_links = dict() - self.mark = 0 - self.tick = simpy.Container(env, capacity=1) - - self.env.process(self.ticker()) - def connect(self, client): - req_link = NetLink(self.env, self.l, self.t, self.v, client.cid, self.rid) - ack_link = NetLink(self.env, self.l, self.t, self.v, self.rid, client.cid) + req_link = NetLink( + self.env, self.l, self.t, self.lv, self.tv, client.cid, self.rid + ) + ack_link = NetLink( + self.env, self.l, self.t, self.lv, self.tv, self.rid, client.cid + ) self.req_links[client.cid] = req_link self.ack_links[client.cid] = ack_link return (req_link, ack_link) - def ticker(self): - while True: - yield self.env.timeout(self.b) - if self.tick.level == 0: - self.tick.put(1) - - def batch(self): - while True: - yield self.tick.get(1) - - reqs = [] - for link in self.req_links.values(): - if len(link.pipe.items) > 0: - reqs += link.pipe.items - link.pipe.items = [] - - # do not return if no reqs available at this tick - if len(reqs) == 0: - continue - - batch = Batch(self.mark, reqs) - self.mark += 1 - return ApiBatch(batch) + def req(self): + # NOTE: hardcode assuming only one client connected + event = yield self.env.process(self.req_links[2957].recv()) + req = event.value + return ApiGotReq(2957, req) def ack(self, cid, mark): if cid not in self.ack_links: @@ -183,13 +186,15 @@ def ack(self, cid, mark): class Replica: - def __init__(self, env, rid, api_ltvb, disk_ltv, protocol, **protocol_args): + def __init__(self, env, rid, api_ltv, disk_ltv, protocol, **protocol_args): self.env = env self.rid = rid self.extl_api = ExtlApi( - env, api_ltvb[0], api_ltvb[1], api_ltvb[2], api_ltvb[3], rid + env, api_ltv[0], api_ltv[1], api_ltv[2], api_ltv[3], rid + ) + self.disk_dev = DiskDev( + env, disk_ltv[0], disk_ltv[1], disk_ltv[2], disk_ltv[3], rid ) - self.disk_dev = DiskDev(env, disk_ltv[0], disk_ltv[1], disk_ltv[2], rid) self.send_links = dict() self.recv_links = dict() @@ -198,10 +203,10 @@ def __init__(self, env, rid, api_ltvb, disk_ltv, protocol, **protocol_args): def add_peer(self, peer, net_ltv): s2p_link = NetLink( - self.env, net_ltv[0], net_ltv[1], net_ltv[2], self.rid, peer.rid + self.env, net_ltv[0], net_ltv[1], net_ltv[2], net_ltv[3], self.rid, peer.rid ) p2s_link = NetLink( - self.env, net_ltv[0], net_ltv[1], net_ltv[2], peer.rid, self.rid + self.env, net_ltv[0], net_ltv[1], net_ltv[2], net_ltv[3], peer.rid, self.rid ) self.send_links[peer.rid] = s2p_link self.recv_links[peer.rid] = p2s_link @@ -210,31 +215,34 @@ def add_peer(self, peer, net_ltv): def run(self): events = { - "api_batch": self.env.process(self.extl_api.batch()), "disk_saved": self.env.process(self.disk_dev.saved()), } for peer, link in self.recv_links.items(): events[("net_recved", peer)] = self.env.process(link.recv()) + # NOTE: hardcoding to have non-leader not do api_got_req + if self.rid == 0: + events["api_got_req"] = self.env.process(self.extl_api.req()) + while True: # could get multiple completed triggers at this yield conds = yield self.env.any_of(events.values()) for event in conds.values(): # print(f"{self.env.now}: R{self.rid} {event}") - if event.enum == EType.ApiBatch: - batch = event.value - self.protocol.handle_api_batch(batch) - events["api_batch"] = self.env.process(self.extl_api.batch()) + if event.enum == EType.ApiGotReq: + req = event.value + yield self.env.process(self.protocol.handle_api_got_req(req)) + events["api_got_req"] = self.env.process(self.extl_api.req()) elif event.enum == EType.DiskSaved: mark = event.value - self.protocol.handle_disk_saved(mark) + yield self.env.process(self.protocol.handle_disk_saved(mark)) events["disk_saved"] = self.env.process(self.disk_dev.saved()) elif event.enum == EType.NetRecved: peer, msg = event.info, event.value - self.protocol.handle_net_recved(peer, msg) + yield self.env.process(self.protocol.handle_net_recved(peer, msg)) events[("net_recved", peer)] = self.env.process( self.recv_links[peer].recv() ) @@ -251,7 +259,7 @@ def __init__( self, env, num_replicas, - api_ltvb, + api_ltv, disk_ltv_map, net_ltv_map, protocol, @@ -262,7 +270,7 @@ def __init__( Replica( env, rid, - api_ltvb, + api_ltv, disk_ltv_map[rid], protocol, **protocol_args, @@ -295,37 +303,44 @@ def __init__(self, replica): class MultiPaxos(Protocol): - def __init__(self, replica, quorum_size): + def __init__(self, replica, cluster_size): super().__init__(replica) - self.quorum_size = quorum_size + self.q = cluster_size // 2 + 1 + self.insts = [] + @classmethod + def name_str(cls, cluster_size): + return "MultiPaxos/Raft" + class Instance: def __init__(self): - self.batch = None + self.req = None self.num_replies = 0 self.from_peer = -1 self.client_acked = False class AcceptMsg(Data): - def __init__(self, slot, batch): - super().__init__(f"a-{slot}", batch.size + 8) - self.batch = batch + def __init__(self, slot, req): + super().__init__(f"a-{slot}", req.size + 8) + self.req = req class AcceptReply(Data): def __init__(self, slot): super().__init__(f"r-{slot}", 8) - def handle_api_batch(self, batch): + def handle_api_got_req(self, req): self.insts.append(self.Instance()) slot = len(self.insts) - 1 - self.insts[slot].batch = batch + self.insts[slot].req = req for link in self.replica.send_links.values(): - link.send(self.AcceptMsg(slot, batch)) + link.send(self.AcceptMsg(slot, req)) + + self.replica.disk_dev.write(self.AcceptMsg(slot, req)) - self.replica.disk_dev.write(self.AcceptMsg(slot, batch)) + yield from [] def handle_disk_saved(self, mark): if not mark.startswith("a-"): @@ -339,7 +354,7 @@ def handle_disk_saved(self, mark): if ( not self.insts[slot].client_acked - and self.insts[slot].num_replies >= self.quorum_size + and self.insts[slot].num_replies >= self.q ): self.ack_client_reqs(slot) @@ -349,6 +364,8 @@ def handle_disk_saved(self, mark): self.AcceptReply(slot) ) + yield from [] + def handle_net_recved(self, peer, msg): if msg.mark.startswith("a-"): # net recv on follower @@ -356,9 +373,9 @@ def handle_net_recved(self, peer, msg): while slot >= len(self.insts): self.insts.append(self.Instance()) self.insts[slot].from_peer = peer - self.insts[slot].batch = msg.batch + self.insts[slot].req = msg.req - self.replica.disk_dev.write(self.AcceptMsg(slot, msg.batch)) + self.replica.disk_dev.write(self.AcceptMsg(slot, msg.req)) elif msg.mark.startswith("r-"): # net recv on leader @@ -368,19 +385,312 @@ def handle_net_recved(self, peer, msg): if ( not self.insts[slot].client_acked - and self.insts[slot].num_replies >= self.quorum_size + and self.insts[slot].num_replies >= self.q ): self.ack_client_reqs(slot) else: raise RuntimeError(f"unrecognized msg mark: {msg.mark}") + yield from [] + def ack_client_reqs(self, slot): assert not self.insts[slot].client_acked + req = self.insts[slot].req + self.replica.extl_api.ack(req.cid, req.mark) + self.insts[slot].client_acked = True + + +class RSPaxos(Protocol): + def __init__( + self, + replica, + cluster_size, + same_liveness, + comp_delay=0, + ): + super().__init__(replica) + + self.cluster_size = cluster_size + self.comp_delay = comp_delay + + self.m = cluster_size // 2 + 1 + if same_liveness: + self.q = cluster_size + self.f = cluster_size - self.m + else: + self.q = math.ceil((cluster_size + self.m) // 2) + self.f = self.q - self.m + + self.insts = [] + + @classmethod + def name_str(cls, cluster_size, same_liveness, comp_delay=0): + if same_liveness: + return f"RS-Paxos/CRaft (f-forced)" + else: + return f"RS-Paxos/CRaft (original)" + + class Instance: + def __init__(self): + self.req = None + self.num_replies = 0 + self.from_peer = -1 + self.client_acked = False + + class AcceptMsg(Data): + def __init__(self, slot, shard): + super().__init__(f"a-{slot}", shard.size + 8) + self.shard = shard + + class AcceptReply(Data): + def __init__(self, slot): + super().__init__(f"r-{slot}", 8) + + def handle_api_got_req(self, req): + self.insts.append(self.Instance()) + slot = len(self.insts) - 1 + self.insts[slot].req = req + + # add EC computation delay + comp_time = self.comp_delay * (float(req.size) / 1000000.0) + yield self.replica.env.timeout(comp_time) + + for peer, link in self.replica.send_links.items(): + codeword = Codeword(req, self.cluster_size, self.m, {peer}) + link.send(self.AcceptMsg(slot, codeword)) + + codeword = Codeword(req, self.cluster_size, self.m, {self.replica.rid}) + self.replica.disk_dev.write(self.AcceptMsg(slot, codeword)) + + yield from [] + + def handle_disk_saved(self, mark): + if not mark.startswith("a-"): + raise RuntimeError(f"unrecognized ent mark: {mark}") + slot = int(mark[2:]) + assert slot < len(self.insts) - for req in self.insts[slot].batch.vec: - self.replica.extl_api.ack(req.cid, req.mark) + if self.insts[slot].from_peer < 0: + # disk save on leader + self.insts[slot].num_replies += 1 + + if ( + not self.insts[slot].client_acked + and self.insts[slot].num_replies >= self.q + ): + self.ack_client_reqs(slot) + + else: + # disk save on follower + self.replica.send_links[self.insts[slot].from_peer].send( + self.AcceptReply(slot) + ) + yield from [] + + def handle_net_recved(self, peer, msg): + if msg.mark.startswith("a-"): + # net recv on follower + slot = int(msg.mark[2:]) + while slot >= len(self.insts): + self.insts.append(self.Instance()) + self.insts[slot].from_peer = peer + self.insts[slot].req = msg.shard + + self.replica.disk_dev.write(self.AcceptMsg(slot, msg.shard)) + + elif msg.mark.startswith("r-"): + # net recv on leader + slot = int(msg.mark[2:]) + assert slot < len(self.insts) + self.insts[slot].num_replies += 1 + + if ( + not self.insts[slot].client_acked + and self.insts[slot].num_replies >= self.q + ): + self.ack_client_reqs(slot) + + else: + raise RuntimeError(f"unrecognized msg mark: {msg.mark}") + + yield from [] + + def ack_client_reqs(self, slot): + assert not self.insts[slot].client_acked + req = self.insts[slot].req + self.replica.extl_api.ack(req.cid, req.mark) + self.insts[slot].client_acked = True + + +class Crossword(Protocol): + def __init__( + self, + replica, + cluster_size, + comp_delay=0, + shards_per_replica=1, # NOTE: a "cheating" approach to adaptiveness + ): + super().__init__(replica) + + self.cluster_size = cluster_size + self.comp_delay = comp_delay + + self.m = cluster_size // 2 + 1 + f = cluster_size - self.m + assert shards_per_replica >= 1 + assert shards_per_replica <= self.m + self.l = shards_per_replica + self.q = self.m + f + 1 - self.l + + self.insts = [] + + @classmethod + def name_str(cls, cluster_size, comp_delay=0, shards_per_replica=1): + return f"Crossword" + + # def update_perf_number(self, lat): + # self.perf_tries[self.l].append(lat) + # if len(self.perf_tries[self.l]) > 100: + # del self.perf_tries[self.l][0] + + # if not self.all_tried: + # if len(self.perf_tries[self.l]) >= 100: + # if self.l == 1: + # self.all_tried = True + # else: + # self.l -= 1 + + # def choose_best_config(self): + # if self.all_tried and not self.ql_picked: + # m = self.cluster_size // 2 + 1 + # f = self.cluster_size - m + + # avg_lats = dict() + # for l, lats in self.perf_tries.items(): + # sorted_lats = sorted(lats)[:-10] + # avg_lats[l] = sum(sorted_lats) / len(sorted_lats) + # self.l = min(avg_lats, key=avg_lats.get) + # self.q = m + f - self.l + 1 + + # print(" picked", self.l) + # self.ql_picked = True + + class Instance: + def __init__(self): + self.req = None + self.num_replies = 0 + self.from_peer = -1 + self.client_acked = False + + class AcceptMsg(Data): + def __init__(self, slot, shard): + super().__init__(f"a-{slot}", shard.size + 8) + self.shard = shard + + class AcceptReply(Data): + def __init__(self, slot): + super().__init__(f"r-{slot}", 8) + + def handle_api_got_req(self, req): + self.insts.append(self.Instance()) + slot = len(self.insts) - 1 + self.insts[slot].req = req + + # add EC computation delay + comp_time = self.comp_delay * (float(req.size) / 1000000.0) + yield self.replica.env.timeout(comp_time) + + # pick the best config if haven't yet + # self.choose_best_config() + + # record this req's starting time + # self.curr_reqs[req.mark] = self.replica.env.now + + for peer, link in self.replica.send_links.items(): + codeword = Codeword( + req, + self.cluster_size, + self.m, + {(p % self.cluster_size) for p in range(peer, peer + self.l)}, + ) + link.send(self.AcceptMsg(slot, codeword)) + + me = self.replica.rid + codeword = Codeword( + req, + self.cluster_size, + self.m, + {(p % self.cluster_size) for p in range(me, me + self.l)}, + ) + self.replica.disk_dev.write(self.AcceptMsg(slot, codeword)) + + yield from [] + + def handle_disk_saved(self, mark): + if not mark.startswith("a-"): + raise RuntimeError(f"unrecognized ent mark: {mark}") + slot = int(mark[2:]) + assert slot < len(self.insts) + + if self.insts[slot].from_peer < 0: + # disk save on leader + self.insts[slot].num_replies += 1 + + if ( + not self.insts[slot].client_acked + and self.insts[slot].num_replies >= self.q + ): + self.ack_client_reqs(slot) + + else: + # disk save on follower + self.replica.send_links[self.insts[slot].from_peer].send( + self.AcceptReply(slot) + ) + + yield from [] + + def handle_net_recved(self, peer, msg): + if msg.mark.startswith("a-"): + # net recv on follower + slot = int(msg.mark[2:]) + while slot >= len(self.insts): + self.insts.append(self.Instance()) + self.insts[slot].from_peer = peer + self.insts[slot].req = msg.shard + + self.replica.disk_dev.write(self.AcceptMsg(slot, msg.shard)) + + elif msg.mark.startswith("r-"): + # net recv on leader + slot = int(msg.mark[2:]) + assert slot < len(self.insts) + self.insts[slot].num_replies += 1 + + if ( + not self.insts[slot].client_acked + and self.insts[slot].num_replies >= self.q + ): + self.ack_client_reqs(slot) + + else: + raise RuntimeError(f"unrecognized msg mark: {msg.mark}") + + yield from [] + + def ack_client_reqs(self, slot): + assert not self.insts[slot].client_acked + req = self.insts[slot].req + + # update perf records + # assert req.mark in self.curr_reqs + # lat = self.replica.env.now - self.curr_reqs[req.mark] + # self.update_perf_number(lat) + # del self.curr_reqs[req.mark] + + self.replica.extl_api.ack(req.cid, req.mark) self.insts[slot].client_acked = True @@ -408,11 +718,19 @@ def add_ack(self, mark): self.total_acks += 1 self.ack_times[mark] = self.env.now - def display(self, chunk_time): + def summary(self): lats = [self.ack_times[m] - self.req_times[m] for m in self.ack_times] - avg_tput = len(lats) / chunk_time + lats.sort() + assert len(lats) > 100 + + chunk_cnt = len(lats) + med_lat = lats[len(lats) // 2] + + lats = lats[:-100] avg_lat = sum(lats) / len(lats) if len(lats) > 0 else 0.0 - return f"{avg_tput:>9.2f} {avg_lat:>9.2f} {len(lats):>7d} {self.total_acks:>8d} / {self.total_sent:<8d}" + std_lat = statistics.stdev(lats) + + return (med_lat, avg_lat, std_lat, chunk_cnt, self.total_acks, self.total_sent) def clear(self): for mark in self.ack_times: @@ -421,7 +739,7 @@ def clear(self): class Client: - def __init__(self, env, cluster, cid, freq, vsize, chunk_time): + def __init__(self, env, cluster, cid, freq, vsize): self.env = env self.cid = cid self.service = cluster @@ -429,14 +747,11 @@ def __init__(self, env, cluster, cid, freq, vsize, chunk_time): self.gap = 1.0 / freq self.vsize = vsize + self.stats = Stats(env) self.mark = 0 self.tick = simpy.Container(env, capacity=1) - self.stats = Stats(env) - self.last_print = 0 - self.chunk_time = chunk_time - self.env.process(self.ticker()) def ticker(self): @@ -450,15 +765,12 @@ def new_req(self): self.mark += 1 return SendNewReq(self.mark) - def loop(self): + def loop(self, num_reqs=None): events = { "req": self.env.process(self.new_req()), "ack": self.env.process(self.ack_link.recv()), } - print( - f"{'Time':>5s}: {'Tput':>9s} {'Lat':>9s} {'Chunk':>7s} {'Reply':>8s} / {'Total':<8s}" - ) while True: # could get multiple completed triggers at this yield conds = yield self.env.any_of(events.values()) @@ -469,7 +781,11 @@ def loop(self): mark = event.value self.req_link.send(Req(self.cid, mark, self.vsize)) self.stats.add_req(mark) - events["req"] = self.env.process(self.new_req()) + # if num_reqs given, only issue this many reqs + if num_reqs is None or self.stats.total_sent < num_reqs: + events["req"] = self.env.process(self.new_req()) + else: + del events["req"] elif event.enum == EType.NetRecved: mark = event.value.mark @@ -479,14 +795,14 @@ def loop(self): else: raise RuntimeError(f"unrecognized event type: {event}") - # print chunk-average stats - if self.env.now - self.last_print > self.chunk_time: - print(f"{self.env.now:>5.1f}: {self.stats.display(self.chunk_time)}") - self.stats.clear() - self.last_print = self.env.now + # if num_reqs given, only issue this many reqs + if num_reqs is not None and self.stats.total_acks == num_reqs: + break + + return self.stats - def start(self): - self.env.process(self.loop()) + def start(self, num_reqs=None): + return self.env.process(self.loop(num_reqs=num_reqs)) ################# @@ -494,31 +810,125 @@ def start(self): ################# +class HomoParams: + def __init__(self, num_replicas, api_ltv, disk_ltv, net_ltv, vsize): + self.num_replicas = num_replicas + + self.api_ltv = api_ltv + self.disk_ltv_map = {rid: disk_ltv for rid in range(num_replicas)} + self.net_ltv_map = dict() + for rid in range(num_replicas): + for peerid in range(rid + 1, num_replicas): + self.net_ltv_map[(rid, peerid)] = net_ltv + + # NOTE: a "cheating" approach to adaptiveness + shards_per_replica = 1 + if net_ltv[0] >= 10: + shards_per_replica = 3 + elif net_ltv[0] >= 5: + if vsize <= 1900 * 1000: + shards_per_replica = 3 + elif vsize <= 2300 * 1000: + shards_per_replica = 2 + + self.protocol_configs = [ + (MultiPaxos, {"cluster_size": num_replicas}), + (RSPaxos, {"cluster_size": num_replicas, "same_liveness": True}), + (RSPaxos, {"cluster_size": num_replicas, "same_liveness": False}), + ( + Crossword, + { + "cluster_size": num_replicas, + "shards_per_replica": shards_per_replica, + }, + ), + ] + + self.vsize = vsize + + +class ParamsLatBounded(HomoParams): + def __init__(self, num_replicas, vsize): + api_ltv = (1, 1, 1, 1) + disk_ltv = (2, 0.5, 20, 1.5) + net_ltv = (10, 2.5, 20, 1.5) + super().__init__(num_replicas, api_ltv, disk_ltv, net_ltv, vsize) + + +class ParamsTputBounded(HomoParams): + def __init__(self, num_replicas, vsize): + api_ltv = (1, 1, 1, 1) + disk_ltv = (0.1, 10, 20, 1.5) + net_ltv = (0.5, 50, 20, 1.5) + super().__init__(num_replicas, api_ltv, disk_ltv, net_ltv, vsize) + + +class ParamsLatTputMix(HomoParams): + def __init__(self, num_replicas, vsize): + api_ltv = (1, 1, 1, 1) + disk_ltv = (1, 5, 20, 1.5) + net_ltv = (5, 25, 20, 1.5) + super().__init__(num_replicas, api_ltv, disk_ltv, net_ltv, vsize) + + +def simulate(params): + results = dict() + for protocol, protocol_args in params.protocol_configs: + env = simpy.Environment() + cluster = Cluster( + env, + params.num_replicas, + params.api_ltv, + params.disk_ltv_map, + params.net_ltv_map, + protocol, + **protocol_args, + ) + client = Client(env, cluster, 2957, freq=0.002, vsize=params.vsize) + + cluster.launch() + done = client.start(num_reqs=1000) + stats = env.run(until=done) + + med_lat, avg_lat, std_lat, _, _, _ = stats.summary() + name_str = protocol.name_str(**protocol_args) + results[name_str] = (med_lat, avg_lat, std_lat) + + return results + + if __name__ == "__main__": - num_replicas = 5 - api_ltvb = (1, 1, 0, 3) - disk_ltv_map = {rid: (1, 1, 0) for rid in range(num_replicas)} - net_ltv_map = dict() - for rid in range(num_replicas): - for peerid in range(rid + 1, num_replicas): - net_ltv_map[(rid, peerid)] = (1, 1, 0) - freq = 1 - vsize = 1 - - env = simpy.Environment() - - cluster = Cluster( - env, - 5, - api_ltvb, - disk_ltv_map, - net_ltv_map, - MultiPaxos, - quorum_size=3, - ) - cluster.launch() + random.seed() + + # TODO: real adaptiveness design + print("NOTE: adaptiveness hardcoded for 5!") + + # for num_replicas in (3, 5, 7, 9): + for num_replicas in (5,): + results = { + "vsizes": [], + "lat_bounded": [], + "tput_bounded": [], + "lat_tput_mix": [], + } + + vsizes = [v * 1000 for v in (2**p for p in range(3, 11))] + vsizes += [v * 1000 for v in (100 * i for i in range(1, 51))] + vsizes.sort() - client = Client(env, cluster, 2957, freq, vsize, 10) - client.start() + for vsize in vsizes: + results["vsizes"].append(vsize) + results["lat_bounded"].append( + simulate(ParamsLatBounded(num_replicas, vsize)) + ) + results["tput_bounded"].append( + simulate(ParamsTputBounded(num_replicas, vsize)) + ) + results["lat_tput_mix"].append( + simulate(ParamsLatTputMix(num_replicas, vsize)) + ) + print(f"Ran: {num_replicas} {vsize // 1000}") - env.run(until=60) + with open(f"results/sim.x_vsize.r_{num_replicas}.pkl", "wb") as fpkl: + pickle.dump(results, fpkl) + print(f"Dumped: {num_replicas}") diff --git a/models/plot_sim_results.py b/models/plot_sim_results.py new file mode 100644 index 00000000..18e6c6f1 --- /dev/null +++ b/models/plot_sim_results.py @@ -0,0 +1,105 @@ +import matplotlib + +matplotlib.use("Agg") + +import pickle +import math +import matplotlib.pyplot as plt + + +def protocol_style(protocol, cluster_size): + m = cluster_size // 2 + 1 + f = cluster_size - m + if "MultiPaxos" in protocol: + return ("-", "dimgray", "s", f"MultiPaxos/Raft\nf={f} |Q|={m} l={m}") + elif "RS-Paxos" in protocol: + if "forced" in protocol: + return ( + "-", + "red", + "x", + f"RS-Paxos/CRaft (f-forced)\nf={f} |Q|={cluster_size} l=1", + ) + else: + q = math.ceil((cluster_size + m) // 2) + lower_f = q - m + return ( + ":", + "orange", + "x", + f"RS-Paxos/CRaft (original)\nf={lower_f} |Q|={q} l=1", + ) + elif "Crossword" in protocol: + return ("-", "steelblue", "o", f"Crossword\nf={f} |Q|,l=adaptive") + else: + raise RuntimeError(f"unrecognized protocol {protocol}") + + +def params_display(params): + if params == "lat_bounded": + return "Latency bounded" + elif params == "tput_bounded": + return "Throughput bounded" + elif params == "lat_tput_mix": + return "Both moderate" + else: + raise RuntimeError(f"unrecognized params {params}") + + +def plot_x_vsize(num_replicas, results): + matplotlib.rcParams.update( + { + "figure.figsize": (11, 3), + "font.size": 10, + } + ) + + plt.figure() + + xs = list(map(lambda s: s / 1000, results["vsizes"])) + protocols = results["lat_bounded"][0].keys() + + for idx, params in enumerate(("lat_bounded", "lat_tput_mix", "tput_bounded")): + plt.subplot(131 + idx) + + for protocol in protocols: + ys = [r[protocol][0] for r in results[params]] + yerrs = [r[protocol][2] for r in results[params]] + linestyle, color, marker, label = protocol_style(protocol, num_replicas) + + plt.errorbar( + xs, + ys, + # yerr=yerrs, + label=label, + linestyle=linestyle, + linewidth=2, + color=color, + # marker=marker, + # markersize=3, + ecolor="darkgray", + elinewidth=1, + capsize=2, + ) + + plt.ylim(0, 420) + + plt.xlabel("Instance size (kB)") + plt.ylabel("Response time (ms)") + + title = params_display(params) + plt.title(title) + + plt.legend(loc="center left", bbox_to_anchor=(1.1, 0.5), labelspacing=1.2) + + plt.tight_layout() + + plt.savefig(f"results/sim.x_vsize.r_{num_replicas}.png", dpi=300) + plt.close() + + +if __name__ == "__main__": + for num_replicas in (5,): + with open(f"results/sim.x_vsize.r_{num_replicas}.pkl", "rb") as fpkl: + results = pickle.load(fpkl) + plot_x_vsize(num_replicas, results) From 9e7ad6be2fd5bd7733f62f26ad88c5aa083c44e4 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Mon, 2 Oct 2023 17:06:18 -0500 Subject: [PATCH 72/89] polish constraint boundary figure --- models/plot_cstr_bounds.py | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/models/plot_cstr_bounds.py b/models/plot_cstr_bounds.py index a9c49cfc..30a6335c 100644 --- a/models/plot_cstr_bounds.py +++ b/models/plot_cstr_bounds.py @@ -70,6 +70,14 @@ def plot_cstr_bound(idx, cluster_size): ys = [m, m + 1, 2, 1] plt.fill(xs, ys, color=fill_color, label="Region of fault-tolerance=f", zorder=0) + # unused x-axis range + if cluster_size < CLUSTER_SIZES[-1]: + xs = [n + 0.9, X_TICKS[-1] + 0.35, X_TICKS[-1] + 0.35, n + 0.8] + ys = [0.3, 0.3, 0, 0] + plt.fill( + xs, ys, hatch="///", fill=False, edgecolor=None, linewidth=0, zorder=10 + ) + # latency & throughput optimized arrows plt.arrow( m + 0.3, @@ -117,22 +125,27 @@ def plot_cstr_bound(idx, cluster_size): plt.xlim((0, X_TICKS[-1] + 0.7)) plt.ylim((0, Y_TICKS[-1] + 2.7)) - plt.xticks(X_TICKS, list(map(str, X_TICKS))) + plt.xticks(X_TICKS[:cluster_size], list(map(str, X_TICKS))[:cluster_size]) plt.yticks(Y_TICKS, list(map(str, Y_TICKS))) plt.xlabel("|Quorum|", loc="right") plt.ylabel("#Shards\n/replica", loc="top", rotation=0, backgroundcolor="white") - ax.xaxis.set_label_coords(1.05, -0.18) + if idx < 2: + ax.xaxis.set_label_coords(1.05, -0.1) + else: + ax.xaxis.set_label_coords(1.05, -0.18) ax.yaxis.set_label_coords(0.2, 0.8) - plt.title( - f"|Cluster|={n} f={f}", - x=0.3, - y=-0.38, - fontsize=10, - fontweight="bold", - backgroundcolor=fill_color, - ) + # plt.title( + # f"|Cluster|={n} f={f}", + # x=0.5, + # y=-0.48, + # fontsize=11, + # # fontweight="bold", + # # backgroundcolor=fill_color, + # ) + plt.text(2.2, -3.2, f"|Cluster|={n} f={f}", fontsize=11) + plt.text(1, -3.2, "▬", fontsize=11, color=line_color) return ax @@ -182,7 +195,7 @@ def make_legend_polygon( sorted_handles, sorted_labels, loc="lower center", - bbox_to_anchor=(0.5, 0.78), + bbox_to_anchor=(0.5, 0.81), ncol=len(handles), handlelength=1.5, handletextpad=0.5, @@ -214,7 +227,7 @@ def plot_all_cstr_bounds(): # single legend group on top make_legend(fig, handles, labels) - plt.tight_layout() + plt.tight_layout(pad=1.0) plt.savefig(f"results/cstr_bounds.png", dpi=300) From c48cc82a5b676db4301c23e805b0336003ad2208 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Tue, 3 Oct 2023 18:06:47 -0500 Subject: [PATCH 73/89] fixing commit_bar and exec_bar bugs --- src/manager/clusman.rs | 18 +++--- src/protocols/crossword.rs | 107 +++++++++++++++++++++-------------- src/protocols/multipaxos.rs | 60 ++++++++++++-------- src/protocols/rep_nothing.rs | 2 + src/protocols/rs_paxos.rs | 91 +++++++++++++++-------------- src/protocols/simple_push.rs | 2 + 6 files changed, 164 insertions(+), 116 deletions(-) diff --git a/src/manager/clusman.rs b/src/manager/clusman.rs index a21ef9c7..89f2700d 100644 --- a/src/manager/clusman.rs +++ b/src/manager/clusman.rs @@ -186,19 +186,12 @@ impl ClusterManager { protocol); } - // tell it to connect to all existing known servers + // gather the list of all existing known servers let to_peers: HashMap = self .server_info .iter() .map(|(&server, info)| (server, info.p2p_addr)) .collect(); - self.server_reigner.send_ctrl( - CtrlMsg::ConnectToPeers { - population: self.population, - to_peers, - }, - server, - )?; // save new server's info self.server_info.insert( @@ -211,6 +204,15 @@ impl ClusterManager { start_slot: 0, }, ); + + // tell it to connect to all other existing known servers + self.server_reigner.send_ctrl( + CtrlMsg::ConnectToPeers { + population: self.population, + to_peers, + }, + server, + )?; Ok(()) } diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index 95f044c1..11a76c6a 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -330,6 +330,7 @@ pub struct CrosswordReplica { // CrosswordReplica common helpers impl CrosswordReplica { /// Create an empty null instance. + #[inline] fn null_instance(&self) -> Result { Ok(Instance { bal: 0, @@ -352,18 +353,32 @@ impl CrosswordReplica { }) } + /// Locate the first null slot or append a null instance if no holes exist. + fn first_null_slot(&mut self) -> Result { + for s in self.commit_bar..(self.start_slot + self.insts.len()) { + if self.insts[s - self.start_slot].status == Status::Null { + return Ok(s); + } + } + self.insts.push(self.null_instance()?); + Ok(self.start_slot + self.insts.len() - 1) + } + /// Compose a unique ballot number from base. + #[inline] fn make_unique_ballot(&self, base: u64) -> Ballot { ((base << 8) | ((self.id + 1) as u64)) as Ballot } /// Compose a unique ballot number greater than the given one. + #[inline] fn make_greater_ballot(&self, bal: Ballot) -> Ballot { self.make_unique_ballot((bal >> 8) + 1) } /// Compose LogActionId from slot index & entry type. /// Uses the `Status` enum type to represent differnet entry types. + #[inline] fn make_log_action_id(slot: usize, entry_type: Status) -> LogActionId { let type_num = match entry_type { Status::Preparing => 1, @@ -375,6 +390,7 @@ impl CrosswordReplica { } /// Decompose LogActionId into slot index & entry type. + #[inline] fn split_log_action_id(log_action_id: LogActionId) -> (usize, Status) { let slot = (log_action_id >> 2) as usize; let type_num = log_action_id & ((1 << 2) - 1); @@ -388,6 +404,7 @@ impl CrosswordReplica { } /// Compose CommandId from slot index & command index within. + #[inline] fn make_command_id(slot: usize, cmd_idx: usize) -> CommandId { assert!(slot <= (u32::MAX as usize)); assert!(cmd_idx <= (u32::MAX as usize)); @@ -395,6 +412,7 @@ impl CrosswordReplica { } /// Decompose CommandId into slot index & command index within. + #[inline] fn split_command_id(command_id: CommandId) -> (usize, usize) { let slot = (command_id >> 32) as usize; let cmd_idx = (command_id & ((1 << 32) - 1)) as usize; @@ -402,12 +420,17 @@ impl CrosswordReplica { } /// TODO: maybe remove this. + #[inline] fn shards_for_replica( + slot: usize, id: ReplicaId, population: u8, num_shards: u8, ) -> Vec { - (id..(id + num_shards)).map(|i| (i % population)).collect() + let first: u8 = ((id as usize + slot) % population as usize) as u8; + (first..(first + num_shards)) + .map(|i| (i % population)) + .collect() } /// TODO: make better impl of this. @@ -495,33 +518,18 @@ impl CrosswordReplica { reqs_cw.compute_parity(Some(&self.rs_coder))?; // create a new instance in the first null slot (or append a new one - // at the end if no holes exist) - let mut slot = self.start_slot + self.insts.len(); - for s in self.commit_bar..(self.start_slot + self.insts.len()) { - if self.insts[s - self.start_slot].status == Status::Null { - slot = s; - break; - } - } - if slot < self.start_slot + self.insts.len() { - let old_inst = &mut self.insts[slot - self.start_slot]; - assert_eq!(old_inst.status, Status::Null); - old_inst.reqs_cw = reqs_cw; - old_inst.leader_bk = Some(LeaderBookkeeping { - prepare_acks: Bitmap::new(self.population, false), - prepare_max_bal: 0, - accept_acks: HashMap::new(), - }); - } else { - let mut new_inst = self.null_instance()?; - new_inst.reqs_cw = reqs_cw; - new_inst.leader_bk = Some(LeaderBookkeeping { + // at the end if no holes exist); fill it up with incoming data + let slot = self.first_null_slot()?; + { + let inst = &mut self.insts[slot - self.start_slot]; + assert_eq!(inst.status, Status::Null); + inst.reqs_cw = reqs_cw; + inst.leader_bk = Some(LeaderBookkeeping { prepare_acks: Bitmap::new(self.population, false), prepare_max_bal: 0, accept_acks: HashMap::new(), }); - new_inst.external = true; - self.insts.push(new_inst); + inst.external = true; } // decide whether we can enter fast path for this instance @@ -578,6 +586,7 @@ impl CrosswordReplica { Bitmap::from( self.population, Self::shards_for_replica( + slot, self.id, self.population, self.config.shards_per_replica, @@ -615,6 +624,7 @@ impl CrosswordReplica { Bitmap::from( self.population, Self::shards_for_replica( + slot, peer, self.population, self.config.shards_per_replica, @@ -732,6 +742,8 @@ impl CrosswordReplica { if inst.status < Status::Committed { break; } + let now_slot = self.commit_bar; + self.commit_bar += 1; if inst.reqs_cw.avail_shards() < self.quorum_cnt { // can't execute if I don't have the complete request batch @@ -752,7 +764,7 @@ impl CrosswordReplica { for (cmd_idx, (_, req)) in reqs.iter().enumerate() { if let ApiRequest::Req { cmd, .. } = req { self.state_machine.submit_cmd( - Self::make_command_id(self.commit_bar, cmd_idx), + Self::make_command_id(now_slot, cmd_idx), cmd.clone(), )?; } else { @@ -760,10 +772,8 @@ impl CrosswordReplica { } } pf_trace!(self.id; "submitted {} exec commands for slot {}", - reqs.len(), self.commit_bar); + reqs.len(), now_slot); } - - self.commit_bar += 1; } } @@ -934,6 +944,7 @@ impl CrosswordReplica { Bitmap::from( self.population, Self::shards_for_replica( + slot, self.id, self.population, self.config.shards_per_replica, @@ -969,6 +980,7 @@ impl CrosswordReplica { Bitmap::from( self.population, Self::shards_for_replica( + slot, peer, self.population, self.config.shards_per_replica, @@ -1078,6 +1090,7 @@ impl CrosswordReplica { Bitmap::from( self.population, Self::shards_for_replica( + slot, peer, self.population, self.config.shards_per_replica, @@ -1220,7 +1233,6 @@ impl CrosswordReplica { peer, slot, ballot, reqs_cw.avail_shards_map()); assert!(slot < self.start_slot + self.insts.len()); assert!(self.insts[slot - self.start_slot].status >= Status::Committed); - let num_insts = self.start_slot + self.insts.len(); let inst = &mut self.insts[slot - self.start_slot]; // if reply not outdated and ballot is up-to-date @@ -1229,10 +1241,10 @@ impl CrosswordReplica { inst.reqs_cw.absorb_other(reqs_cw)?; // if enough shards have been gathered, can push execution forward - if slot == self.commit_bar { - while self.commit_bar < num_insts { - let inst = - &mut self.insts[self.commit_bar - self.start_slot]; + if slot == self.exec_bar { + let mut now_slot = self.exec_bar; + while now_slot < self.start_slot + self.insts.len() { + let inst = &mut self.insts[now_slot - self.start_slot]; if inst.status < Status::Committed || inst.reqs_cw.avail_shards() < self.quorum_cnt { @@ -1253,10 +1265,7 @@ impl CrosswordReplica { for (cmd_idx, (_, req)) in reqs.iter().enumerate() { if let ApiRequest::Req { cmd, .. } = req { self.state_machine.submit_cmd( - Self::make_command_id( - self.commit_bar, - cmd_idx, - ), + Self::make_command_id(now_slot, cmd_idx), cmd.clone(), )?; } else { @@ -1264,10 +1273,10 @@ impl CrosswordReplica { } } pf_trace!(self.id; "submitted {} exec commands for slot {}", - reqs.len(), self.commit_bar); + reqs.len(), now_slot); } - self.commit_bar += 1; + now_slot += 1; } } } @@ -1777,7 +1786,7 @@ impl CrosswordReplica { LogEntry::CommitSlot { slot } => { assert!(slot < self.start_slot + self.insts.len()); - // update instance state + // update instance status self.insts[slot - self.start_slot].status = Status::Committed; // submit commands in contiguously committed instance to the // state machine @@ -1788,6 +1797,8 @@ impl CrosswordReplica { if inst.status < Status::Committed { break; } + // update commit_bar + self.commit_bar += 1; // check number of available shards if inst.reqs_cw.avail_shards() < self.quorum_cnt { // can't execute if I don't have the complete request batch @@ -1808,9 +1819,9 @@ impl CrosswordReplica { let _ = self.state_machine.get_result().await?; } } - // update commit_bar and exec_bar - self.commit_bar += 1; + // update instance status and exec_bar self.exec_bar += 1; + inst.status = Status::Executed; } } } @@ -1863,6 +1874,10 @@ impl CrosswordReplica { offset_ok: true, .. } = log_result { + if self.log_offset > 0 { + pf_info!(self.id; "recovered from wal log: commit {} exec {}", + self.commit_bar, self.exec_bar); + } Ok(()) } else { logged_err!(self.id; "unexpected log result type or failed truncate") @@ -1964,12 +1979,12 @@ impl CrosswordReplica { /// NOTE: the current implementation does not guard against crashes in the /// middle of taking a snapshot. async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> { + pf_debug!(self.id; "taking new snapshot: start {} exec {}", + self.start_slot, self.exec_bar); assert!(self.exec_bar >= self.start_slot); if self.exec_bar == self.start_slot { return Ok(()); } - pf_debug!(self.id; "taking new snapshot: start {} exec {}", - self.start_slot, self.exec_bar); // collect and dump all Puts in executed instances if self.is_leader { @@ -2055,6 +2070,10 @@ impl CrosswordReplica { self.control_hub.send_ctrl(CtrlMsg::SnapshotUpTo { new_start: self.start_slot, })?; + + if self.start_slot > 0 { + pf_info!(self.id; "recovered from snapshot: start {}", self.start_slot); + } Ok(()) } diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs index 4783f4f7..85f01231 100644 --- a/src/protocols/multipaxos.rs +++ b/src/protocols/multipaxos.rs @@ -298,6 +298,7 @@ pub struct MultiPaxosReplica { // MultiPaxosReplica common helpers impl MultiPaxosReplica { /// Create an empty null instance. + #[inline] fn null_instance(&self) -> Instance { Instance { bal: 0, @@ -311,18 +312,32 @@ impl MultiPaxosReplica { } } + /// Locate the first null slot or append a null instance if no holes exist. + fn first_null_slot(&mut self) -> usize { + for s in self.commit_bar..(self.start_slot + self.insts.len()) { + if self.insts[s - self.start_slot].status == Status::Null { + return s; + } + } + self.insts.push(self.null_instance()); + self.start_slot + self.insts.len() - 1 + } + /// Compose a unique ballot number from base. + #[inline] fn make_unique_ballot(&self, base: u64) -> Ballot { ((base << 8) | ((self.id + 1) as u64)) as Ballot } /// Compose a unique ballot number greater than the given one. + #[inline] fn make_greater_ballot(&self, bal: Ballot) -> Ballot { self.make_unique_ballot((bal >> 8) + 1) } /// Compose LogActionId from slot index & entry type. /// Uses the `Status` enum type to represent differnet entry types. + #[inline] fn make_log_action_id(slot: usize, entry_type: Status) -> LogActionId { let type_num = match entry_type { Status::Preparing => 1, @@ -334,6 +349,7 @@ impl MultiPaxosReplica { } /// Decompose LogActionId into slot index & entry type. + #[inline] fn split_log_action_id(log_action_id: LogActionId) -> (usize, Status) { let slot = (log_action_id >> 2) as usize; let type_num = log_action_id & ((1 << 2) - 1); @@ -347,6 +363,7 @@ impl MultiPaxosReplica { } /// Compose CommandId from slot index & command index within. + #[inline] fn make_command_id(slot: usize, cmd_idx: usize) -> CommandId { assert!(slot <= (u32::MAX as usize)); assert!(cmd_idx <= (u32::MAX as usize)); @@ -354,6 +371,7 @@ impl MultiPaxosReplica { } /// Decompose CommandId into slot index & command index within. + #[inline] fn split_command_id(command_id: CommandId) -> (usize, usize) { let slot = (command_id >> 32) as usize; let cmd_idx = (command_id & ((1 << 32) - 1)) as usize; @@ -394,31 +412,18 @@ impl MultiPaxosReplica { } // create a new instance in the first null slot (or append a new one - // at the end if no holes exist) - let mut slot = self.start_slot + self.insts.len(); - for s in self.commit_bar..(self.start_slot + self.insts.len()) { - let old_inst = &mut self.insts[s - self.start_slot]; - if old_inst.status == Status::Null { - old_inst.reqs = req_batch.clone(); - old_inst.leader_bk = Some(LeaderBookkeeping { - prepare_acks: Bitmap::new(self.population, false), - prepare_max_bal: 0, - accept_acks: Bitmap::new(self.population, false), - }); - slot = s; - break; - } - } - if slot == self.start_slot + self.insts.len() { - let mut new_inst = self.null_instance(); - new_inst.reqs = req_batch.clone(); - new_inst.leader_bk = Some(LeaderBookkeeping { + // at the end if no holes exist); fill it up with incoming data + let slot = self.first_null_slot(); + { + let inst = &mut self.insts[slot - self.start_slot]; + assert_eq!(inst.status, Status::Null); + inst.reqs = req_batch.clone(); + inst.leader_bk = Some(LeaderBookkeeping { prepare_acks: Bitmap::new(self.population, false), prepare_max_bal: 0, accept_acks: Bitmap::new(self.population, false), }); - new_inst.external = true; - self.insts.push(new_inst); + inst.external = true; } // decide whether we can enter fast path for this instance @@ -1338,7 +1343,7 @@ impl MultiPaxosReplica { LogEntry::CommitSlot { slot } => { assert!(slot < self.start_slot + self.insts.len()); - // update instance state + // update instance status self.insts[slot - self.start_slot].status = Status::Committed; // submit commands in contiguously committed instance to the // state machine @@ -1358,9 +1363,10 @@ impl MultiPaxosReplica { let _ = self.state_machine.get_result().await?; } } - // update commit_bar and exec_bar + // update instance status, commit_bar and exec_bar self.commit_bar += 1; self.exec_bar += 1; + inst.status = Status::Executed; } } } @@ -1413,6 +1419,10 @@ impl MultiPaxosReplica { offset_ok: true, .. } = log_result { + if self.log_offset > 0 { + pf_info!(self.id; "recovered from wal log: commit {} exec {}", + self.commit_bar, self.exec_bar); + } Ok(()) } else { logged_err!(self.id; "unexpected log result type or failed truncate") @@ -1604,6 +1614,10 @@ impl MultiPaxosReplica { self.control_hub.send_ctrl(CtrlMsg::SnapshotUpTo { new_start: self.start_slot, })?; + + if self.start_slot > 0 { + pf_info!(self.id; "recovered from snapshot: start {}", self.start_slot); + } Ok(()) } diff --git a/src/protocols/rep_nothing.rs b/src/protocols/rep_nothing.rs index 74ea31a7..af46cc69 100644 --- a/src/protocols/rep_nothing.rs +++ b/src/protocols/rep_nothing.rs @@ -109,6 +109,7 @@ pub struct RepNothingReplica { // RepNothingReplica common helpers impl RepNothingReplica { /// Compose CommandId from instance index & command index within. + #[inline] fn make_command_id(inst_idx: usize, cmd_idx: usize) -> CommandId { assert!(inst_idx <= (u32::MAX as usize)); assert!(cmd_idx <= (u32::MAX as usize)); @@ -116,6 +117,7 @@ impl RepNothingReplica { } /// Decompose CommandId into instance index & command index within. + #[inline] fn split_command_id(command_id: CommandId) -> (usize, usize) { let inst_idx = (command_id >> 32) as usize; let cmd_idx = (command_id & ((1 << 32) - 1)) as usize; diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs index 5d2fb450..73e0efac 100644 --- a/src/protocols/rs_paxos.rs +++ b/src/protocols/rs_paxos.rs @@ -313,6 +313,7 @@ pub struct RSPaxosReplica { // RSPaxosReplica common helpers impl RSPaxosReplica { /// Create an empty null instance. + #[inline] fn null_instance(&self) -> Result { Ok(Instance { bal: 0, @@ -335,18 +336,32 @@ impl RSPaxosReplica { }) } + /// Locate the first null slot or append a null instance if no holes exist. + fn first_null_slot(&mut self) -> Result { + for s in self.commit_bar..(self.start_slot + self.insts.len()) { + if self.insts[s - self.start_slot].status == Status::Null { + return Ok(s); + } + } + self.insts.push(self.null_instance()?); + Ok(self.start_slot + self.insts.len() - 1) + } + /// Compose a unique ballot number from base. + #[inline] fn make_unique_ballot(&self, base: u64) -> Ballot { ((base << 8) | ((self.id + 1) as u64)) as Ballot } /// Compose a unique ballot number greater than the given one. + #[inline] fn make_greater_ballot(&self, bal: Ballot) -> Ballot { self.make_unique_ballot((bal >> 8) + 1) } /// Compose LogActionId from slot index & entry type. /// Uses the `Status` enum type to represent differnet entry types. + #[inline] fn make_log_action_id(slot: usize, entry_type: Status) -> LogActionId { let type_num = match entry_type { Status::Preparing => 1, @@ -358,6 +373,7 @@ impl RSPaxosReplica { } /// Decompose LogActionId into slot index & entry type. + #[inline] fn split_log_action_id(log_action_id: LogActionId) -> (usize, Status) { let slot = (log_action_id >> 2) as usize; let type_num = log_action_id & ((1 << 2) - 1); @@ -371,6 +387,7 @@ impl RSPaxosReplica { } /// Compose CommandId from slot index & command index within. + #[inline] fn make_command_id(slot: usize, cmd_idx: usize) -> CommandId { assert!(slot <= (u32::MAX as usize)); assert!(cmd_idx <= (u32::MAX as usize)); @@ -378,6 +395,7 @@ impl RSPaxosReplica { } /// Decompose CommandId into slot index & command index within. + #[inline] fn split_command_id(command_id: CommandId) -> (usize, usize) { let slot = (command_id >> 32) as usize; let cmd_idx = (command_id & ((1 << 32) - 1)) as usize; @@ -426,33 +444,18 @@ impl RSPaxosReplica { reqs_cw.compute_parity(Some(&self.rs_coder))?; // create a new instance in the first null slot (or append a new one - // at the end if no holes exist) - let mut slot = self.start_slot + self.insts.len(); - for s in self.commit_bar..(self.start_slot + self.insts.len()) { - if self.insts[s - self.start_slot].status == Status::Null { - slot = s; - break; - } - } - if slot < self.start_slot + self.insts.len() { - let old_inst = &mut self.insts[slot - self.start_slot]; - assert_eq!(old_inst.status, Status::Null); - old_inst.reqs_cw = reqs_cw; - old_inst.leader_bk = Some(LeaderBookkeeping { - prepare_acks: Bitmap::new(self.population, false), - prepare_max_bal: 0, - accept_acks: Bitmap::new(self.population, false), - }); - } else { - let mut new_inst = self.null_instance()?; - new_inst.reqs_cw = reqs_cw; - new_inst.leader_bk = Some(LeaderBookkeeping { + // at the end if no holes exist); fill it up with incoming data + let slot = self.first_null_slot()?; + { + let inst = &mut self.insts[slot - self.start_slot]; + assert_eq!(inst.status, Status::Null); + inst.reqs_cw = reqs_cw; + inst.leader_bk = Some(LeaderBookkeeping { prepare_acks: Bitmap::new(self.population, false), prepare_max_bal: 0, accept_acks: Bitmap::new(self.population, false), }); - new_inst.external = true; - self.insts.push(new_inst); + inst.external = true; } // decide whether we can enter fast path for this instance @@ -648,6 +651,8 @@ impl RSPaxosReplica { if inst.status < Status::Committed { break; } + let now_slot = self.commit_bar; + self.commit_bar += 1; if inst.reqs_cw.avail_shards() < self.quorum_cnt { // can't execute if I don't have the complete request batch @@ -668,7 +673,7 @@ impl RSPaxosReplica { for (cmd_idx, (_, req)) in reqs.iter().enumerate() { if let ApiRequest::Req { cmd, .. } = req { self.state_machine.submit_cmd( - Self::make_command_id(self.commit_bar, cmd_idx), + Self::make_command_id(now_slot, cmd_idx), cmd.clone(), )?; } else { @@ -676,10 +681,8 @@ impl RSPaxosReplica { } } pf_trace!(self.id; "submitted {} exec commands for slot {}", - reqs.len(), self.commit_bar); + reqs.len(), now_slot); } - - self.commit_bar += 1; } } @@ -1102,7 +1105,6 @@ impl RSPaxosReplica { peer, slot, ballot, reqs_cw.avail_shards_map()); assert!(slot < self.start_slot + self.insts.len()); assert!(self.insts[slot - self.start_slot].status >= Status::Committed); - let num_insts = self.start_slot + self.insts.len(); let inst = &mut self.insts[slot - self.start_slot]; // if reply not outdated and ballot is up-to-date @@ -1111,10 +1113,10 @@ impl RSPaxosReplica { inst.reqs_cw.absorb_other(reqs_cw)?; // if enough shards have been gathered, can push execution forward - if slot == self.commit_bar { - while self.commit_bar < num_insts { - let inst = - &mut self.insts[self.commit_bar - self.start_slot]; + if slot == self.exec_bar { + let mut now_slot = self.exec_bar; + while now_slot < self.start_slot + self.insts.len() { + let inst = &mut self.insts[now_slot - self.start_slot]; if inst.status < Status::Committed || inst.reqs_cw.avail_shards() < self.quorum_cnt { @@ -1135,10 +1137,7 @@ impl RSPaxosReplica { for (cmd_idx, (_, req)) in reqs.iter().enumerate() { if let ApiRequest::Req { cmd, .. } = req { self.state_machine.submit_cmd( - Self::make_command_id( - self.commit_bar, - cmd_idx, - ), + Self::make_command_id(now_slot, cmd_idx), cmd.clone(), )?; } else { @@ -1146,10 +1145,10 @@ impl RSPaxosReplica { } } pf_trace!(self.id; "submitted {} exec commands for slot {}", - reqs.len(), self.commit_bar); + reqs.len(), now_slot); } - self.commit_bar += 1; + now_slot += 1; } } } @@ -1571,7 +1570,7 @@ impl RSPaxosReplica { LogEntry::CommitSlot { slot } => { assert!(slot < self.start_slot + self.insts.len()); - // update instance state + // update instance status self.insts[slot - self.start_slot].status = Status::Committed; // submit commands in contiguously committed instance to the // state machine @@ -1582,6 +1581,8 @@ impl RSPaxosReplica { if inst.status < Status::Committed { break; } + // update commit_bar + self.commit_bar += 1; // check number of available shards if inst.reqs_cw.avail_shards() < self.quorum_cnt { // can't execute if I don't have the complete request batch @@ -1602,9 +1603,9 @@ impl RSPaxosReplica { let _ = self.state_machine.get_result().await?; } } - // update commit_bar and exec_bar - self.commit_bar += 1; + // update instance status and exec_bar self.exec_bar += 1; + inst.status = Status::Executed; } } } @@ -1657,6 +1658,10 @@ impl RSPaxosReplica { offset_ok: true, .. } = log_result { + if self.log_offset > 0 { + pf_info!(self.id; "recovered from wal log: commit {} exec {}", + self.commit_bar, self.exec_bar); + } Ok(()) } else { logged_err!(self.id; "unexpected log result type or failed truncate") @@ -1849,6 +1854,10 @@ impl RSPaxosReplica { self.control_hub.send_ctrl(CtrlMsg::SnapshotUpTo { new_start: self.start_slot, })?; + + if self.start_slot > 0 { + pf_info!(self.id; "recovered from snapshot: start {}", self.start_slot); + } Ok(()) } diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs index c6a283c4..a0345d7e 100644 --- a/src/protocols/simple_push.rs +++ b/src/protocols/simple_push.rs @@ -141,6 +141,7 @@ pub struct SimplePushReplica { // SimplePushReplica common helpers impl SimplePushReplica { /// Compose CommandId from instance index & command index within. + #[inline] fn make_command_id(inst_idx: usize, cmd_idx: usize) -> CommandId { assert!(inst_idx <= (u32::MAX as usize)); assert!(cmd_idx <= (u32::MAX as usize)); @@ -148,6 +149,7 @@ impl SimplePushReplica { } /// Decompose CommandId into instance index & command index within. + #[inline] fn split_command_id(command_id: CommandId) -> (usize, usize) { let inst_idx = (command_id >> 32) as usize; let cmd_idx = (command_id & ((1 << 32) - 1)) as usize; From f30dd2b7cc3befb72b3e55df979946a399e741d5 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Tue, 3 Oct 2023 23:02:33 -0500 Subject: [PATCH 74/89] fixed snapshotting commit_bar bug --- src/protocols/crossword.rs | 56 ++++++++++++++++++++++---- src/protocols/multipaxos.rs | 56 ++++++++++++++++++++++---- src/protocols/rs_paxos.rs | 56 ++++++++++++++++++++++---- summerset_client/src/clients/tester.rs | 21 ++++++++++ 4 files changed, 165 insertions(+), 24 deletions(-) diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index 11a76c6a..9c5aa4d6 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -192,9 +192,14 @@ enum LogEntry { /// Snapshot file entry type. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] enum SnapEntry { - /// First entry at the start of file: number of log instances covered by - /// this snapshot file == the start slot index of in-mem log. - StartSlot { slot: usize }, + /// Necessary slot indices to remember. + SlotInfo { + /// First entry at the start of file: number of log instances covered + /// by this snapshot file == the start slot index of in-mem log. + start_slot: usize, + /// Index of the first non-committed slot. + commit_bar: usize, + }, /// Set of key-value pairs to apply to the state. KVPairSet { pairs: HashMap }, @@ -1977,7 +1982,8 @@ impl CrosswordReplica { /// to that index as well as outdate entries in the durable WAL log file. /// /// NOTE: the current implementation does not guard against crashes in the - /// middle of taking a snapshot. + /// middle of taking a snapshot. Production quality implementations should + /// make the snapshotting action "atomic". async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> { pf_debug!(self.id; "taking new snapshot: start {} exec {}", self.start_slot, self.exec_bar); @@ -1993,6 +1999,28 @@ impl CrosswordReplica { } self.snapshot_dump_kv_pairs().await?; + // write new slot info entry to the head of snapshot + self.snapshot_hub.submit_action( + 0, + LogAction::Write { + entry: SnapEntry::SlotInfo { + start_slot: self.exec_bar, + commit_bar: self.commit_bar, + }, + offset: 0, + sync: self.config.logger_sync, + }, + )?; + let (_, log_result) = self.snapshot_hub.get_result().await?; + match log_result { + LogResult::Write { + offset_ok: true, .. + } => {} + _ => { + return logged_err!(self.id; "unexpected log result type or failed truncate"); + } + } + // update start_slot and discard all in-memory log instances up to exec_bar self.insts.drain(0..(self.exec_bar - self.start_slot)); self.start_slot = self.exec_bar; @@ -2023,11 +2051,19 @@ impl CrosswordReplica { match log_result { LogResult::Read { - entry: Some(SnapEntry::StartSlot { slot }), + entry: + Some(SnapEntry::SlotInfo { + start_slot, + commit_bar, + }), end_offset, } => { self.snap_offset = end_offset; - self.start_slot = slot; // get start slot index of in-mem log + + // recover necessary slot indices info + self.start_slot = start_slot; + self.commit_bar = commit_bar; + self.exec_bar = start_slot; // repeatedly apply key-value pairs loop { @@ -2072,7 +2108,8 @@ impl CrosswordReplica { })?; if self.start_slot > 0 { - pf_info!(self.id; "recovered from snapshot: start {}", self.start_slot); + pf_info!(self.id; "recovered from snapshot: start {} commit {} exec {}", + self.start_slot, self.commit_bar, self.exec_bar); } Ok(()) } @@ -2082,7 +2119,10 @@ impl CrosswordReplica { self.snapshot_hub.submit_action( 0, LogAction::Write { - entry: SnapEntry::StartSlot { slot: 0 }, + entry: SnapEntry::SlotInfo { + start_slot: 0, + commit_bar: 0, + }, offset: 0, sync: self.config.logger_sync, }, diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs index 85f01231..fbcfc682 100644 --- a/src/protocols/multipaxos.rs +++ b/src/protocols/multipaxos.rs @@ -176,9 +176,14 @@ enum LogEntry { /// Snapshot file entry type. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] enum SnapEntry { - /// First entry at the start of file: number of log instances covered by - /// this snapshot file == the start slot index of in-mem log. - StartSlot { slot: usize }, + /// Necessary slot indices to remember. + SlotInfo { + /// First entry at the start of file: number of log instances covered + /// by this snapshot file == the start slot index of in-mem log. + start_slot: usize, + /// Index of the first non-committed slot. + commit_bar: usize, + }, /// Set of key-value pairs to apply to the state. KVPairSet { pairs: HashMap }, @@ -1521,7 +1526,8 @@ impl MultiPaxosReplica { /// to that index as well as outdate entries in the durable WAL log file. /// /// NOTE: the current implementation does not guard against crashes in the - /// middle of taking a snapshot. + /// middle of taking a snapshot. Production quality implementations should + /// make the snapshotting action "atomic". async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> { pf_debug!(self.id; "taking new snapshot: start {} exec {}", self.start_slot, self.exec_bar); @@ -1537,6 +1543,28 @@ impl MultiPaxosReplica { } self.snapshot_dump_kv_pairs().await?; + // write new slot info entry to the head of snapshot + self.snapshot_hub.submit_action( + 0, + LogAction::Write { + entry: SnapEntry::SlotInfo { + start_slot: self.exec_bar, + commit_bar: self.commit_bar, + }, + offset: 0, + sync: self.config.logger_sync, + }, + )?; + let (_, log_result) = self.snapshot_hub.get_result().await?; + match log_result { + LogResult::Write { + offset_ok: true, .. + } => {} + _ => { + return logged_err!(self.id; "unexpected log result type or failed truncate"); + } + } + // update start_slot and discard all in-memory log instances up to exec_bar self.insts.drain(0..(self.exec_bar - self.start_slot)); self.start_slot = self.exec_bar; @@ -1567,11 +1595,19 @@ impl MultiPaxosReplica { match log_result { LogResult::Read { - entry: Some(SnapEntry::StartSlot { slot }), + entry: + Some(SnapEntry::SlotInfo { + start_slot, + commit_bar, + }), end_offset, } => { self.snap_offset = end_offset; - self.start_slot = slot; // get start slot index of in-mem log + + // recover necessary slot indices info + self.start_slot = start_slot; + self.commit_bar = commit_bar; + self.exec_bar = start_slot; // repeatedly apply key-value pairs loop { @@ -1616,7 +1652,8 @@ impl MultiPaxosReplica { })?; if self.start_slot > 0 { - pf_info!(self.id; "recovered from snapshot: start {}", self.start_slot); + pf_info!(self.id; "recovered from snapshot: start {} commit {} exec {}", + self.start_slot, self.commit_bar, self.exec_bar); } Ok(()) } @@ -1626,7 +1663,10 @@ impl MultiPaxosReplica { self.snapshot_hub.submit_action( 0, LogAction::Write { - entry: SnapEntry::StartSlot { slot: 0 }, + entry: SnapEntry::SlotInfo { + start_slot: 0, + commit_bar: 0, + }, offset: 0, sync: self.config.logger_sync, }, diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs index 73e0efac..3b5c25b2 100644 --- a/src/protocols/rs_paxos.rs +++ b/src/protocols/rs_paxos.rs @@ -178,9 +178,14 @@ enum LogEntry { /// Snapshot file entry type. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] enum SnapEntry { - /// First entry at the start of file: number of log instances covered by - /// this snapshot file == the start slot index of in-mem log. - StartSlot { slot: usize }, + /// Necessary slot indices to remember. + SlotInfo { + /// First entry at the start of file: number of log instances covered + /// by this snapshot file == the start slot index of in-mem log. + start_slot: usize, + /// Index of the first non-committed slot. + commit_bar: usize, + }, /// Set of key-value pairs to apply to the state. KVPairSet { pairs: HashMap }, @@ -1761,7 +1766,8 @@ impl RSPaxosReplica { /// to that index as well as outdate entries in the durable WAL log file. /// /// NOTE: the current implementation does not guard against crashes in the - /// middle of taking a snapshot. + /// middle of taking a snapshot. Production quality implementations should + /// make the snapshotting action "atomic". async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> { pf_debug!(self.id; "taking new snapshot: start {} exec {}", self.start_slot, self.exec_bar); @@ -1777,6 +1783,28 @@ impl RSPaxosReplica { } self.snapshot_dump_kv_pairs().await?; + // write new slot info entry to the head of snapshot + self.snapshot_hub.submit_action( + 0, + LogAction::Write { + entry: SnapEntry::SlotInfo { + start_slot: self.exec_bar, + commit_bar: self.commit_bar, + }, + offset: 0, + sync: self.config.logger_sync, + }, + )?; + let (_, log_result) = self.snapshot_hub.get_result().await?; + match log_result { + LogResult::Write { + offset_ok: true, .. + } => {} + _ => { + return logged_err!(self.id; "unexpected log result type or failed truncate"); + } + } + // update start_slot and discard all in-memory log instances up to exec_bar self.insts.drain(0..(self.exec_bar - self.start_slot)); self.start_slot = self.exec_bar; @@ -1807,11 +1835,19 @@ impl RSPaxosReplica { match log_result { LogResult::Read { - entry: Some(SnapEntry::StartSlot { slot }), + entry: + Some(SnapEntry::SlotInfo { + start_slot, + commit_bar, + }), end_offset, } => { self.snap_offset = end_offset; - self.start_slot = slot; // get start slot index of in-mem log + + // recover necessary slot indices info + self.start_slot = start_slot; + self.commit_bar = commit_bar; + self.exec_bar = start_slot; // repeatedly apply key-value pairs loop { @@ -1856,7 +1892,8 @@ impl RSPaxosReplica { })?; if self.start_slot > 0 { - pf_info!(self.id; "recovered from snapshot: start {}", self.start_slot); + pf_info!(self.id; "recovered from snapshot: start {} commit {} exec {}", + self.start_slot, self.commit_bar, self.exec_bar); } Ok(()) } @@ -1866,7 +1903,10 @@ impl RSPaxosReplica { self.snapshot_hub.submit_action( 0, LogAction::Write { - entry: SnapEntry::StartSlot { slot: 0 }, + entry: SnapEntry::SlotInfo { + start_slot: 0, + commit_bar: 0, + }, offset: 0, sync: self.config.logger_sync, }, diff --git a/summerset_client/src/clients/tester.rs b/summerset_client/src/clients/tester.rs index 378256b7..4fb021e0 100644 --- a/summerset_client/src/clients/tester.rs +++ b/summerset_client/src/clients/tester.rs @@ -467,6 +467,7 @@ impl ClientTester { self.checked_put("Jose", &v, Some(None), 0).await?; for (s, is_leader) in self.query_servers().await? { if !is_leader { + // picked a non-leader replica self.driver.leave(false).await?; self.reset_servers(HashSet::from([s]), true).await?; time::sleep(Duration::from_secs(1)).await; @@ -484,6 +485,7 @@ impl ClientTester { self.checked_put("Jose", &v, Some(None), 0).await?; for (s, is_leader) in self.query_servers().await? { if is_leader { + // picked a leader replica self.driver.leave(false).await?; self.reset_servers(HashSet::from([s]), true).await?; time::sleep(Duration::from_secs(1)).await; @@ -515,6 +517,7 @@ impl ClientTester { } } if resets.len() == 2 { + // picked two replicas, one leader and one non-leader self.driver.leave(false).await?; self.reset_servers(resets, true).await?; time::sleep(Duration::from_secs(1)).await; @@ -543,6 +546,7 @@ impl ClientTester { time::sleep(Duration::from_millis(500)).await; for (s, is_leader) in self.query_servers().await? { if !is_leader { + // picked a non-leader replica self.driver.leave(false).await?; self.pause_servers(HashSet::from([s])).await?; time::sleep(Duration::from_secs(1)).await; @@ -563,6 +567,7 @@ impl ClientTester { time::sleep(Duration::from_millis(500)).await; for (s, is_leader) in self.query_servers().await? { if is_leader { + // picked a leader replica self.driver.leave(false).await?; self.pause_servers(HashSet::from([s])).await?; time::sleep(Duration::from_secs(1)).await; @@ -583,24 +588,28 @@ impl ClientTester { time::sleep(Duration::from_millis(500)).await; for (s, is_leader) in self.query_servers().await? { if is_leader { + // picked a leader replica self.driver.leave(false).await?; self.pause_servers(HashSet::from([s])).await?; time::sleep(Duration::from_secs(1)).await; self.driver.connect().await?; let v1 = Self::gen_rand_string(8); self.checked_put("Jose", &v1, Some(Some(&v0)), 0).await?; + // resuming old leader replica self.driver.leave(false).await?; self.resume_servers(HashSet::from([s])).await?; time::sleep(Duration::from_secs(1)).await; self.driver.connect().await?; let v2 = Self::gen_rand_string(8); self.checked_put("Jose", &v2, Some(Some(&v1)), 1).await?; + // pausing that replica again self.driver.leave(false).await?; self.pause_servers(HashSet::from([s])).await?; time::sleep(Duration::from_secs(1)).await; self.driver.connect().await?; let v3 = Self::gen_rand_string(8); self.checked_put("Jose", &v3, Some(Some(&v2)), 0).await?; + // resuming that replica again self.driver.leave(false).await?; self.resume_servers(HashSet::from([s])).await?; time::sleep(Duration::from_secs(1)).await; @@ -619,9 +628,21 @@ impl ClientTester { self.checked_put("Jose", &v0, Some(None), 0).await?; let v1 = Self::gen_rand_string(8); self.checked_put("Shawn", &v1, Some(None), 0).await?; + // forcing all nodes to take snapshot time::sleep(Duration::from_millis(500)).await; self.force_snapshot(HashSet::new()).await?; self.checked_put("Jose", &v1, Some(Some(&v0)), 0).await?; + // reseting all nodes and see if things are there + self.driver.leave(false).await?; + self.reset_servers(HashSet::new(), true).await?; + time::sleep(Duration::from_secs(1)).await; + self.driver.connect().await?; + self.checked_get("Shawn", Some(Some(&v1)), 0).await?; + self.checked_get("Jose", Some(Some(&v1)), 0).await?; + // forcing all nodes to take snapshot again + time::sleep(Duration::from_millis(500)).await; + self.force_snapshot(HashSet::new()).await?; + // reseting all nodes again and check again self.driver.leave(false).await?; self.reset_servers(HashSet::new(), true).await?; time::sleep(Duration::from_secs(1)).await; From 8dd10a28a923e6da3f0b26aa8d903a8b9cc1386c Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Wed, 4 Oct 2023 01:21:39 -0500 Subject: [PATCH 75/89] add chunking to reconstruction read messages --- src/protocols/crossword.rs | 328 +++++++++++++++++++++++-------------- src/protocols/rs_paxos.rs | 200 ++++++++++++---------- src/server/transport.rs | 11 +- 3 files changed, 318 insertions(+), 221 deletions(-) diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index 9c5aa4d6..1963f016 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -69,6 +69,9 @@ pub struct ReplicaConfigCrossword { /// Fault-tolerance level. pub fault_tolerance: u8, + /// Maximum chunk size of a ReconstructRead message. + pub recon_chunk_size: usize, + /// Number of shards to assign to each replica. // TODO: proper config options. pub shards_per_replica: u8, @@ -96,6 +99,7 @@ impl Default for ReplicaConfigCrossword { gossip_timeout_min: 100, gossip_timeout_max: 300, fault_tolerance: 0, + recon_chunk_size: 1000, shards_per_replica: 1, perf_storage_a: 0, perf_storage_b: 0, @@ -234,13 +238,15 @@ enum PeerMsg { Commit { slot: usize }, /// Reconstruction read from new leader to replicas. - Reconstruct { slot: usize, exclude: Vec }, + Reconstruct { + /// Map from slot -> shards to exclude. + slots_excl: HashMap>, + }, /// Reconstruction read reply from replica to leader. ReconstructReply { - slot: usize, - ballot: Ballot, - reqs_cw: RSCodeword, + /// Map from slot -> (ballot, peer shards). + slots_data: HashMap)>, }, /// Leader activity heartbeat. @@ -1183,43 +1189,44 @@ impl CrosswordReplica { fn handle_msg_reconstruct( &mut self, peer: ReplicaId, - slot: usize, - exclude: Vec, + slots_excl: HashMap>, ) -> Result<(), SummersetError> { - if slot < self.start_slot { - return Ok(()); // ignore if slot index outdated - } - pf_trace!(self.id; "received Reconstruct <- {} for slot {}", peer, slot); + pf_trace!(self.id; "received Reconstruct <- {} for {} slots", + peer, slots_excl.len()); + let mut slots_data = HashMap::new(); - // locate instance in memory, filling in null instances if needed - while self.start_slot + self.insts.len() <= slot { - self.insts.push(self.null_instance()?); - } - let inst = &mut self.insts[slot - self.start_slot]; + for (slot, exclude) in slots_excl { + if slot < self.start_slot { + continue; // ignore if slot index outdated + } - // ignore spurious duplications; also ignore if I have nothing to send back - if inst.status < Status::Accepting { - return Ok(()); - } - let mut subset = Bitmap::from(inst.reqs_cw.num_shards(), exclude); - subset.flip(); // exclude unwanted shards the sender already has - let reply_cw = inst.reqs_cw.subset_copy(subset, false)?; - if reply_cw.avail_shards() == 0 { - return Ok(()); - } + // locate instance in memory, filling in null instances if needed + while self.start_slot + self.insts.len() <= slot { + self.insts.push(self.null_instance()?); + } + let inst = &mut self.insts[slot - self.start_slot]; - // send back my ballot for this slot and the available shards - self.transport_hub.send_msg( - PeerMsg::ReconstructReply { - slot, - ballot: inst.bal, - reqs_cw: reply_cw.clone(), - }, - peer, - )?; - pf_trace!(self.id; "sent ReconstructReply message for slot {} bal {}", - slot, inst.bal); + // ignore spurious duplications; also ignore if I have nothing to send back + if inst.status < Status::Accepting { + continue; + } + let mut subset = Bitmap::from(inst.reqs_cw.num_shards(), exclude); + subset.flip(); // exclude unwanted shards the sender already has + let reply_cw = inst.reqs_cw.subset_copy(subset, false)?; + if reply_cw.avail_shards() == 0 { + continue; + } + // send back my ballot for this slot and the available shards + slots_data.insert(slot, (inst.bal, reply_cw)); + } + + if !slots_data.is_empty() { + let num_slots = slots_data.len(); + self.transport_hub + .send_msg(PeerMsg::ReconstructReply { slots_data }, peer)?; + pf_trace!(self.id; "sent ReconstructReply message for {} slots", num_slots); + } Ok(()) } @@ -1227,61 +1234,66 @@ impl CrosswordReplica { fn handle_msg_reconstruct_reply( &mut self, peer: ReplicaId, - slot: usize, - ballot: Ballot, - reqs_cw: RSCodeword, + slots_data: HashMap)>, ) -> Result<(), SummersetError> { - if slot < self.start_slot { - return Ok(()); // ignore if slot index outdated - } - pf_trace!(self.id; "received ReconstructReply <- {} for slot {} bal {} shards {:?}", - peer, slot, ballot, reqs_cw.avail_shards_map()); - assert!(slot < self.start_slot + self.insts.len()); - assert!(self.insts[slot - self.start_slot].status >= Status::Committed); - let inst = &mut self.insts[slot - self.start_slot]; - - // if reply not outdated and ballot is up-to-date - if inst.status < Status::Executed && ballot >= inst.bal { - // absorb the shards from this replica - inst.reqs_cw.absorb_other(reqs_cw)?; + for (slot, (ballot, reqs_cw)) in slots_data { + if slot < self.start_slot { + continue; // ignore if slot index outdated + } + pf_trace!(self.id; "in ReconstructReply <- {} for slot {} bal {} shards {:?}", + peer, slot, ballot, reqs_cw.avail_shards_map()); + assert!(slot < self.start_slot + self.insts.len()); + assert!( + self.insts[slot - self.start_slot].status >= Status::Committed + ); + let inst = &mut self.insts[slot - self.start_slot]; - // if enough shards have been gathered, can push execution forward - if slot == self.exec_bar { - let mut now_slot = self.exec_bar; - while now_slot < self.start_slot + self.insts.len() { - let inst = &mut self.insts[now_slot - self.start_slot]; - if inst.status < Status::Committed - || inst.reqs_cw.avail_shards() < self.quorum_cnt - { - break; - } + // if reply not outdated and ballot is up-to-date + if inst.status < Status::Executed && ballot >= inst.bal { + // absorb the shards from this replica + inst.reqs_cw.absorb_other(reqs_cw)?; + + // if enough shards have been gathered, can push execution forward + if slot == self.exec_bar { + let mut now_slot = self.exec_bar; + while now_slot < self.start_slot + self.insts.len() { + let inst = &mut self.insts[now_slot - self.start_slot]; + if inst.status < Status::Committed + || inst.reqs_cw.avail_shards() < self.quorum_cnt + { + break; + } - if inst.reqs_cw.avail_data_shards() < self.quorum_cnt { - // have enough shards but need reconstruction - inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?; - } - let reqs = inst.reqs_cw.get_data()?; + if inst.reqs_cw.avail_data_shards() < self.quorum_cnt { + // have enough shards but need reconstruction + inst.reqs_cw + .reconstruct_data(Some(&self.rs_coder))?; + } + let reqs = inst.reqs_cw.get_data()?; - // submit commands in committed instance to the state machine - // for execution - if reqs.is_empty() { - inst.status = Status::Executed; - } else { - for (cmd_idx, (_, req)) in reqs.iter().enumerate() { - if let ApiRequest::Req { cmd, .. } = req { - self.state_machine.submit_cmd( - Self::make_command_id(now_slot, cmd_idx), - cmd.clone(), - )?; - } else { - continue; // ignore other types of requests + // submit commands in committed instance to the state machine + // for execution + if reqs.is_empty() { + inst.status = Status::Executed; + } else { + for (cmd_idx, (_, req)) in reqs.iter().enumerate() { + if let ApiRequest::Req { cmd, .. } = req { + self.state_machine.submit_cmd( + Self::make_command_id( + now_slot, cmd_idx, + ), + cmd.clone(), + )?; + } else { + continue; // ignore other types of requests + } } + pf_trace!(self.id; "submitted {} exec commands for slot {}", + reqs.len(), now_slot); } - pf_trace!(self.id; "submitted {} exec commands for slot {}", - reqs.len(), now_slot); - } - now_slot += 1; + now_slot += 1; + } } } } @@ -1313,14 +1325,12 @@ impl CrosswordReplica { self.handle_msg_accept_reply(peer, slot, ballot) } PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot), - PeerMsg::Reconstruct { slot, exclude } => { - self.handle_msg_reconstruct(peer, slot, exclude) + PeerMsg::Reconstruct { slots_excl } => { + self.handle_msg_reconstruct(peer, slots_excl) + } + PeerMsg::ReconstructReply { slots_data } => { + self.handle_msg_reconstruct_reply(peer, slots_data) } - PeerMsg::ReconstructReply { - slot, - ballot, - reqs_cw, - } => self.handle_msg_reconstruct_reply(peer, slot, ballot, reqs_cw), PeerMsg::Heartbeat { ballot, exec_bar } => { self.heard_heartbeat(peer, ballot, exec_bar) } @@ -1412,6 +1422,7 @@ impl CrosswordReplica { self.bal_prep_sent = self.make_greater_ballot(self.bal_max_seen); self.bal_max_seen = self.bal_prep_sent; + let mut recon_slots: HashMap> = HashMap::new(); for (slot, inst) in self .insts .iter_mut() @@ -1461,31 +1472,48 @@ impl CrosswordReplica { if inst.status == Status::Committed && inst.reqs_cw.avail_shards() < self.quorum_cnt { - self.transport_hub.bcast_msg( - PeerMsg::Reconstruct { - slot, - exclude: inst - .reqs_cw - .avail_shards_map() - .iter() - .filter_map( - |(idx, flag)| { - if flag { - Some(idx) - } else { - None - } - }, - ) - .collect(), - }, - None, - )?; - pf_trace!(self.id; "broadcast Reconstruct messages for slot {} bal {} shards {:?}", - slot, inst.bal, inst.reqs_cw.avail_shards_map()); + recon_slots.insert( + slot, + inst.reqs_cw + .avail_shards_map() + .iter() + .filter_map( + |(idx, flag)| { + if flag { + Some(idx) + } else { + None + } + }, + ) + .collect(), + ); + + // send reconstruction read messages in chunks + if recon_slots.len() == self.config.recon_chunk_size { + self.transport_hub.bcast_msg( + PeerMsg::Reconstruct { + slots_excl: std::mem::take(&mut recon_slots), + }, + None, + )?; + pf_trace!(self.id; "broadcast Reconstruct messages for {} slots", + self.config.recon_chunk_size); + } } } + // send reconstruction read message for remaining slots + if !recon_slots.is_empty() { + let num_slots = recon_slots.len(); + self.transport_hub.bcast_msg( + PeerMsg::Reconstruct { + slots_excl: recon_slots, + }, + None, + )?; + pf_trace!(self.id; "broadcast Reconstruct messages for {} slots", num_slots); + } Ok(()) } @@ -1565,8 +1593,17 @@ impl CrosswordReplica { /// Triggers gossiping for my missing shards in committed but not-yet- /// executed instances: fetch missing shards from peers, preferring /// follower peers that hold data shards. + // TODO: prefer replicas with original data shards first fn trigger_gossiping(&mut self) -> Result<(), SummersetError> { - // TODO: want cleverer design than this! + // maintain a map from peer ID to send to -> slots_excl to send + let mut recon_slots: HashMap>> = + HashMap::new(); + for peer in 0..self.population { + if peer != self.id { + recon_slots.insert(peer, HashMap::new()); + } + } + let mut slot_up_to = self.exec_bar; for slot in self.exec_bar..(self.start_slot + self.insts.len()) { slot_up_to = slot; @@ -1578,16 +1615,21 @@ impl CrosswordReplica { } if inst.reqs_cw.avail_shards() < self.quorum_cnt { - let mut target = Bitmap::new(self.population, true); - if let Some(ReplicaBookkeeping { source }) = inst.replica_bk { - // skip leader who initially replicated this instance to me - target.set(source, false)?; - } - self.transport_hub.bcast_msg( - PeerMsg::Reconstruct { + for peer in 0..self.population { + if peer == self.id { + continue; + } + if let Some(ReplicaBookkeeping { source }) = inst.replica_bk + { + if peer == source { + // skip leader who initially replicated this instance to me + continue; + } + } + + recon_slots.get_mut(&peer).unwrap().insert( slot, - exclude: inst - .reqs_cw + inst.reqs_cw .avail_shards_map() .iter() .filter_map( @@ -1600,9 +1642,33 @@ impl CrosswordReplica { }, ) .collect(), - }, - Some(target), - )?; + ); + + // send reconstruction read messages in chunks + if recon_slots[&peer].len() == self.config.recon_chunk_size + { + self.transport_hub.send_msg( + PeerMsg::Reconstruct { + slots_excl: std::mem::take( + recon_slots.get_mut(&peer).unwrap(), + ), + }, + peer, + )?; + pf_trace!(self.id; "sent Reconstruct -> {} for {} slots", + peer, self.config.recon_chunk_size); + } + } + } + } + + // send reconstruction read message for remaining slots + for (peer, slots_excl) in recon_slots.drain() { + if !slots_excl.is_empty() { + let num_slots = slots_excl.len(); + self.transport_hub + .send_msg(PeerMsg::Reconstruct { slots_excl }, peer)?; + pf_trace!(self.id; "sent Reconstruct -> {} for {} slots", peer, num_slots); } } @@ -2168,7 +2234,8 @@ impl GenericReplica for CrosswordReplica { hb_send_interval_ms, snapshot_path, snapshot_interval_s, gossip_timeout_min, gossip_timeout_max, - fault_tolerance, shards_per_replica, + fault_tolerance, recon_chunk_size, + shards_per_replica, perf_storage_a, perf_storage_b, perf_network_a, perf_network_b)?; if config.batch_interval_us == 0 { @@ -2199,6 +2266,13 @@ impl GenericReplica for CrosswordReplica { config.hb_send_interval_ms ); } + if config.recon_chunk_size == 0 { + return logged_err!( + id; + "invalid config.recon_chunk_size '{}'", + config.recon_chunk_size + ); + } // setup state machine module let state_machine = StateMachine::new_and_setup(id).await?; diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs index 3b5c25b2..19399535 100644 --- a/src/protocols/rs_paxos.rs +++ b/src/protocols/rs_paxos.rs @@ -63,6 +63,9 @@ pub struct ReplicaConfigRSPaxos { /// Fault-tolerance level. pub fault_tolerance: u8, + /// Maximum chunk size of a ReconstructRead message. + pub recon_chunk_size: usize, + // Performance simulation params (all zeros means no perf simulation): pub perf_storage_a: u64, pub perf_storage_b: u64, @@ -84,6 +87,7 @@ impl Default for ReplicaConfigRSPaxos { snapshot_path: "/tmp/summerset.rs_paxos.snap".into(), snapshot_interval_s: 0, fault_tolerance: 0, + recon_chunk_size: 1000, perf_storage_a: 0, perf_storage_b: 0, perf_network_a: 0, @@ -220,13 +224,12 @@ enum PeerMsg { Commit { slot: usize }, /// Reconstruction read from new leader to replicas. - Reconstruct { slot: usize }, + Reconstruct { slots: Vec }, /// Reconstruction read reply from replica to leader. ReconstructReply { - slot: usize, - ballot: Ballot, - reqs_cw: RSCodeword, + /// Map from slot -> (ballot, peer shards). + slots_data: HashMap)>, }, /// Leader activity heartbeat. @@ -1062,36 +1065,39 @@ impl RSPaxosReplica { fn handle_msg_reconstruct( &mut self, peer: ReplicaId, - slot: usize, + slots: Vec, ) -> Result<(), SummersetError> { - if slot < self.start_slot { - return Ok(()); // ignore if slot index outdated - } - pf_trace!(self.id; "received Reconstruct <- {} for slot {}", peer, slot); + pf_trace!(self.id; "received Reconstruct <- {} for slots {:?}", peer, slots); + let mut slots_data = HashMap::new(); - // locate instance in memory, filling in null instances if needed - while self.start_slot + self.insts.len() <= slot { - self.insts.push(self.null_instance()?); - } - let inst = &mut self.insts[slot - self.start_slot]; + for slot in slots { + if slot < self.start_slot { + continue; // ignore if slot index outdated + } - // ignore spurious duplications; also ignore if I have nothing to send back - if inst.status < Status::Accepting || inst.reqs_cw.avail_shards() == 0 { - return Ok(()); - } + // locate instance in memory, filling in null instances if needed + while self.start_slot + self.insts.len() <= slot { + self.insts.push(self.null_instance()?); + } + let inst = &mut self.insts[slot - self.start_slot]; - // send back my ballot for this slot and the available shards - self.transport_hub.send_msg( - PeerMsg::ReconstructReply { - slot, - ballot: inst.bal, - reqs_cw: inst.reqs_cw.clone(), - }, - peer, - )?; - pf_trace!(self.id; "sent ReconstructReply message for slot {} bal {}", - slot, inst.bal); + // ignore spurious duplications; also ignore if I have nothing to send back + if inst.status < Status::Accepting + || inst.reqs_cw.avail_shards() == 0 + { + continue; + } + + // send back my ballot for this slot and the available shards + slots_data.insert(slot, (inst.bal, inst.reqs_cw.clone())); + } + if !slots_data.is_empty() { + let num_slots = slots_data.len(); + self.transport_hub + .send_msg(PeerMsg::ReconstructReply { slots_data }, peer)?; + pf_trace!(self.id; "sent ReconstructReply message for {} slots", num_slots); + } Ok(()) } @@ -1099,61 +1105,66 @@ impl RSPaxosReplica { fn handle_msg_reconstruct_reply( &mut self, peer: ReplicaId, - slot: usize, - ballot: Ballot, - reqs_cw: RSCodeword, + slots_data: HashMap)>, ) -> Result<(), SummersetError> { - if slot < self.start_slot { - return Ok(()); // ignore if slot index outdated - } - pf_trace!(self.id; "received ReconstructReply <- {} for slot {} bal {} shards {:?}", - peer, slot, ballot, reqs_cw.avail_shards_map()); - assert!(slot < self.start_slot + self.insts.len()); - assert!(self.insts[slot - self.start_slot].status >= Status::Committed); - let inst = &mut self.insts[slot - self.start_slot]; - - // if reply not outdated and ballot is up-to-date - if inst.status < Status::Executed && ballot >= inst.bal { - // absorb the shards from this replica - inst.reqs_cw.absorb_other(reqs_cw)?; + for (slot, (ballot, reqs_cw)) in slots_data { + if slot < self.start_slot { + continue; // ignore if slot index outdated + } + pf_trace!(self.id; "in ReconstructReply <- {} for slot {} bal {} shards {:?}", + peer, slot, ballot, reqs_cw.avail_shards_map()); + assert!(slot < self.start_slot + self.insts.len()); + assert!( + self.insts[slot - self.start_slot].status >= Status::Committed + ); + let inst = &mut self.insts[slot - self.start_slot]; - // if enough shards have been gathered, can push execution forward - if slot == self.exec_bar { - let mut now_slot = self.exec_bar; - while now_slot < self.start_slot + self.insts.len() { - let inst = &mut self.insts[now_slot - self.start_slot]; - if inst.status < Status::Committed - || inst.reqs_cw.avail_shards() < self.quorum_cnt - { - break; - } + // if reply not outdated and ballot is up-to-date + if inst.status < Status::Executed && ballot >= inst.bal { + // absorb the shards from this replica + inst.reqs_cw.absorb_other(reqs_cw)?; + + // if enough shards have been gathered, can push execution forward + if slot == self.exec_bar { + let mut now_slot = self.exec_bar; + while now_slot < self.start_slot + self.insts.len() { + let inst = &mut self.insts[now_slot - self.start_slot]; + if inst.status < Status::Committed + || inst.reqs_cw.avail_shards() < self.quorum_cnt + { + break; + } - if inst.reqs_cw.avail_data_shards() < self.quorum_cnt { - // have enough shards but need reconstruction - inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?; - } - let reqs = inst.reqs_cw.get_data()?; + if inst.reqs_cw.avail_data_shards() < self.quorum_cnt { + // have enough shards but need reconstruction + inst.reqs_cw + .reconstruct_data(Some(&self.rs_coder))?; + } + let reqs = inst.reqs_cw.get_data()?; - // submit commands in committed instance to the state machine - // for execution - if reqs.is_empty() { - inst.status = Status::Executed; - } else { - for (cmd_idx, (_, req)) in reqs.iter().enumerate() { - if let ApiRequest::Req { cmd, .. } = req { - self.state_machine.submit_cmd( - Self::make_command_id(now_slot, cmd_idx), - cmd.clone(), - )?; - } else { - continue; // ignore other types of requests + // submit commands in committed instance to the state machine + // for execution + if reqs.is_empty() { + inst.status = Status::Executed; + } else { + for (cmd_idx, (_, req)) in reqs.iter().enumerate() { + if let ApiRequest::Req { cmd, .. } = req { + self.state_machine.submit_cmd( + Self::make_command_id( + now_slot, cmd_idx, + ), + cmd.clone(), + )?; + } else { + continue; // ignore other types of requests + } } + pf_trace!(self.id; "submitted {} exec commands for slot {}", + reqs.len(), now_slot); } - pf_trace!(self.id; "submitted {} exec commands for slot {}", - reqs.len(), now_slot); - } - now_slot += 1; + now_slot += 1; + } } } } @@ -1185,14 +1196,12 @@ impl RSPaxosReplica { self.handle_msg_accept_reply(peer, slot, ballot) } PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot), - PeerMsg::Reconstruct { slot } => { - self.handle_msg_reconstruct(peer, slot) + PeerMsg::Reconstruct { slots } => { + self.handle_msg_reconstruct(peer, slots) + } + PeerMsg::ReconstructReply { slots_data } => { + self.handle_msg_reconstruct_reply(peer, slots_data) } - PeerMsg::ReconstructReply { - slot, - ballot, - reqs_cw, - } => self.handle_msg_reconstruct_reply(peer, slot, ballot, reqs_cw), PeerMsg::Heartbeat { ballot, exec_bar } => { self.heard_heartbeat(peer, ballot, exec_bar) } @@ -1284,6 +1293,7 @@ impl RSPaxosReplica { self.bal_prep_sent = self.make_greater_ballot(self.bal_max_seen); self.bal_max_seen = self.bal_prep_sent; + let mut recon_slots = Vec::new(); for (slot, inst) in self .insts .iter_mut() @@ -1333,13 +1343,18 @@ impl RSPaxosReplica { if inst.status == Status::Committed && inst.reqs_cw.avail_shards() < self.quorum_cnt { - self.transport_hub - .bcast_msg(PeerMsg::Reconstruct { slot }, None)?; - pf_trace!(self.id; "broadcast Reconstruct messages for slot {} bal {} shards {:?}", - slot, inst.bal, inst.reqs_cw.avail_shards_map()); + recon_slots.push(slot); } } + // send reconstruction read messages in chunks + for chunk in recon_slots.chunks(self.config.recon_chunk_size) { + let slots = chunk.to_vec(); + let num_slots = slots.len(); + self.transport_hub + .bcast_msg(PeerMsg::Reconstruct { slots }, None)?; + pf_trace!(self.id; "broadcast Reconstruct messages for {} slots", num_slots); + } Ok(()) } @@ -1951,7 +1966,7 @@ impl GenericReplica for RSPaxosReplica { hb_hear_timeout_min, hb_hear_timeout_max, hb_send_interval_ms, snapshot_path, snapshot_interval_s, - fault_tolerance, + fault_tolerance, recon_chunk_size, perf_storage_a, perf_storage_b, perf_network_a, perf_network_b)?; if config.batch_interval_us == 0 { @@ -1982,6 +1997,13 @@ impl GenericReplica for RSPaxosReplica { config.hb_send_interval_ms ); } + if config.recon_chunk_size == 0 { + return logged_err!( + id; + "invalid config.recon_chunk_size '{}'", + config.recon_chunk_size + ); + } // setup state machine module let state_machine = StateMachine::new_and_setup(id).await?; diff --git a/src/server/transport.rs b/src/server/transport.rs index ba0e1e8b..e3b464f4 100644 --- a/src/server/transport.rs +++ b/src/server/transport.rs @@ -227,11 +227,12 @@ where .map_err(|e| SummersetError(e.to_string()))?; } None => { - pf_error!( - self.me; - "peer ID {} not found among connected ones", - peer - ); + // NOTE: commented out to avoid spurious error messages + // pf_error!( + // self.me; + // "peer ID {} not found among connected ones", + // peer + // ); } } From 8751839fd57fbf2156ef3bcfba09dc9f56266927 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Thu, 5 Oct 2023 13:54:59 -0500 Subject: [PATCH 76/89] update counstraint boundary figure --- models/plot_cstr_bounds.py | 50 ++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/models/plot_cstr_bounds.py b/models/plot_cstr_bounds.py index 30a6335c..a5e87eea 100644 --- a/models/plot_cstr_bounds.py +++ b/models/plot_cstr_bounds.py @@ -62,57 +62,59 @@ def plot_cstr_bound(idx, cluster_size): label="Crossword configs", zorder=20, ) - plt.vlines(m, ymin=m, ymax=m + 1.5, linestyles="-", color=line_color, zorder=20) - plt.vlines(n, ymin=1, ymax=2.5, linestyles="-", color=line_color, zorder=20) + plt.vlines(m, ymin=m, ymax=m + 1.4, linestyles="-", color=line_color, zorder=20) + plt.vlines(n, ymin=1, ymax=2.4, linestyles="-", color=line_color, zorder=20) # correct region xs = [m, m, n, n] - ys = [m, m + 1, 2, 1] + ys = [m, m + 1.7, 2.7, 1] plt.fill(xs, ys, color=fill_color, label="Region of fault-tolerance=f", zorder=0) - # unused x-axis range + # unused x-axis ranges + xs = [0.42, m - 0.5, m - 0.8, 0.12] + ys = [0.3, 0.3, 0, 0] + plt.fill(xs, ys, hatch="///", fill=False, linewidth=0, zorder=10) if cluster_size < CLUSTER_SIZES[-1]: - xs = [n + 0.9, X_TICKS[-1] + 0.35, X_TICKS[-1] + 0.35, n + 0.8] - ys = [0.3, 0.3, 0, 0] - plt.fill( - xs, ys, hatch="///", fill=False, edgecolor=None, linewidth=0, zorder=10 - ) + xs = [n + 1.1, X_TICKS[-1] + 0.4, X_TICKS[-1] + 0.1, n + 0.8] + plt.fill(xs, ys, hatch="///", fill=False, linewidth=0, zorder=10) # latency & throughput optimized arrows plt.arrow( - m + 0.3, - m + 1.7, - -0.9, - 0.9, + m + 0.1, + m + 2.4, + -1.3, + 0, linewidth=1, color="dimgray", length_includes_head=True, head_width=0.3, overhang=0.5, + clip_on=False, label="Tradeoff decisions", ) plt.text( - m + 0.18 if n <= 5 else m + 0.5 if n == 9 else m + 0.4, - m + 2.78 if n <= 5 else m + 2.0 if n == 9 else m + 2.4, + m + 0.3 if n < 9 else m + 0.6, + m + 2.5 if n < 9 else m + 2.2, "Lat.\noptim.", horizontalalignment="left", verticalalignment="center", color="dimgray", ) plt.arrow( - n - 0.3, - 3.3, - 0.9, - -0.9, + n + 1, + 2, + 0, + -1.3, linewidth=1, color="dimgray", length_includes_head=True, head_width=0.3, overhang=0.5, + clip_on=False, ) plt.text( - n + 0.8 if n <= 5 else n + 0.0 if n == 9 else n + 0.4, - 1 + 1.5 if n <= 5 else 1 + 2.9 if n == 9 else 1 + 2.6, + n + 1.3 if n < 7 else n + 0.4, + 1 + 1.1 if n < 7 else 1 + 2.1, "Tput.\noptim.", horizontalalignment="left", verticalalignment="center", @@ -125,7 +127,9 @@ def plot_cstr_bound(idx, cluster_size): plt.xlim((0, X_TICKS[-1] + 0.7)) plt.ylim((0, Y_TICKS[-1] + 2.7)) - plt.xticks(X_TICKS[:cluster_size], list(map(str, X_TICKS))[:cluster_size]) + plt.xticks( + X_TICKS[m - 1 : cluster_size], list(map(str, X_TICKS))[m - 1 : cluster_size] + ) plt.yticks(Y_TICKS, list(map(str, Y_TICKS))) plt.xlabel("|Quorum|", loc="right") @@ -163,7 +167,7 @@ def make_legend_arrow( color="dimgray", length_includes_head=True, head_width=0.6 * height, - overhang=0.3, + overhang=0.2, ) def make_legend_polygon( From e7a4e96464a4afc1b2949a24dd1016b4d3535a36 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Thu, 5 Oct 2023 16:22:45 -0500 Subject: [PATCH 77/89] make follower gossiping optimal --- src/protocols/crossword.rs | 177 ++++++++++++++++++++++++------------- src/utils/bitmap.rs | 10 +++ src/utils/rscoding.rs | 15 ++-- 3 files changed, 136 insertions(+), 66 deletions(-) diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index 1963f016..c86549c7 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -96,10 +96,10 @@ impl Default for ReplicaConfigCrossword { hb_send_interval_ms: 50, snapshot_path: "/tmp/summerset.rs_paxos.snap".into(), snapshot_interval_s: 0, - gossip_timeout_min: 100, - gossip_timeout_max: 300, + gossip_timeout_min: 1200, + gossip_timeout_max: 1800, fault_tolerance: 0, - recon_chunk_size: 1000, + recon_chunk_size: 2000, shards_per_replica: 1, perf_storage_a: 0, perf_storage_b: 0, @@ -146,6 +146,9 @@ struct LeaderBookkeeping { struct ReplicaBookkeeping { /// Source leader replica ID for replyiing to Prepares and Accepts. source: ReplicaId, + + /// Have I tried gossiping for this instance at least once? + gossip_tried: bool, } /// In-memory instance containing a (possibly partial) commands batch. @@ -436,14 +439,84 @@ impl CrosswordReplica { slot: usize, id: ReplicaId, population: u8, - num_shards: u8, + shards_per_replica: u8, ) -> Vec { let first: u8 = ((id as usize + slot) % population as usize) as u8; - (first..(first + num_shards)) + (first..(first + shards_per_replica)) .map(|i| (i % population)) .collect() } + /// TODO: should let leader incorporate assignment metadata in Accept + /// messages. With more complex assignment policies, a follower probably + /// does not know the assignment. + fn gossip_targets_excl( + slot: usize, + me: ReplicaId, + population: u8, + quorum_cnt: u8, + shards_per_replica: u8, + mut avail_shards_map: Bitmap, + replica_bk: &mut Option, + ) -> HashMap> { + let mut src_peer = me; + let mut first_try = false; + if let Some(ReplicaBookkeeping { + source, + gossip_tried, + }) = replica_bk + { + if !*gossip_tried { + src_peer = *source; + first_try = true; + // first try: exclude all parity shards + for idx in quorum_cnt..population { + avail_shards_map.set(idx, true).unwrap(); + } + *gossip_tried = true; + } + } + + // greedily considers my peers, starting from the one with my ID + 1, + // until all data shards covered + let mut targets_excl = HashMap::new(); + for p in (me + 1)..(population + me) { + let peer = p % population; + if peer == src_peer { + // skip leader who initially replicated this instance to me + continue; + } + + if !first_try { + // first try probably did not succeed, so do it conservatively + targets_excl.insert(peer, avail_shards_map.to_vec()); + } else { + // first try: only ask for a minimum number of data shards + let mut useful_shards = Vec::new(); + for idx in Self::shards_for_replica( + slot, + peer, + population, + shards_per_replica, + ) { + if !avail_shards_map.get(idx).unwrap() { + useful_shards.push(idx); + } + } + // if this peer has data shards which I don't have right now + // and I have not asked others for in this round + if !useful_shards.is_empty() { + targets_excl.insert(peer, avail_shards_map.to_vec()); + for idx in useful_shards { + avail_shards_map.set(idx, true).unwrap(); + } + } + } + } + + targets_excl + } + /// TODO: make better impl of this. fn coverage_under_faults( population: u8, @@ -682,7 +755,7 @@ impl CrosswordReplica { } else { // on follower replica, finishing the logging of a // PrepareBal entry leads to sending back a Prepare reply - if let Some(ReplicaBookkeeping { source }) = inst.replica_bk { + if let Some(ReplicaBookkeeping { source, .. }) = inst.replica_bk { self.transport_hub.send_msg( PeerMsg::PrepareReply { slot, @@ -719,7 +792,7 @@ impl CrosswordReplica { } else { // on follower replica, finishing the logging of an // AcceptData entry leads to sending back an Accept reply - if let Some(ReplicaBookkeeping { source }) = inst.replica_bk { + if let Some(ReplicaBookkeeping { source, .. }) = inst.replica_bk { self.transport_hub.send_msg( PeerMsg::AcceptReply { slot, @@ -854,7 +927,10 @@ impl CrosswordReplica { inst.bal = ballot; inst.status = Status::Preparing; - inst.replica_bk = Some(ReplicaBookkeeping { source: peer }); + inst.replica_bk = Some(ReplicaBookkeeping { + source: peer, + gossip_tried: false, + }); // update largest ballot seen self.bal_max_seen = ballot; @@ -1037,7 +1113,10 @@ impl CrosswordReplica { inst.bal = ballot; inst.status = Status::Accepting; inst.reqs_cw = reqs_cw; - inst.replica_bk = Some(ReplicaBookkeeping { source: peer }); + inst.replica_bk = Some(ReplicaBookkeeping { + source: peer, + gossip_tried: false, + }); // update largest ballot seen self.bal_max_seen = ballot; @@ -1468,26 +1547,13 @@ impl CrosswordReplica { } // do reconstruction reads for all committed instances that do not - // hold enough available shards for reconstruction + // hold enough available shards for reconstruction. It would be too + // complicated and slow to do the "data shards only" optimization + // during fail-over, so just do this conservatively here if inst.status == Status::Committed && inst.reqs_cw.avail_shards() < self.quorum_cnt { - recon_slots.insert( - slot, - inst.reqs_cw - .avail_shards_map() - .iter() - .filter_map( - |(idx, flag)| { - if flag { - Some(idx) - } else { - None - } - }, - ) - .collect(), - ); + recon_slots.insert(slot, inst.reqs_cw.avail_shards_vec()); // send reconstruction read messages in chunks if recon_slots.len() == self.config.recon_chunk_size { @@ -1593,11 +1659,10 @@ impl CrosswordReplica { /// Triggers gossiping for my missing shards in committed but not-yet- /// executed instances: fetch missing shards from peers, preferring /// follower peers that hold data shards. - // TODO: prefer replicas with original data shards first fn trigger_gossiping(&mut self) -> Result<(), SummersetError> { // maintain a map from peer ID to send to -> slots_excl to send let mut recon_slots: HashMap>> = - HashMap::new(); + HashMap::with_capacity(self.population as usize - 1); for peer in 0..self.population { if peer != self.id { recon_slots.insert(peer, HashMap::new()); @@ -1607,42 +1672,32 @@ impl CrosswordReplica { let mut slot_up_to = self.exec_bar; for slot in self.exec_bar..(self.start_slot + self.insts.len()) { slot_up_to = slot; - let inst = &self.insts[slot - self.start_slot]; - if inst.status >= Status::Executed { - continue; - } else if inst.status < Status::Committed { - break; + { + let inst = &self.insts[slot - self.start_slot]; + if inst.status >= Status::Executed { + continue; + } else if inst.status < Status::Committed { + break; + } } - if inst.reqs_cw.avail_shards() < self.quorum_cnt { - for peer in 0..self.population { - if peer == self.id { - continue; - } - if let Some(ReplicaBookkeeping { source }) = inst.replica_bk - { - if peer == source { - // skip leader who initially replicated this instance to me - continue; - } - } + let avail_shards_map = self.insts[slot - self.start_slot] + .reqs_cw + .avail_shards_map(); + if avail_shards_map.count() < self.quorum_cnt { + // decide which peers to ask for which shards from + let targets_excl = Self::gossip_targets_excl( + slot, + self.id, + self.population, + self.quorum_cnt, + self.config.shards_per_replica, + avail_shards_map, + &mut self.insts[slot - self.start_slot].replica_bk, + ); - recon_slots.get_mut(&peer).unwrap().insert( - slot, - inst.reqs_cw - .avail_shards_map() - .iter() - .filter_map( - |(idx, flag)| { - if flag { - Some(idx) - } else { - None - } - }, - ) - .collect(), - ); + for (peer, exclude) in targets_excl { + recon_slots.get_mut(&peer).unwrap().insert(slot, exclude); // send reconstruction read messages in chunks if recon_slots[&peer].len() == self.config.recon_chunk_size diff --git a/src/utils/bitmap.rs b/src/utils/bitmap.rs index 592ddb0c..5211e9f9 100644 --- a/src/utils/bitmap.rs +++ b/src/utils/bitmap.rs @@ -82,6 +82,15 @@ impl Bitmap { pub fn iter(&self) -> BitmapIter { BitmapIter { map: self, idx: 0 } } + + /// Convenience method for converting the bitmap to a vec of indexes where + /// the flag is true. + #[inline] + pub fn to_vec(&self) -> Vec { + self.iter() + .filter_map(|(idx, flag)| if flag { Some(idx) } else { None }) + .collect() + } } /// Iterator over `Bitmap`, yielding `(id, bit)` pairs. @@ -175,5 +184,6 @@ mod bitmap_tests { for (id, flag) in map.iter() { assert_eq!(ref_map[id as usize], flag); } + assert_eq!(map.to_vec(), [0, 1, 3, 4]); } } diff --git a/src/utils/rscoding.rs b/src/utils/rscoding.rs index c8461c26..35fdc97c 100644 --- a/src/utils/rscoding.rs +++ b/src/utils/rscoding.rs @@ -305,15 +305,20 @@ where self.shards.iter().filter(|s| s.is_some()).count() as u8 } - /// Gets a bitmap of available shard indexes set true. + /// Gets a vec of available shard indexes. #[inline] - pub fn avail_shards_map(&self) -> Bitmap { - let ones: Vec = self - .shards + pub fn avail_shards_vec(&self) -> Vec { + self.shards .iter() .enumerate() .filter_map(|(i, s)| if s.is_some() { Some(i as u8) } else { None }) - .collect(); + .collect() + } + + /// Gets a bitmap of available shard indexes set true. + #[inline] + pub fn avail_shards_map(&self) -> Bitmap { + let ones = self.avail_shards_vec(); Bitmap::from(self.num_shards(), ones) } From ca6f52e2766c86b9fe75cdaaa7912d228170ab1f Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Thu, 5 Oct 2023 23:33:50 -0500 Subject: [PATCH 78/89] staging progress on peer health tracking --- src/manager/clusman.rs | 55 +++++---- src/protocols/multipaxos.rs | 137 ++++++++++++++++++---- src/utils/error.rs | 2 + summerset_client/src/clients/repl.rs | 167 +++++++++++++++++++++------ 4 files changed, 277 insertions(+), 84 deletions(-) diff --git a/src/manager/clusman.rs b/src/manager/clusman.rs index 89f2700d..e3bc0103 100644 --- a/src/manager/clusman.rs +++ b/src/manager/clusman.rs @@ -408,9 +408,13 @@ impl ClusterManager { self.server_info.get_mut(&s).unwrap().is_paused = true; // wait for dummy reply - let (_, reply) = self.server_reigner.recv_ctrl().await?; - if reply != CtrlMsg::PauseReply { - return logged_err!("m"; "unexpected reply type received"); + loop { + let (server, reply) = self.server_reigner.recv_ctrl().await?; + if server != s || reply != CtrlMsg::PauseReply { + self.handle_ctrl_msg(server, reply).await?; + } else { + break; + } } pause_done.insert(s); @@ -444,9 +448,13 @@ impl ClusterManager { self.server_reigner.send_ctrl(CtrlMsg::Resume, s)?; // wait for dummy reply - let (_, reply) = self.server_reigner.recv_ctrl().await?; - if reply != CtrlMsg::ResumeReply { - return logged_err!("m"; "unexpected reply type received"); + loop { + let (server, reply) = self.server_reigner.recv_ctrl().await?; + if server != s || reply != CtrlMsg::ResumeReply { + self.handle_ctrl_msg(server, reply).await?; + } else { + break; + } } // clear the is_paused flag @@ -484,22 +492,27 @@ impl ClusterManager { self.server_reigner.send_ctrl(CtrlMsg::TakeSnapshot, s)?; // wait for reply - let (_, reply) = self.server_reigner.recv_ctrl().await?; - if let CtrlMsg::SnapshotUpTo { new_start } = reply { - // update the log start index info - assert!(self.server_info.contains_key(&s)); - if new_start < self.server_info[&s].start_slot { - return logged_err!("m"; "server {} snapshot up to {} < {}", - s, new_start, - self.server_info[&s].start_slot); - } else { - self.server_info.get_mut(&s).unwrap().start_slot = - new_start; - } + loop { + let (server, reply) = self.server_reigner.recv_ctrl().await?; + match reply { + CtrlMsg::SnapshotUpTo { new_start } if server == s => { + // update the log start index info + assert!(self.server_info.contains_key(&s)); + if new_start < self.server_info[&s].start_slot { + return logged_err!("m"; "server {} snapshot up to {} < {}", + s, new_start, + self.server_info[&s].start_slot); + } else { + self.server_info.get_mut(&s).unwrap().start_slot = + new_start; + } + + snapshot_up_to.insert(s, new_start); + break; + } - snapshot_up_to.insert(s, new_start); - } else { - return logged_err!("m"; "unexpected reply type received"); + _ => self.handle_ctrl_msg(server, reply).await?, + } } } diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs index fbcfc682..f1899544 100644 --- a/src/protocols/multipaxos.rs +++ b/src/protocols/multipaxos.rs @@ -259,14 +259,21 @@ pub struct MultiPaxosReplica { /// TransportHub module. transport_hub: TransportHub, + /// Who do I think is the effective leader of the cluster right now? + leader: Option, + /// Timer for hearing heartbeat from leader. hb_hear_timer: Timer, /// Interval for sending heartbeat to followers. hb_send_interval: Interval, - /// Do I think I am the leader? - is_leader: bool, + /// Heartbeat reply counters for approximate detection of follower health. + /// Tuple of (#hb_replied, #hb_replied seen at last send, repetition). + hb_reply_cnts: HashMap, + + /// Approximate health status tracking of peer replicas. + peer_alive: Bitmap, /// In-memory log of instances. insts: Vec, @@ -302,6 +309,12 @@ pub struct MultiPaxosReplica { // MultiPaxosReplica common helpers impl MultiPaxosReplica { + /// Do I think I am the current effective leader? + #[inline] + fn is_leader(&self) -> bool { + self.leader == Some(self.id) + } + /// Create an empty null instance. #[inline] fn null_instance(&self) -> Instance { @@ -396,21 +409,26 @@ impl MultiPaxosReplica { pf_debug!(self.id; "got request batch of size {}", batch_size); // if I'm not a leader, ignore client requests - if !self.is_leader { + if !self.is_leader() { for (client, req) in req_batch { if let ApiRequest::Req { id: req_id, .. } = req { - // tell the client to try on the next replica - let next_replica = (self.id + 1) % self.population; + // tell the client to try on known leader or just the + // next ID replica + let target = if let Some(peer) = self.leader { + peer + } else { + (self.id + 1) % self.population + }; self.external_api.send_reply( ApiReply::Reply { id: req_id, result: None, - redirect: Some(next_replica), + redirect: Some(target), }, client, )?; pf_trace!(self.id; "redirected client {} to replica {}", - client, next_replica); + client, target); } } return Ok(()); @@ -532,7 +550,7 @@ impl MultiPaxosReplica { None }; - if self.is_leader { + if self.is_leader() { // on leader, finishing the logging of a PrepareBal entry // is equivalent to receiving a Prepare reply from myself // (as an acceptor role) @@ -569,7 +587,7 @@ impl MultiPaxosReplica { slot, self.insts[slot - self.start_slot].bal); let inst = &self.insts[slot - self.start_slot]; - if self.is_leader { + if self.is_leader() { // on leader, finishing the logging of an AcceptData entry // is equivalent to receiving an Accept reply from myself // (as an acceptor role) @@ -738,10 +756,11 @@ impl MultiPaxosReplica { // if ballot is what I'm currently waiting on for Prepare replies: if ballot == self.bal_prep_sent { assert!(slot < self.start_slot + self.insts.len()); + let is_leader = self.is_leader(); let inst = &mut self.insts[slot - self.start_slot]; // ignore spurious duplications and outdated replies - if !self.is_leader + if !is_leader || (inst.status != Status::Preparing) || (ballot < inst.bal) { @@ -871,10 +890,11 @@ impl MultiPaxosReplica { // if ballot is what I'm currently waiting on for Accept replies: if ballot == self.bal_prepared { assert!(slot < self.start_slot + self.insts.len()); + let is_leader = self.is_leader(); let inst = &mut self.insts[slot - self.start_slot]; // ignore spurious duplications and outdated replies - if !self.is_leader + if !is_leader || (inst.status != Status::Accepting) || (ballot < inst.bal) { @@ -1055,16 +1075,25 @@ impl MultiPaxosReplica { /// Becomes a leader, sends self-initiated Prepare messages to followers /// for all in-progress instances, and starts broadcasting heartbeats. fn become_a_leader(&mut self) -> Result<(), SummersetError> { - if self.is_leader { + if self.is_leader() { return Ok(()); + } else if let Some(peer) = self.leader { + // mark old leader as dead + if self.peer_alive.get(peer)? { + self.peer_alive.set(peer, false)?; + pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive); + } } - self.is_leader = true; // this starts broadcasting heartbeats + self.leader = Some(self.id); // this starts broadcasting heartbeats self.control_hub .send_ctrl(CtrlMsg::LeaderStatus { step_up: true })?; pf_info!(self.id; "becoming a leader..."); - // broadcast a heartbeat right now + // clear peers' heartbeat reply counters, and broadcast a heartbeat now + for cnts in self.hb_reply_cnts.values_mut() { + *cnts = (1, 0, 0); + } self.bcast_heartbeats()?; // make a greater ballot number and invalidate all in-progress instances @@ -1129,6 +1158,33 @@ impl MultiPaxosReplica { }, None, )?; + + // update max heartbeat reply counters and their repetitions seen + for (&peer, cnts) in self.hb_reply_cnts.iter_mut() { + if cnts.0 > cnts.1 { + // more hb replies have been received from this peer; it is + // probably alive + cnts.1 = cnts.0; + cnts.2 = 0; + } else { + // did not receive hb reply from this peer at least for the + // last sent hb from me; increment repetition count + cnts.2 += 1; + let repeat_threshold = (self.config.hb_hear_timeout_min + / self.config.hb_send_interval_ms) + as u8; + if cnts.2 > repeat_threshold { + // did not receive hb reply from this peer for too many + // past hbs sent from me; this peer is probably dead + if self.peer_alive.get(peer)? { + self.peer_alive.set(peer, false)?; + pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive); + } + } + } + } + + // I also heard this heartbeat from myself self.heard_heartbeat(self.id, self.bal_prep_sent, self.exec_bar)?; // pf_trace!(self.id; "broadcast heartbeats bal {}", self.bal_prep_sent); @@ -1143,6 +1199,7 @@ impl MultiPaxosReplica { ); // pf_trace!(self.id; "kickoff hb_hear_timer @ {} ms", timeout_ms); + self.hb_hear_timer.cancel()?; self.hb_hear_timer .kickoff(Duration::from_millis(timeout_ms))?; Ok(()) @@ -1153,10 +1210,18 @@ impl MultiPaxosReplica { /// leader status if I currently think I'm a leader. fn heard_heartbeat( &mut self, - _peer: ReplicaId, + peer: ReplicaId, ballot: Ballot, exec_bar: usize, ) -> Result<(), SummersetError> { + if peer != self.id { + self.hb_reply_cnts.get_mut(&peer).unwrap().0 += 1; + if !self.peer_alive.get(peer)? { + self.peer_alive.set(peer, true)?; + pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive); + } + } + // ignore outdated heartbeats and those from peers with exec_bar < mine if ballot < self.bal_max_seen || exec_bar < self.exec_bar { return Ok(()); @@ -1165,12 +1230,28 @@ impl MultiPaxosReplica { // reset hearing timer self.kickoff_hb_hear_timer()?; - // clear my leader status if it carries a higher ballot number - if self.is_leader && ballot > self.bal_max_seen { - self.is_leader = false; - self.control_hub - .send_ctrl(CtrlMsg::LeaderStatus { step_up: false })?; - pf_info!(self.id; "no longer a leader..."); + if peer != self.id { + // reply back with a Heartbeat message + self.transport_hub.send_msg( + PeerMsg::Heartbeat { + ballot: self.bal_max_seen, + exec_bar: self.exec_bar, + }, + peer, + )?; + + // if the peer has made a higher ballot number + if ballot > self.bal_max_seen { + // clear my leader status if I was one + if self.is_leader() && ballot > self.bal_max_seen { + self.control_hub + .send_ctrl(CtrlMsg::LeaderStatus { step_up: false })?; + pf_info!(self.id; "no longer a leader..."); + } + + // set this peer to be the believed leader + self.leader = Some(peer); + } } // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot); @@ -1537,7 +1618,7 @@ impl MultiPaxosReplica { } // collect and dump all Puts in executed instances - if self.is_leader { + if self.is_leader() { // NOTE: broadcast heartbeats here to appease followers self.bcast_heartbeats()?; } @@ -1570,7 +1651,7 @@ impl MultiPaxosReplica { self.start_slot = self.exec_bar; // discarding everything older than start_slot in WAL log - if self.is_leader { + if self.is_leader() { // NOTE: broadcast heartbeats here to appease followers self.bcast_heartbeats()?; } @@ -1824,6 +1905,10 @@ impl GenericReplica for MultiPaxosReplica { )); snapshot_interval.set_missed_tick_behavior(MissedTickBehavior::Skip); + let hb_reply_cnts = (0..population) + .filter_map(|p| if p == id { None } else { Some((p, (1, 0, 0))) }) + .collect(); + Ok(MultiPaxosReplica { id, population, @@ -1837,9 +1922,11 @@ impl GenericReplica for MultiPaxosReplica { storage_hub, snapshot_hub, transport_hub, + leader: None, hb_hear_timer: Timer::new(), hb_send_interval, - is_leader: false, + hb_reply_cnts, + peer_alive: Bitmap::new(population, true), insts: vec![], start_slot: 0, snapshot_interval, @@ -1927,7 +2014,7 @@ impl GenericReplica for MultiPaxosReplica { }, // leader sending heartbeat - _ = self.hb_send_interval.tick(), if !paused && self.is_leader => { + _ = self.hb_send_interval.tick(), if !paused && self.is_leader() => { if let Err(e) = self.bcast_heartbeats() { pf_error!(self.id; "error broadcasting heartbeats: {}", e); } diff --git a/src/utils/error.rs b/src/utils/error.rs index 0e73dccb..6c0907a0 100644 --- a/src/utils/error.rs +++ b/src/utils/error.rs @@ -3,6 +3,7 @@ use std::fmt; use std::io; use std::net; +use std::num; use crate::server::ReplicaId; @@ -30,6 +31,7 @@ macro_rules! impl_from_error { } impl_from_error!(io::Error); +impl_from_error!(num::ParseIntError); impl_from_error!(net::AddrParseError); impl_from_error!(rmp_serde::encode::Error); impl_from_error!(rmp_serde::decode::Error); diff --git a/summerset_client/src/clients/repl.rs b/summerset_client/src/clients/repl.rs index 09e4f330..88e0cfbb 100644 --- a/summerset_client/src/clients/repl.rs +++ b/summerset_client/src/clients/repl.rs @@ -1,6 +1,8 @@ //! Interactive REPL-style command-line interface client. +use std::collections::HashSet; use std::io::{self, Write}; +use std::str::SplitWhitespace; use crate::drivers::{DriverReply, DriverClosedLoop}; @@ -8,7 +10,9 @@ use color_print::{cprint, cprintln}; use tokio::time::Duration; -use summerset::{GenericEndpoint, Command, SummersetError}; +use summerset::{ + ReplicaId, GenericEndpoint, Command, CtrlRequest, CtrlReply, SummersetError, +}; /// Prompt string at the start of line. const PROMPT: &str = ">>>>> "; @@ -24,6 +28,9 @@ enum ReplCommand { /// Print help message. PrintHelp, + /// Control request to the manager. + Control(CtrlRequest), + /// Client exit. Exit, @@ -54,28 +61,61 @@ impl ClientRepl { } /// Prints the prompt string. - fn print_prompt(&mut self) { + #[inline] + fn print_prompt() { cprint!("{}", PROMPT); io::stdout().flush().unwrap(); } /// Prints (optionally) an error message and the help message. - fn print_help(&mut self, err: Option<&SummersetError>) { + fn print_help(err: Option<&SummersetError>) { if let Some(e) = err { cprintln!("✗ {}", e); } - println!("HELP: Supported commands are:"); - println!(" get "); - println!(" put "); - println!(" reconnect"); - println!(" help"); - println!(" exit"); + println!("HELP: Supported normal commands are:"); + println!(" get "); + println!(" put "); + println!(" help"); + println!(" exit"); + println!(" Commands for control/testing:"); + println!(" reconnect"); + println!(" reset [servers]"); + println!(" pause [servers]"); + println!(" resume [servers]"); + println!(" snapshot [servers]"); println!( " Keys and values currently cannot contain any whitespaces" ); io::stdout().flush().unwrap(); } + /// Expect to get the next segment string from parsed segs. + #[inline] + fn expect_next_seg<'s>( + segs: &mut SplitWhitespace<'s>, + ) -> Result<&'s str, SummersetError> { + if let Some(seg) = segs.next() { + Ok(seg) + } else { + let err = SummersetError("not enough args".into()); + Self::print_help(Some(&err)); + Err(err) + } + } + + /// Drain all of the remaining segments into a hash set and interpret as + /// replica IDs. + #[inline] + fn drain_server_ids( + segs: &mut SplitWhitespace, + ) -> Result, SummersetError> { + let mut servers = HashSet::new(); + for seg in segs { + servers.insert(seg.parse::()?); + } + Ok(servers) + } + /// Reads in user input and parses into a command. fn read_command(&mut self) -> Result { self.input_buf.clear(); @@ -98,36 +138,18 @@ impl ClientRepl { match &cmd_type.unwrap().to_lowercase()[..] { "get" => { - let key = segs.next(); - if key.is_none() { - let err = SummersetError("not enough args".into()); - self.print_help(Some(&err)); - return Err(err); - } - - // keys and values are kept as-is, no case conversions - Ok(ReplCommand::Normal(Command::Get { - key: key.unwrap().into(), - })) + // keys are kept as-is, no case conversions + let key = Self::expect_next_seg(&mut segs)?; + Ok(ReplCommand::Normal(Command::Get { key: key.into() })) } "put" => { - let key = segs.next(); - if key.is_none() { - let err = SummersetError("not enough args".into()); - self.print_help(Some(&err)); - return Err(err); - } - let value = segs.next(); - if value.is_none() { - let err = SummersetError("not enough args".into()); - self.print_help(Some(&err)); - return Err(err); - } - + // keys and values are kept as-is, no case conversions + let key = Self::expect_next_seg(&mut segs)?; + let value = Self::expect_next_seg(&mut segs)?; Ok(ReplCommand::Normal(Command::Put { - key: key.unwrap().into(), - value: value.unwrap().into(), + key: key.into(), + value: value.into(), })) } @@ -135,6 +157,29 @@ impl ClientRepl { "reconnect" => Ok(ReplCommand::Reconnect), + "reset" => { + let servers = Self::drain_server_ids(&mut segs)?; + Ok(ReplCommand::Control(CtrlRequest::ResetServers { + servers, + durable: true, + })) + } + + "pause" => { + let servers = Self::drain_server_ids(&mut segs)?; + Ok(ReplCommand::Control(CtrlRequest::PauseServers { servers })) + } + + "resume" => { + let servers = Self::drain_server_ids(&mut segs)?; + Ok(ReplCommand::Control(CtrlRequest::ResumeServers { servers })) + } + + "snapshot" => { + let servers = Self::drain_server_ids(&mut segs)?; + Ok(ReplCommand::Control(CtrlRequest::TakeSnapshot { servers })) + } + "exit" => Ok(ReplCommand::Exit), _ => { @@ -142,7 +187,7 @@ impl ClientRepl { "unrecognized command: {}", cmd_type.unwrap() )); - self.print_help(Some(&err)); + Self::print_help(Some(&err)); Err(err) } } @@ -200,9 +245,49 @@ impl ClientRepl { io::stdout().flush().unwrap(); } + /// Makes a control request to the manager and wait for the reply. + async fn make_ctrl_req( + &mut self, + req: CtrlRequest, + ) -> Result { + let mut sent = self.driver.ctrl_stub().send_req(Some(&req))?; + while !sent { + sent = self.driver.ctrl_stub().send_req(None)?; + } + self.driver.ctrl_stub().recv_reply().await + } + + /// Prints control request reply. + fn print_ctrl_reply(&mut self, reply: CtrlReply) { + match reply { + CtrlReply::ResetServers { servers } => { + cprintln!("# reset servers {:?}", servers); + } + + CtrlReply::PauseServers { servers } => { + cprintln!("# paused servers {:?}", servers); + } + + CtrlReply::ResumeServers { servers } => { + cprintln!("# resumed servers {:?}", servers); + } + + CtrlReply::TakeSnapshot { snapshot_up_to } => { + cprintln!( + "# servers snapshot up to {:?}", + snapshot_up_to + ); + } + + _ => { + cprintln!("✗ unexpected ctrl reply type"); + } + } + } + /// One iteration of the REPL loop. async fn iter(&mut self) -> Result { - self.print_prompt(); + Self::print_prompt(); let cmd = self.read_command()?; match cmd { @@ -221,7 +306,7 @@ impl ClientRepl { } ReplCommand::PrintHelp => { - self.print_help(None); + Self::print_help(None); Ok(true) } @@ -230,6 +315,12 @@ impl ClientRepl { self.print_result(result); Ok(true) } + + ReplCommand::Control(req) => { + let reply = self.make_ctrl_req(req).await?; + self.print_ctrl_reply(reply); + Ok(true) + } } } From 5b1c107aa51db3f084c2881cccd050acda8013f9 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Fri, 6 Oct 2023 00:06:01 -0500 Subject: [PATCH 79/89] finish peer health tracking --- src/protocols/crossword.rs | 141 +++++++++++++++++++++++++++++------- src/protocols/multipaxos.rs | 8 +- src/protocols/rs_paxos.rs | 139 ++++++++++++++++++++++++++++------- 3 files changed, 234 insertions(+), 54 deletions(-) diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index c86549c7..37e82ba8 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -294,14 +294,21 @@ pub struct CrosswordReplica { /// TransportHub module. transport_hub: TransportHub, + /// Who do I think is the effective leader of the cluster right now? + leader: Option, + /// Timer for hearing heartbeat from leader. hb_hear_timer: Timer, /// Interval for sending heartbeat to followers. hb_send_interval: Interval, - /// Do I think I am the leader? - is_leader: bool, + /// Heartbeat reply counters for approximate detection of follower health. + /// Tuple of (#hb_replied, #hb_replied seen at last send, repetition). + hb_reply_cnts: HashMap, + + /// Approximate health status tracking of peer replicas. + peer_alive: Bitmap, /// In-memory log of instances. insts: Vec, @@ -343,6 +350,12 @@ pub struct CrosswordReplica { // CrosswordReplica common helpers impl CrosswordReplica { + /// Do I think I am the current effective leader? + #[inline] + fn is_leader(&self) -> bool { + self.leader == Some(self.id) + } + /// Create an empty null instance. #[inline] fn null_instance(&self) -> Result { @@ -573,21 +586,26 @@ impl CrosswordReplica { pf_debug!(self.id; "got request batch of size {}", batch_size); // if I'm not a leader, ignore client requests - if !self.is_leader { + if !self.is_leader() { for (client, req) in req_batch { if let ApiRequest::Req { id: req_id, .. } = req { - // tell the client to try on the next replica - let next_replica = (self.id + 1) % self.population; + // tell the client to try on known leader or just the + // next ID replica + let target = if let Some(peer) = self.leader { + peer + } else { + (self.id + 1) % self.population + }; self.external_api.send_reply( ApiReply::Reply { id: req_id, result: None, - redirect: Some(next_replica), + redirect: Some(target), }, client, )?; pf_trace!(self.id; "redirected client {} to replica {}", - client, next_replica); + client, target); } } return Ok(()); @@ -747,7 +765,7 @@ impl CrosswordReplica { None }; - if self.is_leader { + if self.is_leader() { // on leader, finishing the logging of a PrepareBal entry // is equivalent to receiving a Prepare reply from myself // (as an acceptor role) @@ -784,7 +802,7 @@ impl CrosswordReplica { slot, self.insts[slot - self.start_slot].bal); let inst = &self.insts[slot - self.start_slot]; - if self.is_leader { + if self.is_leader() { // on leader, finishing the logging of an AcceptData entry // is equivalent to receiving an Accept reply from myself // (as an acceptor role) @@ -968,10 +986,11 @@ impl CrosswordReplica { // if ballot is what I'm currently waiting on for Prepare replies: if ballot == self.bal_prep_sent { assert!(slot < self.start_slot + self.insts.len()); + let is_leader = self.is_leader(); let inst = &mut self.insts[slot - self.start_slot]; // ignore spurious duplications and outdated replies - if !self.is_leader + if !is_leader || (inst.status != Status::Preparing) || (ballot < inst.bal) { @@ -1157,10 +1176,11 @@ impl CrosswordReplica { // if ballot is what I'm currently waiting on for Accept replies: if ballot == self.bal_prepared { assert!(slot < self.start_slot + self.insts.len()); + let is_leader = self.is_leader(); let inst = &mut self.insts[slot - self.start_slot]; // ignore spurious duplications and outdated replies - if !self.is_leader + if !is_leader || (inst.status != Status::Accepting) || (ballot < inst.bal) { @@ -1484,16 +1504,25 @@ impl CrosswordReplica { /// Becomes a leader, sends self-initiated Prepare messages to followers /// for all in-progress instances, and starts broadcasting heartbeats. fn become_a_leader(&mut self) -> Result<(), SummersetError> { - if self.is_leader { + if self.is_leader() { return Ok(()); + } else if let Some(peer) = self.leader { + // mark old leader as dead + if self.peer_alive.get(peer)? { + self.peer_alive.set(peer, false)?; + pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive); + } } - self.is_leader = true; // this starts broadcasting heartbeats + self.leader = Some(self.id); // this starts broadcasting heartbeats self.control_hub .send_ctrl(CtrlMsg::LeaderStatus { step_up: true })?; pf_info!(self.id; "becoming a leader..."); - // broadcast a heartbeat right now + // clear peers' heartbeat reply counters, and broadcast a heartbeat now + for cnts in self.hb_reply_cnts.values_mut() { + *cnts = (1, 0, 0); + } self.bcast_heartbeats()?; // make a greater ballot number and invalidate all in-progress instances @@ -1592,6 +1621,33 @@ impl CrosswordReplica { }, None, )?; + + // update max heartbeat reply counters and their repetitions seen + for (&peer, cnts) in self.hb_reply_cnts.iter_mut() { + if cnts.0 > cnts.1 { + // more hb replies have been received from this peer; it is + // probably alive + cnts.1 = cnts.0; + cnts.2 = 0; + } else { + // did not receive hb reply from this peer at least for the + // last sent hb from me; increment repetition count + cnts.2 += 1; + let repeat_threshold = (self.config.hb_hear_timeout_min + / self.config.hb_send_interval_ms) + as u8; + if cnts.2 > repeat_threshold { + // did not receive hb reply from this peer for too many + // past hbs sent from me; this peer is probably dead + if self.peer_alive.get(peer)? { + self.peer_alive.set(peer, false)?; + pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive); + } + } + } + } + + // I also heard this heartbeat from myself self.heard_heartbeat(self.id, self.bal_prep_sent, self.exec_bar)?; // pf_trace!(self.id; "broadcast heartbeats bal {}", self.bal_prep_sent); @@ -1616,10 +1672,18 @@ impl CrosswordReplica { /// leader status if I currently think I'm a leader. fn heard_heartbeat( &mut self, - _peer: ReplicaId, + peer: ReplicaId, ballot: Ballot, exec_bar: usize, ) -> Result<(), SummersetError> { + if peer != self.id { + self.hb_reply_cnts.get_mut(&peer).unwrap().0 += 1; + if !self.peer_alive.get(peer)? { + self.peer_alive.set(peer, true)?; + pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive); + } + } + // ignore outdated heartbeats and those from peers with exec_bar < mine if ballot < self.bal_max_seen || exec_bar < self.exec_bar { return Ok(()); @@ -1628,12 +1692,30 @@ impl CrosswordReplica { // reset hearing timer self.kickoff_hb_hear_timer()?; - // clear my leader status if it carries a higher ballot number - if self.is_leader && ballot > self.bal_max_seen { - self.is_leader = false; - self.control_hub - .send_ctrl(CtrlMsg::LeaderStatus { step_up: false })?; - pf_info!(self.id; "no longer a leader..."); + if peer != self.id { + // reply back with a Heartbeat message + self.transport_hub.send_msg( + PeerMsg::Heartbeat { + ballot, + exec_bar: self.exec_bar, + }, + peer, + )?; + + // if the peer has made a higher ballot number + if ballot > self.bal_max_seen { + self.bal_max_seen = ballot; + + // clear my leader status if I was one + if self.is_leader() { + self.control_hub + .send_ctrl(CtrlMsg::LeaderStatus { step_up: false })?; + pf_info!(self.id; "no longer a leader..."); + } + + // set this peer to be the believed leader + self.leader = Some(peer); + } } // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot); @@ -1799,6 +1881,7 @@ impl CrosswordReplica { pf_warn!(self.id; "server got resume req"); // reset leader heartbeat timer + self.hb_hear_timer.cancel()?; self.kickoff_hb_hear_timer()?; *paused = false; @@ -2114,7 +2197,7 @@ impl CrosswordReplica { } // collect and dump all Puts in executed instances - if self.is_leader { + if self.is_leader() { // NOTE: broadcast heartbeats here to appease followers self.bcast_heartbeats()?; } @@ -2147,7 +2230,7 @@ impl CrosswordReplica { self.start_slot = self.exec_bar; // discarding everything older than start_slot in WAL log - if self.is_leader { + if self.is_leader() { // NOTE: broadcast heartbeats here to appease followers self.bcast_heartbeats()?; } @@ -2429,6 +2512,10 @@ impl GenericReplica for CrosswordReplica { )); snapshot_interval.set_missed_tick_behavior(MissedTickBehavior::Skip); + let hb_reply_cnts = (0..population) + .filter_map(|p| if p == id { None } else { Some((p, (1, 0, 0))) }) + .collect(); + Ok(CrosswordReplica { id, population, @@ -2442,9 +2529,11 @@ impl GenericReplica for CrosswordReplica { storage_hub, snapshot_hub, transport_hub, + leader: None, hb_hear_timer: Timer::new(), hb_send_interval, - is_leader: false, + hb_reply_cnts, + peer_alive: Bitmap::new(population, true), insts: vec![], start_slot: 0, snapshot_interval, @@ -2537,7 +2626,7 @@ impl GenericReplica for CrosswordReplica { }, // leader sending heartbeat - _ = self.hb_send_interval.tick(), if !paused && self.is_leader => { + _ = self.hb_send_interval.tick(), if !paused && self.is_leader() => { if let Err(e) = self.bcast_heartbeats() { pf_error!(self.id; "error broadcasting heartbeats: {}", e); } @@ -2556,7 +2645,7 @@ impl GenericReplica for CrosswordReplica { }, // follower gossiping trigger - _ = self.gossip_timer.timeout(), if !paused && !self.is_leader => { + _ = self.gossip_timer.timeout(), if !paused && !self.is_leader() => { if let Err(e) = self.trigger_gossiping() { pf_error!(self.id; "error triggering gossiping: {}", e); } diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs index f1899544..7f1fb73b 100644 --- a/src/protocols/multipaxos.rs +++ b/src/protocols/multipaxos.rs @@ -1199,7 +1199,6 @@ impl MultiPaxosReplica { ); // pf_trace!(self.id; "kickoff hb_hear_timer @ {} ms", timeout_ms); - self.hb_hear_timer.cancel()?; self.hb_hear_timer .kickoff(Duration::from_millis(timeout_ms))?; Ok(()) @@ -1234,7 +1233,7 @@ impl MultiPaxosReplica { // reply back with a Heartbeat message self.transport_hub.send_msg( PeerMsg::Heartbeat { - ballot: self.bal_max_seen, + ballot, exec_bar: self.exec_bar, }, peer, @@ -1242,8 +1241,10 @@ impl MultiPaxosReplica { // if the peer has made a higher ballot number if ballot > self.bal_max_seen { + self.bal_max_seen = ballot; + // clear my leader status if I was one - if self.is_leader() && ballot > self.bal_max_seen { + if self.is_leader() { self.control_hub .send_ctrl(CtrlMsg::LeaderStatus { step_up: false })?; pf_info!(self.id; "no longer a leader..."); @@ -1320,6 +1321,7 @@ impl MultiPaxosReplica { pf_warn!(self.id; "server got resume req"); // reset leader heartbeat timer + self.hb_hear_timer.cancel()?; self.kickoff_hb_hear_timer()?; *paused = false; diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs index 19399535..d6e79486 100644 --- a/src/protocols/rs_paxos.rs +++ b/src/protocols/rs_paxos.rs @@ -274,14 +274,21 @@ pub struct RSPaxosReplica { /// TransportHub module. transport_hub: TransportHub, + /// Who do I think is the effective leader of the cluster right now? + leader: Option, + /// Timer for hearing heartbeat from leader. hb_hear_timer: Timer, /// Interval for sending heartbeat to followers. hb_send_interval: Interval, - /// Do I think I am the leader? - is_leader: bool, + /// Heartbeat reply counters for approximate detection of follower health. + /// Tuple of (#hb_replied, #hb_replied seen at last send, repetition). + hb_reply_cnts: HashMap, + + /// Approximate health status tracking of peer replicas. + peer_alive: Bitmap, /// In-memory log of instances. insts: Vec, @@ -320,6 +327,12 @@ pub struct RSPaxosReplica { // RSPaxosReplica common helpers impl RSPaxosReplica { + /// Do I think I am the current effective leader? + #[inline] + fn is_leader(&self) -> bool { + self.leader == Some(self.id) + } + /// Create an empty null instance. #[inline] fn null_instance(&self) -> Result { @@ -423,21 +436,26 @@ impl RSPaxosReplica { pf_debug!(self.id; "got request batch of size {}", batch_size); // if I'm not a leader, ignore client requests - if !self.is_leader { + if !self.is_leader() { for (client, req) in req_batch { if let ApiRequest::Req { id: req_id, .. } = req { - // tell the client to try on the next replica - let next_replica = (self.id + 1) % self.population; + // tell the client to try on known leader or just the + // next ID replica + let target = if let Some(peer) = self.leader { + peer + } else { + (self.id + 1) % self.population + }; self.external_api.send_reply( ApiReply::Reply { id: req_id, result: None, - redirect: Some(next_replica), + redirect: Some(target), }, client, )?; pf_trace!(self.id; "redirected client {} to replica {}", - client, next_replica); + client, target); } } return Ok(()); @@ -580,7 +598,7 @@ impl RSPaxosReplica { None }; - if self.is_leader { + if self.is_leader() { // on leader, finishing the logging of a PrepareBal entry // is equivalent to receiving a Prepare reply from myself // (as an acceptor role) @@ -617,7 +635,7 @@ impl RSPaxosReplica { slot, self.insts[slot - self.start_slot].bal); let inst = &self.insts[slot - self.start_slot]; - if self.is_leader { + if self.is_leader() { // on leader, finishing the logging of an AcceptData entry // is equivalent to receiving an Accept reply from myself // (as an acceptor role) @@ -798,10 +816,11 @@ impl RSPaxosReplica { // if ballot is what I'm currently waiting on for Prepare replies: if ballot == self.bal_prep_sent { assert!(slot < self.start_slot + self.insts.len()); + let is_leader = self.is_leader(); let inst = &mut self.insts[slot - self.start_slot]; // ignore spurious duplications and outdated replies - if !self.is_leader + if !is_leader || (inst.status != Status::Preparing) || (ballot < inst.bal) { @@ -968,10 +987,11 @@ impl RSPaxosReplica { // if ballot is what I'm currently waiting on for Accept replies: if ballot == self.bal_prepared { assert!(slot < self.start_slot + self.insts.len()); + let is_leader = self.is_leader(); let inst = &mut self.insts[slot - self.start_slot]; // ignore spurious duplications and outdated replies - if !self.is_leader + if !is_leader || (inst.status != Status::Accepting) || (ballot < inst.bal) { @@ -1276,16 +1296,25 @@ impl RSPaxosReplica { /// Becomes a leader, sends self-initiated Prepare messages to followers /// for all in-progress instances, and starts broadcasting heartbeats. fn become_a_leader(&mut self) -> Result<(), SummersetError> { - if self.is_leader { + if self.is_leader() { return Ok(()); + } else if let Some(peer) = self.leader { + // mark old leader as dead + if self.peer_alive.get(peer)? { + self.peer_alive.set(peer, false)?; + pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive); + } } - self.is_leader = true; // this starts broadcasting heartbeats + self.leader = Some(self.id); // this starts broadcasting heartbeats self.control_hub .send_ctrl(CtrlMsg::LeaderStatus { step_up: true })?; pf_info!(self.id; "becoming a leader..."); - // broadcast a heartbeat right now + // clear peers' heartbeat reply counters, and broadcast a heartbeat now + for cnts in self.hb_reply_cnts.values_mut() { + *cnts = (1, 0, 0); + } self.bcast_heartbeats()?; // make a greater ballot number and invalidate all in-progress instances @@ -1367,6 +1396,33 @@ impl RSPaxosReplica { }, None, )?; + + // update max heartbeat reply counters and their repetitions seen + for (&peer, cnts) in self.hb_reply_cnts.iter_mut() { + if cnts.0 > cnts.1 { + // more hb replies have been received from this peer; it is + // probably alive + cnts.1 = cnts.0; + cnts.2 = 0; + } else { + // did not receive hb reply from this peer at least for the + // last sent hb from me; increment repetition count + cnts.2 += 1; + let repeat_threshold = (self.config.hb_hear_timeout_min + / self.config.hb_send_interval_ms) + as u8; + if cnts.2 > repeat_threshold { + // did not receive hb reply from this peer for too many + // past hbs sent from me; this peer is probably dead + if self.peer_alive.get(peer)? { + self.peer_alive.set(peer, false)?; + pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive); + } + } + } + } + + // I also heard this heartbeat from myself self.heard_heartbeat(self.id, self.bal_prep_sent, self.exec_bar)?; // pf_trace!(self.id; "broadcast heartbeats bal {}", self.bal_prep_sent); @@ -1391,10 +1447,18 @@ impl RSPaxosReplica { /// leader status if I currently think I'm a leader. fn heard_heartbeat( &mut self, - _peer: ReplicaId, + peer: ReplicaId, ballot: Ballot, exec_bar: usize, ) -> Result<(), SummersetError> { + if peer != self.id { + self.hb_reply_cnts.get_mut(&peer).unwrap().0 += 1; + if !self.peer_alive.get(peer)? { + self.peer_alive.set(peer, true)?; + pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive); + } + } + // ignore outdated heartbeats and those from peers with exec_bar < mine if ballot < self.bal_max_seen || exec_bar < self.exec_bar { return Ok(()); @@ -1403,12 +1467,30 @@ impl RSPaxosReplica { // reset hearing timer self.kickoff_hb_hear_timer()?; - // clear my leader status if it carries a higher ballot number - if self.is_leader && ballot > self.bal_max_seen { - self.is_leader = false; - self.control_hub - .send_ctrl(CtrlMsg::LeaderStatus { step_up: false })?; - pf_info!(self.id; "no longer a leader..."); + if peer != self.id { + // reply back with a Heartbeat message + self.transport_hub.send_msg( + PeerMsg::Heartbeat { + ballot, + exec_bar: self.exec_bar, + }, + peer, + )?; + + // if the peer has made a higher ballot number + if ballot > self.bal_max_seen { + self.bal_max_seen = ballot; + + // clear my leader status if I was one + if self.is_leader() { + self.control_hub + .send_ctrl(CtrlMsg::LeaderStatus { step_up: false })?; + pf_info!(self.id; "no longer a leader..."); + } + + // set this peer to be the believed leader + self.leader = Some(peer); + } } // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot); @@ -1477,6 +1559,7 @@ impl RSPaxosReplica { pf_warn!(self.id; "server got resume req"); // reset leader heartbeat timer + self.hb_hear_timer.cancel()?; self.kickoff_hb_hear_timer()?; *paused = false; @@ -1792,7 +1875,7 @@ impl RSPaxosReplica { } // collect and dump all Puts in executed instances - if self.is_leader { + if self.is_leader() { // NOTE: broadcast heartbeats here to appease followers self.bcast_heartbeats()?; } @@ -1825,7 +1908,7 @@ impl RSPaxosReplica { self.start_slot = self.exec_bar; // discarding everything older than start_slot in WAL log - if self.is_leader { + if self.is_leader() { // NOTE: broadcast heartbeats here to appease followers self.bcast_heartbeats()?; } @@ -2099,6 +2182,10 @@ impl GenericReplica for RSPaxosReplica { )); snapshot_interval.set_missed_tick_behavior(MissedTickBehavior::Skip); + let hb_reply_cnts = (0..population) + .filter_map(|p| if p == id { None } else { Some((p, (1, 0, 0))) }) + .collect(); + Ok(RSPaxosReplica { id, population, @@ -2112,9 +2199,11 @@ impl GenericReplica for RSPaxosReplica { storage_hub, snapshot_hub, transport_hub, + leader: None, hb_hear_timer: Timer::new(), hb_send_interval, - is_leader: false, + hb_reply_cnts, + peer_alive: Bitmap::new(population, true), insts: vec![], start_slot: 0, snapshot_interval, @@ -2203,7 +2292,7 @@ impl GenericReplica for RSPaxosReplica { }, // leader sending heartbeat - _ = self.hb_send_interval.tick(), if !paused && self.is_leader => { + _ = self.hb_send_interval.tick(), if !paused && self.is_leader() => { if let Err(e) = self.bcast_heartbeats() { pf_error!(self.id; "error broadcasting heartbeats: {}", e); } From 313dba1eab0e25b97fe9f39164d3dd09bcb61cdf Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Fri, 6 Oct 2023 00:24:29 -0500 Subject: [PATCH 80/89] finish peer health tracking --- src/protocols/crossword.rs | 18 +++++++++++++----- src/protocols/multipaxos.rs | 9 +++++++++ src/protocols/rs_paxos.rs | 9 +++++++++ 3 files changed, 31 insertions(+), 5 deletions(-) diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index 37e82ba8..554965cd 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -495,15 +495,14 @@ impl CrosswordReplica { let mut targets_excl = HashMap::new(); for p in (me + 1)..(population + me) { let peer = p % population; - if peer == src_peer { - // skip leader who initially replicated this instance to me - continue; - } - if !first_try { // first try probably did not succeed, so do it conservatively targets_excl.insert(peer, avail_shards_map.to_vec()); } else { + // skip leader who initially replicated this instance to me + if peer == src_peer { + continue; + } // first try: only ask for a minimum number of data shards let mut useful_shards = Vec::new(); for idx in Self::shards_for_replica( @@ -1945,6 +1944,9 @@ impl CrosswordReplica { ) -> Result<(), SummersetError> { match entry { LogEntry::PrepareBal { slot, ballot } => { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } // locate instance in memory, filling in null instances if needed while self.start_slot + self.insts.len() <= slot { self.insts.push(self.null_instance()?); @@ -1968,6 +1970,9 @@ impl CrosswordReplica { ballot, reqs_cw, } => { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } // locate instance in memory, filling in null instances if needed while self.start_slot + self.insts.len() <= slot { self.insts.push(self.null_instance()?); @@ -1994,6 +1999,9 @@ impl CrosswordReplica { } LogEntry::CommitSlot { slot } => { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } assert!(slot < self.start_slot + self.insts.len()); // update instance status self.insts[slot - self.start_slot].status = Status::Committed; diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs index 7f1fb73b..aa42d2d1 100644 --- a/src/protocols/multipaxos.rs +++ b/src/protocols/multipaxos.rs @@ -1385,6 +1385,9 @@ impl MultiPaxosReplica { ) -> Result<(), SummersetError> { match entry { LogEntry::PrepareBal { slot, ballot } => { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } // locate instance in memory, filling in null instances if needed while self.start_slot + self.insts.len() <= slot { self.insts.push(self.null_instance()); @@ -1404,6 +1407,9 @@ impl MultiPaxosReplica { } LogEntry::AcceptData { slot, ballot, reqs } => { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } // locate instance in memory, filling in null instances if needed while self.start_slot + self.insts.len() <= slot { self.insts.push(self.null_instance()); @@ -1430,6 +1436,9 @@ impl MultiPaxosReplica { } LogEntry::CommitSlot { slot } => { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } assert!(slot < self.start_slot + self.insts.len()); // update instance status self.insts[slot - self.start_slot].status = Status::Committed; diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs index d6e79486..79c7a09a 100644 --- a/src/protocols/rs_paxos.rs +++ b/src/protocols/rs_paxos.rs @@ -1623,6 +1623,9 @@ impl RSPaxosReplica { ) -> Result<(), SummersetError> { match entry { LogEntry::PrepareBal { slot, ballot } => { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } // locate instance in memory, filling in null instances if needed while self.start_slot + self.insts.len() <= slot { self.insts.push(self.null_instance()?); @@ -1646,6 +1649,9 @@ impl RSPaxosReplica { ballot, reqs_cw, } => { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } // locate instance in memory, filling in null instances if needed while self.start_slot + self.insts.len() <= slot { self.insts.push(self.null_instance()?); @@ -1672,6 +1678,9 @@ impl RSPaxosReplica { } LogEntry::CommitSlot { slot } => { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } assert!(slot < self.start_slot + self.insts.len()); // update instance status self.insts[slot - self.start_slot].status = Status::Committed; From 1e8a8a08615ad5e6423ac4426a43ac0896ec93e2 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Fri, 6 Oct 2023 12:59:08 -0500 Subject: [PATCH 81/89] add fallback mechanism to Crossword --- src/protocols/crossword.rs | 225 ++++++++++++++++++++++++++++++------- src/protocols/rs_paxos.rs | 52 ++++----- 2 files changed, 208 insertions(+), 69 deletions(-) diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index 554965cd..7ac067aa 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -73,7 +73,7 @@ pub struct ReplicaConfigCrossword { pub recon_chunk_size: usize, /// Number of shards to assign to each replica. - // TODO: proper config options. + // TODO: think about how to allow unbalanced assignments. pub shards_per_replica: u8, // Performance simulation params (all zeros means no perf simulation): @@ -265,7 +265,11 @@ pub struct CrosswordReplica { population: u8, /// Majority quorum size. - quorum_cnt: u8, + majority: u8, + + /// Current #shards per replica configuration. + // TODO: probably needs something better for unbalanced assignments. + shards_per_replica: u8, /// Configuration parameters struct. config: ReplicaConfigCrossword, @@ -363,14 +367,14 @@ impl CrosswordReplica { bal: 0, status: Status::Null, reqs_cw: RSCodeword::::from_null( - self.quorum_cnt, - self.population - self.quorum_cnt, + self.majority, + self.population - self.majority, )?, voted: ( 0, RSCodeword::::from_null( - self.quorum_cnt, - self.population - self.quorum_cnt, + self.majority, + self.population - self.majority, )?, ), leader_bk: None, @@ -446,7 +450,7 @@ impl CrosswordReplica { (slot, cmd_idx) } - /// TODO: maybe remove this. + // TODO: think about how to allow unbalanced assignments. #[inline] fn shards_for_replica( slot: usize, @@ -460,14 +464,14 @@ impl CrosswordReplica { .collect() } - /// TODO: should let leader incorporate assignment metadata in Accept - /// messages. With more complex assignment policies, a follower probably - /// does not know the assignment. + // TODO: should let leader incorporate assignment metadata in Accept + // messages. With more complex assignment policies, a follower probably + // does not know the assignment. fn gossip_targets_excl( slot: usize, me: ReplicaId, population: u8, - quorum_cnt: u8, + majority: u8, shards_per_replica: u8, mut avail_shards_map: Bitmap, replica_bk: &mut Option, @@ -483,7 +487,7 @@ impl CrosswordReplica { src_peer = *source; first_try = true; // first try: exclude all parity shards - for idx in quorum_cnt..population { + for idx in majority..population { avail_shards_map.set(idx, true).unwrap(); } *gossip_tried = true; @@ -529,16 +533,24 @@ impl CrosswordReplica { targets_excl } - /// TODO: make better impl of this. + // TODO: think about how to allow unbalanced assignments. fn coverage_under_faults( population: u8, acks: &HashMap, fault_tolerance: u8, + // if given, assume balanced assignment + shards_per_replica: Option, ) -> u8 { if acks.len() <= fault_tolerance as usize { return 0; } + // if assuming balanced assignment + if let Some(shards_per_replica) = shards_per_replica { + assert!(shards_per_replica > 0); + return acks.len() as u8 - fault_tolerance + shards_per_replica - 1; + } + // enumerate all subsets of acks excluding fault number of replicas let cnt = (acks.len() - fault_tolerance as usize) as u32; let servers: Vec = acks.keys().cloned().collect(); @@ -571,6 +583,107 @@ impl CrosswordReplica { min_coverage } + + /// Change to a new #shards_per_replica vs. quorum_size configuration. If + /// `redo_accepts` is true, redo all the instances that are currently in + /// the Accepting phase. This typically should happen when we are falling + /// back to a smaller quorum_size because of detected follower failures; + /// for performance-oriented config changes, this is not necessary. + // TODO: think about how to allow unbalanced assignments. + fn change_assignment_config( + &mut self, + shards_per_replica: u8, + redo_accepts: bool, + ) -> Result<(), SummersetError> { + assert!(shards_per_replica > 0); + if shards_per_replica > self.majority { + return Ok(()); // invalid, ignore + } + + let quorum_size = self.majority + self.config.fault_tolerance + 1 + - shards_per_replica; + self.shards_per_replica = shards_per_replica; + pf_info!(self.id; "switching assignment config: ({} - {}) {}", + self.shards_per_replica, quorum_size, + if redo_accepts { "redo" } else { "" }); + + if redo_accepts { + for (slot, inst) in self + .insts + .iter_mut() + .enumerate() + .map(|(s, i)| (self.start_slot + s, i)) + { + if inst.status == Status::Accepting { + assert!(inst.leader_bk.is_some()); + inst.bal = self.bal_prepared; + inst.leader_bk.as_mut().unwrap().accept_acks.clear(); + pf_debug!(self.id; "enter Accept phase for slot {} bal {}", + slot, inst.bal); + + // record update to largest accepted ballot and corresponding data + let subset_copy = inst.reqs_cw.subset_copy( + Bitmap::from( + self.population, + Self::shards_for_replica( + slot, + self.id, + self.population, + self.shards_per_replica, + ), + ), + false, + )?; + inst.voted = (inst.bal, subset_copy.clone()); + self.storage_hub.submit_action( + Self::make_log_action_id(slot, Status::Accepting), + LogAction::Append { + entry: LogEntry::AcceptData { + slot, + ballot: inst.bal, + // persist only some shards on myself + reqs_cw: subset_copy, + }, + sync: self.config.logger_sync, + }, + )?; + pf_trace!(self.id; "submitted AcceptData log action for slot {} bal {}", + slot, inst.bal); + + // send Accept messages to all peers, each getting its subset of + // shards of data + for peer in 0..self.population { + if peer == self.id { + continue; + } + self.transport_hub.send_msg( + PeerMsg::Accept { + slot, + ballot: inst.bal, + reqs_cw: inst.reqs_cw.subset_copy( + Bitmap::from( + self.population, + Self::shards_for_replica( + slot, + peer, + self.population, + self.shards_per_replica, + ), + ), + false, + )?, + }, + peer, + )?; + } + pf_trace!(self.id; "broadcast Accept messages for slot {} bal {}", + slot, inst.bal); + } + } + } + + Ok(()) + } } // CrosswordReplica client requests entrance @@ -613,8 +726,8 @@ impl CrosswordReplica { // compute the complete Reed-Solomon codeword for the batch data let mut reqs_cw = RSCodeword::from_data( req_batch, - self.quorum_cnt, - self.population - self.quorum_cnt, + self.majority, + self.population - self.majority, )?; reqs_cw.compute_parity(Some(&self.rs_coder))?; @@ -690,7 +803,7 @@ impl CrosswordReplica { slot, self.id, self.population, - self.config.shards_per_replica, + self.shards_per_replica, ), ), false, @@ -728,7 +841,7 @@ impl CrosswordReplica { slot, peer, self.population, - self.config.shards_per_replica, + self.shards_per_replica, ), ), false, @@ -846,12 +959,12 @@ impl CrosswordReplica { let now_slot = self.commit_bar; self.commit_bar += 1; - if inst.reqs_cw.avail_shards() < self.quorum_cnt { + if inst.reqs_cw.avail_shards() < self.majority { // can't execute if I don't have the complete request batch pf_debug!(self.id; "postponing execution for slot {} (shards {}/{})", - slot, inst.reqs_cw.avail_shards(), self.quorum_cnt); + slot, inst.reqs_cw.avail_shards(), self.majority); break; - } else if inst.reqs_cw.avail_data_shards() < self.quorum_cnt { + } else if inst.reqs_cw.avail_data_shards() < self.majority { // have enough shards but need reconstruction inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?; } @@ -1023,10 +1136,10 @@ impl CrosswordReplica { // reconstruct the original data, enter Accept phase for this // instance using the request batch value constructed using shards // with the highest ballot number in quorum - if leader_bk.prepare_acks.count() >= self.quorum_cnt - && inst.reqs_cw.avail_shards() >= self.quorum_cnt + if leader_bk.prepare_acks.count() >= self.majority + && inst.reqs_cw.avail_shards() >= self.majority { - if inst.reqs_cw.avail_data_shards() < self.quorum_cnt { + if inst.reqs_cw.avail_data_shards() < self.majority { // have enough shards but need reconstruction inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?; } @@ -1052,7 +1165,7 @@ impl CrosswordReplica { slot, self.id, self.population, - self.config.shards_per_replica, + self.shards_per_replica, ), ), false, @@ -1088,7 +1201,7 @@ impl CrosswordReplica { slot, peer, self.population, - self.config.shards_per_replica, + self.shards_per_replica, ), ), false, @@ -1202,19 +1315,20 @@ impl CrosswordReplica { slot, peer, self.population, - self.config.shards_per_replica, + self.shards_per_replica, ), ), ); // if quorum size reached AND enough number of shards are // remembered, mark this instance as committed - if leader_bk.accept_acks.len() as u8 >= self.quorum_cnt + if leader_bk.accept_acks.len() as u8 >= self.majority && Self::coverage_under_faults( self.population, &leader_bk.accept_acks, self.config.fault_tolerance, - ) >= self.quorum_cnt + Some(self.shards_per_replica), + ) >= self.majority { inst.status = Status::Committed; pf_debug!(self.id; "committed instance at slot {} bal {}", @@ -1357,12 +1471,12 @@ impl CrosswordReplica { while now_slot < self.start_slot + self.insts.len() { let inst = &mut self.insts[now_slot - self.start_slot]; if inst.status < Status::Committed - || inst.reqs_cw.avail_shards() < self.quorum_cnt + || inst.reqs_cw.avail_shards() < self.majority { break; } - if inst.reqs_cw.avail_data_shards() < self.quorum_cnt { + if inst.reqs_cw.avail_data_shards() < self.majority { // have enough shards but need reconstruction inst.reqs_cw .reconstruct_data(Some(&self.rs_coder))?; @@ -1510,6 +1624,18 @@ impl CrosswordReplica { if self.peer_alive.get(peer)? { self.peer_alive.set(peer, false)?; pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive); + // check if we need to fall back to a config with smaller + // fast-path quorum size + let curr_quorum_size = + self.majority + self.config.fault_tolerance + 1 + - self.shards_per_replica; + if self.peer_alive.count() < curr_quorum_size { + self.change_assignment_config( + self.shards_per_replica + curr_quorum_size + - self.peer_alive.count(), + true, + )?; + } } } @@ -1579,7 +1705,7 @@ impl CrosswordReplica { // complicated and slow to do the "data shards only" optimization // during fail-over, so just do this conservatively here if inst.status == Status::Committed - && inst.reqs_cw.avail_shards() < self.quorum_cnt + && inst.reqs_cw.avail_shards() < self.majority { recon_slots.insert(slot, inst.reqs_cw.avail_shards_vec()); @@ -1649,6 +1775,18 @@ impl CrosswordReplica { // I also heard this heartbeat from myself self.heard_heartbeat(self.id, self.bal_prep_sent, self.exec_bar)?; + // check if we need to fall back to a config with smaller fast-path + // quorum size + let curr_quorum_size = self.majority + self.config.fault_tolerance + 1 + - self.shards_per_replica; + if self.peer_alive.count() < curr_quorum_size { + self.change_assignment_config( + self.shards_per_replica + curr_quorum_size + - self.peer_alive.count(), + true, + )?; + } + // pf_trace!(self.id; "broadcast heartbeats bal {}", self.bal_prep_sent); Ok(()) } @@ -1765,14 +1903,14 @@ impl CrosswordReplica { let avail_shards_map = self.insts[slot - self.start_slot] .reqs_cw .avail_shards_map(); - if avail_shards_map.count() < self.quorum_cnt { + if avail_shards_map.count() < self.majority { // decide which peers to ask for which shards from let targets_excl = Self::gossip_targets_excl( slot, self.id, self.population, - self.quorum_cnt, - self.config.shards_per_replica, + self.majority, + self.shards_per_replica, avail_shards_map, &mut self.insts[slot - self.start_slot].replica_bk, ); @@ -2017,11 +2155,11 @@ impl CrosswordReplica { // update commit_bar self.commit_bar += 1; // check number of available shards - if inst.reqs_cw.avail_shards() < self.quorum_cnt { + if inst.reqs_cw.avail_shards() < self.majority { // can't execute if I don't have the complete request batch break; } else if inst.reqs_cw.avail_data_shards() - < self.quorum_cnt + < self.majority { // have enough shards but need reconstruction inst.reqs_cw @@ -2110,7 +2248,7 @@ impl CrosswordReplica { let mut pairs = HashMap::new(); for slot in self.start_slot..self.exec_bar { let inst = &mut self.insts[slot - self.start_slot]; - assert!(inst.reqs_cw.avail_data_shards() >= self.quorum_cnt); + assert!(inst.reqs_cw.avail_data_shards() >= self.majority); for (_, req) in inst.reqs_cw.get_data()?.clone() { if let ApiRequest::Req { cmd: Command::Put { key, value }, @@ -2467,20 +2605,20 @@ impl GenericReplica for CrosswordReplica { // create a Reed-Solomon coder with num_data_shards == quorum size and // num_parity shards == population - quorum - let quorum_cnt = (population / 2) + 1; - if config.fault_tolerance > (population - quorum_cnt) { + let majority = (population / 2) + 1; + if config.fault_tolerance > (population - majority) { return logged_err!(id; "invalid config.fault_tolerance '{}'", config.fault_tolerance); } if config.shards_per_replica == 0 - || config.shards_per_replica > quorum_cnt + || config.shards_per_replica > majority { return logged_err!(id; "invalid config.shards_per_replica '{}'", config.shards_per_replica); } let rs_coder = ReedSolomon::new( - quorum_cnt as usize, - (population - quorum_cnt) as usize, + majority as usize, + (population - majority) as usize, )?; // proactively connect to some peers, then wait for all population @@ -2527,7 +2665,8 @@ impl GenericReplica for CrosswordReplica { Ok(CrosswordReplica { id, population, - quorum_cnt, + majority, + shards_per_replica: config.shards_per_replica, config, _api_addr: api_addr, _p2p_addr: p2p_addr, diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs index 79c7a09a..750ec48f 100644 --- a/src/protocols/rs_paxos.rs +++ b/src/protocols/rs_paxos.rs @@ -245,7 +245,7 @@ pub struct RSPaxosReplica { population: u8, /// Majority quorum size. - quorum_cnt: u8, + majority: u8, /// Configuration parameters struct. config: ReplicaConfigRSPaxos, @@ -340,14 +340,14 @@ impl RSPaxosReplica { bal: 0, status: Status::Null, reqs_cw: RSCodeword::::from_null( - self.quorum_cnt, - self.population - self.quorum_cnt, + self.majority, + self.population - self.majority, )?, voted: ( 0, RSCodeword::::from_null( - self.quorum_cnt, - self.population - self.quorum_cnt, + self.majority, + self.population - self.majority, )?, ), leader_bk: None, @@ -464,8 +464,8 @@ impl RSPaxosReplica { // compute the complete Reed-Solomon codeword for the batch data let mut reqs_cw = RSCodeword::from_data( req_batch, - self.quorum_cnt, - self.population - self.quorum_cnt, + self.majority, + self.population - self.majority, )?; reqs_cw.compute_parity(Some(&self.rs_coder))?; @@ -680,12 +680,12 @@ impl RSPaxosReplica { let now_slot = self.commit_bar; self.commit_bar += 1; - if inst.reqs_cw.avail_shards() < self.quorum_cnt { + if inst.reqs_cw.avail_shards() < self.majority { // can't execute if I don't have the complete request batch pf_debug!(self.id; "postponing execution for slot {} (shards {}/{})", - slot, inst.reqs_cw.avail_shards(), self.quorum_cnt); + slot, inst.reqs_cw.avail_shards(), self.majority); break; - } else if inst.reqs_cw.avail_data_shards() < self.quorum_cnt { + } else if inst.reqs_cw.avail_data_shards() < self.majority { // have enough shards but need reconstruction inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?; } @@ -854,10 +854,10 @@ impl RSPaxosReplica { // reconstruct the original data, enter Accept phase for this // instance using the request batch value constructed using shards // with the highest ballot number in quorum - if leader_bk.prepare_acks.count() >= self.quorum_cnt - && inst.reqs_cw.avail_shards() >= self.quorum_cnt + if leader_bk.prepare_acks.count() >= self.majority + && inst.reqs_cw.avail_shards() >= self.majority { - if inst.reqs_cw.avail_data_shards() < self.quorum_cnt { + if inst.reqs_cw.avail_data_shards() < self.majority { // have enough shards but need reconstruction inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?; } @@ -1010,9 +1010,9 @@ impl RSPaxosReplica { // if quorum size reached AND enough number of shards are // remembered, mark this instance as committed; in RS-Paxos, this - // means accept_acks.count() >= self.quorum_cnt + fault_tolerance + // means accept_acks.count() >= self.majority + fault_tolerance if leader_bk.accept_acks.count() - >= self.quorum_cnt + self.config.fault_tolerance + >= self.majority + self.config.fault_tolerance { inst.status = Status::Committed; pf_debug!(self.id; "committed instance at slot {} bal {}", @@ -1150,12 +1150,12 @@ impl RSPaxosReplica { while now_slot < self.start_slot + self.insts.len() { let inst = &mut self.insts[now_slot - self.start_slot]; if inst.status < Status::Committed - || inst.reqs_cw.avail_shards() < self.quorum_cnt + || inst.reqs_cw.avail_shards() < self.majority { break; } - if inst.reqs_cw.avail_data_shards() < self.quorum_cnt { + if inst.reqs_cw.avail_data_shards() < self.majority { // have enough shards but need reconstruction inst.reqs_cw .reconstruct_data(Some(&self.rs_coder))?; @@ -1370,7 +1370,7 @@ impl RSPaxosReplica { // do reconstruction reads for all committed instances that do not // hold enough available shards for reconstruction if inst.status == Status::Committed - && inst.reqs_cw.avail_shards() < self.quorum_cnt + && inst.reqs_cw.avail_shards() < self.majority { recon_slots.push(slot); } @@ -1696,11 +1696,11 @@ impl RSPaxosReplica { // update commit_bar self.commit_bar += 1; // check number of available shards - if inst.reqs_cw.avail_shards() < self.quorum_cnt { + if inst.reqs_cw.avail_shards() < self.majority { // can't execute if I don't have the complete request batch break; } else if inst.reqs_cw.avail_data_shards() - < self.quorum_cnt + < self.majority { // have enough shards but need reconstruction inst.reqs_cw @@ -1789,7 +1789,7 @@ impl RSPaxosReplica { let mut pairs = HashMap::new(); for slot in self.start_slot..self.exec_bar { let inst = &mut self.insts[slot - self.start_slot]; - assert!(inst.reqs_cw.avail_data_shards() >= self.quorum_cnt); + assert!(inst.reqs_cw.avail_data_shards() >= self.majority); for (_, req) in inst.reqs_cw.get_data()?.clone() { if let ApiRequest::Req { cmd: Command::Put { key, value }, @@ -2144,14 +2144,14 @@ impl GenericReplica for RSPaxosReplica { // create a Reed-Solomon coder with num_data_shards == quorum size and // num_parity shards == population - quorum - let quorum_cnt = (population / 2) + 1; - if config.fault_tolerance > (population - quorum_cnt) { + let majority = (population / 2) + 1; + if config.fault_tolerance > (population - majority) { return logged_err!(id; "invalid config.fault_tolerance '{}'", config.fault_tolerance); } let rs_coder = ReedSolomon::new( - quorum_cnt as usize, - (population - quorum_cnt) as usize, + majority as usize, + (population - majority) as usize, )?; // proactively connect to some peers, then wait for all population @@ -2198,7 +2198,7 @@ impl GenericReplica for RSPaxosReplica { Ok(RSPaxosReplica { id, population, - quorum_cnt, + majority, config, _api_addr: api_addr, _p2p_addr: p2p_addr, From 1b04307924fc41d79d76241351ac721c91381e97 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Fri, 6 Oct 2023 13:07:13 -0500 Subject: [PATCH 82/89] minor updates to README --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index fb43e7d9..c5411262 100644 --- a/README.md +++ b/README.md @@ -160,8 +160,8 @@ Complete cluster management and benchmarking scripts are available in another re - [ ] implementation of CRaft - [x] implementation of Crossword prototype - [x] fault recovery reads - - [ ] follower gossiping - - [ ] fall-back mechanism + - [x] follower gossiping + - [x] fall-back mechanism - [ ] workload adaptiveness - [ ] unbalanced assignment - [x] client-side utilities From 68066d9fd008aae7ec5c5136142f9801b8511632 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Fri, 6 Oct 2023 14:32:35 -0500 Subject: [PATCH 83/89] start working on Raft impl --- src/lib.rs | 1 + src/protocols/mod.rs | 18 +++ src/protocols/raft.rs | 279 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 298 insertions(+) create mode 100644 src/protocols/raft.rs diff --git a/src/lib.rs b/src/lib.rs index 2de53e51..7d7a4f2a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -35,5 +35,6 @@ pub use crate::protocols::SmrProtocol; pub use crate::protocols::{ReplicaConfigRepNothing, ClientConfigRepNothing}; pub use crate::protocols::{ReplicaConfigSimplePush, ClientConfigSimplePush}; pub use crate::protocols::{ReplicaConfigMultiPaxos, ClientConfigMultiPaxos}; +pub use crate::protocols::{ReplicaConfigRaft, ClientConfigRaft}; pub use crate::protocols::{ReplicaConfigRSPaxos, ClientConfigRSPaxos}; pub use crate::protocols::{ReplicaConfigCrossword, ClientConfigCrossword}; diff --git a/src/protocols/mod.rs b/src/protocols/mod.rs index 3aae79bf..25cb4ede 100644 --- a/src/protocols/mod.rs +++ b/src/protocols/mod.rs @@ -22,6 +22,10 @@ mod multipaxos; use multipaxos::{MultiPaxosReplica, MultiPaxosClient}; pub use multipaxos::{ReplicaConfigMultiPaxos, ClientConfigMultiPaxos}; +mod raft; +use raft::{RaftReplica, RaftClient}; +pub use raft::{ReplicaConfigRaft, ClientConfigRaft}; + mod rs_paxos; use rs_paxos::{RSPaxosReplica, RSPaxosClient}; pub use rs_paxos::{ReplicaConfigRSPaxos, ClientConfigRSPaxos}; @@ -36,6 +40,7 @@ pub enum SmrProtocol { RepNothing, SimplePush, MultiPaxos, + Raft, RSPaxos, Crossword, } @@ -56,6 +61,7 @@ impl SmrProtocol { "RepNothing" => Some(Self::RepNothing), "SimplePush" => Some(Self::SimplePush), "MultiPaxos" => Some(Self::MultiPaxos), + "Raft" => Some(Self::Raft), "RSPaxos" => Some(Self::RSPaxos), "Crossword" => Some(Self::Crossword), _ => None, @@ -106,6 +112,14 @@ impl SmrProtocol { .await ) } + Self::Raft => { + box_if_ok!( + RaftReplica::new_and_setup( + api_addr, p2p_addr, manager, config_str + ) + .await + ) + } Self::RSPaxos => { box_if_ok!( RSPaxosReplica::new_and_setup( @@ -147,6 +161,9 @@ impl SmrProtocol { MultiPaxosClient::new_and_setup(manager, config_str).await ) } + Self::Raft => { + box_if_ok!(RaftClient::new_and_setup(manager, config_str).await) + } Self::RSPaxos => { box_if_ok!( RSPaxosClient::new_and_setup(manager, config_str).await @@ -185,6 +202,7 @@ mod protocols_name_tests { valid_name_test!(RepNothing); valid_name_test!(SimplePush); valid_name_test!(MultiPaxos); + valid_name_test!(Raft); valid_name_test!(RSPaxos); valid_name_test!(Crossword); } diff --git a/src/protocols/raft.rs b/src/protocols/raft.rs new file mode 100644 index 00000000..f4a60f47 --- /dev/null +++ b/src/protocols/raft.rs @@ -0,0 +1,279 @@ +//! Replication protocol: Raft. +//! +//! References: +//! - +//! - + +use crate::utils::{SummersetError, Bitmap, Timer}; + +/// Configuration parameters struct. +#[derive(Debug, Deserialize)] +pub struct ReplicaConfigRaft { + /// Client request batching interval in microsecs. + pub batch_interval_us: u64, + + /// Client request batching maximum batch size. + pub max_batch_size: usize, + + /// Path to backing log file. + pub backer_path: String, + + /// Whether to call `fsync()`/`fdatasync()` on logger. + pub logger_sync: bool, + + // Performance simulation params (all zeros means no perf simulation): + pub perf_storage_a: u64, + pub perf_storage_b: u64, + pub perf_network_a: u64, + pub perf_network_b: u64, +} + +#[allow(clippy::derivable_impls)] +impl Default for ReplicaConfigRaft { + fn default() -> Self { + ReplicaConfigRaft { + batch_interval_us: 1000, + max_batch_size: 5000, + backer_path: "/tmp/summerset.raft.wal".into(), + logger_sync: false, + perf_storage_a: 0, + perf_storage_b: 0, + perf_network_a: 0, + perf_network_b: 0, + } + } +} + +/// Raft server replica module. +pub struct RaftReplica { + /// Replica ID in cluster. + id: ReplicaId, + + /// Total number of replicas in cluster. + population: u8, + + /// Majority quorum size. + quorum_cnt: u8, + + /// Configuration parameters struct. + config: ReplicaConfigRaft, +} + +#[async_trait] +impl GenericReplica for RaftReplica { + async fn new_and_setup( + api_addr: SocketAddr, + p2p_addr: SocketAddr, + manager: SocketAddr, + config_str: Option<&str>, + ) -> Result { + Ok(RaftReplica { + id, + population, + quorum_cnt: (population / 2) + 1, + config, + _api_addr: api_addr, + _p2p_addr: p2p_addr, + }) + } + + async fn run( + &mut self, + mut rx_term: watch::Receiver, + ) -> Result { + } + + fn id(&self) -> ReplicaId { + self.id + } +} + +/// Configuration parameters struct. +#[derive(Debug, Deserialize)] +pub struct ClientConfigRaft { + /// Which server to pick initially. + pub init_server_id: ReplicaId, +} + +#[allow(clippy::derivable_impls)] +impl Default for ClientConfigRaft { + fn default() -> Self { + ClientConfigRaft { init_server_id: 0 } + } +} + +/// Raft client-side module. +pub struct RaftClient { + /// Client ID. + id: ClientId, + + /// Configuration parameters struct. + _config: ClientConfigRaft, + + /// List of active servers information. + servers: HashMap, + + /// Current server ID to talk to. + server_id: ReplicaId, + + /// Control API stub to the cluster manager. + ctrl_stub: ClientCtrlStub, + + /// API stubs for communicating with servers. + api_stubs: HashMap, +} + +#[async_trait] +impl GenericEndpoint for RaftClient { + async fn new_and_setup( + manager: SocketAddr, + config_str: Option<&str>, + ) -> Result { + // connect to the cluster manager and get assigned a client ID + pf_info!("c"; "connecting to manager '{}'...", manager); + let ctrl_stub = ClientCtrlStub::new_by_connect(manager).await?; + let id = ctrl_stub.id; + + // parse protocol-specific configs + let config = parsed_config!(config_str => ClientConfigRaft; + init_server_id)?; + let init_server_id = config.init_server_id; + + Ok(RaftClient { + id, + _config: config, + servers: HashMap::new(), + server_id: init_server_id, + ctrl_stub, + api_stubs: HashMap::new(), + }) + } + + async fn connect(&mut self) -> Result<(), SummersetError> { + // disallow reconnection without leaving + if !self.api_stubs.is_empty() { + return logged_err!(self.id; "reconnecting without leaving"); + } + + // ask the manager about the list of active servers + let mut sent = + self.ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?; + while !sent { + sent = self.ctrl_stub.send_req(None)?; + } + + let reply = self.ctrl_stub.recv_reply().await?; + match reply { + CtrlReply::QueryInfo { + population, + servers, + } => { + // shift to a new server_id if current one not active + assert!(!servers.is_empty()); + while !servers.contains_key(&self.server_id) { + self.server_id = (self.server_id + 1) % population; + } + // establish connection to all servers + self.servers = servers + .into_iter() + .map(|(id, info)| (id, info.0)) + .collect(); + for (&id, &server) in &self.servers { + pf_info!(self.id; "connecting to server {} '{}'...", id, server); + let api_stub = + ClientApiStub::new_by_connect(self.id, server).await?; + self.api_stubs.insert(id, api_stub); + } + Ok(()) + } + _ => logged_err!(self.id; "unexpected reply type received"), + } + } + + async fn leave(&mut self, permanent: bool) -> Result<(), SummersetError> { + // send leave notification to all servers + for (id, mut api_stub) in self.api_stubs.drain() { + let mut sent = api_stub.send_req(Some(&ApiRequest::Leave))?; + while !sent { + sent = api_stub.send_req(None)?; + } + + while api_stub.recv_reply().await? != ApiReply::Leave {} + pf_info!(self.id; "left server connection {}", id); + api_stub.forget(); + } + + // if permanently leaving, send leave notification to the manager + if permanent { + let mut sent = + self.ctrl_stub.send_req(Some(&CtrlRequest::Leave))?; + while !sent { + sent = self.ctrl_stub.send_req(None)?; + } + + while self.ctrl_stub.recv_reply().await? != CtrlReply::Leave {} + pf_info!(self.id; "left manager connection"); + } + + Ok(()) + } + + fn send_req( + &mut self, + req: Option<&ApiRequest>, + ) -> Result { + if self.api_stubs.contains_key(&self.server_id) { + self.api_stubs + .get_mut(&self.server_id) + .unwrap() + .send_req(req) + } else { + Err(SummersetError(format!( + "server_id {} not in api_stubs", + self.server_id + ))) + } + } + + async fn recv_reply(&mut self) -> Result { + if self.api_stubs.contains_key(&self.server_id) { + let reply = self + .api_stubs + .get_mut(&self.server_id) + .unwrap() + .recv_reply() + .await?; + + if let ApiReply::Reply { + ref result, + ref redirect, + .. + } = reply + { + // if the current server redirects me to a different server + if result.is_none() && redirect.is_some() { + let redirect_id = redirect.unwrap(); + assert!(self.servers.contains_key(&redirect_id)); + self.server_id = redirect_id; + pf_debug!(self.id; "redirected to replica {} '{}'", + redirect_id, self.servers[&redirect_id]); + } + } + + Ok(reply) + } else { + Err(SummersetError(format!( + "server_id {} not in api_stubs", + self.server_id + ))) + } + } + + fn id(&self) -> ClientId { + self.id + } + + fn ctrl_stub(&mut self) -> &mut ClientCtrlStub { + &mut self.ctrl_stub + } +} From 90b13fd74e67f538eaca873af9473ccab4047ee5 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Sat, 7 Oct 2023 17:16:29 -0500 Subject: [PATCH 84/89] rename LogEntry to WalEntry for Paxos-style impls --- README.md | 10 +- scripts/local_cluster.py | 2 + src/protocols/crossword.rs | 272 ++++++++++++++++++----------------- src/protocols/multipaxos.rs | 126 +++++++++------- src/protocols/raft.rs | 198 ++++++++++++++++++++++++- src/protocols/rep_nothing.rs | 54 +++---- src/protocols/rs_paxos.rs | 126 +++++++++------- src/protocols/simple_push.rs | 62 ++++---- src/server/transport.rs | 9 +- 9 files changed, 554 insertions(+), 305 deletions(-) diff --git a/README.md b/README.md index c5411262..d968b5d5 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ git push origin [![Proc tests status](https://github.com/josehu07/summerset/actions/workflows/tests_proc.yml/badge.svg)](https://github.com/josehu07/summerset/actions?query=josehu07%3Atests_proc) [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT) -Summerset is a distributed key-value store supporting a wide range of state machine replication (SMR) protocols for research purposes. More protocols are actively being added. +Summerset is a distributed, replicated, protocol-generic key-value store supporting a wide range of state machine replication (SMR) protocols for research purposes. More protocols are actively being added.

@@ -69,6 +69,7 @@ Formal TLA+ specification of some protocols are provided in `tla+/`. - **Async Rust**: Summerset is written in Rust and demonstrates canonical usage of async programming structures backed by the [`tokio`](https://tokio.rs/) framework; - **Event-based**: Summerset adopts a channel-oriented, event-based system architecture; each replication protocol is basically just a set of event handlers plus a `tokio::select!` loop; - **Modularized**: Common components of a distributed KV store, e.g. network transport and durable logger, are cleanly separated from each other and connected through channels. +- **Protocol-generic**: With the above two points combined, Summerset is able to support a set of different replication protocols in one codebase, each being just a single file, with common functionalities abstracted out. These design choices make protocol implementation in Summerset surprisingly straight-forward and **understandable**, without any sacrifice on performance. Comments / issues / PRs are always welcome! @@ -155,15 +156,22 @@ Complete cluster management and benchmarking scripts are available in another re - [ ] specialize read-only commands? - [ ] separate commit vs. exec responses? - [ ] membership discovery & view changes + - [x] TLA+ spec - [x] implementation of RS-Paxos + - [ ] TLA+ spec - [ ] implementation of Raft + - [ ] snapshotting & garbage collection + - [ ] membership discovery & view changes + - [ ] TLA+ spec - [ ] implementation of CRaft + - [ ] TLA+ spec - [x] implementation of Crossword prototype - [x] fault recovery reads - [x] follower gossiping - [x] fall-back mechanism - [ ] workload adaptiveness - [ ] unbalanced assignment + - [ ] TLA+ spec - [x] client-side utilities - [x] REPL-style client - [x] random benchmarking client diff --git a/scripts/local_cluster.py b/scripts/local_cluster.py index a088ed81..9c47d048 100644 --- a/scripts/local_cluster.py +++ b/scripts/local_cluster.py @@ -44,12 +44,14 @@ def kill_all_matching(name, force=False): "RepNothing": lambda r: f"backer_path='/tmp/summerset.rep_nothing.{r}.wal'", "SimplePush": lambda r: f"backer_path='/tmp/summerset.simple_push.{r}.wal'", "MultiPaxos": lambda r: f"backer_path='/tmp/summerset.multipaxos.{r}.wal'", + "Raft": lambda r: f"backer_path='/tmp/summerset.raft.{r}.wal'", "RSPaxos": lambda r: f"backer_path='/tmp/summerset.rs_paxos.{r}.wal'", "Crossword": lambda r: f"backer_path='/tmp/summerset.crossword.{r}.wal'", } PROTOCOL_SNAPSHOT_PATH = { "MultiPaxos": lambda r: f"snapshot_path='/tmp/summerset.multipaxos.{r}.snap'", + "Raft": lambda r: f"snapshot_path='/tmp/summerset.raft.{r}.snap'", "RSPaxos": lambda r: f"snapshot_path='/tmp/summerset.rs_paxos.{r}.snap'", "Crossword": lambda r: f"snapshot_path='/tmp/summerset.crossword.{r}.snap'", } diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index 7ac067aa..69028ce8 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -1,8 +1,8 @@ //! Replication protocol: Crossword. //! -//! MultiPaxos with flexible Reed-Solomon erasure coding that supports tunable -//! shard groups, asymmetric shard assignment, and follower gossiping for actual -//! usability. +//! MultiPaxos with flexible Reed-Solomon erasure code sharding that supports +//! dynamically tunable shard assignment with the correct liveness constraints, +//! plus follower gossiping for actual usability. use std::collections::HashMap; use std::path::Path; @@ -34,8 +34,8 @@ use reed_solomon_erasure::galois_8::ReedSolomon; /// Configuration parameters struct. #[derive(Debug, Deserialize)] pub struct ReplicaConfigCrossword { - /// Client request batching interval in microsecs. - pub batch_interval_us: u64, + /// Client request batching interval in millisecs. + pub batch_interval_ms: u64, /// Client request batching maximum batch size. pub max_batch_size: usize, @@ -87,7 +87,7 @@ pub struct ReplicaConfigCrossword { impl Default for ReplicaConfigCrossword { fn default() -> Self { ReplicaConfigCrossword { - batch_interval_us: 1000, + batch_interval_ms: 10, max_batch_size: 5000, backer_path: "/tmp/summerset.rs_paxos.wal".into(), logger_sync: false, @@ -176,14 +176,20 @@ struct Instance { external: bool, /// Offset of first durable WAL log entry related to this instance. - log_offset: usize, + wal_offset: usize, } -/// Stable storage log entry type. +/// Stable storage WAL log entry type. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] -enum LogEntry { +enum WalEntry { /// Records an update to the largest prepare ballot seen. - PrepareBal { slot: usize, ballot: Ballot }, + PrepareBal { + /// Slot index in Prepare message is the triggering slot of this + /// Prepare. Once prepared, it means that all slots in the range + /// [slot, +infinity) are prepared under this ballot number. + slot: usize, + ballot: Ballot, + }, /// Records a newly accepted request batch data shards at slot index. AcceptData { @@ -197,6 +203,10 @@ enum LogEntry { } /// Snapshot file entry type. +/// +/// NOTE: the current implementation simply appends a squashed log at the +/// end of the snapshot file for simplicity. In production, the snapshot +/// file should be a bounded-sized backend, e.g., an LSM-tree. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] enum SnapEntry { /// Necessary slot indices to remember. @@ -290,7 +300,7 @@ pub struct CrosswordReplica { state_machine: StateMachine, /// StorageHub module. - storage_hub: StorageHub, + storage_hub: StorageHub, /// StorageHub module for the snapshot file. snapshot_hub: StorageHub, @@ -342,8 +352,8 @@ pub struct CrosswordReplica { /// It is always true that exec_bar <= commit_bar <= start_slot + insts.len() exec_bar: usize, - /// Current durable log file offset. - log_offset: usize, + /// Current durable WAL log file offset. + wal_offset: usize, /// Current durable snapshot file offset. snap_offset: usize, @@ -380,7 +390,7 @@ impl CrosswordReplica { leader_bk: None, replica_bk: None, external: false, - log_offset: 0, + wal_offset: 0, }) } @@ -464,75 +474,6 @@ impl CrosswordReplica { .collect() } - // TODO: should let leader incorporate assignment metadata in Accept - // messages. With more complex assignment policies, a follower probably - // does not know the assignment. - fn gossip_targets_excl( - slot: usize, - me: ReplicaId, - population: u8, - majority: u8, - shards_per_replica: u8, - mut avail_shards_map: Bitmap, - replica_bk: &mut Option, - ) -> HashMap> { - let mut src_peer = me; - let mut first_try = false; - if let Some(ReplicaBookkeeping { - source, - gossip_tried, - }) = replica_bk - { - if !*gossip_tried { - src_peer = *source; - first_try = true; - // first try: exclude all parity shards - for idx in majority..population { - avail_shards_map.set(idx, true).unwrap(); - } - *gossip_tried = true; - } - } - - // greedily considers my peers, starting from the one with my ID + 1, - // until all data shards covered - let mut targets_excl = HashMap::new(); - for p in (me + 1)..(population + me) { - let peer = p % population; - if !first_try { - // first try probably did not succeed, so do it conservatively - targets_excl.insert(peer, avail_shards_map.to_vec()); - } else { - // skip leader who initially replicated this instance to me - if peer == src_peer { - continue; - } - // first try: only ask for a minimum number of data shards - let mut useful_shards = Vec::new(); - for idx in Self::shards_for_replica( - slot, - peer, - population, - shards_per_replica, - ) { - if !avail_shards_map.get(idx).unwrap() { - useful_shards.push(idx); - } - } - // if this peer has data shards which I don't have right now - // and I have not asked others for in this round - if !useful_shards.is_empty() { - targets_excl.insert(peer, avail_shards_map.to_vec()); - for idx in useful_shards { - avail_shards_map.set(idx, true).unwrap(); - } - } - } - } - - targets_excl - } - // TODO: think about how to allow unbalanced assignments. fn coverage_under_faults( population: u8, @@ -638,7 +579,7 @@ impl CrosswordReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Accepting), LogAction::Append { - entry: LogEntry::AcceptData { + entry: WalEntry::AcceptData { slot, ballot: inst.bal, // persist only some shards on myself @@ -767,7 +708,7 @@ impl CrosswordReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Preparing), LogAction::Append { - entry: LogEntry::PrepareBal { + entry: WalEntry::PrepareBal { slot, ballot: self.bal_prep_sent, }, @@ -812,7 +753,7 @@ impl CrosswordReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Accepting), LogAction::Append { - entry: LogEntry::AcceptData { + entry: WalEntry::AcceptData { slot, ballot: inst.bal, // persist only some shards on myself @@ -998,7 +939,7 @@ impl CrosswordReplica { fn handle_log_result( &mut self, action_id: LogActionId, - log_result: LogResult, + log_result: LogResult, ) -> Result<(), SummersetError> { let (slot, entry_type) = Self::split_log_action_id(action_id); if slot < self.start_slot { @@ -1007,15 +948,15 @@ impl CrosswordReplica { assert!(slot < self.start_slot + self.insts.len()); if let LogResult::Append { now_size } = log_result { - assert!(now_size >= self.log_offset); - // update first log_offset of slot + assert!(now_size >= self.wal_offset); + // update first wal_offset of slot let inst = &mut self.insts[slot - self.start_slot]; - if inst.log_offset == 0 || inst.log_offset > self.log_offset { - inst.log_offset = self.log_offset; + if inst.wal_offset == 0 || inst.wal_offset > self.wal_offset { + inst.wal_offset = self.wal_offset; } - assert!(inst.log_offset <= self.log_offset); - // then update self.log_offset - self.log_offset = now_size; + assert!(inst.wal_offset <= self.wal_offset); + // then update self.wal_offset + self.wal_offset = now_size; } else { return logged_err!(self.id; "unexpected log result type: {:?}", log_result); } @@ -1069,7 +1010,7 @@ impl CrosswordReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Preparing), LogAction::Append { - entry: LogEntry::PrepareBal { slot, ballot }, + entry: WalEntry::PrepareBal { slot, ballot }, sync: self.config.logger_sync, }, )?; @@ -1174,7 +1115,7 @@ impl CrosswordReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Accepting), LogAction::Append { - entry: LogEntry::AcceptData { + entry: WalEntry::AcceptData { slot, ballot, reqs_cw: subset_copy, @@ -1257,7 +1198,7 @@ impl CrosswordReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Accepting), LogAction::Append { - entry: LogEntry::AcceptData { + entry: WalEntry::AcceptData { slot, ballot, reqs_cw: inst.reqs_cw.clone(), @@ -1338,7 +1279,7 @@ impl CrosswordReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Committed), LogAction::Append { - entry: LogEntry::CommitSlot { slot }, + entry: WalEntry::CommitSlot { slot }, sync: self.config.logger_sync, }, )?; @@ -1387,7 +1328,7 @@ impl CrosswordReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Committed), LogAction::Append { - entry: LogEntry::CommitSlot { slot }, + entry: WalEntry::CommitSlot { slot }, sync: self.config.logger_sync, }, )?; @@ -1678,7 +1619,7 @@ impl CrosswordReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Preparing), LogAction::Append { - entry: LogEntry::PrepareBal { + entry: WalEntry::PrepareBal { slot, ballot: self.bal_prep_sent, }, @@ -1875,6 +1816,75 @@ impl CrosswordReplica { Ok(()) } + // TODO: should let leader incorporate assignment metadata in Accept + // messages. With more complex assignment policies, a follower probably + // does not know the assignment. + fn gossip_targets_excl( + slot: usize, + me: ReplicaId, + population: u8, + majority: u8, + shards_per_replica: u8, + mut avail_shards_map: Bitmap, + replica_bk: &mut Option, + ) -> HashMap> { + let mut src_peer = me; + let mut first_try = false; + if let Some(ReplicaBookkeeping { + source, + gossip_tried, + }) = replica_bk + { + if !*gossip_tried { + src_peer = *source; + first_try = true; + // first try: exclude all parity shards + for idx in majority..population { + avail_shards_map.set(idx, true).unwrap(); + } + *gossip_tried = true; + } + } + + // greedily considers my peers, starting from the one with my ID + 1, + // until all data shards covered + let mut targets_excl = HashMap::new(); + for p in (me + 1)..(population + me) { + let peer = p % population; + if !first_try { + // first try probably did not succeed, so do it conservatively + targets_excl.insert(peer, avail_shards_map.to_vec()); + } else { + // skip leader who initially replicated this instance to me + if peer == src_peer { + continue; + } + // first try: only ask for a minimum number of data shards + let mut useful_shards = Vec::new(); + for idx in Self::shards_for_replica( + slot, + peer, + population, + shards_per_replica, + ) { + if !avail_shards_map.get(idx).unwrap() { + useful_shards.push(idx); + } + } + // if this peer has data shards which I don't have right now + // and I have not asked others for in this round + if !useful_shards.is_empty() { + targets_excl.insert(peer, avail_shards_map.to_vec()); + for idx in useful_shards { + avail_shards_map.set(idx, true).unwrap(); + } + } + } + } + + targets_excl + } + /// Triggers gossiping for my missing shards in committed but not-yet- /// executed instances: fetch missing shards from peers, preferring /// follower peers that hold data shards. @@ -2078,10 +2088,10 @@ impl CrosswordReplica { /// Apply a durable storage log entry for recovery. async fn recover_apply_entry( &mut self, - entry: LogEntry, + entry: WalEntry, ) -> Result<(), SummersetError> { match entry { - LogEntry::PrepareBal { slot, ballot } => { + WalEntry::PrepareBal { slot, ballot } => { if slot < self.start_slot { return Ok(()); // ignore if slot index outdated } @@ -2103,7 +2113,7 @@ impl CrosswordReplica { self.bal_prepared = 0; } - LogEntry::AcceptData { + WalEntry::AcceptData { slot, ballot, reqs_cw, @@ -2136,7 +2146,7 @@ impl CrosswordReplica { assert!(self.bal_prepared <= self.bal_prep_sent); } - LogEntry::CommitSlot { slot } => { + WalEntry::CommitSlot { slot } => { if slot < self.start_slot { return Ok(()); // ignore if slot index outdated } @@ -2185,15 +2195,15 @@ impl CrosswordReplica { Ok(()) } - /// Recover state from durable storage log. - async fn recover_from_log(&mut self) -> Result<(), SummersetError> { - assert_eq!(self.log_offset, 0); + /// Recover state from durable storage WAL log. + async fn recover_from_wal(&mut self) -> Result<(), SummersetError> { + assert_eq!(self.wal_offset, 0); loop { // using 0 as a special log action ID self.storage_hub.submit_action( 0, LogAction::Read { - offset: self.log_offset, + offset: self.wal_offset, }, )?; let (_, log_result) = self.storage_hub.get_result().await?; @@ -2205,7 +2215,7 @@ impl CrosswordReplica { } => { self.recover_apply_entry(entry).await?; // update log offset - self.log_offset = end_offset; + self.wal_offset = end_offset; } LogResult::Read { entry: None, .. } => { // end of log reached @@ -2221,7 +2231,7 @@ impl CrosswordReplica { self.storage_hub.submit_action( 0, LogAction::Truncate { - offset: self.log_offset, + offset: self.wal_offset, }, )?; let (_, log_result) = self.storage_hub.get_result().await?; @@ -2229,7 +2239,7 @@ impl CrosswordReplica { offset_ok: true, .. } = log_result { - if self.log_offset > 0 { + if self.wal_offset > 0 { pf_info!(self.id; "recovered from wal log: commit {} exec {}", self.commit_bar, self.exec_bar); } @@ -2242,7 +2252,7 @@ impl CrosswordReplica { // CrosswordReplica snapshotting & GC logic impl CrosswordReplica { - /// Dump a new key-value pair to snapshot file. + /// Dump new key-value pairs to snapshot file. async fn snapshot_dump_kv_pairs(&mut self) -> Result<(), SummersetError> { // collect all key-value pairs put up to exec_bar let mut pairs = HashMap::new(); @@ -2283,9 +2293,9 @@ impl CrosswordReplica { /// Discard everything older than start_slot in durable WAL log. async fn snapshot_discard_log(&mut self) -> Result<(), SummersetError> { let cut_offset = if !self.insts.is_empty() { - self.insts[0].log_offset + self.insts[0].wal_offset } else { - self.log_offset + self.wal_offset }; // discard the log before cut_offset @@ -2304,8 +2314,8 @@ impl CrosswordReplica { now_size, } = log_result { - assert_eq!(self.log_offset - cut_offset, now_size); - self.log_offset = now_size; + assert_eq!(self.wal_offset - cut_offset, now_size); + self.wal_offset = now_size; } else { return logged_err!( self.id; @@ -2317,11 +2327,11 @@ impl CrosswordReplica { } } - // update inst.log_offset for all remaining in-mem instances + // update inst.wal_offset for all remaining in-mem instances for inst in &mut self.insts { - if inst.log_offset > 0 { - assert!(inst.log_offset >= cut_offset); - inst.log_offset -= cut_offset; + if inst.wal_offset > 0 { + assert!(inst.wal_offset >= cut_offset); + inst.wal_offset -= cut_offset; } } @@ -2334,6 +2344,12 @@ impl CrosswordReplica { /// NOTE: the current implementation does not guard against crashes in the /// middle of taking a snapshot. Production quality implementations should /// make the snapshotting action "atomic". + /// + /// NOTE: the current implementation does not take care of InstallSnapshot + /// messages (which is needed when some lagging follower has some slot + /// which all other peers have snapshotted); we assume here that failed + /// Accept messages will be retried indefinitely until success before its + /// associated data gets discarded from leader's memory. async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> { pf_debug!(self.id; "taking new snapshot: start {} exec {}", self.start_slot, self.exec_bar); @@ -2512,7 +2528,7 @@ impl GenericReplica for CrosswordReplica { // parse protocol-specific configs let config = parsed_config!(config_str => ReplicaConfigCrossword; - batch_interval_us, max_batch_size, + batch_interval_ms, max_batch_size, backer_path, logger_sync, hb_hear_timeout_min, hb_hear_timeout_max, hb_send_interval_ms, @@ -2522,11 +2538,11 @@ impl GenericReplica for CrosswordReplica { shards_per_replica, perf_storage_a, perf_storage_b, perf_network_a, perf_network_b)?; - if config.batch_interval_us == 0 { + if config.batch_interval_ms == 0 { return logged_err!( id; - "invalid config.batch_interval_us '{}'", - config.batch_interval_us + "invalid config.batch_interval_ms '{}'", + config.batch_interval_ms ); } if config.hb_hear_timeout_min < 100 { @@ -2640,7 +2656,7 @@ impl GenericReplica for CrosswordReplica { let external_api = ExternalApi::new_and_setup( id, api_addr, - Duration::from_micros(config.batch_interval_us), + Duration::from_millis(config.batch_interval_ms), config.max_batch_size, ) .await?; @@ -2690,7 +2706,7 @@ impl GenericReplica for CrosswordReplica { bal_max_seen: 0, commit_bar: 0, exec_bar: 0, - log_offset: 0, + wal_offset: 0, snap_offset: 0, rs_coder, }) @@ -2703,8 +2719,8 @@ impl GenericReplica for CrosswordReplica { // recover state from durable snapshot file self.recover_from_snapshot().await?; - // recover the tail-piece memory log & state from durable storage log - self.recover_from_log().await?; + // recover the tail-piece memory log & state from durable WAL log + self.recover_from_wal().await?; // kick off leader activity hearing timer self.kickoff_hb_hear_timer()?; diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs index aa42d2d1..fef9bf60 100644 --- a/src/protocols/multipaxos.rs +++ b/src/protocols/multipaxos.rs @@ -35,8 +35,8 @@ use tokio::sync::watch; /// Configuration parameters struct. #[derive(Debug, Deserialize)] pub struct ReplicaConfigMultiPaxos { - /// Client request batching interval in microsecs. - pub batch_interval_us: u64, + /// Client request batching interval in millisecs. + pub batch_interval_ms: u64, /// Client request batching maximum batch size. pub max_batch_size: usize, @@ -73,7 +73,7 @@ pub struct ReplicaConfigMultiPaxos { impl Default for ReplicaConfigMultiPaxos { fn default() -> Self { ReplicaConfigMultiPaxos { - batch_interval_us: 1000, + batch_interval_ms: 10, max_batch_size: 5000, backer_path: "/tmp/summerset.multipaxos.wal".into(), logger_sync: false, @@ -153,12 +153,12 @@ struct Instance { external: bool, /// Offset of first durable WAL log entry related to this instance. - log_offset: usize, + wal_offset: usize, } -/// Stable storage log entry type. +/// Stable storage WAL log entry type. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] -enum LogEntry { +enum WalEntry { /// Records an update to the largest prepare ballot seen. PrepareBal { slot: usize, ballot: Ballot }, @@ -174,6 +174,10 @@ enum LogEntry { } /// Snapshot file entry type. +/// +/// NOTE: the current implementation simply appends a squashed log at the +/// end of the snapshot file for simplicity. In production, the snapshot +/// file should be a bounded-sized backend, e.g., an LSM-tree. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] enum SnapEntry { /// Necessary slot indices to remember. @@ -193,7 +197,13 @@ enum SnapEntry { #[derive(Debug, Clone, Serialize, Deserialize, GetSize)] enum PeerMsg { /// Prepare message from leader to replicas. - Prepare { slot: usize, ballot: Ballot }, + Prepare { + /// Slot index in Prepare message is the triggering slot of this + /// Prepare. Once prepared, it means that all slots in the range + /// [slot, +infinity) are prepared under this ballot number. + slot: usize, + ballot: Ballot, + }, /// Prepare reply from replica to leader. PrepareReply { @@ -251,7 +261,7 @@ pub struct MultiPaxosReplica { state_machine: StateMachine, /// StorageHub module. - storage_hub: StorageHub, + storage_hub: StorageHub, /// StorageHub module for the snapshot file. snapshot_hub: StorageHub, @@ -300,8 +310,8 @@ pub struct MultiPaxosReplica { /// It is always true that exec_bar <= commit_bar <= start_slot + insts.len() exec_bar: usize, - /// Current durable log file offset. - log_offset: usize, + /// Current durable WAL log file offset. + wal_offset: usize, /// Current durable snapshot file offset. snap_offset: usize, @@ -326,7 +336,7 @@ impl MultiPaxosReplica { leader_bk: None, replica_bk: None, external: false, - log_offset: 0, + wal_offset: 0, } } @@ -470,7 +480,7 @@ impl MultiPaxosReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Preparing), LogAction::Append { - entry: LogEntry::PrepareBal { + entry: WalEntry::PrepareBal { slot, ballot: self.bal_prep_sent, }, @@ -503,7 +513,7 @@ impl MultiPaxosReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Accepting), LogAction::Append { - entry: LogEntry::AcceptData { + entry: WalEntry::AcceptData { slot, ballot: inst.bal, reqs: req_batch.clone(), @@ -660,7 +670,7 @@ impl MultiPaxosReplica { fn handle_log_result( &mut self, action_id: LogActionId, - log_result: LogResult, + log_result: LogResult, ) -> Result<(), SummersetError> { let (slot, entry_type) = Self::split_log_action_id(action_id); if slot < self.start_slot { @@ -669,15 +679,15 @@ impl MultiPaxosReplica { assert!(slot < self.start_slot + self.insts.len()); if let LogResult::Append { now_size } = log_result { - assert!(now_size >= self.log_offset); - // update first log_offset of slot + assert!(now_size >= self.wal_offset); + // update first wal_offset of slot let inst = &mut self.insts[slot - self.start_slot]; - if inst.log_offset == 0 || inst.log_offset > self.log_offset { - inst.log_offset = self.log_offset; + if inst.wal_offset == 0 || inst.wal_offset > self.wal_offset { + inst.wal_offset = self.wal_offset; } - assert!(inst.log_offset <= self.log_offset); - // then update self.log_offset - self.log_offset = now_size; + assert!(inst.wal_offset <= self.wal_offset); + // then update self.wal_offset + self.wal_offset = now_size; } else { return logged_err!(self.id; "unexpected log result type: {:?}", log_result); } @@ -728,7 +738,7 @@ impl MultiPaxosReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Preparing), LogAction::Append { - entry: LogEntry::PrepareBal { slot, ballot }, + entry: WalEntry::PrepareBal { slot, ballot }, sync: self.config.logger_sync, }, )?; @@ -799,7 +809,7 @@ impl MultiPaxosReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Accepting), LogAction::Append { - entry: LogEntry::AcceptData { + entry: WalEntry::AcceptData { slot, ballot, reqs: inst.reqs.clone(), @@ -863,7 +873,7 @@ impl MultiPaxosReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Accepting), LogAction::Append { - entry: LogEntry::AcceptData { slot, ballot, reqs }, + entry: WalEntry::AcceptData { slot, ballot, reqs }, sync: self.config.logger_sync, }, )?; @@ -921,7 +931,7 @@ impl MultiPaxosReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Committed), LogAction::Append { - entry: LogEntry::CommitSlot { slot }, + entry: WalEntry::CommitSlot { slot }, sync: self.config.logger_sync, }, )?; @@ -970,7 +980,7 @@ impl MultiPaxosReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Committed), LogAction::Append { - entry: LogEntry::CommitSlot { slot }, + entry: WalEntry::CommitSlot { slot }, sync: self.config.logger_sync, }, )?; @@ -1123,7 +1133,7 @@ impl MultiPaxosReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Preparing), LogAction::Append { - entry: LogEntry::PrepareBal { + entry: WalEntry::PrepareBal { slot, ballot: self.bal_prep_sent, }, @@ -1381,10 +1391,10 @@ impl MultiPaxosReplica { /// Apply a durable storage log entry for recovery. async fn recover_apply_entry( &mut self, - entry: LogEntry, + entry: WalEntry, ) -> Result<(), SummersetError> { match entry { - LogEntry::PrepareBal { slot, ballot } => { + WalEntry::PrepareBal { slot, ballot } => { if slot < self.start_slot { return Ok(()); // ignore if slot index outdated } @@ -1406,7 +1416,7 @@ impl MultiPaxosReplica { self.bal_prepared = 0; } - LogEntry::AcceptData { slot, ballot, reqs } => { + WalEntry::AcceptData { slot, ballot, reqs } => { if slot < self.start_slot { return Ok(()); // ignore if slot index outdated } @@ -1435,7 +1445,7 @@ impl MultiPaxosReplica { assert!(self.bal_prepared <= self.bal_prep_sent); } - LogEntry::CommitSlot { slot } => { + WalEntry::CommitSlot { slot } => { if slot < self.start_slot { return Ok(()); // ignore if slot index outdated } @@ -1472,15 +1482,15 @@ impl MultiPaxosReplica { Ok(()) } - /// Recover state from durable storage log. - async fn recover_from_log(&mut self) -> Result<(), SummersetError> { - assert_eq!(self.log_offset, 0); + /// Recover state from durable storage WAL log. + async fn recover_from_wal(&mut self) -> Result<(), SummersetError> { + assert_eq!(self.wal_offset, 0); loop { // using 0 as a special log action ID self.storage_hub.submit_action( 0, LogAction::Read { - offset: self.log_offset, + offset: self.wal_offset, }, )?; let (_, log_result) = self.storage_hub.get_result().await?; @@ -1492,7 +1502,7 @@ impl MultiPaxosReplica { } => { self.recover_apply_entry(entry).await?; // update log offset - self.log_offset = end_offset; + self.wal_offset = end_offset; } LogResult::Read { entry: None, .. } => { // end of log reached @@ -1508,7 +1518,7 @@ impl MultiPaxosReplica { self.storage_hub.submit_action( 0, LogAction::Truncate { - offset: self.log_offset, + offset: self.wal_offset, }, )?; let (_, log_result) = self.storage_hub.get_result().await?; @@ -1516,7 +1526,7 @@ impl MultiPaxosReplica { offset_ok: true, .. } = log_result { - if self.log_offset > 0 { + if self.wal_offset > 0 { pf_info!(self.id; "recovered from wal log: commit {} exec {}", self.commit_bar, self.exec_bar); } @@ -1529,7 +1539,7 @@ impl MultiPaxosReplica { // MultiPaxosReplica snapshotting & GC logic impl MultiPaxosReplica { - /// Dump a new key-value pair to snapshot file. + /// Dump new key-value pairs to snapshot file. async fn snapshot_dump_kv_pairs(&mut self) -> Result<(), SummersetError> { // collect all key-value pairs put up to exec_bar let mut pairs = HashMap::new(); @@ -1569,9 +1579,9 @@ impl MultiPaxosReplica { /// Discard everything older than start_slot in durable WAL log. async fn snapshot_discard_log(&mut self) -> Result<(), SummersetError> { let cut_offset = if !self.insts.is_empty() { - self.insts[0].log_offset + self.insts[0].wal_offset } else { - self.log_offset + self.wal_offset }; // discard the log before cut_offset @@ -1590,8 +1600,8 @@ impl MultiPaxosReplica { now_size, } = log_result { - assert_eq!(self.log_offset - cut_offset, now_size); - self.log_offset = now_size; + assert_eq!(self.wal_offset - cut_offset, now_size); + self.wal_offset = now_size; } else { return logged_err!( self.id; @@ -1603,11 +1613,11 @@ impl MultiPaxosReplica { } } - // update inst.log_offset for all remaining in-mem instances + // update inst.wal_offset for all remaining in-mem instances for inst in &mut self.insts { - if inst.log_offset > 0 { - assert!(inst.log_offset >= cut_offset); - inst.log_offset -= cut_offset; + if inst.wal_offset > 0 { + assert!(inst.wal_offset >= cut_offset); + inst.wal_offset -= cut_offset; } } @@ -1620,6 +1630,12 @@ impl MultiPaxosReplica { /// NOTE: the current implementation does not guard against crashes in the /// middle of taking a snapshot. Production quality implementations should /// make the snapshotting action "atomic". + /// + /// NOTE: the current implementation does not take care of InstallSnapshot + /// messages (which is needed when some lagging follower has some slot + /// which all other peers have snapshotted); we assume here that failed + /// Accept messages will be retried indefinitely until success before its + /// associated data gets discarded from leader's memory. async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> { pf_debug!(self.id; "taking new snapshot: start {} exec {}", self.start_slot, self.exec_bar); @@ -1798,18 +1814,18 @@ impl GenericReplica for MultiPaxosReplica { // parse protocol-specific configs let config = parsed_config!(config_str => ReplicaConfigMultiPaxos; - batch_interval_us, max_batch_size, + batch_interval_ms, max_batch_size, backer_path, logger_sync, hb_hear_timeout_min, hb_hear_timeout_max, hb_send_interval_ms, snapshot_path, snapshot_interval_s, perf_storage_a, perf_storage_b, perf_network_a, perf_network_b)?; - if config.batch_interval_us == 0 { + if config.batch_interval_ms == 0 { return logged_err!( id; - "invalid config.batch_interval_us '{}'", - config.batch_interval_us + "invalid config.batch_interval_ms '{}'", + config.batch_interval_ms ); } if config.hb_hear_timeout_min < 100 { @@ -1898,7 +1914,7 @@ impl GenericReplica for MultiPaxosReplica { let external_api = ExternalApi::new_and_setup( id, api_addr, - Duration::from_micros(config.batch_interval_us), + Duration::from_millis(config.batch_interval_ms), config.max_batch_size, ) .await?; @@ -1946,7 +1962,7 @@ impl GenericReplica for MultiPaxosReplica { bal_max_seen: 0, commit_bar: 0, exec_bar: 0, - log_offset: 0, + wal_offset: 0, snap_offset: 0, }) } @@ -1958,8 +1974,8 @@ impl GenericReplica for MultiPaxosReplica { // recover state from durable snapshot file self.recover_from_snapshot().await?; - // recover the tail-piece memory log & state from durable storage log - self.recover_from_log().await?; + // recover the tail-piece memory log & state from durable WAL log + self.recover_from_wal().await?; // kick off leader activity hearing timer self.kickoff_hb_hear_timer()?; diff --git a/src/protocols/raft.rs b/src/protocols/raft.rs index f4a60f47..4d6b2408 100644 --- a/src/protocols/raft.rs +++ b/src/protocols/raft.rs @@ -1,16 +1,40 @@ //! Replication protocol: Raft. //! -//! References: +//! ATC '14 version of Raft. References: //! - //! - +//! - + +use std::collections::HashMap; +use std::path::Path; +use std::net::SocketAddr; use crate::utils::{SummersetError, Bitmap, Timer}; +use crate::manager::{CtrlMsg, CtrlRequest, CtrlReply}; +use crate::server::{ + ReplicaId, ControlHub, StateMachine, Command, CommandResult, CommandId, + ExternalApi, ApiRequest, ApiReply, StorageHub, LogAction, LogResult, + LogActionId, TransportHub, GenericReplica, +}; +use crate::client::{ClientId, ClientApiStub, ClientCtrlStub, GenericEndpoint}; +use crate::protocols::SmrProtocol; + +use rand::prelude::*; + +use async_trait::async_trait; + +use get_size::GetSize; + +use serde::{Serialize, Deserialize}; + +use tokio::time::{self, Duration, Interval, MissedTickBehavior}; +use tokio::sync::watch; /// Configuration parameters struct. #[derive(Debug, Deserialize)] pub struct ReplicaConfigRaft { - /// Client request batching interval in microsecs. - pub batch_interval_us: u64, + /// Client request batching interval in millisecs. + pub batch_interval_ms: u64, /// Client request batching maximum batch size. pub max_batch_size: usize, @@ -21,6 +45,21 @@ pub struct ReplicaConfigRaft { /// Whether to call `fsync()`/`fdatasync()` on logger. pub logger_sync: bool, + /// Min timeout of not hearing any heartbeat from leader in millisecs. + pub hb_hear_timeout_min: u64, + /// Max timeout of not hearing any heartbeat from leader in millisecs. + pub hb_hear_timeout_max: u64, + + /// Interval of leader sending AppendEntries heartbeats to followers. + pub hb_send_interval_ms: u64, + + /// Path to snapshot file. + pub snapshot_path: String, + + /// Snapshot self-triggering interval in secs. 0 means never trigger + /// snapshotting autonomously. + pub snapshot_interval_s: u64, + // Performance simulation params (all zeros means no perf simulation): pub perf_storage_a: u64, pub perf_storage_b: u64, @@ -32,10 +71,15 @@ pub struct ReplicaConfigRaft { impl Default for ReplicaConfigRaft { fn default() -> Self { ReplicaConfigRaft { - batch_interval_us: 1000, + batch_interval_ms: 10, max_batch_size: 5000, backer_path: "/tmp/summerset.raft.wal".into(), logger_sync: false, + hb_hear_timeout_min: 600, + hb_hear_timeout_max: 900, + hb_send_interval_ms: 50, + snapshot_path: "/tmp/summerset.multipaxos.snap".into(), + snapshot_interval_s: 0, perf_storage_a: 0, perf_storage_b: 0, perf_network_a: 0, @@ -44,6 +88,85 @@ impl Default for ReplicaConfigRaft { } } +/// Term number type, defined for better code readability. +type Term = u64; + +/// Request batch type (i.e., the "command" in an entry). +/// +/// NOTE: the originally presented Raft algorithm does not explicitly mention +/// batching, but instead hides it with the heartbeats: every AppendEntries RPC +/// from the leader basically batches all commands it has received since the +/// last sent heartbeat. Here, to make this implementation more comparable to +/// MultiPaxos, we trigger batching also explicitly. +type ReqBatch = Vec<(ClientId, ApiRequest)>; + +/// In-mem + persistent entry of log, containing a term and a commands batch. +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] +struct LogEntry { + /// Term number. + term: Term, + + /// Batch of client requests. + reqs: ReqBatch, +} + +/// Stable storage log entry type. +/// +/// NOTE: Raft makes the persistent log exactly mirror the in-memory log, so +/// the backer file is not a WAL log in runtime operation; it might get +/// overwritten, etc. +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] +enum DurEntry { + /// Durable metadata. + Metadata { + curr_term: Term, + voted_for: ReplicaId, + }, + + /// Log entry mirroring in-mem log. + LogEntry { entry: LogEntry }, +} + +/// Snapshot file entry type. +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] +enum SnapEntry { + /// Necessary slot indices to remember. + SlotInfo { + /// First entry at the start of file: number of log entries covered + /// by this snapshot file == the start slot index of remaining log. + start_slot: usize, + }, + + /// Set of key-value pairs to apply to the state. + KVPairSet { pairs: HashMap }, +} + +/// Peer-peer message type. +#[derive(Debug, Clone, Serialize, Deserialize, GetSize)] +enum PeerMsg { + /// AppendEntries from leader to followers. + AppendEntries { + term: Term, + prev_slot: usize, + prev_term: Term, + entries: Vec, + leader_commit: usize, + }, + + /// AppendEntries reply from follower to leader. + AppendEntriesReply { term: Term, success: bool }, + + /// RequestVote from leader to followers. + RequestVote { + term: Term, + last_slot: usize, + last_term: Term, + }, + + /// RequestVote reply from follower to leader. + RequestVoteReply { term: Term, granted: bool }, +} + /// Raft server replica module. pub struct RaftReplica { /// Replica ID in cluster. @@ -57,6 +180,73 @@ pub struct RaftReplica { /// Configuration parameters struct. config: ReplicaConfigRaft, + + /// Address string for client requests API. + _api_addr: SocketAddr, + + /// Address string for internal peer-peer communication. + _p2p_addr: SocketAddr, + + /// ControlHub module. + control_hub: ControlHub, + + /// ExternalApi module. + external_api: ExternalApi, + + /// StateMachine module. + state_machine: StateMachine, + + /// StorageHub module. + storage_hub: StorageHub, + + /// StorageHub module for the snapshot file. + snapshot_hub: StorageHub, + + /// TransportHub module. + transport_hub: TransportHub, + + /// Who do I think is the effective leader of the cluster right now? + leader: Option, + + /// Timer for hearing heartbeat from leader. + hb_hear_timer: Timer, + + /// Interval for sending heartbeat to followers. + hb_send_interval: Interval, + + /// Heartbeat reply counters for approximate detection of follower health. + /// Tuple of (#hb_replied, #hb_replied seen at last send, repetition). + hb_reply_cnts: HashMap, + + /// Approximate health status tracking of peer replicas. + peer_alive: Bitmap, + + /// Latest term seen. + curr_term: Term, + + /// Candidate ID that received vote in current term. + voted_for: ReplicaId, + + /// In-memory log of entries. + log: Vec, + + /// Map from in-mem log entry slot index -> offset in durable backer file. + log_offset: Vec, + + /// Slot index of highest log entry known to be committed. + commit_bar: usize, + + /// Slot index of highest log entry applied to state machine. + exec_bar: usize, + + /// For each server, index of the next log entry to send. + next_slot: HashMap, + + /// For each server, index of the highest log entry known to be replicated. + match_slot: HashMap, + + /// Current durable snapshot file offset. + snap_offset: usize, } #[async_trait] diff --git a/src/protocols/rep_nothing.rs b/src/protocols/rep_nothing.rs index af46cc69..a14af95b 100644 --- a/src/protocols/rep_nothing.rs +++ b/src/protocols/rep_nothing.rs @@ -28,8 +28,8 @@ use tokio::sync::watch; /// Configuration parameters struct. #[derive(Debug, Deserialize)] pub struct ReplicaConfigRepNothing { - /// Client request batching interval in microsecs. - pub batch_interval_us: u64, + /// Client request batching interval in millisecs. + pub batch_interval_ms: u64, /// Client request batching maximum batch size. pub max_batch_size: usize, @@ -49,7 +49,7 @@ pub struct ReplicaConfigRepNothing { impl Default for ReplicaConfigRepNothing { fn default() -> Self { ReplicaConfigRepNothing { - batch_interval_us: 1000, + batch_interval_ms: 10, max_batch_size: 5000, backer_path: "/tmp/summerset.rep_nothing.wal".into(), logger_sync: false, @@ -59,9 +59,9 @@ impl Default for ReplicaConfigRepNothing { } } -/// Log entry type. +/// WAL log entry type. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] -struct LogEntry { +struct WalEntry { reqs: Vec<(ClientId, ApiRequest)>, } @@ -97,13 +97,13 @@ pub struct RepNothingReplica { state_machine: StateMachine, /// StorageHub module. - storage_hub: StorageHub, + storage_hub: StorageHub, /// In-memory log of instances. insts: Vec, - /// Current durable log file offset. - log_offset: usize, + /// Current durable WAL log file offset. + wal_offset: usize, } // RepNothingReplica common helpers @@ -144,11 +144,11 @@ impl RepNothingReplica { self.insts.push(inst); // submit log action to make this instance durable - let log_entry = LogEntry { reqs: req_batch }; + let wal_entry = WalEntry { reqs: req_batch }; self.storage_hub.submit_action( inst_idx as LogActionId, LogAction::Append { - entry: log_entry, + entry: wal_entry, sync: self.config.logger_sync, }, )?; @@ -163,7 +163,7 @@ impl RepNothingReplica { fn handle_log_result( &mut self, action_id: LogActionId, - log_result: LogResult, + log_result: LogResult, ) -> Result<(), SummersetError> { let inst_idx = action_id as usize; if inst_idx >= self.insts.len() { @@ -172,8 +172,8 @@ impl RepNothingReplica { match log_result { LogResult::Append { now_size } => { - assert!(now_size >= self.log_offset); - self.log_offset = now_size; + assert!(now_size >= self.wal_offset); + self.wal_offset = now_size; } _ => { return logged_err!(self.id; "unexpected log result type for {}: {:?}", inst_idx, log_result); @@ -340,15 +340,15 @@ impl RepNothingReplica { // RepNothingReplica recovery from WAL log impl RepNothingReplica { - /// Recover state from durable storage log. - async fn recover_from_log(&mut self) -> Result<(), SummersetError> { - assert_eq!(self.log_offset, 0); + /// Recover state from durable storage WAL log. + async fn recover_from_wal(&mut self) -> Result<(), SummersetError> { + assert_eq!(self.wal_offset, 0); loop { // using 0 as a special log action ID self.storage_hub.submit_action( 0, LogAction::Read { - offset: self.log_offset, + offset: self.wal_offset, }, )?; let (_, log_result) = self.storage_hub.get_result().await?; @@ -374,7 +374,7 @@ impl RepNothingReplica { execed: vec![true; num_reqs], }); // update log offset - self.log_offset = end_offset; + self.wal_offset = end_offset; } LogResult::Read { entry: None, .. } => { // end of log reached @@ -390,7 +390,7 @@ impl RepNothingReplica { self.storage_hub.submit_action( 0, LogAction::Truncate { - offset: self.log_offset, + offset: self.wal_offset, }, )?; let (_, log_result) = self.storage_hub.get_result().await?; @@ -419,14 +419,14 @@ impl GenericReplica for RepNothingReplica { // parse protocol-specific configs let config = parsed_config!(config_str => ReplicaConfigRepNothing; - batch_interval_us, max_batch_size, + batch_interval_ms, max_batch_size, backer_path, logger_sync, perf_storage_a, perf_storage_b)?; - if config.batch_interval_us == 0 { + if config.batch_interval_ms == 0 { return logged_err!( id; - "invalid config.batch_interval_us '{}'", - config.batch_interval_us + "invalid config.batch_interval_ms '{}'", + config.batch_interval_ms ); } @@ -460,7 +460,7 @@ impl GenericReplica for RepNothingReplica { let external_api = ExternalApi::new_and_setup( id, api_addr, - Duration::from_micros(config.batch_interval_us), + Duration::from_millis(config.batch_interval_ms), config.max_batch_size, ) .await?; @@ -475,7 +475,7 @@ impl GenericReplica for RepNothingReplica { state_machine, storage_hub, insts: vec![], - log_offset: 0, + wal_offset: 0, }) } @@ -483,8 +483,8 @@ impl GenericReplica for RepNothingReplica { &mut self, mut rx_term: watch::Receiver, ) -> Result { - // recover state from durable storage log - self.recover_from_log().await?; + // recover state from durable storage WAL log + self.recover_from_wal().await?; // main event loop let mut paused = false; diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs index 750ec48f..3280baf1 100644 --- a/src/protocols/rs_paxos.rs +++ b/src/protocols/rs_paxos.rs @@ -33,8 +33,8 @@ use reed_solomon_erasure::galois_8::ReedSolomon; /// Configuration parameters struct. #[derive(Debug, Deserialize)] pub struct ReplicaConfigRSPaxos { - /// Client request batching interval in microsecs. - pub batch_interval_us: u64, + /// Client request batching interval in millisecs. + pub batch_interval_ms: u64, /// Client request batching maximum batch size. pub max_batch_size: usize, @@ -77,7 +77,7 @@ pub struct ReplicaConfigRSPaxos { impl Default for ReplicaConfigRSPaxos { fn default() -> Self { ReplicaConfigRSPaxos { - batch_interval_us: 1000, + batch_interval_ms: 10, max_batch_size: 5000, backer_path: "/tmp/summerset.rs_paxos.wal".into(), logger_sync: false, @@ -159,12 +159,12 @@ struct Instance { external: bool, /// Offset of first durable WAL log entry related to this instance. - log_offset: usize, + wal_offset: usize, } -/// Stable storage log entry type. +/// Stable storage WAL log entry type. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] -enum LogEntry { +enum WalEntry { /// Records an update to the largest prepare ballot seen. PrepareBal { slot: usize, ballot: Ballot }, @@ -180,6 +180,10 @@ enum LogEntry { } /// Snapshot file entry type. +/// +/// NOTE: the current implementation simply appends a squashed log at the +/// end of the snapshot file for simplicity. In production, the snapshot +/// file should be a bounded-sized backend, e.g., an LSM-tree. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] enum SnapEntry { /// Necessary slot indices to remember. @@ -199,7 +203,13 @@ enum SnapEntry { #[derive(Debug, Clone, Serialize, Deserialize, GetSize)] enum PeerMsg { /// Prepare message from leader to replicas. - Prepare { slot: usize, ballot: Ballot }, + Prepare { + /// Slot index in Prepare message is the triggering slot of this + /// Prepare. Once prepared, it means that all slots in the range + /// [slot, +infinity) are prepared under this ballot number. + slot: usize, + ballot: Ballot, + }, /// Prepare reply from replica to leader. PrepareReply { @@ -266,7 +276,7 @@ pub struct RSPaxosReplica { state_machine: StateMachine, /// StorageHub module. - storage_hub: StorageHub, + storage_hub: StorageHub, /// StorageHub module for the snapshot file. snapshot_hub: StorageHub, @@ -315,8 +325,8 @@ pub struct RSPaxosReplica { /// It is always true that exec_bar <= commit_bar <= start_slot + insts.len() exec_bar: usize, - /// Current durable log file offset. - log_offset: usize, + /// Current durable WAL log file offset. + wal_offset: usize, /// Current durable snapshot file offset. snap_offset: usize, @@ -353,7 +363,7 @@ impl RSPaxosReplica { leader_bk: None, replica_bk: None, external: false, - log_offset: 0, + wal_offset: 0, }) } @@ -505,7 +515,7 @@ impl RSPaxosReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Preparing), LogAction::Append { - entry: LogEntry::PrepareBal { + entry: WalEntry::PrepareBal { slot, ballot: self.bal_prep_sent, }, @@ -542,7 +552,7 @@ impl RSPaxosReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Accepting), LogAction::Append { - entry: LogEntry::AcceptData { + entry: WalEntry::AcceptData { slot, ballot: inst.bal, // persist only one shard on myself @@ -719,7 +729,7 @@ impl RSPaxosReplica { fn handle_log_result( &mut self, action_id: LogActionId, - log_result: LogResult, + log_result: LogResult, ) -> Result<(), SummersetError> { let (slot, entry_type) = Self::split_log_action_id(action_id); if slot < self.start_slot { @@ -728,15 +738,15 @@ impl RSPaxosReplica { assert!(slot < self.start_slot + self.insts.len()); if let LogResult::Append { now_size } = log_result { - assert!(now_size >= self.log_offset); - // update first log_offset of slot + assert!(now_size >= self.wal_offset); + // update first wal_offset of slot let inst = &mut self.insts[slot - self.start_slot]; - if inst.log_offset == 0 || inst.log_offset > self.log_offset { - inst.log_offset = self.log_offset; + if inst.wal_offset == 0 || inst.wal_offset > self.wal_offset { + inst.wal_offset = self.wal_offset; } - assert!(inst.log_offset <= self.log_offset); - // then update self.log_offset - self.log_offset = now_size; + assert!(inst.wal_offset <= self.wal_offset); + // then update self.wal_offset + self.wal_offset = now_size; } else { return logged_err!(self.id; "unexpected log result type: {:?}", log_result); } @@ -787,7 +797,7 @@ impl RSPaxosReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Preparing), LogAction::Append { - entry: LogEntry::PrepareBal { slot, ballot }, + entry: WalEntry::PrepareBal { slot, ballot }, sync: self.config.logger_sync, }, )?; @@ -884,7 +894,7 @@ impl RSPaxosReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Accepting), LogAction::Append { - entry: LogEntry::AcceptData { + entry: WalEntry::AcceptData { slot, ballot, reqs_cw: subset_copy, @@ -956,7 +966,7 @@ impl RSPaxosReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Accepting), LogAction::Append { - entry: LogEntry::AcceptData { + entry: WalEntry::AcceptData { slot, ballot, reqs_cw: inst.reqs_cw.clone(), @@ -1022,7 +1032,7 @@ impl RSPaxosReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Committed), LogAction::Append { - entry: LogEntry::CommitSlot { slot }, + entry: WalEntry::CommitSlot { slot }, sync: self.config.logger_sync, }, )?; @@ -1071,7 +1081,7 @@ impl RSPaxosReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Committed), LogAction::Append { - entry: LogEntry::CommitSlot { slot }, + entry: WalEntry::CommitSlot { slot }, sync: self.config.logger_sync, }, )?; @@ -1345,7 +1355,7 @@ impl RSPaxosReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Preparing), LogAction::Append { - entry: LogEntry::PrepareBal { + entry: WalEntry::PrepareBal { slot, ballot: self.bal_prep_sent, }, @@ -1619,10 +1629,10 @@ impl RSPaxosReplica { /// Apply a durable storage log entry for recovery. async fn recover_apply_entry( &mut self, - entry: LogEntry, + entry: WalEntry, ) -> Result<(), SummersetError> { match entry { - LogEntry::PrepareBal { slot, ballot } => { + WalEntry::PrepareBal { slot, ballot } => { if slot < self.start_slot { return Ok(()); // ignore if slot index outdated } @@ -1644,7 +1654,7 @@ impl RSPaxosReplica { self.bal_prepared = 0; } - LogEntry::AcceptData { + WalEntry::AcceptData { slot, ballot, reqs_cw, @@ -1677,7 +1687,7 @@ impl RSPaxosReplica { assert!(self.bal_prepared <= self.bal_prep_sent); } - LogEntry::CommitSlot { slot } => { + WalEntry::CommitSlot { slot } => { if slot < self.start_slot { return Ok(()); // ignore if slot index outdated } @@ -1726,15 +1736,15 @@ impl RSPaxosReplica { Ok(()) } - /// Recover state from durable storage log. - async fn recover_from_log(&mut self) -> Result<(), SummersetError> { - assert_eq!(self.log_offset, 0); + /// Recover state from durable storage WAL log. + async fn recover_from_wal(&mut self) -> Result<(), SummersetError> { + assert_eq!(self.wal_offset, 0); loop { // using 0 as a special log action ID self.storage_hub.submit_action( 0, LogAction::Read { - offset: self.log_offset, + offset: self.wal_offset, }, )?; let (_, log_result) = self.storage_hub.get_result().await?; @@ -1746,7 +1756,7 @@ impl RSPaxosReplica { } => { self.recover_apply_entry(entry).await?; // update log offset - self.log_offset = end_offset; + self.wal_offset = end_offset; } LogResult::Read { entry: None, .. } => { // end of log reached @@ -1762,7 +1772,7 @@ impl RSPaxosReplica { self.storage_hub.submit_action( 0, LogAction::Truncate { - offset: self.log_offset, + offset: self.wal_offset, }, )?; let (_, log_result) = self.storage_hub.get_result().await?; @@ -1770,7 +1780,7 @@ impl RSPaxosReplica { offset_ok: true, .. } = log_result { - if self.log_offset > 0 { + if self.wal_offset > 0 { pf_info!(self.id; "recovered from wal log: commit {} exec {}", self.commit_bar, self.exec_bar); } @@ -1783,7 +1793,7 @@ impl RSPaxosReplica { // RSPaxosReplica snapshotting & GC logic impl RSPaxosReplica { - /// Dump a new key-value pair to snapshot file. + /// Dump new key-value pairs to snapshot file. async fn snapshot_dump_kv_pairs(&mut self) -> Result<(), SummersetError> { // collect all key-value pairs put up to exec_bar let mut pairs = HashMap::new(); @@ -1824,9 +1834,9 @@ impl RSPaxosReplica { /// Discard everything older than start_slot in durable WAL log. async fn snapshot_discard_log(&mut self) -> Result<(), SummersetError> { let cut_offset = if !self.insts.is_empty() { - self.insts[0].log_offset + self.insts[0].wal_offset } else { - self.log_offset + self.wal_offset }; // discard the log before cut_offset @@ -1845,8 +1855,8 @@ impl RSPaxosReplica { now_size, } = log_result { - assert_eq!(self.log_offset - cut_offset, now_size); - self.log_offset = now_size; + assert_eq!(self.wal_offset - cut_offset, now_size); + self.wal_offset = now_size; } else { return logged_err!( self.id; @@ -1858,11 +1868,11 @@ impl RSPaxosReplica { } } - // update inst.log_offset for all remaining in-mem instances + // update inst.wal_offset for all remaining in-mem instances for inst in &mut self.insts { - if inst.log_offset > 0 { - assert!(inst.log_offset >= cut_offset); - inst.log_offset -= cut_offset; + if inst.wal_offset > 0 { + assert!(inst.wal_offset >= cut_offset); + inst.wal_offset -= cut_offset; } } @@ -1875,6 +1885,12 @@ impl RSPaxosReplica { /// NOTE: the current implementation does not guard against crashes in the /// middle of taking a snapshot. Production quality implementations should /// make the snapshotting action "atomic". + /// + /// NOTE: the current implementation does not take care of InstallSnapshot + /// messages (which is needed when some lagging follower has some slot + /// which all other peers have snapshotted); we assume here that failed + /// Accept messages will be retried indefinitely until success before its + /// associated data gets discarded from leader's memory. async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> { pf_debug!(self.id; "taking new snapshot: start {} exec {}", self.start_slot, self.exec_bar); @@ -2053,7 +2069,7 @@ impl GenericReplica for RSPaxosReplica { // parse protocol-specific configs let config = parsed_config!(config_str => ReplicaConfigRSPaxos; - batch_interval_us, max_batch_size, + batch_interval_ms, max_batch_size, backer_path, logger_sync, hb_hear_timeout_min, hb_hear_timeout_max, hb_send_interval_ms, @@ -2061,11 +2077,11 @@ impl GenericReplica for RSPaxosReplica { fault_tolerance, recon_chunk_size, perf_storage_a, perf_storage_b, perf_network_a, perf_network_b)?; - if config.batch_interval_us == 0 { + if config.batch_interval_ms == 0 { return logged_err!( id; - "invalid config.batch_interval_us '{}'", - config.batch_interval_us + "invalid config.batch_interval_ms '{}'", + config.batch_interval_ms ); } if config.hb_hear_timeout_min < 100 { @@ -2173,7 +2189,7 @@ impl GenericReplica for RSPaxosReplica { let external_api = ExternalApi::new_and_setup( id, api_addr, - Duration::from_micros(config.batch_interval_us), + Duration::from_millis(config.batch_interval_ms), config.max_batch_size, ) .await?; @@ -2221,7 +2237,7 @@ impl GenericReplica for RSPaxosReplica { bal_max_seen: 0, commit_bar: 0, exec_bar: 0, - log_offset: 0, + wal_offset: 0, snap_offset: 0, rs_coder, }) @@ -2234,8 +2250,8 @@ impl GenericReplica for RSPaxosReplica { // recover state from durable snapshot file self.recover_from_snapshot().await?; - // recover the tail-piece memory log & state from durable storage log - self.recover_from_log().await?; + // recover the tail-piece memory log & state from durable WAL log + self.recover_from_wal().await?; // kick off leader activity hearing timer self.kickoff_hb_hear_timer()?; diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs index a0345d7e..93baeb0c 100644 --- a/src/protocols/simple_push.rs +++ b/src/protocols/simple_push.rs @@ -29,8 +29,8 @@ use tokio::sync::watch; /// Configuration parameters struct. #[derive(Debug, Deserialize)] pub struct ReplicaConfigSimplePush { - /// Client request batching interval in microsecs. - pub batch_interval_us: u64, + /// Client request batching interval in millisecs. + pub batch_interval_ms: u64, /// Client request batching maximum batch size. pub max_batch_size: usize, @@ -52,7 +52,7 @@ pub struct ReplicaConfigSimplePush { impl Default for ReplicaConfigSimplePush { fn default() -> Self { ReplicaConfigSimplePush { - batch_interval_us: 1000, + batch_interval_ms: 10, max_batch_size: 5000, backer_path: "/tmp/summerset.simple_push.wal".into(), rep_degree: 2, @@ -64,9 +64,9 @@ impl Default for ReplicaConfigSimplePush { } } -/// Log entry type. +/// WAL log entry type. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] -enum LogEntry { +enum WalEntry { FromClient { reqs: Vec<(ClientId, ApiRequest)>, }, @@ -126,7 +126,7 @@ pub struct SimplePushReplica { state_machine: StateMachine, /// StorageHub module. - storage_hub: StorageHub, + storage_hub: StorageHub, /// TransportHub module. transport_hub: TransportHub, @@ -134,8 +134,8 @@ pub struct SimplePushReplica { /// In-memory log of instances. insts: Vec, - /// Current durable log file offset. - log_offset: usize, + /// Current durable WAL log file offset. + wal_offset: usize, } // SimplePushReplica common helpers @@ -192,13 +192,13 @@ impl SimplePushReplica { self.insts.push(inst); // submit log action to make this instance durable - let log_entry = LogEntry::FromClient { + let wal_entry = WalEntry::FromClient { reqs: req_batch.clone(), }; self.storage_hub.submit_action( inst_idx as LogActionId, LogAction::Append { - entry: log_entry, + entry: wal_entry, sync: true, }, )?; @@ -222,7 +222,7 @@ impl SimplePushReplica { fn handle_log_result( &mut self, action_id: LogActionId, - log_result: LogResult, + log_result: LogResult, ) -> Result<(), SummersetError> { let inst_idx = action_id as usize; if inst_idx >= self.insts.len() { @@ -231,8 +231,8 @@ impl SimplePushReplica { match log_result { LogResult::Append { now_size } => { - assert!(now_size >= self.log_offset); - self.log_offset = now_size; + assert!(now_size >= self.wal_offset); + self.wal_offset = now_size; } _ => { return logged_err!(self.id; "unexpected log result type for {}: {:?}", inst_idx, log_result); @@ -296,7 +296,7 @@ impl SimplePushReplica { self.insts.push(inst); // submit log action to make this instance durable - let log_entry = LogEntry::PeerPushed { + let wal_entry = WalEntry::PeerPushed { peer, src_inst_idx, reqs: req_batch.clone(), @@ -304,7 +304,7 @@ impl SimplePushReplica { self.storage_hub.submit_action( inst_idx as LogActionId, LogAction::Append { - entry: log_entry, + entry: wal_entry, sync: true, }, )?; @@ -508,15 +508,15 @@ impl SimplePushReplica { // SimplePushReplica recovery from WAL log impl SimplePushReplica { - /// Recover state from durable storage log. - async fn recover_from_log(&mut self) -> Result<(), SummersetError> { - assert_eq!(self.log_offset, 0); + /// Recover state from durable storage WAL log. + async fn recover_from_wal(&mut self) -> Result<(), SummersetError> { + assert_eq!(self.wal_offset, 0); loop { // using 0 as a special log action ID self.storage_hub.submit_action( 0, LogAction::Read { - offset: self.log_offset, + offset: self.wal_offset, }, )?; let (_, log_result) = self.storage_hub.get_result().await?; @@ -527,8 +527,8 @@ impl SimplePushReplica { end_offset, } => { let (from_peer, reqs) = match entry { - LogEntry::FromClient { reqs } => (None, reqs), - LogEntry::PeerPushed { + WalEntry::FromClient { reqs } => (None, reqs), + WalEntry::PeerPushed { peer, src_inst_idx, reqs, @@ -552,7 +552,7 @@ impl SimplePushReplica { from_peer, }); // update log offset - self.log_offset = end_offset; + self.wal_offset = end_offset; } LogResult::Read { entry: None, .. } => { // end of log reached @@ -568,7 +568,7 @@ impl SimplePushReplica { self.storage_hub.submit_action( 0, LogAction::Truncate { - offset: self.log_offset, + offset: self.wal_offset, }, )?; let (_, log_result) = self.storage_hub.get_result().await?; @@ -598,15 +598,15 @@ impl GenericReplica for SimplePushReplica { // parse protocol-specific configs let config = parsed_config!(config_str => ReplicaConfigSimplePush; - batch_interval_us, max_batch_size, + batch_interval_ms, max_batch_size, backer_path, rep_degree, perf_storage_a, perf_storage_b, perf_network_a, perf_network_b)?; - if config.batch_interval_us == 0 { + if config.batch_interval_ms == 0 { return logged_err!( id; - "invalid config.batch_interval_us '{}'", - config.batch_interval_us + "invalid config.batch_interval_ms '{}'", + config.batch_interval_ms ); } @@ -666,7 +666,7 @@ impl GenericReplica for SimplePushReplica { let external_api = ExternalApi::new_and_setup( id, api_addr, - Duration::from_micros(config.batch_interval_us), + Duration::from_millis(config.batch_interval_ms), config.max_batch_size, ) .await?; @@ -683,7 +683,7 @@ impl GenericReplica for SimplePushReplica { storage_hub, transport_hub, insts: vec![], - log_offset: 0, + wal_offset: 0, }) } @@ -691,8 +691,8 @@ impl GenericReplica for SimplePushReplica { &mut self, mut rx_term: watch::Receiver, ) -> Result { - // recover state from durable storage log - self.recover_from_log().await?; + // recover state from durable storage WAL log + self.recover_from_wal().await?; // main event loop let mut paused = false; diff --git a/src/server/transport.rs b/src/server/transport.rs index e3b464f4..a91c44f4 100644 --- a/src/server/transport.rs +++ b/src/server/transport.rs @@ -1,9 +1,10 @@ //! Summerset server internal TCP transport module implementation. //! -//! In concept, all messages are sent through unstable communication channels, -//! and are retried if the sender did not receive an ACK in a timely manner. -//! Here, we use TCP as the communication protocol to get the same effect of -//! "every message a sender wants to send will eventually be delivered". +//! NOTE: In concept, all messages are sent through unstable communication +//! channels, and are retried if the sender did not receive an ACK in a timely +//! manner. Here, we use TCP as the communication protocol to get the same +//! effect of "every message a sender wants to send will be retried until +//! eventually delivered". use std::fmt; use std::net::SocketAddr; From 7d69552d0f0ec37bbd609aa0dfa052771789bde6 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Sun, 8 Oct 2023 10:30:48 -0500 Subject: [PATCH 85/89] basically finished Raft impl --- README.md | 1 + src/protocols/crossword.rs | 6 +- src/protocols/multipaxos.rs | 6 +- src/protocols/raft.rs | 1680 ++++++++++++++++++++++++++++++++++- src/protocols/rs_paxos.rs | 6 +- 5 files changed, 1679 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index d968b5d5..89481b4a 100644 --- a/README.md +++ b/README.md @@ -58,6 +58,7 @@ Summerset is a distributed, replicated, protocol-generic key-value store support | `SimplePush` | Pushing to peers w/o any consistency guarantees | | `MultiPaxos` | Classic [MultiPaxos](https://www.microsoft.com/en-us/research/uploads/prod/2016/12/paxos-simple-Copy.pdf) protocol | | `RS-Paxos` | MultiPaxos w/ Reed-Solomon erasure code sharding | +| `Raft` | Explicit notion of log and strong leadership | Formal TLA+ specification of some protocols are provided in `tla+/`. diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index 69028ce8..71942eb5 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -2338,7 +2338,7 @@ impl CrosswordReplica { Ok(()) } - /// Take a snapshot up to current exec_idx, then discard the in-mem log up + /// Take a snapshot up to current exec_bar, then discard the in-mem log up /// to that index as well as outdate entries in the durable WAL log file. /// /// NOTE: the current implementation does not guard against crashes in the @@ -2383,7 +2383,7 @@ impl CrosswordReplica { offset_ok: true, .. } => {} _ => { - return logged_err!(self.id; "unexpected log result type or failed truncate"); + return logged_err!(self.id; "unexpected log result type or failed write"); } } @@ -2502,7 +2502,7 @@ impl CrosswordReplica { self.snap_offset = now_size; Ok(()) } else { - logged_err!(self.id; "unexpected log result type or failed truncate") + logged_err!(self.id; "unexpected log result type or failed write") } } diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs index fef9bf60..a19e1d48 100644 --- a/src/protocols/multipaxos.rs +++ b/src/protocols/multipaxos.rs @@ -1624,7 +1624,7 @@ impl MultiPaxosReplica { Ok(()) } - /// Take a snapshot up to current exec_idx, then discard the in-mem log up + /// Take a snapshot up to current exec_bar, then discard the in-mem log up /// to that index as well as outdate entries in the durable WAL log file. /// /// NOTE: the current implementation does not guard against crashes in the @@ -1669,7 +1669,7 @@ impl MultiPaxosReplica { offset_ok: true, .. } => {} _ => { - return logged_err!(self.id; "unexpected log result type or failed truncate"); + return logged_err!(self.id; "unexpected log result type or failed write"); } } @@ -1788,7 +1788,7 @@ impl MultiPaxosReplica { self.snap_offset = now_size; Ok(()) } else { - logged_err!(self.id; "unexpected log result type or failed truncate") + logged_err!(self.id; "unexpected log result type or failed write") } } diff --git a/src/protocols/raft.rs b/src/protocols/raft.rs index 4d6b2408..4693b5cf 100644 --- a/src/protocols/raft.rs +++ b/src/protocols/raft.rs @@ -5,7 +5,7 @@ //! - //! - -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::path::Path; use std::net::SocketAddr; @@ -78,7 +78,7 @@ impl Default for ReplicaConfigRaft { hb_hear_timeout_min: 600, hb_hear_timeout_max: 900, hb_send_interval_ms: 50, - snapshot_path: "/tmp/summerset.multipaxos.snap".into(), + snapshot_path: "/tmp/summerset.raft.snap".into(), snapshot_interval_s: 0, perf_storage_a: 0, perf_storage_b: 0, @@ -108,6 +108,14 @@ struct LogEntry { /// Batch of client requests. reqs: ReqBatch, + + /// True if from external client, else false. + external: bool, + + /// Offset in durable log file of this entry. This field is not maintained + /// in durable storage itself, where it is typically 0. It is maintained + /// only in the in-memory log. + log_offset: usize, } /// Stable storage log entry type. @@ -120,7 +128,7 @@ enum DurEntry { /// Durable metadata. Metadata { curr_term: Term, - voted_for: ReplicaId, + voted_for: Option, }, /// Log entry mirroring in-mem log. @@ -154,7 +162,12 @@ enum PeerMsg { }, /// AppendEntries reply from follower to leader. - AppendEntriesReply { term: Term, success: bool }, + AppendEntriesReply { + term: Term, + /// For correct tracking of which AppendEntries this reply is for. + end_slot: usize, + success: bool, + }, /// RequestVote from leader to followers. RequestVote { @@ -167,6 +180,16 @@ enum PeerMsg { RequestVoteReply { term: Term, granted: bool }, } +/// Replica role type. +#[derive( + Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Serialize, Deserialize, +)] +enum Role { + Follower, + Candidate, + Leader, +} + /// Raft server replica module. pub struct RaftReplica { /// Replica ID in cluster. @@ -205,6 +228,9 @@ pub struct RaftReplica { /// TransportHub module. transport_hub: TransportHub, + /// Which role am I in right now? + role: Role, + /// Who do I think is the effective leader of the cluster right now? leader: Option, @@ -224,20 +250,26 @@ pub struct RaftReplica { /// Latest term seen. curr_term: Term, - /// Candidate ID that received vote in current term. - voted_for: ReplicaId, + /// Candidate ID that I voted for in current term. + voted_for: Option, + + /// Replica IDs that voted for me in current election. + votes_granted: HashSet, - /// In-memory log of entries. + /// In-memory log of entries. Slot 0 is a dummy entry to make indexing happy. log: Vec, - /// Map from in-mem log entry slot index -> offset in durable backer file. - log_offset: Vec, + /// Start slot index of in-mem log after latest snapshot. + start_slot: usize, + + /// Timer for taking a new autonomous snapshot. + snapshot_interval: Interval, /// Slot index of highest log entry known to be committed. - commit_bar: usize, + last_commit: usize, /// Slot index of highest log entry applied to state machine. - exec_bar: usize, + last_exec: usize, /// For each server, index of the next log entry to send. next_slot: HashMap, @@ -245,10 +277,1359 @@ pub struct RaftReplica { /// For each server, index of the highest log entry known to be replicated. match_slot: HashMap, + /// Current durable log file end offset. + log_offset: usize, + /// Current durable snapshot file offset. snap_offset: usize, } +// RaftReplica common helpers +impl RaftReplica { + /// Compose LogActionId from (slot, end_slot) pair & entry type. + /// Uses the `Role` enum type to represent differnet entry types. + #[inline] + fn make_log_action_id( + slot: usize, + slot_e: usize, + entry_type: Role, + ) -> LogActionId { + let type_num = match entry_type { + Role::Follower => 1, + Role::Leader => 2, + _ => panic!("unknown log entry type {:?}", entry_type), + }; + ((slot << 33) | (slot_e << 2) | type_num) as LogActionId + } + + /// Decompose LogActionId into (slot, end_slot) pair & entry type. + #[inline] + fn split_log_action_id(log_action_id: LogActionId) -> (usize, usize, Role) { + let slot = (log_action_id >> 33) as usize; + let slot_e = ((log_action_id & ((1 << 33) - 1)) >> 2) as usize; + let type_num = log_action_id & ((1 << 2) - 1); + let entry_type = match type_num { + 1 => Role::Follower, + 2 => Role::Leader, + _ => panic!("unknown log entry type num {}", type_num), + }; + (slot, slot_e, entry_type) + } + + /// Compose CommandId from slot index & command index within. + #[inline] + fn make_command_id(slot: usize, cmd_idx: usize) -> CommandId { + assert!(slot <= (u32::MAX as usize)); + assert!(cmd_idx <= (u32::MAX as usize)); + ((slot << 32) | cmd_idx) as CommandId + } + + /// Decompose CommandId into slot index & command index within. + #[inline] + fn split_command_id(command_id: CommandId) -> (usize, usize) { + let slot = (command_id >> 32) as usize; + let cmd_idx = (command_id & ((1 << 32) - 1)) as usize; + (slot, cmd_idx) + } + + /// Check if the given term is larger than mine. If so, convert my role + /// back to follower. Returns true if my role was not follower but now + /// converted to follower, and false otherwise. + #[inline] + fn check_term( + &mut self, + peer: ReplicaId, + term: Term, + ) -> Result { + if term > self.curr_term { + self.curr_term = term; + self.heard_heartbeat(peer, term)?; // refresh election timer + if self.role != Role::Follower { + self.role = Role::Follower; + Ok(true) + } else { + Ok(false) + } + } else { + Ok(false) + } + } +} + +// RaftReplica client requests entrance +impl RaftReplica { + /// Handler of client request batch chan recv. + fn handle_req_batch( + &mut self, + req_batch: ReqBatch, + ) -> Result<(), SummersetError> { + let batch_size = req_batch.len(); + assert!(batch_size > 0); + pf_debug!(self.id; "got request batch of size {}", batch_size); + + // if I'm not a leader, ignore client requests + if self.role != Role::Leader { + for (client, req) in req_batch { + if let ApiRequest::Req { id: req_id, .. } = req { + // tell the client to try on known leader or just the + // next ID replica + let target = if let Some(peer) = self.leader { + peer + } else { + (self.id + 1) % self.population + }; + self.external_api.send_reply( + ApiReply::Reply { + id: req_id, + result: None, + redirect: Some(target), + }, + client, + )?; + pf_trace!(self.id; "redirected client {} to replica {}", + client, target); + } + } + return Ok(()); + } + + // append an entry to in-memory log + let entry = LogEntry { + term: self.curr_term, + reqs: req_batch, + external: true, + log_offset: self.log_offset, + }; + let slot = self.start_slot + self.log.len(); + self.log.push(entry.clone()); + + // submit logger action to make this log entry durable + self.storage_hub.submit_action( + Self::make_log_action_id(slot, slot, Role::Leader), + LogAction::Append { + entry: DurEntry::LogEntry { entry }, + sync: self.config.logger_sync, + }, + )?; + pf_trace!(self.id; "submitted leader append log action for slot {}", slot); + + Ok(()) + } +} + +// RaftReplica durable logging +impl RaftReplica { + /// Handler of leader append logging result chan recv. + fn handle_logged_leader_append( + &mut self, + slot: usize, + slot_e: usize, + ) -> Result<(), SummersetError> { + if slot < self.start_slot || self.role != Role::Leader { + return Ok(()); // ignore if outdated + } + pf_trace!(self.id; "finished leader append logging for slot {} <= {}", + slot, slot_e); + assert_eq!(slot, slot_e); + + // broadcast AppendEntries messages to followers + for peer in 0..self.population { + if peer == self.id { + continue; + } + + let prev_slot = self.next_slot[&peer] - 1; + if prev_slot < self.start_slot { + pf_error!(self.id; "snapshotted slot {} queried", prev_slot); + } + let prev_term = self.log[prev_slot - self.start_slot].term; + let entries = self + .log + .iter() + .skip(self.next_slot[&peer] - self.start_slot) + .cloned() + .collect(); + + if slot >= self.next_slot[&peer] { + self.transport_hub.send_msg( + PeerMsg::AppendEntries { + term: self.curr_term, + prev_slot, + prev_term, + entries, + leader_commit: self.last_commit, + }, + peer, + )?; + pf_trace!(self.id; "sent AppendEntries -> {} with slots {} - {}", + peer, self.next_slot[&peer], + self.start_slot + self.log.len() - 1); + } + } + + // I also heard my own heartbeat + self.heard_heartbeat(self.id, self.curr_term)?; + + Ok(()) + } + + /// Handler of follower append logging result chan recv. + fn handle_logged_follower_append( + &mut self, + slot: usize, + slot_e: usize, + ) -> Result<(), SummersetError> { + if slot < self.start_slot || self.role != Role::Follower { + return Ok(()); // ignore if outdated + } + pf_trace!(self.id; "finished follower append logging for slot {} <= {}", + slot, slot_e); + assert!(slot <= slot_e); + + // submit newly committed entry for state machine execution + if slot > self.last_exec && slot <= self.last_commit { + let entry = &self.log[slot - self.start_slot]; + for (cmd_idx, (_, req)) in entry.reqs.iter().enumerate() { + if let ApiRequest::Req { cmd, .. } = req { + self.state_machine.submit_cmd( + Self::make_command_id(slot, cmd_idx), + cmd.clone(), + )?; + } else { + continue; // ignore other types of requests + } + } + } + + // if all consecutive entries are made durable, reply AppendEntries + // success back to leader + if slot == slot_e { + if let Some(leader) = self.leader { + self.transport_hub.send_msg( + PeerMsg::AppendEntriesReply { + term: self.curr_term, + end_slot: slot_e, + success: true, + }, + leader, + )?; + pf_trace!(self.id; "sent AppendEntriesReply -> {} up to slot {}", + leader, slot_e); + } + } + + Ok(()) + } + + /// Synthesized handler of durable logging result chan recv. + fn handle_log_result( + &mut self, + action_id: LogActionId, + log_result: LogResult, + ) -> Result<(), SummersetError> { + let (slot, slot_e, entry_type) = Self::split_log_action_id(action_id); + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } + assert!(slot_e < self.start_slot + self.log.len()); + + if let LogResult::Append { now_size } = log_result { + assert_eq!( + self.log[slot - self.start_slot].log_offset, + self.log_offset + ); + assert!(now_size > self.log_offset); + self.log_offset = now_size; + } else { + return logged_err!(self.id; "unexpected log result type: {:?}", log_result); + } + + match entry_type { + Role::Follower => self.handle_logged_follower_append(slot, slot_e), + Role::Leader => self.handle_logged_leader_append(slot, slot_e), + _ => { + logged_err!(self.id; "unexpected log entry type: {:?}", entry_type) + } + } + } +} + +// RaftReplica peer-peer messages handling +impl RaftReplica { + /// Handler of AppendEntries message from leader. + async fn handle_msg_append_entries( + &mut self, + leader: ReplicaId, + term: Term, + prev_slot: usize, + prev_term: Term, + mut entries: Vec, + leader_commit: usize, + ) -> Result<(), SummersetError> { + pf_trace!(self.id; "received AcceptEntries <- {} for slot > {} term {}", + leader, prev_slot, term); + if self.check_term(leader, term)? || self.role != Role::Follower { + return Ok(()); + } + + // reply false if term smaller than mine, or if my log does not + // contain an entry at prev_slot matching prev_term + if term < self.curr_term + || prev_slot < self.start_slot + || prev_slot >= self.start_slot + self.log.len() + || self.log[prev_slot - self.start_slot].term != prev_term + { + self.transport_hub.send_msg( + PeerMsg::AppendEntriesReply { + term: self.curr_term, + end_slot: prev_slot, + success: false, + }, + leader, + )?; + + if term >= self.curr_term { + // also refresh heartbeat timer here since the "decrementing" + // procedure for a lagging follower might take long + self.heard_heartbeat(leader, term)?; + } + return Ok(()); + } + + // update my knowledge of who's the current leader, and reset election + // timeout timer + self.leader = Some(leader); + self.heard_heartbeat(leader, term)?; + + // check if any existing entry conflicts with a new one in `entries`. + // If so, truncate everything at and after that entry + let mut first_new = prev_slot + 1; + for (slot, new_entry) in entries + .iter() + .enumerate() + .map(|(s, e)| (s + prev_slot + 1, e)) + { + if slot >= self.start_slot + self.log.len() { + first_new = slot; + break; + } else if self.log[slot - self.start_slot].term != new_entry.term { + let cut_offset = self.log[slot - self.start_slot].log_offset; + // do this truncation in-place for simplicity + self.storage_hub.submit_action( + 0, + LogAction::Truncate { offset: cut_offset }, + )?; + loop { + let (action_id, log_result) = + self.storage_hub.get_result().await?; + if action_id != 0 { + // normal log action previously in queue; process it + self.handle_log_result(action_id, log_result)?; + } else { + if let LogResult::Truncate { + offset_ok: true, + now_size, + } = log_result + { + assert_eq!(now_size, cut_offset); + self.log_offset = cut_offset; + } else { + return logged_err!( + self.id; + "unexpected log result type or failed truncate" + ); + } + break; + } + } + // truncate in-mem log as well + self.log.truncate(slot - self.start_slot); + first_new = slot; + break; + } + } + + // append new entries into my log, and submit logger actions to make + // new entries durable + let (num_entries, mut num_appended) = (0, 0); + for (slot, mut entry) in entries + .drain((first_new - prev_slot - 1)..entries.len()) + .enumerate() + .map(|(s, e)| (s + first_new, e)) + { + entry.external = false; // not from client + self.log.push(entry.clone()); + self.storage_hub.submit_action( + Self::make_log_action_id( + slot, + prev_slot + num_entries, + Role::Follower, + ), + LogAction::Append { + entry: DurEntry::LogEntry { entry }, + sync: self.config.logger_sync, + }, + )?; + num_appended += 1; + } + + // even if no entries appended, also send back AppendEntriesReply + // as a follower-to-leader reverse heardbeat for peer health + // tracking purposes + if num_appended == 0 { + self.transport_hub.send_msg( + PeerMsg::AppendEntriesReply { + term: self.curr_term, + end_slot: first_new - 1, + success: true, + }, + leader, + )?; + } + + // if leader_commit is larger than my last_commit, update last_commit + if leader_commit > self.last_commit { + self.last_commit = if leader_commit < prev_slot + entries.len() { + leader_commit + } else { + prev_slot + entries.len() + }; + } + + Ok(()) + } + + /// Handler of AppendEntries reply from follower. + fn handle_msg_append_entries_reply( + &mut self, + peer: ReplicaId, + term: Term, + end_slot: usize, + success: bool, + ) -> Result<(), SummersetError> { + pf_trace!(self.id; "received AcceptEntriesReply <- {} for term {} {}", + peer, term, if success { "ok" } else { "fail" }); + if self.check_term(peer, term)? || self.role != Role::Leader { + return Ok(()); + } + + if success { + // success: update next_slot and match_slot for follower + *self.next_slot.get_mut(&peer).unwrap() = end_slot + 1; + *self.match_slot.get_mut(&peer).unwrap() = end_slot; + + // since we updated some match_slot here, check if any additional + // entries are now considered committed + for slot in + (self.last_commit + 1)..(self.start_slot + self.log.len()) + { + let entry = &self.log[slot - self.start_slot]; + if entry.term != self.curr_term { + continue; // cannot decide commit using non-latest term + } + + let match_cnt = 1 + self + .match_slot + .values() + .filter(|&&s| s >= slot) + .count() as u8; + if match_cnt >= self.quorum_cnt { + // quorum size reached, set last_commit to here + self.last_commit = slot; + } + } + + // submit newly committed commands, if any, for execution + for slot in (self.last_exec + 1)..=self.last_commit { + let entry = &self.log[slot - self.start_slot]; + for (cmd_idx, (_, req)) in entry.reqs.iter().enumerate() { + if let ApiRequest::Req { cmd, .. } = req { + self.state_machine.submit_cmd( + Self::make_command_id(slot, cmd_idx), + cmd.clone(), + )?; + } else { + continue; // ignore other types of requests + } + } + } + } else { + // failed: decrement next_slot for follower and retry + *self.next_slot.get_mut(&peer).unwrap() -= 1; + + let prev_slot = self.next_slot[&peer] - 1; + if prev_slot < self.start_slot { + pf_error!(self.id; "snapshotted slot {} queried", prev_slot); + } + let prev_term = self.log[prev_slot - self.start_slot].term; + let entries = self + .log + .iter() + .skip(self.next_slot[&peer] - self.start_slot) + .cloned() + .collect(); + + self.transport_hub.send_msg( + PeerMsg::AppendEntries { + term: self.curr_term, + prev_slot, + prev_term, + entries, + leader_commit: self.last_commit, + }, + peer, + )?; + pf_trace!(self.id; "sent AppendEntries -> {} with slots {} - {}", + peer, self.next_slot[&peer], + self.start_slot + self.log.len() - 1); + } + + Ok(()) + } + + /// Handler of RequestVote message from candidate. + fn handle_msg_request_vote( + &mut self, + candidate: ReplicaId, + term: Term, + last_slot: usize, + last_term: Term, + ) -> Result<(), SummersetError> { + pf_trace!(self.id; "received RequestVote <- {} with term {} last {} term {}", + candidate, term, last_slot, last_term); + self.check_term(candidate, term)?; + + // if the given term is smaller than mine, reply false + if term < self.curr_term { + self.transport_hub.send_msg( + PeerMsg::RequestVoteReply { + term: self.curr_term, + granted: false, + }, + candidate, + )?; + pf_trace!(self.id; "sent RequestVote -> {} term {} false", + candidate, self.curr_term); + return Ok(()); + } + + // if I did not vote for anyone else in my current term and that the + // candidate's log is as up-to-date as mine, grant vote + #[allow(clippy::collapsible_if)] + if self.voted_for.is_none() || (self.voted_for.unwrap() == candidate) { + if last_term >= self.log.last().unwrap().term + || (last_term == self.curr_term + && last_slot + 1 >= self.start_slot + self.log.len()) + { + self.transport_hub.send_msg( + PeerMsg::RequestVoteReply { + term: self.curr_term, + granted: true, + }, + candidate, + )?; + pf_trace!(self.id; "sent RequestVote -> {} term {} granted", + candidate, self.curr_term); + } + } + + Ok(()) + } + + /// Handler of RequestVote reply from peer. + fn handle_msg_request_vote_reply( + &mut self, + peer: ReplicaId, + term: Term, + granted: bool, + ) -> Result<(), SummersetError> { + pf_trace!(self.id; "received RequestVoteReply <- {} with term {} {}", + peer, term, if granted { "granted" } else { "false" }); + if self.check_term(peer, term)? || self.role != Role::Candidate { + return Ok(()); + } + + // bookkeep this vote + self.votes_granted.insert(peer); + + // if a majority of servers have voted for me, become the leader + if self.votes_granted.len() as u8 >= self.quorum_cnt { + self.become_the_leader()?; + } + + Ok(()) + } + + /// Synthesized handler of receiving message from peer. + async fn handle_msg_recv( + &mut self, + peer: ReplicaId, + msg: PeerMsg, + ) -> Result<(), SummersetError> { + match msg { + PeerMsg::AppendEntries { + term, + prev_slot, + prev_term, + entries, + leader_commit, + } => { + self.handle_msg_append_entries( + peer, + term, + prev_slot, + prev_term, + entries, + leader_commit, + ) + .await + } + PeerMsg::AppendEntriesReply { + term, + end_slot, + success, + } => self + .handle_msg_append_entries_reply(peer, term, end_slot, success), + PeerMsg::RequestVote { + term, + last_slot, + last_term, + } => self.handle_msg_request_vote(peer, term, last_slot, last_term), + PeerMsg::RequestVoteReply { term, granted } => { + self.handle_msg_request_vote_reply(peer, term, granted) + } + } + } +} + +// RaftReplica state machine execution +impl RaftReplica { + /// Handler of state machine exec result chan recv. + fn handle_cmd_result( + &mut self, + cmd_id: CommandId, + cmd_result: CommandResult, + ) -> Result<(), SummersetError> { + let (slot, cmd_idx) = Self::split_command_id(cmd_id); + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } + assert!(slot < self.start_slot + self.log.len()); + pf_trace!(self.id; "executed cmd in entry at slot {} idx {}", + slot, cmd_idx); + + let entry = &mut self.log[slot - self.start_slot]; + assert!(cmd_idx < entry.reqs.len()); + let (client, ref req) = entry.reqs[cmd_idx]; + + // reply command result back to client + if let ApiRequest::Req { id: req_id, .. } = req { + if entry.external && self.external_api.has_client(client) { + self.external_api.send_reply( + ApiReply::Reply { + id: *req_id, + result: Some(cmd_result), + redirect: None, + }, + client, + )?; + pf_trace!(self.id; "replied -> client {} for slot {} idx {}", + client, slot, cmd_idx); + } + } else { + return logged_err!(self.id; "unexpected API request type"); + } + + // if all commands in this entry have been executed, update last_exec + if cmd_idx == entry.reqs.len() - 1 { + pf_debug!(self.id; "executed all cmds in entry at slot {}", slot); + self.last_exec = slot; + } + + Ok(()) + } +} + +// RaftReplica leader election timeout logic +impl RaftReplica { + /// Becomes a candidate and starts the election procedure. + async fn become_a_candidate(&mut self) -> Result<(), SummersetError> { + if self.role != Role::Follower { + return Ok(()); + } else if let Some(peer) = self.leader { + // mark old leader as dead + if self.peer_alive.get(peer)? { + self.peer_alive.set(peer, false)?; + pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive); + } + } + + self.role = Role::Candidate; + + // increment current term and vote for myself + self.curr_term += 1; + self.voted_for = Some(self.id); + self.votes_granted = HashSet::from([self.id]); + pf_info!(self.id; "starting election with term {}...", self.curr_term); + + // also make the two critical fields durable, synchronously + self.storage_hub.submit_action( + 0, + LogAction::Write { + entry: DurEntry::Metadata { + curr_term: self.curr_term, + voted_for: self.voted_for, + }, + offset: 0, + sync: self.config.logger_sync, + }, + )?; + loop { + let (action_id, log_result) = self.storage_hub.get_result().await?; + if action_id != 0 { + // normal log action previously in queue; process it + self.handle_log_result(action_id, log_result)?; + } else { + if let LogResult::Write { + offset_ok: true, .. + } = log_result + { + } else { + return logged_err!(self.id; "unexpected log result type or failed write"); + } + break; + } + } + + // reset election timeout timer + self.heard_heartbeat(self.id, self.curr_term)?; + + // send RequestVote messages to all other peers + let last_slot = self.start_slot + self.log.len() - 1; + assert!(last_slot >= self.start_slot); + let last_term = self.log[last_slot - self.start_slot].term; + self.transport_hub.bcast_msg( + PeerMsg::RequestVote { + term: self.curr_term, + last_slot, + last_term, + }, + None, + )?; + pf_trace!(self.id; "broadcast RequestVote with term {} last {} term {}", + self.curr_term, last_slot, last_term); + + Ok(()) + } + + /// Becomes the leader after enough votes granted for me. + fn become_the_leader(&mut self) -> Result<(), SummersetError> { + pf_info!(self.id; "elected as leader with term {}", self.curr_term); + + // clear peers' heartbeat reply counters, and broadcast a heartbeat now + for cnts in self.hb_reply_cnts.values_mut() { + *cnts = (1, 0, 0); + } + self.bcast_heartbeats()?; + + // re-initialize next_slot and match_slot information + for slot in self.next_slot.values_mut() { + *slot = self.start_slot + self.log.len(); + } + for slot in self.match_slot.values_mut() { + *slot = 0; + } + + Ok(()) + } + + /// Broadcasts empty AppendEntries messages as heartbeats to all peers. + fn bcast_heartbeats(&mut self) -> Result<(), SummersetError> { + let prev_slot = self.start_slot + self.log.len() - 1; + assert!(prev_slot >= self.start_slot); + let prev_term = self.log[prev_slot - self.start_slot].term; + self.transport_hub.bcast_msg( + PeerMsg::AppendEntries { + term: self.curr_term, + prev_slot, + prev_term, + entries: vec![], + leader_commit: self.last_commit, + }, + None, + )?; + + // update max heartbeat reply counters and their repetitions seen + for (&peer, cnts) in self.hb_reply_cnts.iter_mut() { + if cnts.0 > cnts.1 { + // more hb replies have been received from this peer; it is + // probably alive + cnts.1 = cnts.0; + cnts.2 = 0; + } else { + // did not receive hb reply from this peer at least for the + // last sent hb from me; increment repetition count + cnts.2 += 1; + let repeat_threshold = (self.config.hb_hear_timeout_min + / self.config.hb_send_interval_ms) + as u8; + if cnts.2 > repeat_threshold { + // did not receive hb reply from this peer for too many + // past hbs sent from me; this peer is probably dead + if self.peer_alive.get(peer)? { + self.peer_alive.set(peer, false)?; + pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive); + } + } + } + } + + // I also heard this heartbeat from myself + self.heard_heartbeat(self.id, self.curr_term)?; + + // pf_trace!(self.id; "broadcast heartbeats term {}", self.curr_term); + Ok(()) + } + + /// Chooses a random hb_hear_timeout from the min-max range and kicks off + /// the hb_hear_timer. + fn kickoff_hb_hear_timer(&mut self) -> Result<(), SummersetError> { + let timeout_ms = thread_rng().gen_range( + self.config.hb_hear_timeout_min..=self.config.hb_hear_timeout_max, + ); + + // pf_trace!(self.id; "kickoff hb_hear_timer @ {} ms", timeout_ms); + self.hb_hear_timer + .kickoff(Duration::from_millis(timeout_ms))?; + Ok(()) + } + + /// Heard a heartbeat from some other replica. Resets election timer. + fn heard_heartbeat( + &mut self, + peer: ReplicaId, + _term: Term, + ) -> Result<(), SummersetError> { + if peer != self.id { + self.hb_reply_cnts.get_mut(&peer).unwrap().0 += 1; + if !self.peer_alive.get(peer)? { + self.peer_alive.set(peer, true)?; + pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive); + } + } + + // reset hearing timer + self.kickoff_hb_hear_timer()?; + + // pf_trace!(self.id; "heard heartbeat <- {} term {}", peer, term); + Ok(()) + } +} + +// RaftReplica control messages handling +impl RaftReplica { + /// Handler of ResetState control message. + async fn handle_ctrl_reset_state( + &mut self, + durable: bool, + ) -> Result<(), SummersetError> { + pf_warn!(self.id; "server got restart req"); + + // send leave notification to peers and wait for their replies + self.transport_hub.leave().await?; + + // send leave notification to manager and wait for its reply + self.control_hub.send_ctrl(CtrlMsg::Leave)?; + while self.control_hub.recv_ctrl().await? != CtrlMsg::LeaveReply {} + + // if `durable` is false, truncate backer file + if !durable { + // use 0 as a special log action ID here + self.storage_hub + .submit_action(0, LogAction::Truncate { offset: 0 })?; + loop { + let (action_id, log_result) = + self.storage_hub.get_result().await?; + if action_id == 0 { + if log_result + != (LogResult::Truncate { + offset_ok: true, + now_size: 0, + }) + { + return logged_err!(self.id; "failed to truncate log to 0"); + } else { + return Ok(()); + } + } + } + } + + Ok(()) + } + + /// Handler of Pause control message. + fn handle_ctrl_pause( + &mut self, + paused: &mut bool, + ) -> Result<(), SummersetError> { + pf_warn!(self.id; "server got pause req"); + *paused = true; + self.control_hub.send_ctrl(CtrlMsg::PauseReply)?; + Ok(()) + } + + /// Handler of Resume control message. + fn handle_ctrl_resume( + &mut self, + paused: &mut bool, + ) -> Result<(), SummersetError> { + pf_warn!(self.id; "server got resume req"); + + // reset leader heartbeat timer + self.hb_hear_timer.cancel()?; + self.kickoff_hb_hear_timer()?; + + *paused = false; + self.control_hub.send_ctrl(CtrlMsg::ResumeReply)?; + Ok(()) + } + + /// Handler of TakeSnapshot control message. + async fn handle_ctrl_take_snapshot( + &mut self, + ) -> Result<(), SummersetError> { + pf_warn!(self.id; "server told to take snapshot"); + self.take_new_snapshot().await?; + + self.control_hub.send_ctrl(CtrlMsg::SnapshotUpTo { + new_start: self.start_slot, + })?; + Ok(()) + } + + /// Synthesized handler of manager control messages. If ok, returns + /// `Some(true)` if decides to terminate and reboot, `Some(false)` if + /// decides to shutdown completely, and `None` if not terminating. + async fn handle_ctrl_msg( + &mut self, + msg: CtrlMsg, + paused: &mut bool, + ) -> Result, SummersetError> { + match msg { + CtrlMsg::ResetState { durable } => { + self.handle_ctrl_reset_state(durable).await?; + Ok(Some(true)) + } + + CtrlMsg::Pause => { + self.handle_ctrl_pause(paused)?; + Ok(None) + } + + CtrlMsg::Resume => { + self.handle_ctrl_resume(paused)?; + Ok(None) + } + + CtrlMsg::TakeSnapshot => { + self.handle_ctrl_take_snapshot().await?; + Ok(None) + } + + _ => Ok(None), // ignore all other types + } + } +} + +// RaftReplica recovery from durable log +impl RaftReplica { + /// Recover state from durable storage log. + async fn recover_from_log(&mut self) -> Result<(), SummersetError> { + assert_eq!(self.log_offset, 0); + + // first, try to read the first several bytes, which should record + // necessary durable metadata + self.storage_hub + .submit_action(0, LogAction::Read { offset: 0 })?; + let (_, log_result) = self.storage_hub.get_result().await?; + + match log_result { + LogResult::Read { + entry: + Some(DurEntry::Metadata { + curr_term, + voted_for, + }), + end_offset, + } => { + self.log_offset = end_offset; + + // recover necessary metadata info + self.curr_term = curr_term; + self.voted_for = voted_for; + + // read out and push all log entries into memory log + loop { + // using 0 as a special log action ID + self.storage_hub.submit_action( + 0, + LogAction::Read { + offset: self.log_offset, + }, + )?; + let (_, log_result) = self.storage_hub.get_result().await?; + + match log_result { + LogResult::Read { + entry: Some(DurEntry::LogEntry { mut entry }), + end_offset, + } => { + entry.log_offset = self.log_offset; + self.log.push(entry); + // update log offset + self.log_offset = end_offset; + } + LogResult::Read { entry: None, .. } => { + // end of log reached + break; + } + _ => { + return logged_err!(self.id; "unexpected log result type"); + } + } + } + } + + LogResult::Read { entry: None, .. } => { + // log file is empty, write initial metadata + self.storage_hub.submit_action( + 0, + LogAction::Write { + entry: DurEntry::Metadata { + curr_term: 0, + voted_for: None, + }, + offset: 0, + sync: self.config.logger_sync, + }, + )?; + let (_, log_result) = self.storage_hub.get_result().await?; + if let LogResult::Write { + offset_ok: true, + now_size, + } = log_result + { + self.log_offset = now_size; + } else { + return logged_err!(self.id; "unexpected log result type or failed write"); + } + // ... and write the 0-th dummy entry + self.storage_hub.submit_action( + 0, + LogAction::Write { + entry: DurEntry::LogEntry { + entry: LogEntry { + term: 0, + reqs: vec![], + external: false, + log_offset: self.log_offset, + }, + }, + offset: self.log_offset, + sync: self.config.logger_sync, + }, + )?; + let (_, log_result) = self.storage_hub.get_result().await?; + if let LogResult::Write { + offset_ok: true, + now_size, + } = log_result + { + self.log[0].log_offset = self.log_offset; + self.log_offset = now_size; + } else { + return logged_err!(self.id; "unexpected log result type or failed write"); + } + } + + _ => return logged_err!(self.id; "unexpected log result type"), + } + + // do an extra Truncate to remove paritial entry at the end if any + self.storage_hub.submit_action( + 0, + LogAction::Truncate { + offset: self.log_offset, + }, + )?; + let (_, log_result) = self.storage_hub.get_result().await?; + if let LogResult::Truncate { + offset_ok: true, .. + } = log_result + { + if self.log_offset > 0 { + pf_info!(self.id; "recovered from wal log: term {} voted {:?} |log| {}", + self.curr_term, self.voted_for, self.log.len()); + } + Ok(()) + } else { + logged_err!(self.id; "unexpected log result type or failed truncate") + } + } +} + +// RaftReplica snapshotting & GC logic +impl RaftReplica { + /// Dump new key-value pairs to snapshot file. + async fn snapshot_dump_kv_pairs(&mut self) -> Result<(), SummersetError> { + // collect all key-value pairs put up to exec_bar + let mut pairs = HashMap::new(); + for slot in self.start_slot..self.last_exec { + let entry = &self.log[slot - self.start_slot]; + for (_, req) in entry.reqs.clone() { + if let ApiRequest::Req { + cmd: Command::Put { key, value }, + .. + } = req + { + pairs.insert(key, value); + } + } + } + + // write the collection to snapshot file + self.snapshot_hub.submit_action( + 0, // using 0 as dummy log action ID + LogAction::Append { + entry: SnapEntry::KVPairSet { pairs }, + sync: self.config.logger_sync, + }, + )?; + let (_, log_result) = self.snapshot_hub.get_result().await?; + if let LogResult::Append { now_size } = log_result { + self.snap_offset = now_size; + Ok(()) + } else { + logged_err!( + self.id; + "unexpected log result type" + ) + } + } + + /// Discard everything lower than start_slot in durable log. + async fn snapshot_discard_log(&mut self) -> Result<(), SummersetError> { + assert!(!self.log.is_empty()); + let cut_offset = self.log[0].log_offset; + + // discard the log before cut_offset + if cut_offset > 0 { + self.storage_hub + .submit_action(0, LogAction::Discard { offset: cut_offset })?; + loop { + let (action_id, log_result) = + self.storage_hub.get_result().await?; + if action_id != 0 { + // normal log action previously in queue; process it + self.handle_log_result(action_id, log_result)?; + } else { + if let LogResult::Discard { + offset_ok: true, + now_size, + } = log_result + { + assert_eq!(self.log_offset - cut_offset, now_size); + self.log_offset = now_size; + } else { + return logged_err!( + self.id; + "unexpected log result type or failed discard" + ); + } + break; + } + } + } + + // update entry.log_offset for all remaining in-mem entries + for entry in &mut self.log { + if entry.log_offset > 0 { + assert!(entry.log_offset >= cut_offset); + entry.log_offset -= cut_offset; + } + } + + Ok(()) + } + + /// Take a snapshot up to current last_exec, then discard the in-mem log up + /// to that index as well as their data in the durable log file. + /// + /// NOTE: the current implementation does not guard against crashes in the + /// middle of taking a snapshot. Production quality implementations should + /// make the snapshotting action "atomic". + /// + /// NOTE: the current implementation does not take care of InstallSnapshot + /// messages (which is needed when some lagging follower has some slot + /// which all other peers have snapshotted); we assume here that failed + /// Accept messages will be retried indefinitely until success before its + /// associated data gets discarded from leader's memory. + async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> { + pf_debug!(self.id; "taking new snapshot: start {} exec {}", + self.start_slot, self.last_exec); + assert!(self.last_exec + 1 >= self.start_slot); + if self.last_exec < self.start_slot + 1 { + // always keep at least one entry in log to make indexing happy + return Ok(()); + } + + // collect and dump all Puts in executed entries + if self.role == Role::Leader { + // NOTE: broadcast heartbeats here to appease followers + self.bcast_heartbeats()?; + } + self.snapshot_dump_kv_pairs().await?; + + // write new slot info entry to the head of snapshot + self.snapshot_hub.submit_action( + 0, + LogAction::Write { + entry: SnapEntry::SlotInfo { + start_slot: self.last_exec, + }, + offset: 0, + sync: self.config.logger_sync, + }, + )?; + let (_, log_result) = self.snapshot_hub.get_result().await?; + match log_result { + LogResult::Write { + offset_ok: true, .. + } => {} + _ => { + return logged_err!(self.id; "unexpected log result type or failed write"); + } + } + + // update start_slot and discard all in-mem log entries up to + // last_exec - 1 + self.log.drain(0..(self.last_exec - self.start_slot)); + self.start_slot = self.last_exec; + + // discarding everything lower than start_slot in durable log + if self.role == Role::Leader { + // NOTE: broadcast heartbeats here to appease followers + self.bcast_heartbeats()?; + } + self.snapshot_discard_log().await?; + + // reset the leader heartbeat hear timer + self.kickoff_hb_hear_timer()?; + + pf_info!(self.id; "took snapshot up to: start {}", self.start_slot); + Ok(()) + } + + /// Recover initial state from durable storage snapshot file. + async fn recover_from_snapshot(&mut self) -> Result<(), SummersetError> { + assert_eq!(self.snap_offset, 0); + + // first, try to read the first several bytes, which should record the + // start_slot index + self.snapshot_hub + .submit_action(0, LogAction::Read { offset: 0 })?; + let (_, log_result) = self.snapshot_hub.get_result().await?; + + match log_result { + LogResult::Read { + entry: Some(SnapEntry::SlotInfo { start_slot }), + end_offset, + } => { + self.snap_offset = end_offset; + + // recover start_slot info + self.start_slot = start_slot; + + // repeatedly apply key-value pairs + loop { + self.snapshot_hub.submit_action( + 0, + LogAction::Read { + offset: self.snap_offset, + }, + )?; + let (_, log_result) = + self.snapshot_hub.get_result().await?; + + match log_result { + LogResult::Read { + entry: Some(SnapEntry::KVPairSet { pairs }), + end_offset, + } => { + // execute Put commands on state machine + for (key, value) in pairs { + self.state_machine.submit_cmd( + 0, + Command::Put { key, value }, + )?; + let _ = self.state_machine.get_result().await?; + } + // update snapshot file offset + self.snap_offset = end_offset; + } + LogResult::Read { entry: None, .. } => { + // end of log reached + break; + } + _ => { + return logged_err!(self.id; "unexpected log result type"); + } + } + } + + // tell manager about my start_slot index + self.control_hub.send_ctrl(CtrlMsg::SnapshotUpTo { + new_start: self.start_slot, + })?; + + if self.start_slot > 0 { + pf_info!(self.id; "recovered from snapshot: start {}", + self.start_slot); + } + Ok(()) + } + + LogResult::Read { entry: None, .. } => { + // snapshot file is empty. Write a 1 as start_slot and return + self.snapshot_hub.submit_action( + 0, + LogAction::Write { + entry: SnapEntry::SlotInfo { start_slot: 1 }, + offset: 0, + sync: self.config.logger_sync, + }, + )?; + let (_, log_result) = self.snapshot_hub.get_result().await?; + if let LogResult::Write { + offset_ok: true, + now_size, + } = log_result + { + self.snap_offset = now_size; + Ok(()) + } else { + logged_err!(self.id; "unexpected log result type or failed write") + } + } + + _ => { + logged_err!(self.id; "unexpected log result type") + } + } + } +} + #[async_trait] impl GenericReplica for RaftReplica { async fn new_and_setup( @@ -257,6 +1638,135 @@ impl GenericReplica for RaftReplica { manager: SocketAddr, config_str: Option<&str>, ) -> Result { + // connect to the cluster manager and get assigned a server ID + let mut control_hub = ControlHub::new_and_setup(manager).await?; + let id = control_hub.me; + let population = control_hub.population; + + // parse protocol-specific configs + let config = parsed_config!(config_str => ReplicaConfigRaft; + batch_interval_ms, max_batch_size, + backer_path, logger_sync, + hb_hear_timeout_min, hb_hear_timeout_max, + hb_send_interval_ms, + snapshot_path, snapshot_interval_s, + perf_storage_a, perf_storage_b, + perf_network_a, perf_network_b)?; + if config.batch_interval_ms == 0 { + return logged_err!( + id; + "invalid config.batch_interval_ms '{}'", + config.batch_interval_ms + ); + } + if config.hb_hear_timeout_min < 100 { + return logged_err!( + id; + "invalid config.hb_hear_timeout_min '{}'", + config.hb_hear_timeout_min + ); + } + if config.hb_hear_timeout_max < config.hb_hear_timeout_min + 100 { + return logged_err!( + id; + "invalid config.hb_hear_timeout_max '{}'", + config.hb_hear_timeout_max + ); + } + if config.hb_send_interval_ms == 0 { + return logged_err!( + id; + "invalid config.hb_send_interval_ms '{}'", + config.hb_send_interval_ms + ); + } + + // setup state machine module + let state_machine = StateMachine::new_and_setup(id).await?; + + // setup storage hub module + let storage_hub = StorageHub::new_and_setup( + id, + Path::new(&config.backer_path), + if config.perf_storage_a == 0 && config.perf_storage_b == 0 { + None + } else { + Some((config.perf_storage_a, config.perf_storage_b)) + }, + ) + .await?; + + // setup transport hub module + let mut transport_hub = TransportHub::new_and_setup( + id, + population, + p2p_addr, + if config.perf_network_a == 0 && config.perf_network_b == 0 { + None + } else { + Some((config.perf_network_a, config.perf_network_b)) + }, + ) + .await?; + + // ask for the list of peers to proactively connect to. Do this after + // transport hub has been set up, so that I will be able to accept + // later peer connections + control_hub.send_ctrl(CtrlMsg::NewServerJoin { + id, + protocol: SmrProtocol::Raft, + api_addr, + p2p_addr, + })?; + let to_peers = if let CtrlMsg::ConnectToPeers { to_peers, .. } = + control_hub.recv_ctrl().await? + { + to_peers + } else { + return logged_err!(id; "unexpected ctrl msg type received"); + }; + + // proactively connect to some peers, then wait for all population + // have been connected with me + for (peer, addr) in to_peers { + transport_hub.connect_to_peer(peer, addr).await?; + } + transport_hub.wait_for_group(population).await?; + + // setup snapshot hub module + let snapshot_hub = StorageHub::new_and_setup( + id, + Path::new(&config.snapshot_path), + None, + ) + .await?; + + // setup external API module, ready to take in client requests + let external_api = ExternalApi::new_and_setup( + id, + api_addr, + Duration::from_millis(config.batch_interval_ms), + config.max_batch_size, + ) + .await?; + + let mut hb_send_interval = + time::interval(Duration::from_millis(config.hb_send_interval_ms)); + hb_send_interval.set_missed_tick_behavior(MissedTickBehavior::Skip); + + let mut snapshot_interval = time::interval(Duration::from_secs( + if config.snapshot_interval_s > 0 { + config.snapshot_interval_s + } else { + 60 // dummy non-zero value to make `time::interval` happy + }, + )); + snapshot_interval.set_missed_tick_behavior(MissedTickBehavior::Skip); + + let hb_reply_cnts = (0..population) + .filter_map(|p| if p == id { None } else { Some((p, (1, 0, 0))) }) + .collect(); + Ok(RaftReplica { id, population, @@ -264,6 +1774,39 @@ impl GenericReplica for RaftReplica { config, _api_addr: api_addr, _p2p_addr: p2p_addr, + control_hub, + external_api, + state_machine, + storage_hub, + snapshot_hub, + transport_hub, + role: Role::Follower, + leader: None, + hb_hear_timer: Timer::new(), + hb_send_interval, + hb_reply_cnts, + peer_alive: Bitmap::new(population, true), + curr_term: 0, + voted_for: None, + votes_granted: HashSet::new(), + log: vec![LogEntry { + term: 0, + reqs: vec![], + external: false, + log_offset: 0, + }], + start_slot: 0, + snapshot_interval, + last_commit: 0, + last_exec: 0, + next_slot: (0..population) + .filter_map(|s| if s == id { None } else { Some((s, 1)) }) + .collect(), + match_slot: (0..population) + .filter_map(|s| if s == id { None } else { Some((s, 0)) }) + .collect(), + log_offset: 0, + snap_offset: 0, }) } @@ -271,6 +1814,121 @@ impl GenericReplica for RaftReplica { &mut self, mut rx_term: watch::Receiver, ) -> Result { + // recover state from durable snapshot file + self.recover_from_snapshot().await?; + + // recover the tail-piece memory log & state from remaining durable log + self.recover_from_log().await?; + + // kick off leader activity hearing timer + self.kickoff_hb_hear_timer()?; + + // main event loop + let mut paused = false; + loop { + tokio::select! { + // client request batch + req_batch = self.external_api.get_req_batch(), if !paused => { + if let Err(e) = req_batch { + pf_error!(self.id; "error getting req batch: {}", e); + continue; + } + let req_batch = req_batch.unwrap(); + if let Err(e) = self.handle_req_batch(req_batch) { + pf_error!(self.id; "error handling req batch: {}", e); + } + }, + + // durable logging result + log_result = self.storage_hub.get_result(), if !paused => { + if let Err(e) = log_result { + pf_error!(self.id; "error getting log result: {}", e); + continue; + } + let (action_id, log_result) = log_result.unwrap(); + if let Err(e) = self.handle_log_result(action_id, log_result) { + pf_error!(self.id; "error handling log result {}: {}", + action_id, e); + } + }, + + // message from peer + msg = self.transport_hub.recv_msg(), if !paused => { + if let Err(e) = msg { + pf_error!(self.id; "error receiving peer msg: {}", e); + continue; + } + let (peer, msg) = msg.unwrap(); + if let Err(e) = self.handle_msg_recv(peer, msg).await { + pf_error!(self.id; "error handling msg recv <- {}: {}", peer, e); + } + }, + + // state machine execution result + cmd_result = self.state_machine.get_result(), if !paused => { + if let Err(e) = cmd_result { + pf_error!(self.id; "error getting cmd result: {}", e); + continue; + } + let (cmd_id, cmd_result) = cmd_result.unwrap(); + if let Err(e) = self.handle_cmd_result(cmd_id, cmd_result) { + pf_error!(self.id; "error handling cmd result {}: {}", cmd_id, e); + } + }, + + // leader inactivity timeout + _ = self.hb_hear_timer.timeout(), if !paused => { + if let Err(e) = self.become_a_candidate().await { + pf_error!(self.id; "error becoming a candidate: {}", e); + } + }, + + // leader sending heartbeat + _ = self.hb_send_interval.tick(), if !paused + && self.role == Role::Leader => { + if let Err(e) = self.bcast_heartbeats() { + pf_error!(self.id; "error broadcasting heartbeats: {}", e); + } + }, + + // autonomous snapshot taking timeout + _ = self.snapshot_interval.tick(), if !paused + && self.config.snapshot_interval_s > 0 => { + if let Err(e) = self.take_new_snapshot().await { + pf_error!(self.id; "error taking a new snapshot: {}", e); + } else { + self.control_hub.send_ctrl( + CtrlMsg::SnapshotUpTo { new_start: self.start_slot } + )?; + } + }, + + // manager control message + ctrl_msg = self.control_hub.recv_ctrl() => { + if let Err(e) = ctrl_msg { + pf_error!(self.id; "error getting ctrl msg: {}", e); + continue; + } + let ctrl_msg = ctrl_msg.unwrap(); + match self.handle_ctrl_msg(ctrl_msg, &mut paused).await { + Ok(terminate) => { + if let Some(restart) = terminate { + return Ok(restart); + } + }, + Err(e) => { + pf_error!(self.id; "error handling ctrl msg: {}", e); + } + } + }, + + // receiving termination signal + _ = rx_term.changed() => { + pf_warn!(self.id; "server caught termination signal"); + return Ok(false); + } + } + } } fn id(&self) -> ReplicaId { diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs index 3280baf1..d10bf38f 100644 --- a/src/protocols/rs_paxos.rs +++ b/src/protocols/rs_paxos.rs @@ -1879,7 +1879,7 @@ impl RSPaxosReplica { Ok(()) } - /// Take a snapshot up to current exec_idx, then discard the in-mem log up + /// Take a snapshot up to current exec_bar, then discard the in-mem log up /// to that index as well as outdate entries in the durable WAL log file. /// /// NOTE: the current implementation does not guard against crashes in the @@ -1924,7 +1924,7 @@ impl RSPaxosReplica { offset_ok: true, .. } => {} _ => { - return logged_err!(self.id; "unexpected log result type or failed truncate"); + return logged_err!(self.id; "unexpected log result type or failed write"); } } @@ -2043,7 +2043,7 @@ impl RSPaxosReplica { self.snap_offset = now_size; Ok(()) } else { - logged_err!(self.id; "unexpected log result type or failed truncate") + logged_err!(self.id; "unexpected log result type or failed write") } } From 63591b66127ac8cae988b80d1a15d8c5266f905f Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Sun, 8 Oct 2023 16:41:38 -0500 Subject: [PATCH 86/89] finish raft impl & debugging --- README.md | 5 +- src/manager/reigner.rs | 2 +- src/protocols/crossword.rs | 14 +- src/protocols/multipaxos.rs | 14 +- src/protocols/raft.rs | 238 ++++++++++++++------ src/protocols/rs_paxos.rs | 14 +- src/server/storage.rs | 79 +++++-- summerset_client/src/drivers/closed_loop.rs | 148 ++++++------ summerset_client/src/drivers/open_loop.rs | 60 ++--- 9 files changed, 379 insertions(+), 195 deletions(-) diff --git a/README.md b/README.md index 89481b4a..79a88675 100644 --- a/README.md +++ b/README.md @@ -160,8 +160,9 @@ Complete cluster management and benchmarking scripts are available in another re - [x] TLA+ spec - [x] implementation of RS-Paxos - [ ] TLA+ spec -- [ ] implementation of Raft - - [ ] snapshotting & garbage collection +- [x] implementation of Raft + - [x] state persistence & restart check + - [x] snapshotting & garbage collection - [ ] membership discovery & view changes - [ ] TLA+ spec - [ ] implementation of CRaft diff --git a/src/manager/reigner.rs b/src/manager/reigner.rs index 41ae38ec..3be28cde 100644 --- a/src/manager/reigner.rs +++ b/src/manager/reigner.rs @@ -21,7 +21,7 @@ use tokio::task::JoinHandle; /// Control message from/to servers. Control traffic could be bidirectional: /// some initiated by the manager and some by servers. -// TODO: later add leader change, membership change, etc. +// TODO: later add membership/view change, link drop, etc. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] pub enum CtrlMsg { /// Server -> Manager: new server up, requesting a list of peers' addresses diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index 71942eb5..41d44a9b 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -1735,6 +1735,8 @@ impl CrosswordReplica { /// Chooses a random hb_hear_timeout from the min-max range and kicks off /// the hb_hear_timer. fn kickoff_hb_hear_timer(&mut self) -> Result<(), SummersetError> { + self.hb_hear_timer.cancel()?; + let timeout_ms = thread_rng().gen_range( self.config.hb_hear_timeout_min..=self.config.hb_hear_timeout_max, ); @@ -2028,7 +2030,6 @@ impl CrosswordReplica { pf_warn!(self.id; "server got resume req"); // reset leader heartbeat timer - self.hb_hear_timer.cancel()?; self.kickoff_hb_hear_timer()?; *paused = false; @@ -2300,8 +2301,13 @@ impl CrosswordReplica { // discard the log before cut_offset if cut_offset > 0 { - self.storage_hub - .submit_action(0, LogAction::Discard { offset: cut_offset })?; + self.storage_hub.submit_action( + 0, + LogAction::Discard { + offset: cut_offset, + keep: 0, + }, + )?; loop { let (action_id, log_result) = self.storage_hub.get_result().await?; @@ -2797,7 +2803,7 @@ impl GenericReplica for CrosswordReplica { // autonomous snapshot taking timeout _ = self.snapshot_interval.tick(), if !paused - && self.config.snapshot_interval_s > 0 => { + && self.config.snapshot_interval_s > 0 => { if let Err(e) = self.take_new_snapshot().await { pf_error!(self.id; "error taking a new snapshot: {}", e); } else { diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs index a19e1d48..76955978 100644 --- a/src/protocols/multipaxos.rs +++ b/src/protocols/multipaxos.rs @@ -1204,6 +1204,8 @@ impl MultiPaxosReplica { /// Chooses a random hb_hear_timeout from the min-max range and kicks off /// the hb_hear_timer. fn kickoff_hb_hear_timer(&mut self) -> Result<(), SummersetError> { + self.hb_hear_timer.cancel()?; + let timeout_ms = thread_rng().gen_range( self.config.hb_hear_timeout_min..=self.config.hb_hear_timeout_max, ); @@ -1331,7 +1333,6 @@ impl MultiPaxosReplica { pf_warn!(self.id; "server got resume req"); // reset leader heartbeat timer - self.hb_hear_timer.cancel()?; self.kickoff_hb_hear_timer()?; *paused = false; @@ -1586,8 +1587,13 @@ impl MultiPaxosReplica { // discard the log before cut_offset if cut_offset > 0 { - self.storage_hub - .submit_action(0, LogAction::Discard { offset: cut_offset })?; + self.storage_hub.submit_action( + 0, + LogAction::Discard { + offset: cut_offset, + keep: 0, + }, + )?; loop { let (action_id, log_result) = self.storage_hub.get_result().await?; @@ -2049,7 +2055,7 @@ impl GenericReplica for MultiPaxosReplica { // autonomous snapshot taking timeout _ = self.snapshot_interval.tick(), if !paused - && self.config.snapshot_interval_s > 0 => { + && self.config.snapshot_interval_s > 0 => { if let Err(e) = self.take_new_snapshot().await { pf_error!(self.id; "error taking a new snapshot: {}", e); } else { diff --git a/src/protocols/raft.rs b/src/protocols/raft.rs index 4693b5cf..5dc54a83 100644 --- a/src/protocols/raft.rs +++ b/src/protocols/raft.rs @@ -5,6 +5,7 @@ //! - //! - +use std::cmp; use std::collections::{HashMap, HashSet}; use std::path::Path; use std::net::SocketAddr; @@ -159,6 +160,8 @@ enum PeerMsg { prev_term: Term, entries: Vec, leader_commit: usize, + /// For conservative snapshotting purpose. + last_snap: usize, }, /// AppendEntries reply from follower to leader. @@ -277,9 +280,18 @@ pub struct RaftReplica { /// For each server, index of the highest log entry known to be replicated. match_slot: HashMap, + /// Slot index up to which it is safe to take snapshot. + /// NOTE: we are taking a conservative approach here that a snapshot + /// covering an entry can be taken only when all servers have durably + /// committed that entry. + last_snap: usize, + /// Current durable log file end offset. log_offset: usize, + /// Current durable log end of offset of metadata. + log_meta_end: usize, + /// Current durable snapshot file offset. snap_offset: usize, } @@ -346,6 +358,7 @@ impl RaftReplica { self.heard_heartbeat(peer, term)?; // refresh election timer if self.role != Role::Follower { self.role = Role::Follower; + pf_trace!(self.id; "converted back to follower"); Ok(true) } else { Ok(false) @@ -398,7 +411,7 @@ impl RaftReplica { term: self.curr_term, reqs: req_batch, external: true, - log_offset: self.log_offset, + log_offset: 0, }; let slot = self.start_slot + self.log.len(); self.log.push(entry.clone()); @@ -434,13 +447,13 @@ impl RaftReplica { // broadcast AppendEntries messages to followers for peer in 0..self.population { - if peer == self.id { + if peer == self.id || self.next_slot[&peer] < 1 { continue; } let prev_slot = self.next_slot[&peer] - 1; if prev_slot < self.start_slot { - pf_error!(self.id; "snapshotted slot {} queried", prev_slot); + return logged_err!(self.id; "snapshotted slot {} queried", prev_slot); } let prev_term = self.log[prev_slot - self.start_slot].term; let entries = self @@ -458,6 +471,7 @@ impl RaftReplica { prev_term, entries, leader_commit: self.last_commit, + last_snap: self.last_snap, }, peer, )?; @@ -486,21 +500,6 @@ impl RaftReplica { slot, slot_e); assert!(slot <= slot_e); - // submit newly committed entry for state machine execution - if slot > self.last_exec && slot <= self.last_commit { - let entry = &self.log[slot - self.start_slot]; - for (cmd_idx, (_, req)) in entry.reqs.iter().enumerate() { - if let ApiRequest::Req { cmd, .. } = req { - self.state_machine.submit_cmd( - Self::make_command_id(slot, cmd_idx), - cmd.clone(), - )?; - } else { - continue; // ignore other types of requests - } - } - } - // if all consecutive entries are made durable, reply AppendEntries // success back to leader if slot == slot_e { @@ -534,10 +533,11 @@ impl RaftReplica { assert!(slot_e < self.start_slot + self.log.len()); if let LogResult::Append { now_size } = log_result { - assert_eq!( - self.log[slot - self.start_slot].log_offset, - self.log_offset - ); + let entry = &mut self.log[slot - self.start_slot]; + if entry.log_offset != self.log_offset { + // entry has incorrect log_offset bookkept; update it + entry.log_offset = self.log_offset; + } assert!(now_size > self.log_offset); self.log_offset = now_size; } else { @@ -557,6 +557,7 @@ impl RaftReplica { // RaftReplica peer-peer messages handling impl RaftReplica { /// Handler of AppendEntries message from leader. + #[allow(clippy::too_many_arguments)] async fn handle_msg_append_entries( &mut self, leader: ReplicaId, @@ -565,9 +566,12 @@ impl RaftReplica { prev_term: Term, mut entries: Vec, leader_commit: usize, + last_snap: usize, ) -> Result<(), SummersetError> { - pf_trace!(self.id; "received AcceptEntries <- {} for slot > {} term {}", - leader, prev_slot, term); + if !entries.is_empty() { + pf_trace!(self.id; "received AcceptEntries <- {} for slots {} - {} term {}", + leader, prev_slot + 1, prev_slot + entries.len(), term); + } if self.check_term(leader, term)? || self.role != Role::Follower { return Ok(()); } @@ -587,6 +591,8 @@ impl RaftReplica { }, leader, )?; + pf_trace!(self.id; "sent AcceptEntriesReply -> {} term {} end_slot {} fail", + leader, self.curr_term, prev_slot); if term >= self.curr_term { // also refresh heartbeat timer here since the "decrementing" @@ -651,13 +657,14 @@ impl RaftReplica { // append new entries into my log, and submit logger actions to make // new entries durable - let (num_entries, mut num_appended) = (0, 0); + let (num_entries, mut num_appended) = (entries.len(), 0); for (slot, mut entry) in entries .drain((first_new - prev_slot - 1)..entries.len()) .enumerate() .map(|(s, e)| (s + first_new, e)) { - entry.external = false; // not from client + entry.log_offset = 0; + self.log.push(entry.clone()); self.storage_hub.submit_action( Self::make_log_action_id( @@ -670,6 +677,7 @@ impl RaftReplica { sync: self.config.logger_sync, }, )?; + num_appended += 1; } @@ -689,11 +697,29 @@ impl RaftReplica { // if leader_commit is larger than my last_commit, update last_commit if leader_commit > self.last_commit { - self.last_commit = if leader_commit < prev_slot + entries.len() { - leader_commit - } else { - prev_slot + entries.len() - }; + let new_commit = cmp::min(leader_commit, prev_slot + entries.len()); + + // submit newly committed entries for state machine execution + for slot in (self.last_commit + 1)..=new_commit { + let entry = &self.log[slot - self.start_slot]; + for (cmd_idx, (_, req)) in entry.reqs.iter().enumerate() { + if let ApiRequest::Req { cmd, .. } = req { + self.state_machine.submit_cmd( + Self::make_command_id(slot, cmd_idx), + cmd.clone(), + )?; + } else { + continue; // ignore other types of requests + } + } + } + + self.last_commit = new_commit; + } + + // if last_snap is larger than mine, update last_snap + if last_snap > self.last_snap { + self.last_snap = last_snap; } Ok(()) @@ -707,11 +733,14 @@ impl RaftReplica { end_slot: usize, success: bool, ) -> Result<(), SummersetError> { - pf_trace!(self.id; "received AcceptEntriesReply <- {} for term {} {}", - peer, term, if success { "ok" } else { "fail" }); + if !success || self.match_slot[&peer] != end_slot { + pf_trace!(self.id; "received AcceptEntriesReply <- {} for term {} {}", + peer, term, if success { "ok" } else { "fail" }); + } if self.check_term(peer, term)? || self.role != Role::Leader { return Ok(()); } + self.heard_heartbeat(peer, term)?; if success { // success: update next_slot and match_slot for follower @@ -720,6 +749,7 @@ impl RaftReplica { // since we updated some match_slot here, check if any additional // entries are now considered committed + let mut new_commit = self.last_commit; for slot in (self.last_commit + 1)..(self.start_slot + self.log.len()) { @@ -734,13 +764,13 @@ impl RaftReplica { .filter(|&&s| s >= slot) .count() as u8; if match_cnt >= self.quorum_cnt { - // quorum size reached, set last_commit to here - self.last_commit = slot; + // quorum size reached, set new_commit to here + new_commit = slot; } } // submit newly committed commands, if any, for execution - for slot in (self.last_exec + 1)..=self.last_commit { + for slot in (self.last_commit + 1)..=new_commit { let entry = &self.log[slot - self.start_slot]; for (cmd_idx, (_, req)) in entry.reqs.iter().enumerate() { if let ApiRequest::Req { cmd, .. } = req { @@ -753,13 +783,34 @@ impl RaftReplica { } } } + + self.last_commit = new_commit; + + // also check if any additional entries are safe to snapshot + for slot in (self.last_snap + 1)..=end_slot { + let match_cnt = 1 + self + .match_slot + .values() + .filter(|&&s| s >= slot) + .count() as u8; + if match_cnt == self.population { + // all servers have durably stored this entry + self.last_snap = slot; + } + } } else { // failed: decrement next_slot for follower and retry + // NOTE: the optimization of fast-backward bypassing (instead of + // always decrementing by 1) not implemented + if self.next_slot[&peer] == 1 { + return Ok(()); // cannot move backward any more + } *self.next_slot.get_mut(&peer).unwrap() -= 1; let prev_slot = self.next_slot[&peer] - 1; if prev_slot < self.start_slot { - pf_error!(self.id; "snapshotted slot {} queried", prev_slot); + *self.next_slot.get_mut(&peer).unwrap() += 1; + return logged_err!(self.id; "snapshotted slot {} queried", prev_slot); } let prev_term = self.log[prev_slot - self.start_slot].term; let entries = self @@ -776,6 +827,7 @@ impl RaftReplica { prev_term, entries, leader_commit: self.last_commit, + last_snap: self.last_snap, }, peer, )?; @@ -808,7 +860,7 @@ impl RaftReplica { }, candidate, )?; - pf_trace!(self.id; "sent RequestVote -> {} term {} false", + pf_trace!(self.id; "sent RequestVoteReply -> {} term {} false", candidate, self.curr_term); return Ok(()); } @@ -828,8 +880,12 @@ impl RaftReplica { }, candidate, )?; - pf_trace!(self.id; "sent RequestVote -> {} term {} granted", + pf_trace!(self.id; "sent RequestVoteReply -> {} term {} granted", candidate, self.curr_term); + + // hear a heartbeat here to prevent me from starting an + // election soon + self.heard_heartbeat(candidate, term)?; } } @@ -873,6 +929,7 @@ impl RaftReplica { prev_term, entries, leader_commit, + last_snap, } => { self.handle_msg_append_entries( peer, @@ -881,6 +938,7 @@ impl RaftReplica { prev_term, entries, leader_commit, + last_snap, ) .await } @@ -1024,7 +1082,8 @@ impl RaftReplica { /// Becomes the leader after enough votes granted for me. fn become_the_leader(&mut self) -> Result<(), SummersetError> { - pf_info!(self.id; "elected as leader with term {}", self.curr_term); + pf_info!(self.id; "elected to be leader with term {}", self.curr_term); + self.role = Role::Leader; // clear peers' heartbeat reply counters, and broadcast a heartbeat now for cnts in self.hb_reply_cnts.values_mut() { @@ -1055,6 +1114,7 @@ impl RaftReplica { prev_term, entries: vec![], leader_commit: self.last_commit, + last_snap: self.last_snap, }, None, )?; @@ -1094,6 +1154,8 @@ impl RaftReplica { /// Chooses a random hb_hear_timeout from the min-max range and kicks off /// the hb_hear_timer. fn kickoff_hb_hear_timer(&mut self) -> Result<(), SummersetError> { + self.hb_hear_timer.cancel()?; + let timeout_ms = thread_rng().gen_range( self.config.hb_hear_timeout_min..=self.config.hb_hear_timeout_max, ); @@ -1187,7 +1249,6 @@ impl RaftReplica { pf_warn!(self.id; "server got resume req"); // reset leader heartbeat timer - self.hb_hear_timer.cancel()?; self.kickoff_hb_hear_timer()?; *paused = false; @@ -1264,6 +1325,7 @@ impl RaftReplica { end_offset, } => { self.log_offset = end_offset; + self.log_meta_end = end_offset; // recover necessary metadata info self.curr_term = curr_term; @@ -1286,9 +1348,9 @@ impl RaftReplica { end_offset, } => { entry.log_offset = self.log_offset; + entry.external = false; // no re-replying to clients self.log.push(entry); - // update log offset - self.log_offset = end_offset; + self.log_offset = end_offset; // update log offset } LogResult::Read { entry: None, .. } => { // end of log reached @@ -1321,10 +1383,18 @@ impl RaftReplica { } = log_result { self.log_offset = now_size; + self.log_meta_end = now_size; } else { return logged_err!(self.id; "unexpected log result type or failed write"); } - // ... and write the 0-th dummy entry + // ... and push a 0-th dummy entry into in-mem log + self.log.push(LogEntry { + term: 0, + reqs: vec![], + external: false, + log_offset: 0, + }); + // ... and write the 0-th dummy entry durably self.storage_hub.submit_action( 0, LogAction::Write { @@ -1382,10 +1452,13 @@ impl RaftReplica { // RaftReplica snapshotting & GC logic impl RaftReplica { /// Dump new key-value pairs to snapshot file. - async fn snapshot_dump_kv_pairs(&mut self) -> Result<(), SummersetError> { + async fn snapshot_dump_kv_pairs( + &mut self, + new_start_slot: usize, + ) -> Result<(), SummersetError> { // collect all key-value pairs put up to exec_bar let mut pairs = HashMap::new(); - for slot in self.start_slot..self.last_exec { + for slot in self.start_slot..new_start_slot { let entry = &self.log[slot - self.start_slot]; for (_, req) in entry.reqs.clone() { if let ApiRequest::Req { @@ -1420,13 +1493,26 @@ impl RaftReplica { /// Discard everything lower than start_slot in durable log. async fn snapshot_discard_log(&mut self) -> Result<(), SummersetError> { + // drain things currently in storage_hub's recv chan if head of log's + // durable file offset has not been set yet assert!(!self.log.is_empty()); + while self.log[0].log_offset == 0 { + let (action_id, log_result) = self.storage_hub.get_result().await?; + self.handle_log_result(action_id, log_result)?; + } let cut_offset = self.log[0].log_offset; - // discard the log before cut_offset + // discard the log after meta_end and before cut_offset if cut_offset > 0 { - self.storage_hub - .submit_action(0, LogAction::Discard { offset: cut_offset })?; + assert!(self.log_meta_end > 0); + assert!(self.log_meta_end <= cut_offset); + self.storage_hub.submit_action( + 0, + LogAction::Discard { + offset: cut_offset, + keep: self.log_meta_end, + }, + )?; loop { let (action_id, log_result) = self.storage_hub.get_result().await?; @@ -1439,7 +1525,10 @@ impl RaftReplica { now_size, } = log_result { - assert_eq!(self.log_offset - cut_offset, now_size); + assert_eq!( + self.log_offset - cut_offset + self.log_meta_end, + now_size + ); self.log_offset = now_size; } else { return logged_err!( @@ -1456,7 +1545,7 @@ impl RaftReplica { for entry in &mut self.log { if entry.log_offset > 0 { assert!(entry.log_offset >= cut_offset); - entry.log_offset -= cut_offset; + entry.log_offset -= cut_offset - self.log_meta_end; } } @@ -1472,15 +1561,18 @@ impl RaftReplica { /// /// NOTE: the current implementation does not take care of InstallSnapshot /// messages (which is needed when some lagging follower has some slot - /// which all other peers have snapshotted); we assume here that failed - /// Accept messages will be retried indefinitely until success before its - /// associated data gets discarded from leader's memory. + /// which all other peers have snapshotted); we take the conservative + /// approach that a snapshot is only taken when data has been durably + /// committed on all servers. async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> { - pf_debug!(self.id; "taking new snapshot: start {} exec {}", - self.start_slot, self.last_exec); + pf_debug!(self.id; "taking new snapshot: start {} exec {} snap {}", + self.start_slot, self.last_exec, self.last_snap); assert!(self.last_exec + 1 >= self.start_slot); - if self.last_exec < self.start_slot + 1 { - // always keep at least one entry in log to make indexing happy + + // always keep at least one entry in log to make indexing happy + let new_start_slot = cmp::min(self.last_snap, self.last_exec); + assert!(new_start_slot < self.start_slot + self.log.len()); + if new_start_slot < self.start_slot + 1 { return Ok(()); } @@ -1489,14 +1581,14 @@ impl RaftReplica { // NOTE: broadcast heartbeats here to appease followers self.bcast_heartbeats()?; } - self.snapshot_dump_kv_pairs().await?; + self.snapshot_dump_kv_pairs(new_start_slot).await?; // write new slot info entry to the head of snapshot self.snapshot_hub.submit_action( 0, LogAction::Write { entry: SnapEntry::SlotInfo { - start_slot: self.last_exec, + start_slot: new_start_slot, }, offset: 0, sync: self.config.logger_sync, @@ -1513,9 +1605,9 @@ impl RaftReplica { } // update start_slot and discard all in-mem log entries up to - // last_exec - 1 - self.log.drain(0..(self.last_exec - self.start_slot)); - self.start_slot = self.last_exec; + // new_start_slot + self.log.drain(0..(new_start_slot - self.start_slot)); + self.start_slot = new_start_slot; // discarding everything lower than start_slot in durable log if self.role == Role::Leader { @@ -1550,6 +1642,11 @@ impl RaftReplica { // recover start_slot info self.start_slot = start_slot; + if start_slot > 0 { + self.last_commit = start_slot - 1; + self.last_exec = start_slot - 1; + self.last_snap = start_slot - 1; + } // repeatedly apply key-value pairs loop { @@ -1601,11 +1698,11 @@ impl RaftReplica { } LogResult::Read { entry: None, .. } => { - // snapshot file is empty. Write a 1 as start_slot and return + // snapshot file is empty. Write a 0 as start_slot and return self.snapshot_hub.submit_action( 0, LogAction::Write { - entry: SnapEntry::SlotInfo { start_slot: 1 }, + entry: SnapEntry::SlotInfo { start_slot: 0 }, offset: 0, sync: self.config.logger_sync, }, @@ -1789,12 +1886,7 @@ impl GenericReplica for RaftReplica { curr_term: 0, voted_for: None, votes_granted: HashSet::new(), - log: vec![LogEntry { - term: 0, - reqs: vec![], - external: false, - log_offset: 0, - }], + log: vec![], start_slot: 0, snapshot_interval, last_commit: 0, @@ -1805,7 +1897,9 @@ impl GenericReplica for RaftReplica { match_slot: (0..population) .filter_map(|s| if s == id { None } else { Some((s, 0)) }) .collect(), + last_snap: 0, log_offset: 0, + log_meta_end: 0, snap_offset: 0, }) } @@ -1893,7 +1987,7 @@ impl GenericReplica for RaftReplica { // autonomous snapshot taking timeout _ = self.snapshot_interval.tick(), if !paused - && self.config.snapshot_interval_s > 0 => { + && self.config.snapshot_interval_s > 0 => { if let Err(e) = self.take_new_snapshot().await { pf_error!(self.id; "error taking a new snapshot: {}", e); } else { diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs index d10bf38f..0f7022f4 100644 --- a/src/protocols/rs_paxos.rs +++ b/src/protocols/rs_paxos.rs @@ -1442,6 +1442,8 @@ impl RSPaxosReplica { /// Chooses a random hb_hear_timeout from the min-max range and kicks off /// the hb_hear_timer. fn kickoff_hb_hear_timer(&mut self) -> Result<(), SummersetError> { + self.hb_hear_timer.cancel()?; + let timeout_ms = thread_rng().gen_range( self.config.hb_hear_timeout_min..=self.config.hb_hear_timeout_max, ); @@ -1569,7 +1571,6 @@ impl RSPaxosReplica { pf_warn!(self.id; "server got resume req"); // reset leader heartbeat timer - self.hb_hear_timer.cancel()?; self.kickoff_hb_hear_timer()?; *paused = false; @@ -1841,8 +1842,13 @@ impl RSPaxosReplica { // discard the log before cut_offset if cut_offset > 0 { - self.storage_hub - .submit_action(0, LogAction::Discard { offset: cut_offset })?; + self.storage_hub.submit_action( + 0, + LogAction::Discard { + offset: cut_offset, + keep: 0, + }, + )?; loop { let (action_id, log_result) = self.storage_hub.get_result().await?; @@ -2325,7 +2331,7 @@ impl GenericReplica for RSPaxosReplica { // autonomous snapshot taking timeout _ = self.snapshot_interval.tick(), if !paused - && self.config.snapshot_interval_s > 0 => { + && self.config.snapshot_interval_s > 0 => { if let Err(e) = self.take_new_snapshot().await { pf_error!(self.id; "error taking a new snapshot: {}", e); } else { diff --git a/src/server/storage.rs b/src/server/storage.rs index a11d6ba6..06bc0430 100644 --- a/src/server/storage.rs +++ b/src/server/storage.rs @@ -44,8 +44,9 @@ pub enum LogAction { /// Truncate the log at given offset, keeping the head part. Truncate { offset: usize }, - /// Discard the log before given offset, keeping the tail part. - Discard { offset: usize }, + /// Discard the log before given offset, keeping the tail part (and + /// optionally a head part). + Discard { offset: usize, keep: usize }, } /// Action result returned by the logger. @@ -337,12 +338,14 @@ where } } - /// Discard the file before given index, keeping the tail part. + /// Discard the file before given index, keeping the tail part (and + /// optionally a head part). async fn discard_log( me: ReplicaId, backer: &mut File, file_size: usize, offset: usize, + keep: usize, ) -> Result<(bool, usize), SummersetError> { if offset > file_size { pf_warn!( @@ -352,25 +355,32 @@ where file_size ); Ok((false, file_size)) + } else if keep >= offset { + pf_warn!( + me; + "discard keeping {} while offset is {}", + keep, offset + ); + Ok((false, file_size)) } else { let tail_size = file_size - offset; if tail_size > 0 { // due to the limited interfaces provided by `tokio::fs`, we - // read out the tail part and write it back to offset 0 to + // read out the tail part and write it back to offset keep to // achieve the effect of discarding let mut tail_buf: Vec = vec![0; tail_size]; backer.seek(SeekFrom::Start(offset as u64)).await?; backer.read_exact(&mut tail_buf[..]).await?; - backer.seek(SeekFrom::Start(0)).await?; + backer.seek(SeekFrom::Start(keep as u64)).await?; backer.write_all(&tail_buf[..]).await?; } - backer.set_len(tail_size as u64).await?; + backer.set_len((keep + tail_size) as u64).await?; backer.seek(SeekFrom::End(0)).await?; // recover cursor to EOF backer.sync_all().await?; - Ok((true, tail_size)) + Ok((true, keep + tail_size)) } } @@ -422,16 +432,16 @@ where } }) } - LogAction::Discard { offset } => { - Self::discard_log(me, backer, *file_size, offset).await.map( - |(offset_ok, now_size)| { + LogAction::Discard { offset, keep } => { + Self::discard_log(me, backer, *file_size, offset, keep) + .await + .map(|(offset_ok, now_size)| { *file_size = now_size; LogResult::Discard { offset_ok, now_size, } - }, - ) + }) } } } @@ -658,24 +668,55 @@ mod storage_tests { let mut backer_file = prepare_test_file("/tmp/test-backer-4.log").await?; let entry = TestEntry("test-entry-dummy-string".into()); - let mid_offset = + let mid1_offset = StorageHub::append_entry(0, &mut backer_file, 0, &entry, false) .await?; + let mid2_offset = StorageHub::append_entry( + 0, + &mut backer_file, + mid1_offset, + &entry, + false, + ) + .await?; let end_offset = StorageHub::append_entry( 0, &mut backer_file, - mid_offset, + mid2_offset, &entry, true, ) .await?; - let tail_size = end_offset - mid_offset; + let tail_size = end_offset - mid2_offset; assert_eq!( StorageHub::::discard_log( 0, &mut backer_file, end_offset, - mid_offset + mid2_offset, + mid1_offset, + ) + .await?, + (true, 2 * tail_size) + ); + assert_eq!( + StorageHub::::discard_log( + 0, + &mut backer_file, + 2 * tail_size, + mid1_offset, + end_offset, + ) + .await?, + (false, 2 * tail_size) + ); + assert_eq!( + StorageHub::::discard_log( + 0, + &mut backer_file, + 2 * tail_size, + mid1_offset, + 0, ) .await?, (true, tail_size) @@ -685,7 +726,8 @@ mod storage_tests { 0, &mut backer_file, tail_size, - end_offset + end_offset, + 0 ) .await?, (false, tail_size) @@ -695,7 +737,8 @@ mod storage_tests { 0, &mut backer_file, tail_size, - tail_size + tail_size, + 0 ) .await?, (true, 0) diff --git a/summerset_client/src/drivers/closed_loop.rs b/summerset_client/src/drivers/closed_loop.rs index a0a96e87..06e218df 100644 --- a/summerset_client/src/drivers/closed_loop.rs +++ b/summerset_client/src/drivers/closed_loop.rs @@ -99,46 +99,55 @@ impl DriverClosedLoop { })?; let issue_ts = Instant::now(); - let reply = self.recv_reply_with_timeout().await?; - match reply { - Some(ApiReply::Reply { - id: reply_id, - result: cmd_result, - redirect, - }) => { - if reply_id != req_id { - logged_err!(self.id; "request ID mismatch: expected {}, replied {}", - req_id, reply_id) - } else { - match cmd_result { - None => { - if let Some(server) = redirect { - Ok(DriverReply::Redirect { server }) - } else { - Ok(DriverReply::Failure) + loop { + let reply = self.recv_reply_with_timeout().await?; + match reply { + Some(ApiReply::Reply { + id: reply_id, + result: cmd_result, + redirect, + }) => { + if reply_id != req_id { + // logged_err!(self.id; "request ID mismatch: expected {}, replied {}", + // req_id, reply_id) + continue; + } else { + match cmd_result { + None => { + if let Some(server) = redirect { + return Ok(DriverReply::Redirect { + server, + }); + } else { + return Ok(DriverReply::Failure); + } } - } - Some(CommandResult::Get { value }) => { - let latency = - Instant::now().duration_since(issue_ts); - Ok(DriverReply::Success { - req_id, - cmd_result: CommandResult::Get { value }, - latency, - }) - } + Some(CommandResult::Get { value }) => { + let latency = + Instant::now().duration_since(issue_ts); + return Ok(DriverReply::Success { + req_id, + cmd_result: CommandResult::Get { value }, + latency, + }); + } - _ => { - logged_err!(self.id; "command type mismatch: expected Get") + _ => { + return logged_err!(self.id; "command type mismatch: expected Get"); + } } } } - } - None => Ok(DriverReply::Timeout), + None => { + return Ok(DriverReply::Timeout); + } - _ => logged_err!(self.id; "unexpected reply type received"), + _ => { + return logged_err!(self.id; "unexpected reply type received"); + } + } } } @@ -160,46 +169,57 @@ impl DriverClosedLoop { })?; let issue_ts = Instant::now(); - let reply = self.recv_reply_with_timeout().await?; - match reply { - Some(ApiReply::Reply { - id: reply_id, - result: cmd_result, - redirect, - }) => { - if reply_id != req_id { - logged_err!(self.id; "request ID mismatch: expected {}, replied {}", - req_id, reply_id) - } else { - match cmd_result { - None => { - if let Some(server) = redirect { - Ok(DriverReply::Redirect { server }) - } else { - Ok(DriverReply::Failure) + loop { + let reply = self.recv_reply_with_timeout().await?; + match reply { + Some(ApiReply::Reply { + id: reply_id, + result: cmd_result, + redirect, + }) => { + if reply_id != req_id { + // logged_err!(self.id; "request ID mismatch: expected {}, replied {}", + // req_id, reply_id) + continue; + } else { + match cmd_result { + None => { + if let Some(server) = redirect { + return Ok(DriverReply::Redirect { + server, + }); + } else { + return Ok(DriverReply::Failure); + } } - } - Some(CommandResult::Put { old_value }) => { - let latency = - Instant::now().duration_since(issue_ts); - Ok(DriverReply::Success { - req_id, - cmd_result: CommandResult::Put { old_value }, - latency, - }) - } + Some(CommandResult::Put { old_value }) => { + let latency = + Instant::now().duration_since(issue_ts); + return Ok(DriverReply::Success { + req_id, + cmd_result: CommandResult::Put { + old_value, + }, + latency, + }); + } - _ => { - logged_err!(self.id; "command type mismatch: expected Put") + _ => { + return logged_err!(self.id; "command type mismatch: expected Put"); + } } } } - } - None => Ok(DriverReply::Timeout), + None => { + return Ok(DriverReply::Timeout); + } - _ => logged_err!(self.id; "unexpected reply type received"), + _ => { + return logged_err!(self.id; "unexpected reply type received"); + } + } } } diff --git a/summerset_client/src/drivers/open_loop.rs b/summerset_client/src/drivers/open_loop.rs index 8e49c107..37a902d5 100644 --- a/summerset_client/src/drivers/open_loop.rs +++ b/summerset_client/src/drivers/open_loop.rs @@ -168,37 +168,45 @@ impl DriverOpenLoop { /// Waits for the next reply. pub async fn wait_reply(&mut self) -> Result { - let reply = self.recv_reply_with_timeout().await?; - match reply { - Some(ApiReply::Reply { - id: reply_id, - result: cmd_result, - redirect, - }) => { - if !self.pending_reqs.contains_key(&reply_id) { - logged_err!(self.id; "request ID {} not in pending set", - reply_id) - } else { - let issue_ts = self.pending_reqs.remove(&reply_id).unwrap(); - let latency = Instant::now().duration_since(issue_ts); - - if let Some(res) = cmd_result { - Ok(DriverReply::Success { - req_id: reply_id, - cmd_result: res, - latency, - }) - } else if let Some(server) = redirect { - Ok(DriverReply::Redirect { server }) + loop { + let reply = self.recv_reply_with_timeout().await?; + match reply { + Some(ApiReply::Reply { + id: reply_id, + result: cmd_result, + redirect, + }) => { + if !self.pending_reqs.contains_key(&reply_id) { + // logged_err!(self.id; "request ID {} not in pending set", + // reply_id) + continue; } else { - Ok(DriverReply::Failure) + let issue_ts = + self.pending_reqs.remove(&reply_id).unwrap(); + let latency = Instant::now().duration_since(issue_ts); + + if let Some(res) = cmd_result { + return Ok(DriverReply::Success { + req_id: reply_id, + cmd_result: res, + latency, + }); + } else if let Some(server) = redirect { + return Ok(DriverReply::Redirect { server }); + } else { + return Ok(DriverReply::Failure); + } } } - } - None => Ok(DriverReply::Timeout), + None => { + return Ok(DriverReply::Timeout); + } - _ => logged_err!(self.id; "unexpected reply type received"), + _ => { + return logged_err!(self.id; "unexpected reply type received"); + } + } } } From 5046591f509833e47c6305eb779bc00a24e3c250 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Sun, 8 Oct 2023 17:51:29 -0500 Subject: [PATCH 87/89] make Paxos variants snapshotting conservative --- src/protocols/crossword.rs | 91 +++++++++++++++++++++++++++++++------ src/protocols/multipaxos.rs | 91 +++++++++++++++++++++++++++++++------ src/protocols/rs_paxos.rs | 91 +++++++++++++++++++++++++++++++------ 3 files changed, 231 insertions(+), 42 deletions(-) diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index 41d44a9b..dbf44a3e 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -4,6 +4,7 @@ //! dynamically tunable shard assignment with the correct liveness constraints, //! plus follower gossiping for actual usability. +use std::cmp; use std::collections::HashMap; use std::path::Path; use std::net::SocketAddr; @@ -263,7 +264,13 @@ enum PeerMsg { }, /// Leader activity heartbeat. - Heartbeat { ballot: Ballot, exec_bar: usize }, + Heartbeat { + ballot: Ballot, + /// For leader step-up as well as conservative snapshotting purpose. + exec_bar: usize, + /// For conservative snapshotting purpose. + snap_bar: usize, + }, } /// Crossword server replica module. @@ -352,6 +359,16 @@ pub struct CrosswordReplica { /// It is always true that exec_bar <= commit_bar <= start_slot + insts.len() exec_bar: usize, + /// Map from peer ID -> its latest exec_bar I know; this is for conservative + /// snapshotting purpose. + peer_exec_bar: HashMap, + + /// Slot index before which it is safe to take snapshot. + /// NOTE: we are taking a conservative approach here that a snapshot + /// covering an entry can be taken only when all servers have durably + /// committed (and executed) that entry. + snap_bar: usize, + /// Current durable WAL log file offset. wal_offset: usize, @@ -1484,9 +1501,11 @@ impl CrosswordReplica { PeerMsg::ReconstructReply { slots_data } => { self.handle_msg_reconstruct_reply(peer, slots_data) } - PeerMsg::Heartbeat { ballot, exec_bar } => { - self.heard_heartbeat(peer, ballot, exec_bar) - } + PeerMsg::Heartbeat { + ballot, + exec_bar, + snap_bar, + } => self.heard_heartbeat(peer, ballot, exec_bar, snap_bar), } } } @@ -1591,6 +1610,11 @@ impl CrosswordReplica { } self.bcast_heartbeats()?; + // re-initialize peer_exec_bar information + for slot in self.peer_exec_bar.values_mut() { + *slot = 0; + } + // make a greater ballot number and invalidate all in-progress instances self.bal_prepared = 0; self.bal_prep_sent = self.make_greater_ballot(self.bal_max_seen); @@ -1684,6 +1708,7 @@ impl CrosswordReplica { PeerMsg::Heartbeat { ballot: self.bal_prep_sent, exec_bar: self.exec_bar, + snap_bar: self.snap_bar, }, None, )?; @@ -1714,7 +1739,12 @@ impl CrosswordReplica { } // I also heard this heartbeat from myself - self.heard_heartbeat(self.id, self.bal_prep_sent, self.exec_bar)?; + self.heard_heartbeat( + self.id, + self.bal_prep_sent, + self.exec_bar, + self.snap_bar, + )?; // check if we need to fall back to a config with smaller fast-path // quorum size @@ -1755,6 +1785,7 @@ impl CrosswordReplica { peer: ReplicaId, ballot: Ballot, exec_bar: usize, + snap_bar: usize, ) -> Result<(), SummersetError> { if peer != self.id { self.hb_reply_cnts.get_mut(&peer).unwrap().0 += 1; @@ -1778,10 +1809,27 @@ impl CrosswordReplica { PeerMsg::Heartbeat { ballot, exec_bar: self.exec_bar, + snap_bar: self.snap_bar, }, peer, )?; + // update peer_exec_bar if larger then known; if all servers' + // exec_bar (including myself) have passed a slot, that slot + // is definitely safe to be snapshotted + if exec_bar > self.peer_exec_bar[&peer] { + *self.peer_exec_bar.get_mut(&peer).unwrap() = exec_bar; + let passed_cnt = 1 + self + .peer_exec_bar + .values() + .filter(|&&e| e >= exec_bar) + .count() as u8; + if passed_cnt == self.population { + // all servers have executed up to exec_bar + self.snap_bar = exec_bar; + } + } + // if the peer has made a higher ballot number if ballot > self.bal_max_seen { self.bal_max_seen = ballot; @@ -1798,6 +1846,11 @@ impl CrosswordReplica { } } + // if snap_bar is larger than mine, update snap_bar + if snap_bar > self.snap_bar { + self.snap_bar = snap_bar; + } + // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot); Ok(()) } @@ -2254,10 +2307,13 @@ impl CrosswordReplica { // CrosswordReplica snapshotting & GC logic impl CrosswordReplica { /// Dump new key-value pairs to snapshot file. - async fn snapshot_dump_kv_pairs(&mut self) -> Result<(), SummersetError> { + async fn snapshot_dump_kv_pairs( + &mut self, + new_start_slot: usize, + ) -> Result<(), SummersetError> { // collect all key-value pairs put up to exec_bar let mut pairs = HashMap::new(); - for slot in self.start_slot..self.exec_bar { + for slot in self.start_slot..new_start_slot { let inst = &mut self.insts[slot - self.start_slot]; assert!(inst.reqs_cw.avail_data_shards() >= self.majority); for (_, req) in inst.reqs_cw.get_data()?.clone() { @@ -2357,10 +2413,12 @@ impl CrosswordReplica { /// Accept messages will be retried indefinitely until success before its /// associated data gets discarded from leader's memory. async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> { - pf_debug!(self.id; "taking new snapshot: start {} exec {}", - self.start_slot, self.exec_bar); + pf_debug!(self.id; "taking new snapshot: start {} exec {} snap {}", + self.start_slot, self.exec_bar, self.snap_bar); assert!(self.exec_bar >= self.start_slot); - if self.exec_bar == self.start_slot { + + let new_start_slot = cmp::min(self.snap_bar, self.exec_bar); + if new_start_slot == self.start_slot { return Ok(()); } @@ -2369,14 +2427,14 @@ impl CrosswordReplica { // NOTE: broadcast heartbeats here to appease followers self.bcast_heartbeats()?; } - self.snapshot_dump_kv_pairs().await?; + self.snapshot_dump_kv_pairs(new_start_slot).await?; // write new slot info entry to the head of snapshot self.snapshot_hub.submit_action( 0, LogAction::Write { entry: SnapEntry::SlotInfo { - start_slot: self.exec_bar, + start_slot: new_start_slot, commit_bar: self.commit_bar, }, offset: 0, @@ -2394,8 +2452,8 @@ impl CrosswordReplica { } // update start_slot and discard all in-memory log instances up to exec_bar - self.insts.drain(0..(self.exec_bar - self.start_slot)); - self.start_slot = self.exec_bar; + self.insts.drain(0..(new_start_slot - self.start_slot)); + self.start_slot = new_start_slot; // discarding everything older than start_slot in WAL log if self.is_leader() { @@ -2436,6 +2494,7 @@ impl CrosswordReplica { self.start_slot = start_slot; self.commit_bar = commit_bar; self.exec_bar = start_slot; + self.snap_bar = start_slot; // repeatedly apply key-value pairs loop { @@ -2712,6 +2771,10 @@ impl GenericReplica for CrosswordReplica { bal_max_seen: 0, commit_bar: 0, exec_bar: 0, + peer_exec_bar: (0..population) + .filter_map(|s| if s == id { None } else { Some((s, 0)) }) + .collect(), + snap_bar: 0, wal_offset: 0, snap_offset: 0, rs_coder, diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs index 76955978..417217ab 100644 --- a/src/protocols/multipaxos.rs +++ b/src/protocols/multipaxos.rs @@ -7,6 +7,7 @@ //! - //! - +use std::cmp; use std::collections::HashMap; use std::path::Path; use std::net::SocketAddr; @@ -228,7 +229,13 @@ enum PeerMsg { Commit { slot: usize }, /// Leader activity heartbeat. - Heartbeat { ballot: Ballot, exec_bar: usize }, + Heartbeat { + ballot: Ballot, + /// For leader step-up as well as conservative snapshotting purpose. + exec_bar: usize, + /// For conservative snapshotting purpose. + snap_bar: usize, + }, } /// MultiPaxos server replica module. @@ -310,6 +317,16 @@ pub struct MultiPaxosReplica { /// It is always true that exec_bar <= commit_bar <= start_slot + insts.len() exec_bar: usize, + /// Map from peer ID -> its latest exec_bar I know; this is for conservative + /// snapshotting purpose. + peer_exec_bar: HashMap, + + /// Slot index before which it is safe to take snapshot. + /// NOTE: we are taking a conservative approach here that a snapshot + /// covering an entry can be taken only when all servers have durably + /// committed (and executed) that entry. + snap_bar: usize, + /// Current durable WAL log file offset. wal_offset: usize, @@ -1012,9 +1029,11 @@ impl MultiPaxosReplica { self.handle_msg_accept_reply(peer, slot, ballot) } PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot), - PeerMsg::Heartbeat { ballot, exec_bar } => { - self.heard_heartbeat(peer, ballot, exec_bar) - } + PeerMsg::Heartbeat { + ballot, + exec_bar, + snap_bar, + } => self.heard_heartbeat(peer, ballot, exec_bar, snap_bar), } } } @@ -1106,6 +1125,11 @@ impl MultiPaxosReplica { } self.bcast_heartbeats()?; + // re-initialize peer_exec_bar information + for slot in self.peer_exec_bar.values_mut() { + *slot = 0; + } + // make a greater ballot number and invalidate all in-progress instances self.bal_prepared = 0; self.bal_prep_sent = self.make_greater_ballot(self.bal_max_seen); @@ -1165,6 +1189,7 @@ impl MultiPaxosReplica { PeerMsg::Heartbeat { ballot: self.bal_prep_sent, exec_bar: self.exec_bar, + snap_bar: self.snap_bar, }, None, )?; @@ -1195,7 +1220,12 @@ impl MultiPaxosReplica { } // I also heard this heartbeat from myself - self.heard_heartbeat(self.id, self.bal_prep_sent, self.exec_bar)?; + self.heard_heartbeat( + self.id, + self.bal_prep_sent, + self.exec_bar, + self.snap_bar, + )?; // pf_trace!(self.id; "broadcast heartbeats bal {}", self.bal_prep_sent); Ok(()) @@ -1224,6 +1254,7 @@ impl MultiPaxosReplica { peer: ReplicaId, ballot: Ballot, exec_bar: usize, + snap_bar: usize, ) -> Result<(), SummersetError> { if peer != self.id { self.hb_reply_cnts.get_mut(&peer).unwrap().0 += 1; @@ -1247,10 +1278,27 @@ impl MultiPaxosReplica { PeerMsg::Heartbeat { ballot, exec_bar: self.exec_bar, + snap_bar: self.snap_bar, }, peer, )?; + // update peer_exec_bar if larger then known; if all servers' + // exec_bar (including myself) have passed a slot, that slot + // is definitely safe to be snapshotted + if exec_bar > self.peer_exec_bar[&peer] { + *self.peer_exec_bar.get_mut(&peer).unwrap() = exec_bar; + let passed_cnt = 1 + self + .peer_exec_bar + .values() + .filter(|&&e| e >= exec_bar) + .count() as u8; + if passed_cnt == self.population { + // all servers have executed up to exec_bar + self.snap_bar = exec_bar; + } + } + // if the peer has made a higher ballot number if ballot > self.bal_max_seen { self.bal_max_seen = ballot; @@ -1267,6 +1315,11 @@ impl MultiPaxosReplica { } } + // if snap_bar is larger than mine, update snap_bar + if snap_bar > self.snap_bar { + self.snap_bar = snap_bar; + } + // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot); Ok(()) } @@ -1541,10 +1594,13 @@ impl MultiPaxosReplica { // MultiPaxosReplica snapshotting & GC logic impl MultiPaxosReplica { /// Dump new key-value pairs to snapshot file. - async fn snapshot_dump_kv_pairs(&mut self) -> Result<(), SummersetError> { + async fn snapshot_dump_kv_pairs( + &mut self, + new_start_slot: usize, + ) -> Result<(), SummersetError> { // collect all key-value pairs put up to exec_bar let mut pairs = HashMap::new(); - for slot in self.start_slot..self.exec_bar { + for slot in self.start_slot..new_start_slot { let inst = &self.insts[slot - self.start_slot]; for (_, req) in inst.reqs.clone() { if let ApiRequest::Req { @@ -1643,10 +1699,12 @@ impl MultiPaxosReplica { /// Accept messages will be retried indefinitely until success before its /// associated data gets discarded from leader's memory. async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> { - pf_debug!(self.id; "taking new snapshot: start {} exec {}", - self.start_slot, self.exec_bar); + pf_debug!(self.id; "taking new snapshot: start {} exec {} snap {}", + self.start_slot, self.exec_bar, self.snap_bar); assert!(self.exec_bar >= self.start_slot); - if self.exec_bar == self.start_slot { + + let new_start_slot = cmp::min(self.snap_bar, self.exec_bar); + if new_start_slot == self.start_slot { return Ok(()); } @@ -1655,14 +1713,14 @@ impl MultiPaxosReplica { // NOTE: broadcast heartbeats here to appease followers self.bcast_heartbeats()?; } - self.snapshot_dump_kv_pairs().await?; + self.snapshot_dump_kv_pairs(new_start_slot).await?; // write new slot info entry to the head of snapshot self.snapshot_hub.submit_action( 0, LogAction::Write { entry: SnapEntry::SlotInfo { - start_slot: self.exec_bar, + start_slot: new_start_slot, commit_bar: self.commit_bar, }, offset: 0, @@ -1680,8 +1738,8 @@ impl MultiPaxosReplica { } // update start_slot and discard all in-memory log instances up to exec_bar - self.insts.drain(0..(self.exec_bar - self.start_slot)); - self.start_slot = self.exec_bar; + self.insts.drain(0..(new_start_slot - self.start_slot)); + self.start_slot = new_start_slot; // discarding everything older than start_slot in WAL log if self.is_leader() { @@ -1722,6 +1780,7 @@ impl MultiPaxosReplica { self.start_slot = start_slot; self.commit_bar = commit_bar; self.exec_bar = start_slot; + self.snap_bar = start_slot; // repeatedly apply key-value pairs loop { @@ -1968,6 +2027,10 @@ impl GenericReplica for MultiPaxosReplica { bal_max_seen: 0, commit_bar: 0, exec_bar: 0, + peer_exec_bar: (0..population) + .filter_map(|s| if s == id { None } else { Some((s, 0)) }) + .collect(), + snap_bar: 0, wal_offset: 0, snap_offset: 0, }) diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs index 0f7022f4..96aa8127 100644 --- a/src/protocols/rs_paxos.rs +++ b/src/protocols/rs_paxos.rs @@ -3,6 +3,7 @@ //! MultiPaxos with Reed-Solomon erasure coding. References: //! - +use std::cmp; use std::collections::HashMap; use std::path::Path; use std::net::SocketAddr; @@ -243,7 +244,13 @@ enum PeerMsg { }, /// Leader activity heartbeat. - Heartbeat { ballot: Ballot, exec_bar: usize }, + Heartbeat { + ballot: Ballot, + /// For leader step-up as well as conservative snapshotting purpose. + exec_bar: usize, + /// For conservative snapshotting purpose. + snap_bar: usize, + }, } /// RSPaxos server replica module. @@ -325,6 +332,16 @@ pub struct RSPaxosReplica { /// It is always true that exec_bar <= commit_bar <= start_slot + insts.len() exec_bar: usize, + /// Map from peer ID -> its latest exec_bar I know; this is for conservative + /// snapshotting purpose. + peer_exec_bar: HashMap, + + /// Slot index before which it is safe to take snapshot. + /// NOTE: we are taking a conservative approach here that a snapshot + /// covering an entry can be taken only when all servers have durably + /// committed (and executed) that entry. + snap_bar: usize, + /// Current durable WAL log file offset. wal_offset: usize, @@ -1232,9 +1249,11 @@ impl RSPaxosReplica { PeerMsg::ReconstructReply { slots_data } => { self.handle_msg_reconstruct_reply(peer, slots_data) } - PeerMsg::Heartbeat { ballot, exec_bar } => { - self.heard_heartbeat(peer, ballot, exec_bar) - } + PeerMsg::Heartbeat { + ballot, + exec_bar, + snap_bar, + } => self.heard_heartbeat(peer, ballot, exec_bar, snap_bar), } } } @@ -1327,6 +1346,11 @@ impl RSPaxosReplica { } self.bcast_heartbeats()?; + // re-initialize peer_exec_bar information + for slot in self.peer_exec_bar.values_mut() { + *slot = 0; + } + // make a greater ballot number and invalidate all in-progress instances self.bal_prepared = 0; self.bal_prep_sent = self.make_greater_ballot(self.bal_max_seen); @@ -1403,6 +1427,7 @@ impl RSPaxosReplica { PeerMsg::Heartbeat { ballot: self.bal_prep_sent, exec_bar: self.exec_bar, + snap_bar: self.snap_bar, }, None, )?; @@ -1433,7 +1458,12 @@ impl RSPaxosReplica { } // I also heard this heartbeat from myself - self.heard_heartbeat(self.id, self.bal_prep_sent, self.exec_bar)?; + self.heard_heartbeat( + self.id, + self.bal_prep_sent, + self.exec_bar, + self.snap_bar, + )?; // pf_trace!(self.id; "broadcast heartbeats bal {}", self.bal_prep_sent); Ok(()) @@ -1462,6 +1492,7 @@ impl RSPaxosReplica { peer: ReplicaId, ballot: Ballot, exec_bar: usize, + snap_bar: usize, ) -> Result<(), SummersetError> { if peer != self.id { self.hb_reply_cnts.get_mut(&peer).unwrap().0 += 1; @@ -1485,10 +1516,27 @@ impl RSPaxosReplica { PeerMsg::Heartbeat { ballot, exec_bar: self.exec_bar, + snap_bar: self.snap_bar, }, peer, )?; + // update peer_exec_bar if larger then known; if all servers' + // exec_bar (including myself) have passed a slot, that slot + // is definitely safe to be snapshotted + if exec_bar > self.peer_exec_bar[&peer] { + *self.peer_exec_bar.get_mut(&peer).unwrap() = exec_bar; + let passed_cnt = 1 + self + .peer_exec_bar + .values() + .filter(|&&e| e >= exec_bar) + .count() as u8; + if passed_cnt == self.population { + // all servers have executed up to exec_bar + self.snap_bar = exec_bar; + } + } + // if the peer has made a higher ballot number if ballot > self.bal_max_seen { self.bal_max_seen = ballot; @@ -1505,6 +1553,11 @@ impl RSPaxosReplica { } } + // if snap_bar is larger than mine, update snap_bar + if snap_bar > self.snap_bar { + self.snap_bar = snap_bar; + } + // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot); Ok(()) } @@ -1795,10 +1848,13 @@ impl RSPaxosReplica { // RSPaxosReplica snapshotting & GC logic impl RSPaxosReplica { /// Dump new key-value pairs to snapshot file. - async fn snapshot_dump_kv_pairs(&mut self) -> Result<(), SummersetError> { + async fn snapshot_dump_kv_pairs( + &mut self, + new_start_slot: usize, + ) -> Result<(), SummersetError> { // collect all key-value pairs put up to exec_bar let mut pairs = HashMap::new(); - for slot in self.start_slot..self.exec_bar { + for slot in self.start_slot..new_start_slot { let inst = &mut self.insts[slot - self.start_slot]; assert!(inst.reqs_cw.avail_data_shards() >= self.majority); for (_, req) in inst.reqs_cw.get_data()?.clone() { @@ -1898,10 +1954,12 @@ impl RSPaxosReplica { /// Accept messages will be retried indefinitely until success before its /// associated data gets discarded from leader's memory. async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> { - pf_debug!(self.id; "taking new snapshot: start {} exec {}", - self.start_slot, self.exec_bar); + pf_debug!(self.id; "taking new snapshot: start {} exec {} snap {}", + self.start_slot, self.exec_bar, self.snap_bar); assert!(self.exec_bar >= self.start_slot); - if self.exec_bar == self.start_slot { + + let new_start_slot = cmp::min(self.snap_bar, self.exec_bar); + if new_start_slot == self.start_slot { return Ok(()); } @@ -1910,14 +1968,14 @@ impl RSPaxosReplica { // NOTE: broadcast heartbeats here to appease followers self.bcast_heartbeats()?; } - self.snapshot_dump_kv_pairs().await?; + self.snapshot_dump_kv_pairs(new_start_slot).await?; // write new slot info entry to the head of snapshot self.snapshot_hub.submit_action( 0, LogAction::Write { entry: SnapEntry::SlotInfo { - start_slot: self.exec_bar, + start_slot: new_start_slot, commit_bar: self.commit_bar, }, offset: 0, @@ -1935,8 +1993,8 @@ impl RSPaxosReplica { } // update start_slot and discard all in-memory log instances up to exec_bar - self.insts.drain(0..(self.exec_bar - self.start_slot)); - self.start_slot = self.exec_bar; + self.insts.drain(0..(new_start_slot - self.start_slot)); + self.start_slot = new_start_slot; // discarding everything older than start_slot in WAL log if self.is_leader() { @@ -1977,6 +2035,7 @@ impl RSPaxosReplica { self.start_slot = start_slot; self.commit_bar = commit_bar; self.exec_bar = start_slot; + self.snap_bar = start_slot; // repeatedly apply key-value pairs loop { @@ -2243,6 +2302,10 @@ impl GenericReplica for RSPaxosReplica { bal_max_seen: 0, commit_bar: 0, exec_bar: 0, + peer_exec_bar: (0..population) + .filter_map(|s| if s == id { None } else { Some((s, 0)) }) + .collect(), + snap_bar: 0, wal_offset: 0, snap_offset: 0, rs_coder, From 913dfa621a0ed1b4afa0b1e0642f4eb50c51df52 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Sun, 8 Oct 2023 18:34:53 -0500 Subject: [PATCH 88/89] add missing hole filling mechanism to Paxos variants --- src/protocols/crossword.rs | 70 +++++++++++++++++++++++++++++++++++++ src/protocols/multipaxos.rs | 59 +++++++++++++++++++++++++++++++ src/protocols/rs_paxos.rs | 62 ++++++++++++++++++++++++++++++++ 3 files changed, 191 insertions(+) diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index dbf44a3e..f71a6b5e 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -251,6 +251,10 @@ enum PeerMsg { /// Commit notification from leader to replicas. Commit { slot: usize }, + /// Request by a lagging replica to leader asking to re-send Accepts for + /// missing holes + FillHoles { slots: Vec }, + /// Reconstruction read from new leader to replicas. Reconstruct { /// Map from slot -> shards to exclude. @@ -949,6 +953,21 @@ impl CrosswordReplica { } } + // if there are hole(s) between current commit_bar and newly committed + // slot, ask the leader to re-send Accept messages for those slots + if slot > self.commit_bar && !self.is_leader() { + if let Some(leader) = self.leader { + let holes: Vec = (self.commit_bar..slot).collect(); + self.transport_hub.send_msg( + PeerMsg::FillHoles { + slots: holes.clone(), + }, + leader, + )?; + pf_trace!(self.id; "sent FillHoles -> {} slots {:?}", leader, holes); + } + } + Ok(()) } @@ -1355,6 +1374,54 @@ impl CrosswordReplica { Ok(()) } + /// Handler of FillHoles message from a lagging peer. + fn handle_msg_fill_holes( + &mut self, + peer: ReplicaId, + slots: Vec, + ) -> Result<(), SummersetError> { + if !self.is_leader() { + return Ok(()); + } + pf_trace!(self.id; "received FillHoles <- {} for slots {:?}", peer, slots); + + for slot in slots { + if slot < self.start_slot { + continue; + } else if slot >= self.start_slot + self.insts.len() { + break; + } + let inst = &self.insts[slot - self.start_slot]; + + if inst.status >= Status::Committed { + // re-send Accept message for this slot + self.transport_hub.send_msg( + PeerMsg::Accept { + slot, + ballot: self.bal_prepared, + reqs_cw: inst.reqs_cw.subset_copy( + Bitmap::from( + self.population, + Self::shards_for_replica( + slot, + peer, + self.population, + self.shards_per_replica, + ), + ), + false, + )?, + }, + peer, + )?; + pf_trace!(self.id; "sent Accept -> {} for slot {} bal {}", + peer, slot, self.bal_prepared); + } + } + + Ok(()) + } + /// Handler of Reconstruct message from leader. fn handle_msg_reconstruct( &mut self, @@ -1495,6 +1562,9 @@ impl CrosswordReplica { self.handle_msg_accept_reply(peer, slot, ballot) } PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot), + PeerMsg::FillHoles { slots } => { + self.handle_msg_fill_holes(peer, slots) + } PeerMsg::Reconstruct { slots_excl } => { self.handle_msg_reconstruct(peer, slots_excl) } diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs index 417217ab..435cbf3c 100644 --- a/src/protocols/multipaxos.rs +++ b/src/protocols/multipaxos.rs @@ -228,6 +228,10 @@ enum PeerMsg { /// Commit notification from leader to replicas. Commit { slot: usize }, + /// Request by a lagging replica to leader asking to re-send Accepts for + /// missing holes + FillHoles { slots: Vec }, + /// Leader activity heartbeat. Heartbeat { ballot: Ballot, @@ -680,6 +684,21 @@ impl MultiPaxosReplica { } } + // if there are hole(s) between current commit_bar and newly committed + // slot, ask the leader to re-send Accept messages for those slots + if slot > self.commit_bar && !self.is_leader() { + if let Some(leader) = self.leader { + let holes: Vec = (self.commit_bar..slot).collect(); + self.transport_hub.send_msg( + PeerMsg::FillHoles { + slots: holes.clone(), + }, + leader, + )?; + pf_trace!(self.id; "sent FillHoles -> {} slots {:?}", leader, holes); + } + } + Ok(()) } @@ -1007,6 +1026,43 @@ impl MultiPaxosReplica { Ok(()) } + /// Handler of FillHoles message from a lagging peer. + fn handle_msg_fill_holes( + &mut self, + peer: ReplicaId, + slots: Vec, + ) -> Result<(), SummersetError> { + if !self.is_leader() { + return Ok(()); + } + pf_trace!(self.id; "received FillHoles <- {} for slots {:?}", peer, slots); + + for slot in slots { + if slot < self.start_slot { + continue; + } else if slot >= self.start_slot + self.insts.len() { + break; + } + let inst = &self.insts[slot - self.start_slot]; + + if inst.status >= Status::Committed { + // re-send Accept message for this slot + self.transport_hub.send_msg( + PeerMsg::Accept { + slot, + ballot: self.bal_prepared, + reqs: inst.reqs.clone(), + }, + peer, + )?; + pf_trace!(self.id; "sent Accept -> {} for slot {} bal {}", + peer, slot, self.bal_prepared); + } + } + + Ok(()) + } + /// Synthesized handler of receiving message from peer. fn handle_msg_recv( &mut self, @@ -1029,6 +1085,9 @@ impl MultiPaxosReplica { self.handle_msg_accept_reply(peer, slot, ballot) } PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot), + PeerMsg::FillHoles { slots } => { + self.handle_msg_fill_holes(peer, slots) + } PeerMsg::Heartbeat { ballot, exec_bar, diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs index 96aa8127..31fbcb69 100644 --- a/src/protocols/rs_paxos.rs +++ b/src/protocols/rs_paxos.rs @@ -234,6 +234,10 @@ enum PeerMsg { /// Commit notification from leader to replicas. Commit { slot: usize }, + /// Request by a lagging replica to leader asking to re-send Accepts for + /// missing holes + FillHoles { slots: Vec }, + /// Reconstruction read from new leader to replicas. Reconstruct { slots: Vec }, @@ -739,6 +743,21 @@ impl RSPaxosReplica { } } + // if there are hole(s) between current commit_bar and newly committed + // slot, ask the leader to re-send Accept messages for those slots + if slot > self.commit_bar && !self.is_leader() { + if let Some(leader) = self.leader { + let holes: Vec = (self.commit_bar..slot).collect(); + self.transport_hub.send_msg( + PeerMsg::FillHoles { + slots: holes.clone(), + }, + leader, + )?; + pf_trace!(self.id; "sent FillHoles -> {} slots {:?}", leader, holes); + } + } + Ok(()) } @@ -1108,6 +1127,46 @@ impl RSPaxosReplica { Ok(()) } + /// Handler of FillHoles message from a lagging peer. + fn handle_msg_fill_holes( + &mut self, + peer: ReplicaId, + slots: Vec, + ) -> Result<(), SummersetError> { + if !self.is_leader() { + return Ok(()); + } + pf_trace!(self.id; "received FillHoles <- {} for slots {:?}", peer, slots); + + for slot in slots { + if slot < self.start_slot { + continue; + } else if slot >= self.start_slot + self.insts.len() { + break; + } + let inst = &self.insts[slot - self.start_slot]; + + if inst.status >= Status::Committed { + // re-send Accept message for this slot + self.transport_hub.send_msg( + PeerMsg::Accept { + slot, + ballot: self.bal_prepared, + reqs_cw: inst.reqs_cw.subset_copy( + Bitmap::from(self.population, vec![peer]), + false, + )?, + }, + peer, + )?; + pf_trace!(self.id; "sent Accept -> {} for slot {} bal {}", + peer, slot, self.bal_prepared); + } + } + + Ok(()) + } + /// Handler of Reconstruct message from leader. fn handle_msg_reconstruct( &mut self, @@ -1243,6 +1302,9 @@ impl RSPaxosReplica { self.handle_msg_accept_reply(peer, slot, ballot) } PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot), + PeerMsg::FillHoles { slots } => { + self.handle_msg_fill_holes(peer, slots) + } PeerMsg::Reconstruct { slots } => { self.handle_msg_reconstruct(peer, slots) } From 8fd45a3ca4d90c314c5e683ef7d7e4db5e0c38c1 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Sun, 8 Oct 2023 19:01:01 -0500 Subject: [PATCH 89/89] add Raft to workflow proc tests --- .github/workflows/tests_proc.yml | 6 ++++-- .github/workflows/tests_unit.yml | 2 +- scripts/workflow_test.py | 14 ++++++++++++++ src/protocols/raft.rs | 3 ++- 4 files changed, 21 insertions(+), 4 deletions(-) diff --git a/.github/workflows/tests_proc.yml b/.github/workflows/tests_proc.yml index e8fbde3f..dbd7195e 100644 --- a/.github/workflows/tests_proc.yml +++ b/.github/workflows/tests_proc.yml @@ -16,5 +16,7 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Run proc tests - run: python3 scripts/workflow_test.py + - name: Run proc tests (MultiPaxos) + run: python3 scripts/workflow_test.py -p MultiPaxos + - name: Run proc tests (Raft) + run: python3 scripts/workflow_test.py -p Raft diff --git a/.github/workflows/tests_unit.yml b/.github/workflows/tests_unit.yml index 0a1fd8d6..57aa8fb3 100644 --- a/.github/workflows/tests_unit.yml +++ b/.github/workflows/tests_unit.yml @@ -16,5 +16,5 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Run unit tests + - name: Run all unit tests run: cargo test --workspace --verbose diff --git a/scripts/workflow_test.py b/scripts/workflow_test.py index 33484aca..eb176a7f 100644 --- a/scripts/workflow_test.py +++ b/scripts/workflow_test.py @@ -1,5 +1,6 @@ import sys import os +import argparse import subprocess @@ -76,6 +77,12 @@ def run_tester_client(protocol, test_name): if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-p", "--protocol", type=str, required=True, help="protocol name" + ) + args = parser.parse_args() + do_cargo_build() kill_all_matching("local_client.py", force=True) @@ -85,6 +92,13 @@ def run_tester_client(protocol, test_name): kill_all_matching("summerset_manager", force=True) PROTOCOL = "MultiPaxos" + if args.protocol == "MultiPaxos": + pass + elif args.protocol == "Raft": + PROTOCOL = "Raft" + else: + raise ValueError(f"unrecognized protocol {args.protocol} to run workflow test") + NUM_REPLICAS = 3 TEST_NAME = "primitive_ops" TIMEOUT = 300 diff --git a/src/protocols/raft.rs b/src/protocols/raft.rs index 5dc54a83..4ffc04f5 100644 --- a/src/protocols/raft.rs +++ b/src/protocols/raft.rs @@ -1427,6 +1427,7 @@ impl RaftReplica { } // do an extra Truncate to remove paritial entry at the end if any + assert!(self.log_offset >= self.log_meta_end); self.storage_hub.submit_action( 0, LogAction::Truncate { @@ -1438,7 +1439,7 @@ impl RaftReplica { offset_ok: true, .. } = log_result { - if self.log_offset > 0 { + if self.log_offset > self.log_meta_end { pf_info!(self.id; "recovered from wal log: term {} voted {:?} |log| {}", self.curr_term, self.voted_for, self.log.len()); }