From 5b5271a33aa01391dfdef48c0453b990c866e088 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Thu, 24 Aug 2023 17:34:41 +0800 Subject: [PATCH 01/21] minor changes to benchmarking scripts --- scripts/local_bench.tmp.py | 2 +- scripts/local_client.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py index 714eac67..6f1ceb78 100644 --- a/scripts/local_bench.tmp.py +++ b/scripts/local_bench.tmp.py @@ -110,7 +110,7 @@ def bench_round(protocol, num_replicas, value_size, put_ratio, length_s): if __name__ == "__main__": do_cargo_build() - for num_replicas in (3, 7): + for num_replicas in (3, 5, 7): for value_size in (1024, 65536, 4194304): for protocol in ("MultiPaxos", "RSPaxos"): bench_round(protocol, num_replicas, value_size, 100, 60) diff --git a/scripts/local_client.py b/scripts/local_client.py index c0f46adf..04347398 100644 --- a/scripts/local_client.py +++ b/scripts/local_client.py @@ -71,7 +71,7 @@ def compose_client_cmd(protocol, manager, config, utility, params, release): # if in benchmarking mode, lower the client's CPU scheduling priority if utility == "bench": - cmd = ["nice", "-n", "15"] + cmd + cmd = ["nice", "-n", "19"] + cmd return cmd From 91d8778aae6902509ea1b3b434c07831308d44da Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Thu, 24 Aug 2023 20:06:10 +0800 Subject: [PATCH 02/21] minor updates to README --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 3e2fe00a..3ccadae9 100644 --- a/README.md +++ b/README.md @@ -50,6 +50,7 @@ Summerset is a distributed key-value store supporting a wide range of state mach | `RepNothing` | Simplest protocol w/o any replication | | `SimplePush` | Pushing to peers w/o any consistency guarantees | | `MultiPaxos` | Classic [MultiPaxos](https://www.microsoft.com/en-us/research/uploads/prod/2016/12/paxos-simple-Copy.pdf) protocol | +| `RS-Paxos` | MultiPaxos w/ Reed-Solomon erasure code sharding | Formal TLA+ specification of some protocols are provided in `tla+/`. From 7979ddac1663a9bccc03a598f5d67bd4fb6580cd Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Mon, 28 Aug 2023 16:54:27 +0800 Subject: [PATCH 03/21] add very basic crossword impl --- scripts/local_client.py | 1 + scripts/local_cluster.py | 1 + src/lib.rs | 2 + src/protocols/crossword.rs | 1410 ++++++++++++++++++++++++++++++++++++ src/protocols/mod.rs | 18 + src/protocols/rs_paxos.rs | 2 +- 6 files changed, 1433 insertions(+), 1 deletion(-) create mode 100644 src/protocols/crossword.rs diff --git a/scripts/local_client.py b/scripts/local_client.py index 04347398..2f9c2c6d 100644 --- a/scripts/local_client.py +++ b/scripts/local_client.py @@ -26,6 +26,7 @@ def run_process(cmd): "SimplePush": "", "MultiPaxos": "", "RSPaxos": "", + "Crossword": "", } diff --git a/scripts/local_cluster.py b/scripts/local_cluster.py index a1d33351..b7dcdb25 100644 --- a/scripts/local_cluster.py +++ b/scripts/local_cluster.py @@ -40,6 +40,7 @@ def kill_all_matching(name): "SimplePush": lambda r, n: f"backer_path='/tmp/summerset.simple_push.{r}.wal'+rep_degree={n-1}", "MultiPaxos": lambda r, n: f"backer_path='/tmp/summerset.multipaxos.{r}.wal'", "RSPaxos": lambda r, n: f"backer_path='/tmp/summerset.rs_paxos.{r}.wal'+fault_tolerance={n-(n//2+1)}", + "Crossword": lambda r, n: f"backer_path='/tmp/summerset.rs_paxos.{r}.wal'+fault_tolerance=0+shards_per_replica=3", } diff --git a/src/lib.rs b/src/lib.rs index f5cf4126..40bcbf31 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -35,3 +35,5 @@ pub use crate::protocols::SmrProtocol; pub use crate::protocols::{ReplicaConfigRepNothing, ClientConfigRepNothing}; pub use crate::protocols::{ReplicaConfigSimplePush, ClientConfigSimplePush}; pub use crate::protocols::{ReplicaConfigMultiPaxos, ClientConfigMultiPaxos}; +pub use crate::protocols::{ReplicaConfigRSPaxos, ClientConfigRSPaxos}; +pub use crate::protocols::{ReplicaConfigCrossword, ClientConfigCrossword}; diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs new file mode 100644 index 00000000..dfe647ea --- /dev/null +++ b/src/protocols/crossword.rs @@ -0,0 +1,1410 @@ +//! Replication protocol: Crossword. +//! +//! MultiPaxos with flexible Reed-Solomon erasure coding that supports tunable +//! shard groups and asymmetric shard assignment. + +use std::collections::{HashMap, HashSet}; +use std::path::Path; +use std::net::SocketAddr; + +use crate::utils::{SummersetError, ReplicaMap, RSCodeword}; +use crate::manager::{CtrlMsg, CtrlRequest, CtrlReply}; +use crate::server::{ + ReplicaId, ControlHub, StateMachine, CommandResult, CommandId, ExternalApi, + ApiRequest, ApiReply, StorageHub, LogAction, LogResult, LogActionId, + TransportHub, GenericReplica, +}; +use crate::client::{ClientId, ClientApiStub, ClientCtrlStub, GenericEndpoint}; +use crate::protocols::SmrProtocol; + +use async_trait::async_trait; + +use serde::{Serialize, Deserialize}; + +use tokio::time::Duration; + +use reed_solomon_erasure::galois_8::ReedSolomon; + +/// Configuration parameters struct. +#[derive(Debug, Deserialize)] +pub struct ReplicaConfigCrossword { + /// Client request batching interval in microsecs. + pub batch_interval_us: u64, + + /// Client request batching maximum batch size. + pub max_batch_size: usize, + + /// Path to backing file. + pub backer_path: String, + + /// Whether to call `fsync()`/`fdatasync()` on logger. + pub logger_sync: bool, + + /// Fault-tolerance level. + pub fault_tolerance: u8, + + /// Number of shards to assign to each replica. + // TODO: proper config options. + pub shards_per_replica: u8, +} + +#[allow(clippy::derivable_impls)] +impl Default for ReplicaConfigCrossword { + fn default() -> Self { + ReplicaConfigCrossword { + batch_interval_us: 1000, + max_batch_size: 5000, + backer_path: "/tmp/summerset.rs_paxos.wal".into(), + logger_sync: false, + fault_tolerance: 0, + shards_per_replica: 1, + } + } +} + +/// Ballot number type. Use 0 as a null ballot number. +type Ballot = u64; + +/// Instance status enum. +#[derive( + Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Serialize, Deserialize, +)] +enum Status { + Null = 0, + Preparing = 1, + Accepting = 2, + Committed = 3, + Executed = 4, +} + +/// Request batch type (i.e., the "value" in Paxos). +type ReqBatch = Vec<(ClientId, ApiRequest)>; + +/// Leader-side bookkeeping info for each instance initiated. +#[derive(Debug, Clone)] +struct LeaderBookkeeping { + /// Replicas from which I have received Prepare confirmations. + prepare_acks: ReplicaMap, + + /// Max ballot among received Prepare replies. + prepare_max_bal: Ballot, + + /// Replicas from which I have received Accept confirmations. + accept_acks: ReplicaMap, +} + +/// Follower-side bookkeeping info for each instance received. +#[derive(Debug, Clone)] +struct ReplicaBookkeeping { + /// Source leader replica ID for replyiing to Prepares and Accepts. + source: ReplicaId, +} + +/// In-memory instance containing a complete commands batch. +#[derive(Debug, Clone)] +struct Instance { + /// Ballot number. + bal: Ballot, + + /// Instance status. + status: Status, + + /// Shards of a batch of client requests. + reqs_cw: RSCodeword, + + /// Leader-side bookkeeping info. + leader_bk: Option, + + /// Follower-side bookkeeping info. + replica_bk: Option, +} + +/// Stable storage log entry type. +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] +enum LogEntry { + /// Records an update to the largest prepare ballot seen. + PrepareBal { slot: usize, ballot: Ballot }, + + /// Records a newly accepted request batch data shards at slot index. + AcceptData { + slot: usize, + ballot: Ballot, + reqs_cw: RSCodeword, + }, + + /// Records an event of committing the instance at index. + CommitSlot { slot: usize }, +} + +/// Peer-peer message type. +#[derive(Debug, Clone, Serialize, Deserialize)] +enum PeerMsg { + /// Prepare message from leader to replicas. + Prepare { slot: usize, ballot: Ballot }, + + /// Prepare reply from replica to leader. + PrepareReply { + slot: usize, + ballot: Ballot, + /// The accepted ballot number for that instance and the corresponding + /// request batch value shards known by replica. + voted: Option<(Ballot, RSCodeword)>, + }, + + /// Accept message from leader to replicas. + Accept { + slot: usize, + ballot: Ballot, + reqs_cw: RSCodeword, + }, + + /// Accept reply from replica to leader. + AcceptReply { slot: usize, ballot: Ballot }, + + /// Commit notification from leader to replicas. + Commit { slot: usize }, +} + +/// Crossword server replica module. +pub struct CrosswordReplica { + /// Replica ID in cluster. + id: ReplicaId, + + /// Total number of replicas in cluster. + population: u8, + + /// Majority quorum size. + quorum_cnt: u8, + + /// Configuration parameters struct. + config: ReplicaConfigCrossword, + + /// Address string for client requests API. + _api_addr: SocketAddr, + + /// Address string for internal peer-peer communication. + _p2p_addr: SocketAddr, + + /// ControlHub module. + control_hub: ControlHub, + + /// ExternalApi module. + external_api: ExternalApi, + + /// StateMachine module. + state_machine: StateMachine, + + /// StorageHub module. + storage_hub: StorageHub, + + /// TransportHub module. + transport_hub: TransportHub, + + /// Do I think I am the leader? + is_leader: bool, + + /// In-memory log of instances. + insts: Vec, + + /// Largest ballot number that a leader has sent Prepare messages in. + bal_prep_sent: Ballot, + + /// Largest ballot number that a leader knows has been safely prepared. + bal_prepared: Ballot, + + /// Largest ballot number seen as acceptor. + bal_max_seen: Ballot, + + /// Index of the first non-committed instance. + commit_bar: usize, + + /// Index of the first non-executed instance. + /// It is always true that exec_bar <= commit_bar <= insts.len() + exec_bar: usize, + + /// Current durable log file offset. + log_offset: usize, + + /// Fixed Reed-Solomon coder. + rs_coder: ReedSolomon, +} + +impl CrosswordReplica { + /// Compose a unique ballot number from base. + fn make_unique_ballot(&self, base: u64) -> Ballot { + ((base << 8) | ((self.id + 1) as u64)) as Ballot + } + + /// Compose a unique ballot number greater than the given one. + fn make_greater_ballot(&self, bal: Ballot) -> Ballot { + self.make_unique_ballot((bal >> 8) + 1) + } + + /// Compose LogActionId from slot index & entry type. + /// Uses the `Status` enum type to represent differnet entry types. + fn make_log_action_id(slot: usize, entry_type: Status) -> LogActionId { + let type_num = match entry_type { + Status::Preparing => 1, + Status::Accepting => 2, + Status::Committed => 3, + _ => panic!("unknown log entry type {:?}", entry_type), + }; + ((slot << 2) | type_num) as LogActionId + } + + /// Decompose LogActionId into slot index & entry type. + fn split_log_action_id(log_action_id: LogActionId) -> (usize, Status) { + let slot = (log_action_id >> 2) as usize; + let type_num = log_action_id & ((1 << 2) - 1); + let entry_type = match type_num { + 1 => Status::Preparing, + 2 => Status::Accepting, + 3 => Status::Committed, + _ => panic!("unknown log entry type num {}", type_num), + }; + (slot, entry_type) + } + + /// Compose CommandId from slot index & command index within. + fn make_command_id(slot: usize, cmd_idx: usize) -> CommandId { + assert!(slot <= (u32::MAX as usize)); + assert!(cmd_idx <= (u32::MAX as usize)); + ((slot << 32) | cmd_idx) as CommandId + } + + /// Decompose CommandId into slot index & command index within. + fn split_command_id(command_id: CommandId) -> (usize, usize) { + let slot = (command_id >> 32) as usize; + let cmd_idx = (command_id & ((1 << 32) - 1)) as usize; + (slot, cmd_idx) + } + + /// TODO: maybe remove this. + fn shards_for_replica( + id: ReplicaId, + population: u8, + num_shards: u8, + ) -> HashSet { + (id..(id + num_shards)) + .map(|i| (i % population) as usize) + .collect() + } + + /// Handler of client request batch chan recv. + fn handle_req_batch( + &mut self, + req_batch: ReqBatch, + ) -> Result<(), SummersetError> { + let batch_size = req_batch.len(); + assert!(batch_size > 0); + pf_debug!(self.id; "got request batch of size {}", batch_size); + + // if I'm not a leader, ignore client requests + if !self.is_leader { + for (client, req) in req_batch { + if let ApiRequest::Req { id: req_id, .. } = req { + // tell the client to try on the next replica + let next_replica = (self.id + 1) % self.population; + self.external_api.send_reply( + ApiReply::Reply { + id: req_id, + result: None, + redirect: Some(next_replica), + }, + client, + )?; + pf_trace!(self.id; "redirected client {} to replica {}", + client, next_replica); + } + } + return Ok(()); + } + + // compute the complete Reed-Solomon codeword for the batch data + let mut reqs_cw = RSCodeword::from_data( + req_batch, + self.quorum_cnt as usize, + (self.population - self.quorum_cnt) as usize, + )?; + reqs_cw.compute_parity(Some(&self.rs_coder))?; + + // create a new instance in the first null slot (or append a new one + // at the end if no holes exist) + // TODO: maybe use a null_idx variable to better keep track of this + let mut slot = self.insts.len(); + for s in self.commit_bar..self.insts.len() { + if self.insts[s].status == Status::Null { + slot = s; + break; + } + } + if slot < self.insts.len() { + let old_inst = &mut self.insts[slot]; + assert_eq!(old_inst.status, Status::Null); + old_inst.reqs_cw = reqs_cw; + old_inst.leader_bk = Some(LeaderBookkeeping { + prepare_acks: ReplicaMap::new(self.population, false), + prepare_max_bal: 0, + accept_acks: ReplicaMap::new(self.population, false), + }); + } else { + let new_inst = Instance { + bal: 0, + status: Status::Null, + reqs_cw, + leader_bk: Some(LeaderBookkeeping { + prepare_acks: ReplicaMap::new(self.population, false), + prepare_max_bal: 0, + accept_acks: ReplicaMap::new(self.population, false), + }), + replica_bk: None, + }; + self.insts.push(new_inst); + } + + // decide whether we can enter fast path for this instance + // TODO: remember to reset bal_prepared to 0, update bal_max_seen, + // and re-handle all Preparing & Accepting instances in autonomous + // Prepare initiation + if self.bal_prepared == 0 { + // slow case: Prepare phase not done yet. Initiate a Prepare round + // if none is on the fly, or just wait for some Prepare reply to + // trigger my Accept phase + if self.bal_prep_sent == 0 { + self.bal_prep_sent = + self.make_greater_ballot(self.bal_max_seen); + self.bal_max_seen = self.bal_prep_sent; + } + + let inst = &mut self.insts[slot]; + inst.bal = self.bal_prep_sent; + inst.status = Status::Preparing; + pf_debug!(self.id; "enter Prepare phase for slot {} bal {}", + slot, inst.bal); + + // record update to largest prepare ballot + self.storage_hub.submit_action( + Self::make_log_action_id(slot, Status::Preparing), + LogAction::Append { + entry: LogEntry::PrepareBal { + slot, + ballot: self.bal_prep_sent, + }, + sync: self.config.logger_sync, + }, + )?; + pf_trace!(self.id; "submitted PrepareBal log action for slot {} bal {}", + slot, inst.bal); + + // send Prepare messages to all peers + self.transport_hub.bcast_msg( + PeerMsg::Prepare { + slot, + ballot: self.bal_prep_sent, + }, + None, + )?; + pf_trace!(self.id; "broadcast Prepare messages for slot {} bal {}", + slot, inst.bal); + } else { + // normal case: Prepare phase covered, only do the Accept phase + let inst = &mut self.insts[slot]; + inst.bal = self.bal_prepared; + inst.status = Status::Accepting; + pf_debug!(self.id; "enter Accept phase for slot {} bal {}", + slot, inst.bal); + + // record update to largest accepted ballot and corresponding data + self.storage_hub.submit_action( + Self::make_log_action_id(slot, Status::Accepting), + LogAction::Append { + entry: LogEntry::AcceptData { + slot, + ballot: inst.bal, + // persist only some shards on myself + reqs_cw: inst.reqs_cw.subset_copy( + Self::shards_for_replica( + self.id, + self.population, + self.config.shards_per_replica, + ), + false, + )?, + }, + sync: self.config.logger_sync, + }, + )?; + pf_trace!(self.id; "submitted AcceptData log action for slot {} bal {}", + slot, inst.bal); + + // send Accept messages to all peers, each getting its subset of + // shards of data + for peer in 0..self.population { + if peer == self.id { + continue; + } + self.transport_hub.send_msg( + PeerMsg::Accept { + slot, + ballot: inst.bal, + reqs_cw: inst.reqs_cw.subset_copy( + Self::shards_for_replica( + peer, + self.population, + self.config.shards_per_replica, + ), + false, + )?, + }, + peer, + )?; + } + pf_trace!(self.id; "broadcast Accept messages for slot {} bal {}", + slot, inst.bal); + } + + Ok(()) + } + + /// Handler of PrepareBal logging result chan recv. + fn handle_logged_prepare_bal( + &mut self, + slot: usize, + ) -> Result<(), SummersetError> { + pf_trace!(self.id; "finished PrepareBal logging for slot {} bal {}", + slot, self.insts[slot].bal); + let inst = &self.insts[slot]; + let voted = if inst.status >= Status::Accepting { + Some((inst.bal, inst.reqs_cw.clone())) + } else { + None + }; + + if self.is_leader { + // on leader, finishing the logging of a PrepareBal entry + // is equivalent to receiving a Prepare reply from myself + // (as an acceptor role) + self.handle_msg_prepare_reply(self.id, slot, inst.bal, voted)?; + } else { + // on follower replica, finishing the logging of a + // PrepareBal entry leads to sending back a Prepare reply + assert!(inst.replica_bk.is_some()); + let source = inst.replica_bk.as_ref().unwrap().source; + self.transport_hub.send_msg( + PeerMsg::PrepareReply { + slot, + ballot: inst.bal, + voted, + }, + source, + )?; + pf_trace!(self.id; "sent PrepareReply -> {} for slot {} bal {}", + source, slot, inst.bal); + } + + Ok(()) + } + + /// Handler of AcceptData logging result chan recv. + fn handle_logged_accept_data( + &mut self, + slot: usize, + ) -> Result<(), SummersetError> { + pf_trace!(self.id; "finished AcceptData logging for slot {} bal {}", + slot, self.insts[slot].bal); + let inst = &self.insts[slot]; + + if self.is_leader { + // on leader, finishing the logging of an AcceptData entry + // is equivalent to receiving an Accept reply from myself + // (as an acceptor role) + self.handle_msg_accept_reply(self.id, slot, inst.bal)?; + } else { + // on follower replica, finishing the logging of an + // AcceptData entry leads to sending back an Accept reply + assert!(inst.replica_bk.is_some()); + let source = inst.replica_bk.as_ref().unwrap().source; + self.transport_hub.send_msg( + PeerMsg::AcceptReply { + slot, + ballot: inst.bal, + }, + source, + )?; + pf_trace!(self.id; "sent AcceptReply -> {} for slot {} bal {}", + source, slot, inst.bal); + } + + Ok(()) + } + + /// Handler of CommitSlot logging result chan recv. + fn handle_logged_commit_slot( + &mut self, + slot: usize, + ) -> Result<(), SummersetError> { + pf_trace!(self.id; "finished CommitSlot logging for slot {} bal {}", + slot, self.insts[slot].bal); + assert!(self.insts[slot].status >= Status::Committed); + + // update index of the first non-committed instance + if slot == self.commit_bar { + while self.commit_bar < self.insts.len() { + let inst = &mut self.insts[self.commit_bar]; + if inst.status < Status::Committed { + break; + } + + if inst.reqs_cw.avail_shards() < self.quorum_cnt as usize { + // can't execute if I don't have the complete request batch + pf_debug!(self.id; "postponing execution for slot {} (shards {}/{})", + slot, inst.reqs_cw.avail_shards(), self.quorum_cnt); + break; + } else if inst.reqs_cw.avail_data_shards() + < self.quorum_cnt as usize + { + // have enough shards but need reconstruction + inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?; + } + let reqs = inst.reqs_cw.get_data()?; + + // submit commands in committed instance to the state machine + // for execution + if reqs.is_empty() { + inst.status = Status::Executed; + } else if inst.status == Status::Committed { + for (cmd_idx, (_, req)) in reqs.iter().enumerate() { + if let ApiRequest::Req { cmd, .. } = req { + self.state_machine.submit_cmd( + Self::make_command_id(self.commit_bar, cmd_idx), + cmd.clone(), + )?; + } else { + continue; // ignore other types of requests + } + } + pf_trace!(self.id; "submitted {} exec commands for slot {}", + reqs.len(), self.commit_bar); + } + + self.commit_bar += 1; + } + } + + Ok(()) + } + + /// Synthesized handler of durable logging result chan recv. + fn handle_log_result( + &mut self, + action_id: LogActionId, + log_result: LogResult, + ) -> Result<(), SummersetError> { + let (slot, entry_type) = Self::split_log_action_id(action_id); + assert!(slot < self.insts.len()); + + if let LogResult::Append { now_size } = log_result { + assert!(now_size >= self.log_offset); + self.log_offset = now_size; + } else { + return logged_err!(self.id; "unexpected log result type: {:?}", log_result); + } + + match entry_type { + Status::Preparing => self.handle_logged_prepare_bal(slot), + Status::Accepting => self.handle_logged_accept_data(slot), + Status::Committed => self.handle_logged_commit_slot(slot), + _ => { + logged_err!(self.id; "unexpected log entry type: {:?}", entry_type) + } + } + } + + /// Handler of Prepare message from leader. + fn handle_msg_prepare( + &mut self, + peer: ReplicaId, + slot: usize, + ballot: Ballot, + ) -> Result<(), SummersetError> { + pf_trace!(self.id; "received Prepare <- {} for slot {} bal {}", + peer, slot, ballot); + + // if ballot is not smaller than what I have seen: + if ballot >= self.bal_max_seen { + // locate instance in memory, filling in null instances if needed + while self.insts.len() <= slot { + self.insts.push(Instance { + bal: 0, + status: Status::Null, + reqs_cw: RSCodeword::::from_null( + self.quorum_cnt as usize, + (self.population - self.quorum_cnt) as usize, + )?, + leader_bk: None, + replica_bk: None, + }); + } + let inst = &mut self.insts[slot]; + assert!(inst.bal <= ballot); + + inst.bal = ballot; + inst.status = Status::Preparing; + inst.replica_bk = Some(ReplicaBookkeeping { source: peer }); + + // update largest ballot seen + self.bal_max_seen = ballot; + + // record update to largest prepare ballot + self.storage_hub.submit_action( + Self::make_log_action_id(slot, Status::Preparing), + LogAction::Append { + entry: LogEntry::PrepareBal { slot, ballot }, + sync: self.config.logger_sync, + }, + )?; + pf_trace!(self.id; "submitted PrepareBal log action for slot {} bal {}", + slot, ballot); + } + + Ok(()) + } + + /// Handler of Prepare reply from replica. + fn handle_msg_prepare_reply( + &mut self, + peer: ReplicaId, + slot: usize, + ballot: Ballot, + voted: Option<(Ballot, RSCodeword)>, + ) -> Result<(), SummersetError> { + pf_trace!(self.id; "received PrepareReply <- {} for slot {} bal {} shards {:?}", + peer, slot, ballot, + voted.as_ref().map(|(_, cw)| cw.avail_shards_set())); + + // if ballot is what I'm currently waiting on for Prepare replies: + if ballot == self.bal_prep_sent { + assert!(slot < self.insts.len()); + let inst = &mut self.insts[slot]; + + // ignore spurious duplications and outdated replies + if (inst.status != Status::Preparing) || (ballot < inst.bal) { + return Ok(()); + } + assert_eq!(inst.bal, ballot); + assert!(self.bal_max_seen >= ballot); + assert!(inst.leader_bk.is_some()); + let leader_bk = inst.leader_bk.as_mut().unwrap(); + if leader_bk.prepare_acks.get(peer)? { + return Ok(()); + } + + // bookkeep this Prepare reply + leader_bk.prepare_acks.set(peer, true)?; + if let Some((bal, val)) = voted { + #[allow(clippy::comparison_chain)] + if bal > leader_bk.prepare_max_bal { + // is of ballot > current maximum, so discard the current + // codeword and take the replied codeword + leader_bk.prepare_max_bal = bal; + inst.reqs_cw = val; + } else if bal == leader_bk.prepare_max_bal { + // is of ballot == the one currently taken, so merge the + // replied codeword into the current one + inst.reqs_cw.absorb_other(val)?; + } + } + + // if quorum size reached AND enough shards are known to + // reconstruct the original data, enter Accept phase for this + // instance using the request batch value constructed using shards + // with the highest ballot number in quorum + if leader_bk.prepare_acks.count() >= self.quorum_cnt + && inst.reqs_cw.avail_data_shards() >= self.quorum_cnt as usize + { + inst.status = Status::Accepting; + pf_debug!(self.id; "enter Accept phase for slot {} bal {}", + slot, inst.bal); + + // update bal_prepared + assert!(self.bal_prepared <= ballot); + self.bal_prepared = ballot; + + // if parity shards not computed yet, compute them now + if inst.reqs_cw.avail_shards() < self.population as usize { + inst.reqs_cw.compute_parity(Some(&self.rs_coder))?; + } + + // record update to largest accepted ballot and corresponding data + self.storage_hub.submit_action( + Self::make_log_action_id(slot, Status::Accepting), + LogAction::Append { + entry: LogEntry::AcceptData { + slot, + ballot, + reqs_cw: inst.reqs_cw.subset_copy( + Self::shards_for_replica( + self.id, + self.population, + self.config.shards_per_replica, + ), + false, + )?, + }, + sync: self.config.logger_sync, + }, + )?; + pf_trace!(self.id; "submitted AcceptData log action for slot {} bal {}", + slot, ballot); + + // send Accept messages to all peers + for peer in 0..self.population { + if peer == self.id { + continue; + } + self.transport_hub.send_msg( + PeerMsg::Accept { + slot, + ballot, + reqs_cw: inst.reqs_cw.subset_copy( + Self::shards_for_replica( + peer, + self.population, + self.config.shards_per_replica, + ), + false, + )?, + }, + peer, + )?; + } + pf_trace!(self.id; "broadcast Accept messages for slot {} bal {}", + slot, ballot); + } + } + + Ok(()) + } + + /// Handler of Accept message from leader. + fn handle_msg_accept( + &mut self, + peer: ReplicaId, + slot: usize, + ballot: Ballot, + reqs_cw: RSCodeword, + ) -> Result<(), SummersetError> { + pf_trace!(self.id; "received Accept <- {} for slot {} bal {} shards {:?}", + peer, slot, ballot, reqs_cw.avail_shards_set()); + + // if ballot is not smaller than what I have made promises for: + if ballot >= self.bal_max_seen { + // locate instance in memory, filling in null instances if needed + while self.insts.len() <= slot { + self.insts.push(Instance { + bal: 0, + status: Status::Null, + reqs_cw: RSCodeword::::from_null( + self.quorum_cnt as usize, + (self.population - self.quorum_cnt) as usize, + )?, + leader_bk: None, + replica_bk: None, + }); + } + let inst = &mut self.insts[slot]; + assert!(inst.bal <= ballot); + + inst.bal = ballot; + inst.status = Status::Accepting; + inst.reqs_cw = reqs_cw; + inst.replica_bk = Some(ReplicaBookkeeping { source: peer }); + + // update largest ballot seen + self.bal_max_seen = ballot; + + // record update to largest prepare ballot + self.storage_hub.submit_action( + Self::make_log_action_id(slot, Status::Accepting), + LogAction::Append { + entry: LogEntry::AcceptData { + slot, + ballot, + reqs_cw: inst.reqs_cw.clone(), + }, + sync: self.config.logger_sync, + }, + )?; + pf_trace!(self.id; "submitted AcceptData log action for slot {} bal {}", + slot, ballot); + } + + Ok(()) + } + + /// Handler of Accept reply from replica. + fn handle_msg_accept_reply( + &mut self, + peer: ReplicaId, + slot: usize, + ballot: Ballot, + ) -> Result<(), SummersetError> { + pf_trace!(self.id; "received AcceptReply <- {} for slot {} bal {}", + peer, slot, ballot); + + // if ballot is what I'm currently waiting on for Accept replies: + if ballot == self.bal_prepared { + assert!(slot < self.insts.len()); + let inst = &mut self.insts[slot]; + + // ignore spurious duplications and outdated replies + if (inst.status != Status::Accepting) || (ballot < inst.bal) { + return Ok(()); + } + assert_eq!(inst.bal, ballot); + assert!(self.bal_max_seen >= ballot); + assert!(inst.leader_bk.is_some()); + let leader_bk = inst.leader_bk.as_mut().unwrap(); + if leader_bk.accept_acks.get(peer)? { + return Ok(()); + } + + // bookkeep this Accept reply + leader_bk.accept_acks.set(peer, true)?; + + // if quorum size reached AND enough number of shards are + // remembered, mark this instance as committed; in RS-Paxos, this + // means accept_acks.count() >= self.quorum_cnt + fault_tolerance + if leader_bk.accept_acks.count() + >= self.quorum_cnt + self.config.fault_tolerance + { + inst.status = Status::Committed; + pf_debug!(self.id; "committed instance at slot {} bal {}", + slot, inst.bal); + + // record commit event + self.storage_hub.submit_action( + Self::make_log_action_id(slot, Status::Committed), + LogAction::Append { + entry: LogEntry::CommitSlot { slot }, + sync: self.config.logger_sync, + }, + )?; + pf_trace!(self.id; "submitted CommitSlot log action for slot {} bal {}", + slot, inst.bal); + + // send Commit messages to all peers + self.transport_hub + .bcast_msg(PeerMsg::Commit { slot }, None)?; + pf_trace!(self.id; "broadcast Commit messages for slot {} bal {}", + slot, ballot); + } + } + + Ok(()) + } + + /// Handler of Commit message from leader. + /// TODO: take care of missing/lost Commit messages + fn handle_msg_commit( + &mut self, + peer: ReplicaId, + slot: usize, + ) -> Result<(), SummersetError> { + pf_trace!(self.id; "received Commit <- {} for slot {}", peer, slot); + + // locate instance in memory, filling in null instances if needed + while self.insts.len() <= slot { + self.insts.push(Instance { + bal: 0, + status: Status::Null, + reqs_cw: RSCodeword::::from_null( + self.quorum_cnt as usize, + (self.population - self.quorum_cnt) as usize, + )?, + leader_bk: None, + replica_bk: None, + }); + } + let inst = &mut self.insts[slot]; + + // ignore spurious duplications + if inst.status != Status::Accepting { + return Ok(()); + } + + // mark this instance as committed + inst.status = Status::Committed; + pf_debug!(self.id; "committed instance at slot {} bal {}", + slot, inst.bal); + + // record commit event + self.storage_hub.submit_action( + Self::make_log_action_id(slot, Status::Committed), + LogAction::Append { + entry: LogEntry::CommitSlot { slot }, + sync: self.config.logger_sync, + }, + )?; + pf_trace!(self.id; "submitted CommitSlot log action for slot {} bal {}", + slot, inst.bal); + + Ok(()) + } + + /// Synthesized handler of receiving message from peer. + fn handle_msg_recv( + &mut self, + peer: ReplicaId, + msg: PeerMsg, + ) -> Result<(), SummersetError> { + match msg { + PeerMsg::Prepare { slot, ballot } => { + self.handle_msg_prepare(peer, slot, ballot) + } + PeerMsg::PrepareReply { + slot, + ballot, + voted, + } => self.handle_msg_prepare_reply(peer, slot, ballot, voted), + PeerMsg::Accept { + slot, + ballot, + reqs_cw, + } => self.handle_msg_accept(peer, slot, ballot, reqs_cw), + PeerMsg::AcceptReply { slot, ballot } => { + self.handle_msg_accept_reply(peer, slot, ballot) + } + PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot), + } + } + + /// Handler of state machine exec result chan recv. + fn handle_cmd_result( + &mut self, + cmd_id: CommandId, + cmd_result: CommandResult, + ) -> Result<(), SummersetError> { + let (slot, cmd_idx) = Self::split_command_id(cmd_id); + assert!(slot < self.insts.len()); + pf_trace!(self.id; "executed cmd in instance at slot {} idx {}", + slot, cmd_idx); + + let inst = &mut self.insts[slot]; + let reqs = inst.reqs_cw.get_data()?; + assert!(cmd_idx < reqs.len()); + let (client, ref req) = reqs[cmd_idx]; + + // reply command result back to client + if let ApiRequest::Req { id: req_id, .. } = req { + if self.external_api.has_client(client) { + self.external_api.send_reply( + ApiReply::Reply { + id: *req_id, + result: Some(cmd_result), + redirect: None, + }, + client, + )?; + pf_trace!(self.id; "replied -> client {} for slot {} idx {}", + client, slot, cmd_idx); + } + } else { + return logged_err!(self.id; "unexpected API request type"); + } + + // if all commands in this instance have been executed, set status to + // Executed and update `exec_bar` + if cmd_idx == reqs.len() - 1 { + inst.status = Status::Executed; + pf_debug!(self.id; "executed all cmds in instance at slot {}", + slot); + + // update index of the first non-executed instance + if slot == self.exec_bar { + while self.exec_bar < self.insts.len() { + let inst = &mut self.insts[self.exec_bar]; + if inst.status < Status::Executed { + break; + } + self.exec_bar += 1; + } + } + } + + Ok(()) + } + + /// Synthesized handler of manager control messages. + fn handle_ctrl_msg(&mut self, _msg: CtrlMsg) -> Result<(), SummersetError> { + // TODO: fill this when more control message types added + Ok(()) + } +} + +#[async_trait] +impl GenericReplica for CrosswordReplica { + async fn new_and_setup( + api_addr: SocketAddr, + p2p_addr: SocketAddr, + manager: SocketAddr, + config_str: Option<&str>, + ) -> Result { + let config = parsed_config!(config_str => ReplicaConfigCrossword; + batch_interval_us, max_batch_size, + backer_path, logger_sync, fault_tolerance, + shards_per_replica)?; + // connect to the cluster manager and get assigned a server ID + let mut control_hub = ControlHub::new_and_setup(manager).await?; + let id = control_hub.me; + + if config.batch_interval_us == 0 { + return logged_err!( + id; + "invalid config.batch_interval_us '{}'", + config.batch_interval_us + ); + } + + // ask for population number and the list of peers to proactively + // connect to + control_hub.send_ctrl(CtrlMsg::NewServerJoin { + id, + protocol: SmrProtocol::Crossword, + api_addr, + p2p_addr, + })?; + let (population, to_peers) = if let CtrlMsg::ConnectToPeers { + population, + to_peers, + } = control_hub.recv_ctrl().await? + { + (population, to_peers) + } else { + return logged_err!(id; "unexpected ctrl msg type received"); + }; + + // create a Reed-Solomon coder with num_data_shards == quorum size and + // num_parity shards == population - quorum + let quorum_cnt = (population / 2) + 1; + if config.fault_tolerance > (population - quorum_cnt) { + return logged_err!(id; "invalid config.fault_tolerance '{}'", + config.fault_tolerance); + } + if config.shards_per_replica == 0 + || config.shards_per_replica > quorum_cnt + { + return logged_err!(id; "invalid config.shards_per_replica '{}'", + config.shards_per_replica); + } + let rs_coder = ReedSolomon::new( + quorum_cnt as usize, + (population - quorum_cnt) as usize, + )?; + + let state_machine = StateMachine::new_and_setup(id).await?; + + let storage_hub = + StorageHub::new_and_setup(id, Path::new(&config.backer_path)) + .await?; + + let mut transport_hub = + TransportHub::new_and_setup(id, population, p2p_addr).await?; + + // proactively connect to some peers, then wait for all population + // have been connected with me + for (peer, addr) in to_peers { + transport_hub.connect_to_peer(peer, addr).await?; + } + transport_hub.wait_for_group(population).await?; + + let external_api = ExternalApi::new_and_setup( + id, + api_addr, + Duration::from_micros(config.batch_interval_us), + config.max_batch_size, + ) + .await?; + + Ok(CrosswordReplica { + id, + population, + quorum_cnt, + config, + _api_addr: api_addr, + _p2p_addr: p2p_addr, + control_hub, + external_api, + state_machine, + storage_hub, + transport_hub, + is_leader: false, + insts: vec![], + bal_prep_sent: 0, + bal_prepared: 0, + bal_max_seen: 0, + commit_bar: 0, + exec_bar: 0, + log_offset: 0, + rs_coder, + }) + } + + async fn run(&mut self) { + // TODO: proper leader election + if self.id == 0 { + self.is_leader = true; + } + + loop { + tokio::select! { + // client request batch + req_batch = self.external_api.get_req_batch() => { + if let Err(e) = req_batch { + pf_error!(self.id; "error getting req batch: {}", e); + continue; + } + let req_batch = req_batch.unwrap(); + if let Err(e) = self.handle_req_batch(req_batch) { + pf_error!(self.id; "error handling req batch: {}", e); + } + }, + + // durable logging result + log_result = self.storage_hub.get_result() => { + if let Err(e) = log_result { + pf_error!(self.id; "error getting log result: {}", e); + continue; + } + let (action_id, log_result) = log_result.unwrap(); + if let Err(e) = self.handle_log_result(action_id, log_result) { + pf_error!(self.id; "error handling log result {}: {}", + action_id, e); + } + }, + + // message from peer + msg = self.transport_hub.recv_msg() => { + if let Err(e) = msg { + pf_error!(self.id; "error receiving peer msg: {}", e); + continue; + } + let (peer, msg) = msg.unwrap(); + if let Err(e) = self.handle_msg_recv(peer, msg) { + pf_error!(self.id; "error handling msg recv <- {}: {}", peer, e); + } + } + + // state machine execution result + cmd_result = self.state_machine.get_result() => { + if let Err(e) = cmd_result { + pf_error!(self.id; "error getting cmd result: {}", e); + continue; + } + let (cmd_id, cmd_result) = cmd_result.unwrap(); + if let Err(e) = self.handle_cmd_result(cmd_id, cmd_result) { + pf_error!(self.id; "error handling cmd result {}: {}", cmd_id, e); + } + }, + + // manager control message + ctrl_msg = self.control_hub.recv_ctrl() => { + if let Err(e) = ctrl_msg { + pf_error!(self.id; "error getting ctrl msg: {}", e); + continue; + } + let ctrl_msg = ctrl_msg.unwrap(); + if let Err(e) = self.handle_ctrl_msg(ctrl_msg) { + pf_error!(self.id; "error handling ctrl msg: {}", e); + } + } + } + } + } +} + +/// Configuration parameters struct. +#[derive(Debug, Deserialize)] +pub struct ClientConfigCrossword { + /// Which server to pick initially. + pub init_server_id: ReplicaId, +} + +#[allow(clippy::derivable_impls)] +impl Default for ClientConfigCrossword { + fn default() -> Self { + ClientConfigCrossword { init_server_id: 0 } + } +} + +/// Crossword client-side module. +pub struct CrosswordClient { + /// Client ID. + id: ClientId, + + /// Address of the cluster manager oracle. + manager: SocketAddr, + + /// Configuration parameters struct. + _config: ClientConfigCrossword, + + /// Cached list of active servers information. + servers: HashMap, + + /// Current server ID to connect to. + server_id: ReplicaId, + + /// Control API stub to the cluster manager. + ctrl_stub: Option, + + /// API stubs for communicating with servers. + api_stub: Option, +} + +#[async_trait] +impl GenericEndpoint for CrosswordClient { + fn new( + manager: SocketAddr, + config_str: Option<&str>, + ) -> Result { + let config = parsed_config!(config_str => ClientConfigCrossword; + init_server_id)?; + let init_server_id = config.init_server_id; + + Ok(CrosswordClient { + id: 255, // nil at this time + manager, + _config: config, + servers: HashMap::new(), + server_id: init_server_id, + ctrl_stub: None, + api_stub: None, + }) + } + + async fn connect(&mut self) -> Result { + // disallow reconnection without leaving + if self.api_stub.is_some() { + return logged_err!(self.id; "reconnecting without leaving"); + } + + // if ctrl_stubs not established yet, connect to the manager + if self.ctrl_stub.is_none() { + let ctrl_stub = + ClientCtrlStub::new_by_connect(self.manager).await?; + self.id = ctrl_stub.id; + self.ctrl_stub = Some(ctrl_stub); + } + let ctrl_stub = self.ctrl_stub.as_mut().unwrap(); + + // ask the manager about the list of active servers + let mut sent = ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?; + while !sent { + sent = ctrl_stub.send_req(None)?; + } + + let reply = ctrl_stub.recv_reply().await?; + match reply { + CtrlReply::QueryInfo { servers } => { + // connect to the one with server ID in config + let api_stub = ClientApiStub::new_by_connect( + self.id, + servers[&self.server_id], + ) + .await?; + self.api_stub = Some(api_stub); + self.servers = servers; + Ok(self.id) + } + _ => logged_err!(self.id; "unexpected reply type received"), + } + } + + async fn leave(&mut self, permanent: bool) -> Result<(), SummersetError> { + // send leave notification to current connected server + if let Some(mut api_stub) = self.api_stub.take() { + let mut sent = api_stub.send_req(Some(&ApiRequest::Leave))?; + while !sent { + sent = api_stub.send_req(None)?; + } + + let reply = api_stub.recv_reply().await?; + match reply { + ApiReply::Leave => { + pf_info!(self.id; "left current server connection"); + api_stub.forget(); + } + _ => { + return logged_err!(self.id; "unexpected reply type received"); + } + } + } + + // if permanently leaving, send leave notification to the manager + if permanent { + // disallow multiple permanent leaving + if self.ctrl_stub.is_none() { + return logged_err!(self.id; "repeated permanent leaving"); + } + + if let Some(mut ctrl_stub) = self.ctrl_stub.take() { + let mut sent = ctrl_stub.send_req(Some(&CtrlRequest::Leave))?; + while !sent { + sent = ctrl_stub.send_req(None)?; + } + + let reply = ctrl_stub.recv_reply().await?; + match reply { + CtrlReply::Leave => { + pf_info!(self.id; "left current manager connection"); + ctrl_stub.forget(); + } + _ => { + return logged_err!(self.id; "unexpected reply type received"); + } + } + } + } + + Ok(()) + } + + fn send_req( + &mut self, + req: Option<&ApiRequest>, + ) -> Result { + match self.api_stub { + Some(ref mut api_stub) => api_stub.send_req(req), + None => logged_err!(self.id; "client is not set up"), + } + } + + async fn recv_reply(&mut self) -> Result { + match self.api_stub { + Some(ref mut api_stub) => { + let reply = api_stub.recv_reply().await?; + + if let ApiReply::Reply { + ref result, + ref redirect, + .. + } = reply + { + // if the current server redirects me to a different server + if result.is_none() && redirect.is_some() { + let redirect_id = redirect.unwrap(); + assert!(self.servers.contains_key(&redirect_id)); + self.leave(false).await?; + self.server_id = redirect_id; + self.connect().await?; + pf_debug!(self.id; "redirected to replica {} '{}'", + redirect_id, self.servers[&redirect_id]); + } + } + + Ok(reply) + } + None => logged_err!(self.id; "client is not set up"), + } + } +} diff --git a/src/protocols/mod.rs b/src/protocols/mod.rs index 36d0ea1a..98ecf371 100644 --- a/src/protocols/mod.rs +++ b/src/protocols/mod.rs @@ -26,6 +26,10 @@ mod rs_paxos; use rs_paxos::{RSPaxosReplica, RSPaxosClient}; pub use rs_paxos::{ReplicaConfigRSPaxos, ClientConfigRSPaxos}; +mod crossword; +use crossword::{CrosswordReplica, CrosswordClient}; +pub use crossword::{ReplicaConfigCrossword, ClientConfigCrossword}; + /// Enum of supported replication protocol types. #[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)] pub enum SmrProtocol { @@ -33,6 +37,7 @@ pub enum SmrProtocol { SimplePush, MultiPaxos, RSPaxos, + Crossword, } /// Helper macro for saving boilder-plate `Box` mapping in @@ -52,6 +57,7 @@ impl SmrProtocol { "SimplePush" => Some(Self::SimplePush), "MultiPaxos" => Some(Self::MultiPaxos), "RSPaxos" => Some(Self::RSPaxos), + "Crossword" => Some(Self::Crossword), _ => None, } } @@ -108,6 +114,14 @@ impl SmrProtocol { .await ) } + Self::Crossword => { + box_if_ok!( + CrosswordReplica::new_and_setup( + api_addr, p2p_addr, manager, config_str + ) + .await + ) + } } } @@ -130,6 +144,9 @@ impl SmrProtocol { Self::RSPaxos => { box_if_ok!(RSPaxosClient::new(manager, config_str)) } + Self::Crossword => { + box_if_ok!(CrosswordClient::new(manager, config_str)) + } } } } @@ -159,6 +176,7 @@ mod protocols_name_tests { valid_name_test!(SimplePush); valid_name_test!(MultiPaxos); valid_name_test!(RSPaxos); + valid_name_test!(Crossword); } #[test] diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs index 1dade1c0..e47993de 100644 --- a/src/protocols/rs_paxos.rs +++ b/src/protocols/rs_paxos.rs @@ -417,7 +417,7 @@ impl RSPaxosReplica { pf_trace!(self.id; "submitted AcceptData log action for slot {} bal {}", slot, inst.bal); - // send Accept messages to all peers, each getting on shard of data + // send Accept messages to all peers, each getting one shard of data for peer in 0..self.population { if peer == self.id { continue; From 2eb416c7b0452cb091f6ff43b1a2cfe6da6fa06c Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Tue, 29 Aug 2023 14:24:47 +0800 Subject: [PATCH 04/21] updates to benchmark scripts --- scripts/local_bench.tmp.py | 76 +++++++++++++++++++++++++++++------ scripts/local_client.py | 22 ++++------- scripts/local_cluster.py | 77 ++++++++++++++++++++++++++---------- scripts/set_tcp_buf_sizes.sh | 10 +++++ 4 files changed, 138 insertions(+), 47 deletions(-) diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py index 6f1ceb78..c20ad6d5 100644 --- a/scripts/local_bench.tmp.py +++ b/scripts/local_bench.tmp.py @@ -1,5 +1,5 @@ import subprocess -import time +import itertools import statistics @@ -24,7 +24,7 @@ def kill_all_matching(name): proc.wait() -def launch_cluster(protocol, num_replicas): +def launch_cluster(protocol, num_replicas, config): cmd = [ "python3", "./scripts/local_cluster.py", @@ -34,9 +34,25 @@ def launch_cluster(protocol, num_replicas): str(num_replicas), "-r", ] + if config is not None and len(config) > 0: + cmd += ["--config", config] return run_process(cmd) +def wait_cluster_setup(proc, num_replicas): + accepting_clients = [False for _ in range(num_replicas)] + + for line in iter(proc.stderr.readline, b""): + l = line.decode() + if "manager" not in l and "accepting clients" in l: + replica = int(l[l.find("(") + 1 : l.find(")")]) + assert not accepting_clients[replica] + accepting_clients[replica] = True + + if accepting_clients.count(True) == num_replicas: + break + + def run_bench_client(protocol, value_size, put_ratio, length_s): cmd = [ "python3", @@ -84,7 +100,15 @@ def parse_output(output): print(f" std tpt {std_tpt:9.2f} lat {std_lat:9.2f}") -def bench_round(protocol, num_replicas, value_size, put_ratio, length_s): +def bench_round( + protocol, + num_replicas, + value_size, + put_ratio, + length_s, + fault_tolerance=None, + shards_per_replica=None, +): print( f"{protocol:<10s} n={num_replicas:1d} v={value_size:<9d} w%={put_ratio:<3d} {length_s:3d}s" ) @@ -92,8 +116,13 @@ def bench_round(protocol, num_replicas, value_size, put_ratio, length_s): kill_all_matching("summerset_server") kill_all_matching("summerset_manager") - proc_cluster = launch_cluster(protocol, num_replicas) - time.sleep(15) + configs = [] + if fault_tolerance is not None: + configs.append(f"fault_tolerance={fault_tolerance}") + if shards_per_replica is not None: + configs.append(f"shards_per_replica={shards_per_replica}") + proc_cluster = launch_cluster(protocol, num_replicas, "+".join(configs)) + wait_cluster_setup(proc_cluster, num_replicas) proc_client = run_bench_client(protocol, value_size, put_ratio, length_s) out, err = proc_client.communicate() @@ -110,10 +139,33 @@ def bench_round(protocol, num_replicas, value_size, put_ratio, length_s): if __name__ == "__main__": do_cargo_build() - for num_replicas in (3, 5, 7): - for value_size in (1024, 65536, 4194304): - for protocol in ("MultiPaxos", "RSPaxos"): - bench_round(protocol, num_replicas, value_size, 100, 60) - - bench_round("MultiPaxos", 7, 4194304, 10, 60) - bench_round("RSPaxos", 7, 4194304, 10, 60) + def all_protocol_configs(num_replicas): + quorum_cnt = num_replicas // 2 + 1 + max_fault_tolerance = num_replicas - quorum_cnt + + config_choices = [("MultiPaxos", None, None)] + for shards_per_replica in range(quorum_cnt, 0): + config_choices.append( + ("Crossword", max_fault_tolerance, shards_per_replica) + ) + config_choices.append(("Crossword", 0, 1)) + + return config_choices + + # for num_replicas in (3, 5, 7): + # for value_size in (1024, 65536, 4194304): + # for protocol, fault_tolerance, shards_per_replica in all_protocol_configs( + # num_replicas + # ): + # bench_round( + # protocol, + # num_replicas, + # value_size, + # 100, + # 60, + # fault_tolerance=fault_tolerance, + # shards_per_replica=shards_per_replica, + # ) + + bench_round("MultiPaxos", 5, 65536, 0, 60) + # bench_round("Crossword", 5, 65536, 0, 60, fault_tolerance=0, shards_per_replica=1) diff --git a/scripts/local_client.py b/scripts/local_client.py index 2f9c2c6d..f8ac5981 100644 --- a/scripts/local_client.py +++ b/scripts/local_client.py @@ -21,15 +21,6 @@ def run_process(cmd): MANAGER_CLI_PORT = 52601 -PROTOCOL_CONFIGS = { - "RepNothing": "", - "SimplePush": "", - "MultiPaxos": "", - "RSPaxos": "", - "Crossword": "", -} - - UTILITY_PARAM_NAMES = { "repl": [], "bench": ["freq_target", "value_size", "put_ratio", "length_s"], @@ -63,7 +54,7 @@ def compose_client_cmd(protocol, manager, config, utility, params, release): "-m", manager, ] - if len(config) > 0: + if config is not None and len(config) > 0: cmd += ["--config", config] cmd += ["-u", utility] @@ -77,11 +68,11 @@ def compose_client_cmd(protocol, manager, config, utility, params, release): return cmd -def run_client(protocol, utility, params, release): +def run_client(protocol, utility, params, release, config): cmd = compose_client_cmd( protocol, f"127.0.0.1:{MANAGER_CLI_PORT}", - PROTOCOL_CONFIGS[protocol], + config, utility, params, release, @@ -97,6 +88,9 @@ def run_client(protocol, utility, params, release): "-p", "--protocol", type=str, required=True, help="protocol name" ) parser.add_argument("-r", "--release", action="store_true", help="run release mode") + parser.add_argument( + "-c", "--config", type=str, help="protocol-specific TOML config string" + ) subparsers = parser.add_subparsers( required=True, @@ -129,9 +123,6 @@ def run_client(protocol, utility, params, release): args = parser.parse_args() - if args.protocol not in PROTOCOL_CONFIGS: - raise ValueError(f"unknown protocol name '{args.protocol}'") - # build everything do_cargo_build(args.release) @@ -141,6 +132,7 @@ def run_client(protocol, utility, params, release): args.utility, glue_params_str(args, UTILITY_PARAM_NAMES[args.utility]), args.release, + args.config, ) rc = client_proc.wait() diff --git a/scripts/local_cluster.py b/scripts/local_cluster.py index b7dcdb25..ffbdff15 100644 --- a/scripts/local_cluster.py +++ b/scripts/local_cluster.py @@ -1,7 +1,6 @@ import sys import argparse import subprocess -import time from pathlib import Path @@ -14,9 +13,13 @@ def do_cargo_build(release): proc.wait() -def run_process(cmd): +def run_process(cmd, capture_stderr=False): print("Run:", " ".join(cmd)) - proc = subprocess.Popen(cmd) + proc = None + if capture_stderr: + proc = subprocess.Popen(cmd, stderr=subprocess.PIPE) + else: + proc = subprocess.Popen(cmd) return proc @@ -35,15 +38,28 @@ def kill_all_matching(name): SERVER_P2P_PORT = lambda r: 52800 + r -PROTOCOL_CONFIGS = { - "RepNothing": lambda r, n: f"backer_path='/tmp/summerset.rep_nothing.{r}.wal'", - "SimplePush": lambda r, n: f"backer_path='/tmp/summerset.simple_push.{r}.wal'+rep_degree={n-1}", - "MultiPaxos": lambda r, n: f"backer_path='/tmp/summerset.multipaxos.{r}.wal'", - "RSPaxos": lambda r, n: f"backer_path='/tmp/summerset.rs_paxos.{r}.wal'+fault_tolerance={n-(n//2+1)}", - "Crossword": lambda r, n: f"backer_path='/tmp/summerset.rs_paxos.{r}.wal'+fault_tolerance=0+shards_per_replica=3", +PROTOCOL_BACKER_PATH = { + "RepNothing": lambda r: f"backer_path='/tmp/summerset.rep_nothing.{r}.wal'", + "SimplePush": lambda r: f"backer_path='/tmp/summerset.simple_push.{r}.wal'", + "MultiPaxos": lambda r: f"backer_path='/tmp/summerset.multipaxos.{r}.wal'", + "RSPaxos": lambda r: f"backer_path='/tmp/summerset.rs_paxos.{r}.wal'", + "Crossword": lambda r: f"backer_path='/tmp/summerset.crossword.{r}.wal'", } +def config_with_backer_path(protocol, config, replica): + result_config = PROTOCOL_BACKER_PATH[protocol](replica) + + if config is not None and len(config) > 0: + if "backer_path" in config: + result_config = config # use user-supplied path + else: + result_config += "+" + result_config += config + + return result_config + + def compose_manager_cmd(protocol, srv_port, cli_port, num_replicas, release): cmd = [f"./target/{'release' if release else 'debug'}/summerset_manager"] cmd += [ @@ -67,7 +83,26 @@ def launch_manager(protocol, num_replicas, release): num_replicas, release, ) - return run_process(cmd) + return run_process(cmd, capture_stderr=True) + + +def wait_manager_setup(proc): + accepting_servers, accepting_clients = False, False + + for line in iter(proc.stderr.readline, b""): + sys.stderr.buffer.write(line) + sys.stderr.flush() + + l = line.decode() + if "(m) accepting servers" in l: + assert not accepting_servers + accepting_servers = True + if "(m) accepting clients" in l: + assert not accepting_clients + accepting_clients = True + + if accepting_servers and accepting_clients: + break def compose_server_cmd(protocol, api_port, p2p_port, manager, config, release): @@ -82,12 +117,12 @@ def compose_server_cmd(protocol, api_port, p2p_port, manager, config, release): "-m", manager, ] - if len(config) > 0: + if config is not None and len(config) > 0: cmd += ["--config", config] return cmd -def launch_servers(protocol, num_replicas, release): +def launch_servers(protocol, num_replicas, release, config): server_procs = [] for replica in range(num_replicas): cmd = compose_server_cmd( @@ -95,7 +130,7 @@ def launch_servers(protocol, num_replicas, release): SERVER_API_PORT(replica), SERVER_P2P_PORT(replica), f"127.0.0.1:{MANAGER_SRV_PORT}", - PROTOCOL_CONFIGS[protocol](replica, num_replicas), + config_with_backer_path(protocol, config, replica), release, ) proc = run_process(cmd) @@ -115,13 +150,11 @@ def launch_servers(protocol, num_replicas, release): parser.add_argument( "-r", "--release", action="store_true", help="if set, run release mode" ) + parser.add_argument( + "-c", "--config", type=str, help="protocol-specific TOML config string" + ) args = parser.parse_args() - if args.protocol not in PROTOCOL_CONFIGS: - raise ValueError(f"unknown protocol name '{args.protocol}'") - if args.num_replicas <= 0 or args.num_replicas > 9: - raise ValueError(f"invalid number of replicas {args.num_replicas}") - # kill all existing server and manager processes kill_all_matching("summerset_server") kill_all_matching("summerset_manager") @@ -135,10 +168,14 @@ def launch_servers(protocol, num_replicas, release): # launch cluster manager oracle first manager_proc = launch_manager(args.protocol, args.num_replicas, args.release) - time.sleep(5) + wait_manager_setup(manager_proc) # then launch server replicas - launch_servers(args.protocol, args.num_replicas, args.release) + launch_servers(args.protocol, args.num_replicas, args.release, args.config) + + for line in iter(manager_proc.stderr.readline, b""): + sys.stderr.buffer.write(line) + sys.stderr.flush() rc = manager_proc.wait() sys.exit(rc) diff --git a/scripts/set_tcp_buf_sizes.sh b/scripts/set_tcp_buf_sizes.sh index 2d3e3f21..55d8d0a4 100755 --- a/scripts/set_tcp_buf_sizes.sh +++ b/scripts/set_tcp_buf_sizes.sh @@ -1,12 +1,22 @@ #! /usr/bin/bash +echo "Per-socket TCP send/receive buffer:" +echo "min default max" echo "4096 131072 33554432" | sudo tee /proc/sys/net/ipv4/tcp_rmem echo "4096 131072 33554432" | sudo tee /proc/sys/net/ipv4/tcp_wmem +echo +echo "System-wide total buffer size:" +echo "min default max" echo "1538757 16413408 24620112" | sudo tee /proc/sys/net/ipv4/tcp_mem +echo +echo "Max value of setsockopt:" echo "33554432" | sudo tee /proc/sys/net/core/rmem_max echo "33554432" | sudo tee /proc/sys/net/core/wmem_max +echo +echo "Default value of network socket:" echo "131072" | sudo tee /proc/sys/net/core/rmem_default echo "131072" | sudo tee /proc/sys/net/core/wmem_default +echo From f9960edec58fe55dddbd0e2e2346636c84411c90 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Tue, 29 Aug 2023 16:44:12 +0800 Subject: [PATCH 05/21] make Bitmap a general u8-indexed map --- src/lib.rs | 2 +- src/protocols/crossword.rs | 53 +++++++------- src/protocols/multipaxos.rs | 14 ++-- src/protocols/rs_paxos.rs | 54 +++++++------- src/protocols/simple_push.rs | 8 +-- src/server/transport.rs | 12 ++-- src/utils/bitmap.rs | 86 +++++++++++++++------- src/utils/mod.rs | 2 +- src/utils/rscoding.rs | 134 ++++++++++++++++++++--------------- 9 files changed, 210 insertions(+), 155 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 40bcbf31..24a24bb6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -13,7 +13,7 @@ mod protocols; // Things (other than exported macros) exposed to users of this crate: #[doc(inline)] -pub use crate::utils::{SummersetError, ReplicaMap, Timer}; +pub use crate::utils::{SummersetError, Bitmap, Timer}; #[doc(inline)] pub use crate::manager::{CtrlMsg, CtrlRequest, CtrlReply, ClusterManager}; diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index dfe647ea..7f6bc743 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -3,11 +3,11 @@ //! MultiPaxos with flexible Reed-Solomon erasure coding that supports tunable //! shard groups and asymmetric shard assignment. -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use std::path::Path; use std::net::SocketAddr; -use crate::utils::{SummersetError, ReplicaMap, RSCodeword}; +use crate::utils::{SummersetError, Bitmap, RSCodeword}; use crate::manager::{CtrlMsg, CtrlRequest, CtrlReply}; use crate::server::{ ReplicaId, ControlHub, StateMachine, CommandResult, CommandId, ExternalApi, @@ -84,13 +84,13 @@ type ReqBatch = Vec<(ClientId, ApiRequest)>; #[derive(Debug, Clone)] struct LeaderBookkeeping { /// Replicas from which I have received Prepare confirmations. - prepare_acks: ReplicaMap, + prepare_acks: Bitmap, /// Max ballot among received Prepare replies. prepare_max_bal: Ballot, /// Replicas from which I have received Accept confirmations. - accept_acks: ReplicaMap, + accept_acks: Bitmap, } /// Follower-side bookkeeping info for each instance received. @@ -284,10 +284,9 @@ impl CrosswordReplica { id: ReplicaId, population: u8, num_shards: u8, - ) -> HashSet { - (id..(id + num_shards)) - .map(|i| (i % population) as usize) - .collect() + ) -> Bitmap { + let ones = (id..(id + num_shards)).map(|i| (i % population)).collect(); + Bitmap::from(population, ones) } /// Handler of client request batch chan recv. @@ -323,8 +322,8 @@ impl CrosswordReplica { // compute the complete Reed-Solomon codeword for the batch data let mut reqs_cw = RSCodeword::from_data( req_batch, - self.quorum_cnt as usize, - (self.population - self.quorum_cnt) as usize, + self.quorum_cnt, + self.population - self.quorum_cnt, )?; reqs_cw.compute_parity(Some(&self.rs_coder))?; @@ -343,9 +342,9 @@ impl CrosswordReplica { assert_eq!(old_inst.status, Status::Null); old_inst.reqs_cw = reqs_cw; old_inst.leader_bk = Some(LeaderBookkeeping { - prepare_acks: ReplicaMap::new(self.population, false), + prepare_acks: Bitmap::new(self.population, false), prepare_max_bal: 0, - accept_acks: ReplicaMap::new(self.population, false), + accept_acks: Bitmap::new(self.population, false), }); } else { let new_inst = Instance { @@ -353,9 +352,9 @@ impl CrosswordReplica { status: Status::Null, reqs_cw, leader_bk: Some(LeaderBookkeeping { - prepare_acks: ReplicaMap::new(self.population, false), + prepare_acks: Bitmap::new(self.population, false), prepare_max_bal: 0, - accept_acks: ReplicaMap::new(self.population, false), + accept_acks: Bitmap::new(self.population, false), }), replica_bk: None, }; @@ -555,14 +554,12 @@ impl CrosswordReplica { break; } - if inst.reqs_cw.avail_shards() < self.quorum_cnt as usize { + if inst.reqs_cw.avail_shards() < self.quorum_cnt { // can't execute if I don't have the complete request batch pf_debug!(self.id; "postponing execution for slot {} (shards {}/{})", slot, inst.reqs_cw.avail_shards(), self.quorum_cnt); break; - } else if inst.reqs_cw.avail_data_shards() - < self.quorum_cnt as usize - { + } else if inst.reqs_cw.avail_data_shards() < self.quorum_cnt { // have enough shards but need reconstruction inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?; } @@ -638,8 +635,8 @@ impl CrosswordReplica { bal: 0, status: Status::Null, reqs_cw: RSCodeword::::from_null( - self.quorum_cnt as usize, - (self.population - self.quorum_cnt) as usize, + self.quorum_cnt, + self.population - self.quorum_cnt, )?, leader_bk: None, replica_bk: None, @@ -680,7 +677,7 @@ impl CrosswordReplica { ) -> Result<(), SummersetError> { pf_trace!(self.id; "received PrepareReply <- {} for slot {} bal {} shards {:?}", peer, slot, ballot, - voted.as_ref().map(|(_, cw)| cw.avail_shards_set())); + voted.as_ref().map(|(_, cw)| cw.avail_shards_map())); // if ballot is what I'm currently waiting on for Prepare replies: if ballot == self.bal_prep_sent { @@ -720,7 +717,7 @@ impl CrosswordReplica { // instance using the request batch value constructed using shards // with the highest ballot number in quorum if leader_bk.prepare_acks.count() >= self.quorum_cnt - && inst.reqs_cw.avail_data_shards() >= self.quorum_cnt as usize + && inst.reqs_cw.avail_data_shards() >= self.quorum_cnt { inst.status = Status::Accepting; pf_debug!(self.id; "enter Accept phase for slot {} bal {}", @@ -731,7 +728,7 @@ impl CrosswordReplica { self.bal_prepared = ballot; // if parity shards not computed yet, compute them now - if inst.reqs_cw.avail_shards() < self.population as usize { + if inst.reqs_cw.avail_shards() < self.population { inst.reqs_cw.compute_parity(Some(&self.rs_coder))?; } @@ -795,7 +792,7 @@ impl CrosswordReplica { reqs_cw: RSCodeword, ) -> Result<(), SummersetError> { pf_trace!(self.id; "received Accept <- {} for slot {} bal {} shards {:?}", - peer, slot, ballot, reqs_cw.avail_shards_set()); + peer, slot, ballot, reqs_cw.avail_shards_map()); // if ballot is not smaller than what I have made promises for: if ballot >= self.bal_max_seen { @@ -805,8 +802,8 @@ impl CrosswordReplica { bal: 0, status: Status::Null, reqs_cw: RSCodeword::::from_null( - self.quorum_cnt as usize, - (self.population - self.quorum_cnt) as usize, + self.quorum_cnt, + self.population - self.quorum_cnt, )?, leader_bk: None, replica_bk: None, @@ -919,8 +916,8 @@ impl CrosswordReplica { bal: 0, status: Status::Null, reqs_cw: RSCodeword::::from_null( - self.quorum_cnt as usize, - (self.population - self.quorum_cnt) as usize, + self.quorum_cnt, + self.population - self.quorum_cnt, )?, leader_bk: None, replica_bk: None, diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs index c268372c..d44056f1 100644 --- a/src/protocols/multipaxos.rs +++ b/src/protocols/multipaxos.rs @@ -11,7 +11,7 @@ use std::collections::HashMap; use std::path::Path; use std::net::SocketAddr; -use crate::utils::{SummersetError, ReplicaMap}; +use crate::utils::{SummersetError, Bitmap}; use crate::manager::{CtrlMsg, CtrlRequest, CtrlReply}; use crate::server::{ ReplicaId, ControlHub, StateMachine, CommandResult, CommandId, ExternalApi, @@ -77,13 +77,13 @@ type ReqBatch = Vec<(ClientId, ApiRequest)>; #[derive(Debug, Clone)] struct LeaderBookkeeping { /// Replicas from which I have received Prepare confirmations. - prepare_acks: ReplicaMap, + prepare_acks: Bitmap, /// Max ballot among received Prepare replies. prepare_max_bal: Ballot, /// Replicas from which I have received Accept confirmations. - accept_acks: ReplicaMap, + accept_acks: Bitmap, } /// Follower-side bookkeeping info for each instance received. @@ -308,9 +308,9 @@ impl MultiPaxosReplica { if old_inst.status == Status::Null { old_inst.reqs = req_batch.clone(); old_inst.leader_bk = Some(LeaderBookkeeping { - prepare_acks: ReplicaMap::new(self.population, false), + prepare_acks: Bitmap::new(self.population, false), prepare_max_bal: 0, - accept_acks: ReplicaMap::new(self.population, false), + accept_acks: Bitmap::new(self.population, false), }); slot = s; break; @@ -322,9 +322,9 @@ impl MultiPaxosReplica { status: Status::Null, reqs: req_batch.clone(), leader_bk: Some(LeaderBookkeeping { - prepare_acks: ReplicaMap::new(self.population, false), + prepare_acks: Bitmap::new(self.population, false), prepare_max_bal: 0, - accept_acks: ReplicaMap::new(self.population, false), + accept_acks: Bitmap::new(self.population, false), }), replica_bk: None, }; diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs index e47993de..b2da668d 100644 --- a/src/protocols/rs_paxos.rs +++ b/src/protocols/rs_paxos.rs @@ -3,11 +3,11 @@ //! MultiPaxos with Reed-Solomon erasure coding. References: //! - -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use std::path::Path; use std::net::SocketAddr; -use crate::utils::{SummersetError, ReplicaMap, RSCodeword}; +use crate::utils::{SummersetError, Bitmap, RSCodeword}; use crate::manager::{CtrlMsg, CtrlRequest, CtrlReply}; use crate::server::{ ReplicaId, ControlHub, StateMachine, CommandResult, CommandId, ExternalApi, @@ -79,13 +79,13 @@ type ReqBatch = Vec<(ClientId, ApiRequest)>; #[derive(Debug, Clone)] struct LeaderBookkeeping { /// Replicas from which I have received Prepare confirmations. - prepare_acks: ReplicaMap, + prepare_acks: Bitmap, /// Max ballot among received Prepare replies. prepare_max_bal: Ballot, /// Replicas from which I have received Accept confirmations. - accept_acks: ReplicaMap, + accept_acks: Bitmap, } /// Follower-side bookkeeping info for each instance received. @@ -307,8 +307,8 @@ impl RSPaxosReplica { // compute the complete Reed-Solomon codeword for the batch data let mut reqs_cw = RSCodeword::from_data( req_batch, - self.quorum_cnt as usize, - (self.population - self.quorum_cnt) as usize, + self.quorum_cnt, + self.population - self.quorum_cnt, )?; reqs_cw.compute_parity(Some(&self.rs_coder))?; @@ -327,9 +327,9 @@ impl RSPaxosReplica { assert_eq!(old_inst.status, Status::Null); old_inst.reqs_cw = reqs_cw; old_inst.leader_bk = Some(LeaderBookkeeping { - prepare_acks: ReplicaMap::new(self.population, false), + prepare_acks: Bitmap::new(self.population, false), prepare_max_bal: 0, - accept_acks: ReplicaMap::new(self.population, false), + accept_acks: Bitmap::new(self.population, false), }); } else { let new_inst = Instance { @@ -337,9 +337,9 @@ impl RSPaxosReplica { status: Status::Null, reqs_cw, leader_bk: Some(LeaderBookkeeping { - prepare_acks: ReplicaMap::new(self.population, false), + prepare_acks: Bitmap::new(self.population, false), prepare_max_bal: 0, - accept_acks: ReplicaMap::new(self.population, false), + accept_acks: Bitmap::new(self.population, false), }), replica_bk: None, }; @@ -407,7 +407,7 @@ impl RSPaxosReplica { ballot: inst.bal, // persist only one shard on myself reqs_cw: inst.reqs_cw.subset_copy( - HashSet::from([self.id as usize]), + Bitmap::from(self.population, vec![self.id]), false, )?, }, @@ -427,7 +427,7 @@ impl RSPaxosReplica { slot, ballot: inst.bal, reqs_cw: inst.reqs_cw.subset_copy( - HashSet::from([peer as usize]), + Bitmap::from(self.population, vec![peer]), false, )?, }, @@ -530,14 +530,12 @@ impl RSPaxosReplica { break; } - if inst.reqs_cw.avail_shards() < self.quorum_cnt as usize { + if inst.reqs_cw.avail_shards() < self.quorum_cnt { // can't execute if I don't have the complete request batch pf_debug!(self.id; "postponing execution for slot {} (shards {}/{})", slot, inst.reqs_cw.avail_shards(), self.quorum_cnt); break; - } else if inst.reqs_cw.avail_data_shards() - < self.quorum_cnt as usize - { + } else if inst.reqs_cw.avail_data_shards() < self.quorum_cnt { // have enough shards but need reconstruction inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?; } @@ -613,8 +611,8 @@ impl RSPaxosReplica { bal: 0, status: Status::Null, reqs_cw: RSCodeword::::from_null( - self.quorum_cnt as usize, - (self.population - self.quorum_cnt) as usize, + self.quorum_cnt, + self.population - self.quorum_cnt, )?, leader_bk: None, replica_bk: None, @@ -655,7 +653,7 @@ impl RSPaxosReplica { ) -> Result<(), SummersetError> { pf_trace!(self.id; "received PrepareReply <- {} for slot {} bal {} shards {:?}", peer, slot, ballot, - voted.as_ref().map(|(_, cw)| cw.avail_shards_set())); + voted.as_ref().map(|(_, cw)| cw.avail_shards_map())); // if ballot is what I'm currently waiting on for Prepare replies: if ballot == self.bal_prep_sent { @@ -695,7 +693,7 @@ impl RSPaxosReplica { // instance using the request batch value constructed using shards // with the highest ballot number in quorum if leader_bk.prepare_acks.count() >= self.quorum_cnt - && inst.reqs_cw.avail_data_shards() >= self.quorum_cnt as usize + && inst.reqs_cw.avail_data_shards() >= self.quorum_cnt { inst.status = Status::Accepting; pf_debug!(self.id; "enter Accept phase for slot {} bal {}", @@ -706,7 +704,7 @@ impl RSPaxosReplica { self.bal_prepared = ballot; // if parity shards not computed yet, compute them now - if inst.reqs_cw.avail_shards() < self.population as usize { + if inst.reqs_cw.avail_shards() < self.population { inst.reqs_cw.compute_parity(Some(&self.rs_coder))?; } @@ -718,7 +716,7 @@ impl RSPaxosReplica { slot, ballot, reqs_cw: inst.reqs_cw.subset_copy( - HashSet::from([self.id as usize]), + Bitmap::from(self.population, vec![self.id]), false, )?, }, @@ -738,7 +736,7 @@ impl RSPaxosReplica { slot, ballot, reqs_cw: inst.reqs_cw.subset_copy( - HashSet::from([peer as usize]), + Bitmap::from(self.population, vec![peer]), false, )?, }, @@ -762,7 +760,7 @@ impl RSPaxosReplica { reqs_cw: RSCodeword, ) -> Result<(), SummersetError> { pf_trace!(self.id; "received Accept <- {} for slot {} bal {} shards {:?}", - peer, slot, ballot, reqs_cw.avail_shards_set()); + peer, slot, ballot, reqs_cw.avail_shards_map()); // if ballot is not smaller than what I have made promises for: if ballot >= self.bal_max_seen { @@ -772,8 +770,8 @@ impl RSPaxosReplica { bal: 0, status: Status::Null, reqs_cw: RSCodeword::::from_null( - self.quorum_cnt as usize, - (self.population - self.quorum_cnt) as usize, + self.quorum_cnt, + self.population - self.quorum_cnt, )?, leader_bk: None, replica_bk: None, @@ -886,8 +884,8 @@ impl RSPaxosReplica { bal: 0, status: Status::Null, reqs_cw: RSCodeword::::from_null( - self.quorum_cnt as usize, - (self.population - self.quorum_cnt) as usize, + self.quorum_cnt, + self.population - self.quorum_cnt, )?, leader_bk: None, replica_bk: None, diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs index d212f98b..eb082de3 100644 --- a/src/protocols/simple_push.rs +++ b/src/protocols/simple_push.rs @@ -7,7 +7,7 @@ use std::path::Path; use std::net::SocketAddr; -use crate::utils::{SummersetError, ReplicaMap}; +use crate::utils::{SummersetError, Bitmap}; use crate::manager::{CtrlMsg, CtrlRequest, CtrlReply}; use crate::server::{ ReplicaId, ControlHub, StateMachine, CommandResult, CommandId, ExternalApi, @@ -80,7 +80,7 @@ enum PushMsg { struct Instance { reqs: Vec<(ClientId, ApiRequest)>, durable: bool, - pending_peers: ReplicaMap, + pending_peers: Bitmap, execed: Vec, from_peer: Option<(ReplicaId, usize)>, // peer ID, peer inst_idx } @@ -148,7 +148,7 @@ impl SimplePushReplica { assert!(batch_size > 0); // target peers to push to - let mut target = ReplicaMap::new(self.population, false); + let mut target = Bitmap::new(self.population, false); let mut peer_cnt = 0; for peer in 0..self.population { if peer_cnt == self.config.rep_degree { @@ -262,7 +262,7 @@ impl SimplePushReplica { let inst = Instance { reqs: req_batch.clone(), durable: false, - pending_peers: ReplicaMap::new(self.population, false), + pending_peers: Bitmap::new(self.population, false), execed: vec![false; req_batch.len()], from_peer: Some((peer, src_inst_idx)), }; diff --git a/src/server/transport.rs b/src/server/transport.rs index fb4bae0e..10deff8b 100644 --- a/src/server/transport.rs +++ b/src/server/transport.rs @@ -3,7 +3,7 @@ use std::fmt; use std::net::SocketAddr; -use crate::utils::{SummersetError, ReplicaMap, safe_tcp_read, safe_tcp_write}; +use crate::utils::{SummersetError, Bitmap, safe_tcp_read, safe_tcp_write}; use crate::server::ReplicaId; use bytes::BytesMut; @@ -144,10 +144,10 @@ where } } - /// Gets a ReplicaMap where currently connected peers are set true. - pub fn current_peers(&self) -> Result { + /// Gets a bitmap where currently connected peers are set true. + pub fn current_peers(&self) -> Result { let tx_sends_guard = self.tx_sends.guard(); - let mut peers = ReplicaMap::new(self.population, false); + let mut peers = Bitmap::new(self.population, false); for &id in tx_sends_guard.keys() { if let Err(e) = peers.set(id, true) { return logged_err!(self.me; "error setting peer {}: {}", @@ -187,7 +187,7 @@ where pub fn bcast_msg( &mut self, msg: Msg, - target: Option, + target: Option, ) -> Result<(), SummersetError> { let tx_sends_guard = self.tx_sends.guard(); for &peer in tx_sends_guard.keys() { @@ -624,7 +624,7 @@ mod transport_tests { assert!(id == 1 || id == 2); assert_eq!(msg, TestMsg("world".into())); // send another message to 1 only - let mut map = ReplicaMap::new(3, false); + let mut map = Bitmap::new(3, false); map.set(1, true)?; hub.bcast_msg(TestMsg("nice".into()), Some(map))?; // recv another message from 1 diff --git a/src/utils/bitmap.rs b/src/utils/bitmap.rs index a7f27d98..dfbb8467 100644 --- a/src/utils/bitmap.rs +++ b/src/utils/bitmap.rs @@ -1,15 +1,16 @@ //! Bitmap data structure helper. +use std::fmt; + use crate::utils::SummersetError; -use crate::server::ReplicaId; use fixedbitset::FixedBitSet; -/// Compact bitmap for replica ID -> bool mapping. -#[derive(Debug, Clone)] -pub struct ReplicaMap(FixedBitSet); +/// Compact bitmap for u8 ID -> bool mapping. +#[derive(Clone, PartialEq, Eq)] +pub struct Bitmap(FixedBitSet); -impl ReplicaMap { +impl Bitmap { /// Creates a new bitmap of given size. If `ones` is true, all slots are /// marked true initially; otherwise, all slots are initially false. pub fn new(size: u8, ones: bool) -> Self { @@ -17,18 +18,31 @@ impl ReplicaMap { panic!("invalid bitmap size {}", size); } let mut bitset = FixedBitSet::with_capacity(size as usize); + if ones { bitset.set_range(.., true); } - ReplicaMap(bitset) + + Bitmap(bitset) + } + + /// Creates a new bitmap of given size from vec literal. Indices in the + /// vec are bits to be set as true. + pub fn from(size: u8, ones: Vec) -> Self { + let mut bitmap = Self::new(size, false); + + for idx in ones { + if let Err(e) = bitmap.set(idx, true) { + panic!("{}", e); + } + } + + bitmap } /// Sets bit at index to given flag. - pub fn set( - &mut self, - idx: ReplicaId, - flag: bool, - ) -> Result<(), SummersetError> { + #[inline] + pub fn set(&mut self, idx: u8, flag: bool) -> Result<(), SummersetError> { if idx as usize >= self.0.len() { return Err(SummersetError(format!("index {} out of bound", idx))); } @@ -37,7 +51,8 @@ impl ReplicaMap { } /// Gets the bit flag at index. - pub fn get(&self, idx: ReplicaId) -> Result { + #[inline] + pub fn get(&self, idx: u8) -> Result { if idx as usize >= self.0.len() { return Err(SummersetError(format!("index {} out of bound", idx))); } @@ -45,33 +60,36 @@ impl ReplicaMap { } /// Returns the size of the bitmap. + #[inline] pub fn size(&self) -> u8 { self.0.len() as u8 } /// Returns the number of trues in the bitmap. + #[inline] pub fn count(&self) -> u8 { self.0.count_ones(..) as u8 } /// Allows `for (id, bit) in map.iter()`. - pub fn iter(&self) -> ReplicaMapIter { - ReplicaMapIter { map: self, idx: 0 } + #[inline] + pub fn iter(&self) -> BitmapIter { + BitmapIter { map: self, idx: 0 } } } -/// Iterator over `ReplicaMap`, yielding `(id, bit)` pairs. +/// Iterator over `Bitmap`, yielding `(id, bit)` pairs. #[derive(Debug, Clone)] -pub struct ReplicaMapIter<'m> { - map: &'m ReplicaMap, +pub struct BitmapIter<'m> { + map: &'m Bitmap, idx: usize, } -impl Iterator for ReplicaMapIter<'_> { - type Item = (ReplicaId, bool); +impl Iterator for BitmapIter<'_> { + type Item = (u8, bool); fn next(&mut self) -> Option { - let id: ReplicaId = self.idx as ReplicaId; + let id: u8 = self.idx as u8; if id < self.map.size() { self.idx += 1; Some((id, self.map.get(id).unwrap())) @@ -81,6 +99,26 @@ impl Iterator for ReplicaMapIter<'_> { } } +// Implement `Debug` trait manually for better trace printing. +impl fmt::Debug for Bitmap { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{{{}; [", self.size())?; + let mut first_idx = true; + for i in self + .iter() + .filter_map(|(i, flag)| if flag { Some(i) } else { None }) + { + if !first_idx { + write!(f, ", {}", i)?; + } else { + write!(f, "{}", i)?; + first_idx = false; + } + } + write!(f, "]}}") + } +} + #[cfg(test)] mod bitmap_tests { use super::*; @@ -88,12 +126,12 @@ mod bitmap_tests { #[test] #[should_panic] fn bitmap_new_panic() { - ReplicaMap::new(0, true); + Bitmap::new(0, true); } #[test] fn bitmap_set_get() { - let mut map = ReplicaMap::new(7, false); + let mut map = Bitmap::new(7, false); assert!(map.set(0, true).is_ok()); assert!(map.set(1, false).is_ok()); assert!(map.set(2, true).is_ok()); @@ -107,7 +145,7 @@ mod bitmap_tests { #[test] fn bitmap_count() { - let mut map = ReplicaMap::new(7, false); + let mut map = Bitmap::new(7, false); assert_eq!(map.count(), 0); assert!(map.set(0, true).is_ok()); assert!(map.set(2, true).is_ok()); @@ -118,7 +156,7 @@ mod bitmap_tests { #[test] fn bitmap_iter() { let ref_map = vec![true, true, false, true, true]; - let mut map = ReplicaMap::new(5, true); + let mut map = Bitmap::new(5, true); assert!(map.set(2, false).is_ok()); for (id, flag) in map.iter() { assert_eq!(ref_map[id as usize], flag); diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 6feb3e1e..7510b772 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -13,7 +13,7 @@ mod safetcp; mod rscoding; pub use error::SummersetError; -pub use bitmap::ReplicaMap; +pub use bitmap::Bitmap; pub use timer::Timer; pub use safetcp::{safe_tcp_read, safe_tcp_write}; pub use rscoding::RSCodeword; diff --git a/src/utils/rscoding.rs b/src/utils/rscoding.rs index 659077ac..49c008a3 100644 --- a/src/utils/rscoding.rs +++ b/src/utils/rscoding.rs @@ -2,10 +2,9 @@ use std::fmt; use std::io; -use std::collections::HashSet; use std::marker::PhantomData; -use crate::utils::SummersetError; +use crate::utils::{SummersetError, Bitmap}; use bytes::{BytesMut, BufMut}; @@ -20,10 +19,10 @@ use reed_solomon_erasure::galois_8::ReedSolomon; #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] pub struct RSCodeword { /// Number of data shards. - num_data_shards: usize, + num_data_shards: u8, /// Number of parity shards. - num_parity_shards: usize, + num_parity_shards: u8, /// Exact length of original data in bytes. data_len: usize, @@ -53,13 +52,13 @@ where data_copy: Option, data_bytes: Option, data_len: usize, - num_data_shards: usize, - num_parity_shards: usize, + num_data_shards: u8, + num_parity_shards: u8, ) -> Result { if num_data_shards == 0 { return Err(SummersetError("num_data_shards is zero".into())); } - if data_len != 0 && data_len < num_data_shards { + if data_len != 0 && data_len < num_data_shards as usize { return Err(SummersetError(format!( "data length too small: {}", data_len @@ -67,10 +66,10 @@ where } let num_total_shards = num_data_shards + num_parity_shards; - let shard_len = if data_len % num_data_shards == 0 { - data_len / num_data_shards + let shard_len = if data_len % num_data_shards as usize == 0 { + data_len / num_data_shards as usize } else { - (data_len / num_data_shards) + 1 + (data_len / num_data_shards as usize) + 1 }; let shards = if let Some(mut data_bytes) = data_bytes { @@ -78,11 +77,11 @@ where assert_eq!(data_bytes.len(), data_len); // pad length to multiple of num_data_shards and compute shard size - let padded_len = shard_len * num_data_shards; + let padded_len = shard_len * num_data_shards as usize; data_bytes.resize(padded_len, 0); // split the bytes representation into contiguously stored shards - let mut shards = Vec::with_capacity(num_data_shards); + let mut shards = Vec::with_capacity(num_data_shards as usize); for _ in 0..(num_data_shards - 1) { let shard = data_bytes.split_to(shard_len); assert_eq!(shard.len(), shard_len); @@ -90,15 +89,15 @@ where } assert_eq!(data_bytes.len(), shard_len); shards.push(Some(data_bytes)); // the last shard - assert_eq!(shards.len(), num_data_shards); + assert_eq!(shards.len(), num_data_shards as usize); for _ in num_data_shards..num_total_shards { shards.push(None); } - assert_eq!(shards.len(), num_total_shards); + assert_eq!(shards.len(), num_total_shards as usize); shards } else { // if newing from empty - vec![None; num_total_shards] + vec![None; num_total_shards as usize] }; Ok(RSCodeword { @@ -115,8 +114,8 @@ where /// Creates a new RSCodeword from original data. pub fn from_data( data: T, - num_data_shards: usize, - num_parity_shards: usize, + num_data_shards: u8, + num_parity_shards: u8, ) -> Result { // serialize original data into bytes let mut data_writer = BytesMut::new().writer(); @@ -133,8 +132,8 @@ where /// Creates a new RSCodeword from empty bytes. pub fn from_null( - num_data_shards: usize, - num_parity_shards: usize, + num_data_shards: u8, + num_parity_shards: u8, ) -> Result { Self::new(None, None, 0, num_data_shards, num_parity_shards) } @@ -143,15 +142,25 @@ where /// shards, and a complete copy of the original data if required. pub fn subset_copy( &self, - subset: HashSet, + subset: Bitmap, copy_data: bool, ) -> Result { if self.data_len == 0 { return Err(SummersetError("codeword is null".into())); } - let mut shards = vec![None; self.num_shards()]; - for i in subset { + let mut shards = vec![None; self.num_shards() as usize]; + for i in + subset.iter().filter_map( + |(i, flag)| { + if flag { + Some(i as usize) + } else { + None + } + }, + ) + { if i >= shards.len() { return Err(SummersetError(format!( "shard index {} out-of-bound", @@ -231,60 +240,71 @@ where } /// Gets number of data shards. - pub fn num_data_shards(&self) -> usize { + #[inline] + pub fn num_data_shards(&self) -> u8 { self.num_data_shards } /// Gets number of parity shards. #[allow(dead_code)] - pub fn num_parity_shards(&self) -> usize { + #[inline] + pub fn num_parity_shards(&self) -> u8 { self.num_parity_shards } /// Gets total number of shards. - pub fn num_shards(&self) -> usize { - self.shards.len() + #[inline] + pub fn num_shards(&self) -> u8 { + self.shards.len() as u8 } /// Gets number of currently available data shards. - pub fn avail_data_shards(&self) -> usize { + #[inline] + pub fn avail_data_shards(&self) -> u8 { self.shards .iter() - .take(self.num_data_shards) + .take(self.num_data_shards as usize) .filter(|s| s.is_some()) - .count() + .count() as u8 } /// Gets number of currently available parity shards. #[allow(dead_code)] - pub fn avail_parity_shards(&self) -> usize { + #[inline] + pub fn avail_parity_shards(&self) -> u8 { self.shards .iter() - .skip(self.num_data_shards) + .skip(self.num_data_shards as usize) .filter(|s| s.is_some()) - .count() + .count() as u8 } /// Gets total number of currently available shards. - pub fn avail_shards(&self) -> usize { - self.shards.iter().filter(|s| s.is_some()).count() + #[inline] + pub fn avail_shards(&self) -> u8 { + self.shards.iter().filter(|s| s.is_some()).count() as u8 } - /// Gets the set of available shard indexes. - pub fn avail_shards_set(&self) -> HashSet { - self.shards + /// Gets a bitmap of available shard indexes set true. + #[inline] + pub fn avail_shards_map(&self) -> Bitmap { + let ones: Vec = self + .shards .iter() .enumerate() - .filter_map(|(i, s)| if s.is_some() { Some(i) } else { None }) - .collect() + .filter_map(|(i, s)| if s.is_some() { Some(i as u8) } else { None }) + .collect(); + Bitmap::from(self.num_shards(), ones) } /// Gets length of original data in bytes. + #[inline] pub fn data_len(&self) -> usize { self.data_len } /// Gets length of a shard in bytes. + #[inline] pub fn shard_len(&self) -> usize { self.shard_len } @@ -295,13 +315,13 @@ where &self, rs: &ReedSolomon, ) -> Result<(), SummersetError> { - if rs.data_shard_count() != self.num_data_shards { + if rs.data_shard_count() != self.num_data_shards as usize { Err(SummersetError(format!( "num_data_shards mismatch: expected {}, rs {}", self.num_data_shards, rs.data_shard_count() ))) - } else if rs.parity_shard_count() != self.num_parity_shards { + } else if rs.parity_shard_count() != self.num_parity_shards as usize { Err(SummersetError(format!( "num_parity_shards mismatch: expected {}, rs {}", self.num_parity_shards, @@ -339,7 +359,8 @@ where } // allocate space for parity shards if haven't - for shard in self.shards.iter_mut().skip(self.num_data_shards) { + for shard in self.shards.iter_mut().skip(self.num_data_shards as usize) + { if shard.is_none() { *shard = Some(BytesMut::zeroed(self.shard_len)); } @@ -473,23 +494,23 @@ struct ShardsReader<'a> { shards: &'a Vec>, /// Number of data shards in vec. - num_data_shards: usize, + num_data_shards: u8, /// Length in bytes of a shard. shard_len: usize, /// Composite cursor: (shard_idx, byte_idx). - cursor: (usize, usize), + cursor: (u8, usize), } impl<'a> ShardsReader<'a> { /// Creates a new temporary reader. fn new( shards: &'a Vec>, - num_data_shards: usize, + num_data_shards: u8, shard_len: usize, ) -> Result { - for shard in shards.iter().take(num_data_shards) { + for shard in shards.iter().take(num_data_shards as usize) { if shard.is_none() { return Err(SummersetError("some data shard is None".into())); } @@ -510,8 +531,9 @@ impl<'a> io::Read for ShardsReader<'a> { let mut total_nread = 0; while self.cursor.0 < self.num_data_shards { - let mut slice = &(self.shards[self.cursor.0].as_ref().unwrap()) - [self.cursor.1..]; + let mut slice = &(self.shards[self.cursor.0 as usize] + .as_ref() + .unwrap())[self.cursor.1..]; let (_, buf_tail) = buf.split_at_mut(total_nread); let shard_nread = slice.read(buf_tail).unwrap(); @@ -569,7 +591,7 @@ mod rscoding_tests { assert_eq!(cw.avail_data_shards(), 3); assert_eq!(cw.avail_parity_shards(), 0); assert_eq!(cw.avail_shards(), 3); - assert_eq!(cw.avail_shards_set(), HashSet::from([0, 1, 2])); + assert_eq!(cw.avail_shards_map(), Bitmap::from(3, vec![0, 1, 2])); assert_eq!(cw.data_len(), data_len); assert_eq!(cw.shard_len(), shard_len); // valid with num_parity_shards > 0 @@ -580,7 +602,7 @@ mod rscoding_tests { assert_eq!(cw.avail_data_shards(), 3); assert_eq!(cw.avail_parity_shards(), 0); assert_eq!(cw.avail_shards(), 3); - assert_eq!(cw.avail_shards_set(), HashSet::from([0, 1, 2])); + assert_eq!(cw.avail_shards_map(), Bitmap::from(5, vec![0, 1, 2])); assert_eq!(cw.data_len(), data_len); assert_eq!(cw.shard_len(), shard_len); Ok(()) @@ -598,7 +620,7 @@ mod rscoding_tests { assert_eq!(cw.avail_data_shards(), 0); assert_eq!(cw.avail_parity_shards(), 0); assert_eq!(cw.avail_shards(), 0); - assert_eq!(cw.avail_shards_set(), HashSet::new()); + assert_eq!(cw.avail_shards_map(), Bitmap::new(5, false)); assert_eq!(cw.data_len(), 0); assert_eq!(cw.shard_len(), 0); Ok(()) @@ -609,21 +631,21 @@ mod rscoding_tests { let data = TestData("interesting_value".into()); let cwa = RSCodeword::from_data(data.clone(), 3, 2)?; // invalid subset - assert!(cwa.subset_copy(HashSet::from([0, 5]), false).is_err()); + assert!(cwa.subset_copy(Bitmap::from(6, vec![0, 5]), false).is_err()); // valid subsets - let cw01 = cwa.subset_copy(HashSet::from([0, 1]), false)?; + let cw01 = cwa.subset_copy(Bitmap::from(5, vec![0, 1]), false)?; assert_eq!(cw01.avail_data_shards(), 2); - let cw02 = cwa.subset_copy(HashSet::from([0, 2]), true)?; + let cw02 = cwa.subset_copy(Bitmap::from(5, vec![0, 2]), true)?; assert_eq!(cw02.avail_data_shards(), 2); assert!(cw02.data_copy.is_some()); // valid absorbing let mut cwb = RSCodeword::::from_null(3, 2)?; cwb.absorb_other(cw02)?; assert_eq!(cwb.avail_shards(), 2); - assert_eq!(cwb.avail_shards_set(), HashSet::from([0, 2])); + assert_eq!(cwb.avail_shards_map(), Bitmap::from(5, vec![0, 2])); cwb.absorb_other(cw01)?; assert_eq!(cwb.avail_shards(), 3); - assert_eq!(cwb.avail_shards_set(), HashSet::from([0, 1, 2])); + assert_eq!(cwb.avail_shards_map(), Bitmap::from(5, vec![0, 1, 2])); assert_eq!(*cwb.get_data()?, data); // invalid absorbing assert!(cwb From 7312afbb471e865e31a40e3b306a1bbfc2a224ac Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Tue, 29 Aug 2023 20:51:29 +0800 Subject: [PATCH 06/21] fix crossword ack pattern bug --- src/protocols/crossword.rs | 118 +++++++++++++++++++++++++++++-------- 1 file changed, 93 insertions(+), 25 deletions(-) diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index 7f6bc743..da59f976 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -89,8 +89,9 @@ struct LeaderBookkeeping { /// Max ballot among received Prepare replies. prepare_max_bal: Ballot, - /// Replicas from which I have received Accept confirmations. - accept_acks: Bitmap, + /// Replicas and their assigned shards which the received Accept + /// confirmations cover. + accept_acks: HashMap, } /// Follower-side bookkeeping info for each instance received. @@ -284,9 +285,51 @@ impl CrosswordReplica { id: ReplicaId, population: u8, num_shards: u8, - ) -> Bitmap { - let ones = (id..(id + num_shards)).map(|i| (i % population)).collect(); - Bitmap::from(population, ones) + ) -> Vec { + (id..(id + num_shards)).map(|i| (i % population)).collect() + } + + /// TODO: make better impl of this. + fn coverage_under_faults( + population: u8, + acks: &HashMap, + fault_tolerance: u8, + ) -> u8 { + if acks.len() <= fault_tolerance as usize { + return 0; + } + + // enumerate all subsets of acks excluding fault number of replicas + let cnt = (acks.len() - fault_tolerance as usize) as u32; + let servers: Vec = acks.keys().cloned().collect(); + let mut min_coverage = population; + + for n in (0..2usize.pow(servers.len() as u32)) + .filter(|n| n.count_ones() == cnt) + { + let mut coverage = Bitmap::new(population, false); + for (_, server) in servers + .iter() + .enumerate() + .filter(|&(i, _)| (n >> i) % 2 == 1) + { + for shard in acks[server].iter().filter_map(|(s, flag)| { + if flag { + Some(s) + } else { + None + } + }) { + coverage.set(shard, true).expect("impossible shard index"); + } + } + + if coverage.count() < min_coverage { + min_coverage = coverage.count(); + } + } + + min_coverage } /// Handler of client request batch chan recv. @@ -344,7 +387,7 @@ impl CrosswordReplica { old_inst.leader_bk = Some(LeaderBookkeeping { prepare_acks: Bitmap::new(self.population, false), prepare_max_bal: 0, - accept_acks: Bitmap::new(self.population, false), + accept_acks: HashMap::new(), }); } else { let new_inst = Instance { @@ -354,7 +397,7 @@ impl CrosswordReplica { leader_bk: Some(LeaderBookkeeping { prepare_acks: Bitmap::new(self.population, false), prepare_max_bal: 0, - accept_acks: Bitmap::new(self.population, false), + accept_acks: HashMap::new(), }), replica_bk: None, }; @@ -422,10 +465,13 @@ impl CrosswordReplica { ballot: inst.bal, // persist only some shards on myself reqs_cw: inst.reqs_cw.subset_copy( - Self::shards_for_replica( - self.id, + Bitmap::from( self.population, - self.config.shards_per_replica, + Self::shards_for_replica( + self.id, + self.population, + self.config.shards_per_replica, + ), ), false, )?, @@ -447,10 +493,13 @@ impl CrosswordReplica { slot, ballot: inst.bal, reqs_cw: inst.reqs_cw.subset_copy( - Self::shards_for_replica( - peer, + Bitmap::from( self.population, - self.config.shards_per_replica, + Self::shards_for_replica( + peer, + self.population, + self.config.shards_per_replica, + ), ), false, )?, @@ -740,10 +789,13 @@ impl CrosswordReplica { slot, ballot, reqs_cw: inst.reqs_cw.subset_copy( - Self::shards_for_replica( - self.id, + Bitmap::from( self.population, - self.config.shards_per_replica, + Self::shards_for_replica( + self.id, + self.population, + self.config.shards_per_replica, + ), ), false, )?, @@ -764,10 +816,13 @@ impl CrosswordReplica { slot, ballot, reqs_cw: inst.reqs_cw.subset_copy( - Self::shards_for_replica( - peer, + Bitmap::from( self.population, - self.config.shards_per_replica, + Self::shards_for_replica( + peer, + self.population, + self.config.shards_per_replica, + ), ), false, )?, @@ -862,18 +917,31 @@ impl CrosswordReplica { assert!(self.bal_max_seen >= ballot); assert!(inst.leader_bk.is_some()); let leader_bk = inst.leader_bk.as_mut().unwrap(); - if leader_bk.accept_acks.get(peer)? { + if leader_bk.accept_acks.contains_key(&peer) { return Ok(()); } // bookkeep this Accept reply - leader_bk.accept_acks.set(peer, true)?; + leader_bk.accept_acks.insert( + peer, + Bitmap::from( + self.population, + Self::shards_for_replica( + peer, + self.population, + self.config.shards_per_replica, + ), + ), + ); // if quorum size reached AND enough number of shards are - // remembered, mark this instance as committed; in RS-Paxos, this - // means accept_acks.count() >= self.quorum_cnt + fault_tolerance - if leader_bk.accept_acks.count() - >= self.quorum_cnt + self.config.fault_tolerance + // remembered, mark this instance as committed + if leader_bk.accept_acks.len() as u8 >= self.quorum_cnt + && Self::coverage_under_faults( + self.population, + &leader_bk.accept_acks, + self.config.fault_tolerance, + ) >= self.quorum_cnt { inst.status = Status::Committed; pf_debug!(self.id; "committed instance at slot {} bal {}", From 6f2e0c7e5dce3cf31e1df1aaf94b65a4416f532e Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Wed, 30 Aug 2023 13:25:28 +0800 Subject: [PATCH 07/21] minor updates to bench script --- scripts/local_bench.tmp.py | 39 ++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py index c20ad6d5..8aaa6374 100644 --- a/scripts/local_bench.tmp.py +++ b/scripts/local_bench.tmp.py @@ -144,7 +144,7 @@ def all_protocol_configs(num_replicas): max_fault_tolerance = num_replicas - quorum_cnt config_choices = [("MultiPaxos", None, None)] - for shards_per_replica in range(quorum_cnt, 0): + for shards_per_replica in range(quorum_cnt, 0, -1): config_choices.append( ("Crossword", max_fault_tolerance, shards_per_replica) ) @@ -152,20 +152,27 @@ def all_protocol_configs(num_replicas): return config_choices - # for num_replicas in (3, 5, 7): - # for value_size in (1024, 65536, 4194304): - # for protocol, fault_tolerance, shards_per_replica in all_protocol_configs( - # num_replicas - # ): - # bench_round( - # protocol, - # num_replicas, - # value_size, - # 100, - # 60, - # fault_tolerance=fault_tolerance, - # shards_per_replica=shards_per_replica, - # ) + for num_replicas in (3, 5, 7): + for value_size in (1024, 65536, 4194304): + for protocol, fault_tolerance, shards_per_replica in all_protocol_configs( + num_replicas + ): + # print( + # num_replicas, + # value_size, + # protocol, + # fault_tolerance, + # shards_per_replica, + # ) + bench_round( + protocol, + num_replicas, + value_size, + 100, + 60, + fault_tolerance=fault_tolerance, + shards_per_replica=shards_per_replica, + ) bench_round("MultiPaxos", 5, 65536, 0, 60) - # bench_round("Crossword", 5, 65536, 0, 60, fault_tolerance=0, shards_per_replica=1) + bench_round("Crossword", 5, 65536, 0, 60, fault_tolerance=0, shards_per_replica=1) From 7d11298a57d319a81f863ad4d4df002ec2795bd4 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Wed, 30 Aug 2023 14:03:55 +0800 Subject: [PATCH 08/21] minor updates to bench script --- scripts/local_bench.tmp.py | 2 +- scripts/local_cluster.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py index 8aaa6374..ba3e2346 100644 --- a/scripts/local_bench.tmp.py +++ b/scripts/local_bench.tmp.py @@ -19,7 +19,7 @@ def run_process(cmd): def kill_all_matching(name): # print("Kill all:", name) assert name.count(" ") == 0 - cmd = ["pkill", "-9", "-f", name] + cmd = ["sudo", "pkill", "-9", "-f", name] proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) proc.wait() diff --git a/scripts/local_cluster.py b/scripts/local_cluster.py index ffbdff15..c4fa4f68 100644 --- a/scripts/local_cluster.py +++ b/scripts/local_cluster.py @@ -26,7 +26,7 @@ def run_process(cmd, capture_stderr=False): def kill_all_matching(name): print("Kill all:", name) assert name.count(" ") == 0 - cmd = ["pkill", "-9", "-f", name] + cmd = ["sudo", "pkill", "-9", "-f", name] proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) proc.wait() From 8353c08c5310293e55c4dad0d5bbf2e1b081124e Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Wed, 30 Aug 2023 14:11:55 +0800 Subject: [PATCH 09/21] minor updates to bench script --- scripts/local_bench.tmp.py | 6 ++---- scripts/local_cluster.py | 5 ++--- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py index ba3e2346..45085437 100644 --- a/scripts/local_bench.tmp.py +++ b/scripts/local_bench.tmp.py @@ -1,5 +1,5 @@ +import os import subprocess -import itertools import statistics @@ -19,9 +19,7 @@ def run_process(cmd): def kill_all_matching(name): # print("Kill all:", name) assert name.count(" ") == 0 - cmd = ["sudo", "pkill", "-9", "-f", name] - proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - proc.wait() + os.system(f"sudo pkill -9 -f {name}") def launch_cluster(protocol, num_replicas, config): diff --git a/scripts/local_cluster.py b/scripts/local_cluster.py index c4fa4f68..f0cc1099 100644 --- a/scripts/local_cluster.py +++ b/scripts/local_cluster.py @@ -1,4 +1,5 @@ import sys +import os import argparse import subprocess from pathlib import Path @@ -26,9 +27,7 @@ def run_process(cmd, capture_stderr=False): def kill_all_matching(name): print("Kill all:", name) assert name.count(" ") == 0 - cmd = ["sudo", "pkill", "-9", "-f", name] - proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - proc.wait() + os.system(f"sudo pkill -9 -f {name}") MANAGER_SRV_PORT = 52600 From b2621cbe65f7fc0395c261a99ce39306d3ea3bad Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Wed, 30 Aug 2023 14:28:58 +0800 Subject: [PATCH 10/21] fixing scripts address already in use --- scripts/local_bench.tmp.py | 2 +- scripts/local_cluster.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py index 45085437..3776079f 100644 --- a/scripts/local_bench.tmp.py +++ b/scripts/local_bench.tmp.py @@ -19,7 +19,7 @@ def run_process(cmd): def kill_all_matching(name): # print("Kill all:", name) assert name.count(" ") == 0 - os.system(f"sudo pkill -9 -f {name}") + os.system(f"killall -9 {name} > /dev/null 2>&1") def launch_cluster(protocol, num_replicas, config): diff --git a/scripts/local_cluster.py b/scripts/local_cluster.py index f0cc1099..87fd94e3 100644 --- a/scripts/local_cluster.py +++ b/scripts/local_cluster.py @@ -27,7 +27,7 @@ def run_process(cmd, capture_stderr=False): def kill_all_matching(name): print("Kill all:", name) assert name.count(" ") == 0 - os.system(f"sudo pkill -9 -f {name}") + os.system(f"killall -9 {name} > /dev/null 2>&1") MANAGER_SRV_PORT = 52600 From 0a79799d8b2b80f0c160b084602cd03aaf581230 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Wed, 30 Aug 2023 14:54:13 +0800 Subject: [PATCH 11/21] fixing scripts address already in use --- scripts/local_bench.tmp.py | 5 ++++- src/manager/reactor.rs | 6 ++++-- src/manager/reigner.rs | 6 ++++-- src/server/external.rs | 6 ++++-- src/server/transport.rs | 6 ++++-- src/utils/mod.rs | 2 +- src/utils/safetcp.rs | 28 +++++++++++++++++++++++++--- 7 files changed, 46 insertions(+), 13 deletions(-) diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py index 3776079f..ad51e517 100644 --- a/scripts/local_bench.tmp.py +++ b/scripts/local_bench.tmp.py @@ -108,7 +108,10 @@ def bench_round( shards_per_replica=None, ): print( - f"{protocol:<10s} n={num_replicas:1d} v={value_size:<9d} w%={put_ratio:<3d} {length_s:3d}s" + f"{protocol:<10s} n={num_replicas:1d} v={value_size:<9d} " + + f"f={fault_tolerance if fault_tolerance is not None else 'x':1d} " + + f"s={shards_per_replica if shards_per_replica is not None else 'x':1d} " + + f"w%={put_ratio:<3d} {length_s:3d}s" ) kill_all_matching("summerset_client") kill_all_matching("summerset_server") diff --git a/src/manager/reactor.rs b/src/manager/reactor.rs index 52b14f63..e3a1b198 100644 --- a/src/manager/reactor.rs +++ b/src/manager/reactor.rs @@ -3,7 +3,9 @@ use std::collections::HashMap; use std::net::SocketAddr; -use crate::utils::{SummersetError, safe_tcp_read, safe_tcp_write}; +use crate::utils::{ + SummersetError, safe_tcp_read, safe_tcp_write, tcp_bind_with_retry, +}; use crate::server::ReplicaId; use crate::client::ClientId; @@ -74,7 +76,7 @@ impl ClientReactor { let (client_responder_handles_write, client_responder_handles_read) = flashmap::new::>(); - let client_listener = TcpListener::bind(cli_addr).await?; + let client_listener = tcp_bind_with_retry(cli_addr, 10).await?; let client_acceptor_handle = tokio::spawn(Self::client_acceptor_thread( tx_req, diff --git a/src/manager/reigner.rs b/src/manager/reigner.rs index 21658f92..cff8f18f 100644 --- a/src/manager/reigner.rs +++ b/src/manager/reigner.rs @@ -3,7 +3,9 @@ use std::collections::HashMap; use std::net::SocketAddr; -use crate::utils::{SummersetError, safe_tcp_read, safe_tcp_write}; +use crate::utils::{ + SummersetError, safe_tcp_read, safe_tcp_write, tcp_bind_with_retry, +}; use crate::server::ReplicaId; use crate::protocols::SmrProtocol; @@ -71,7 +73,7 @@ impl ServerReigner { let (server_controller_handles_write, server_controller_handles_read) = flashmap::new::>(); - let server_listener = TcpListener::bind(srv_addr).await?; + let server_listener = tcp_bind_with_retry(srv_addr, 10).await?; let server_acceptor_handle = tokio::spawn(Self::server_acceptor_thread( tx_recv, diff --git a/src/server/external.rs b/src/server/external.rs index 9c50b546..3083a662 100644 --- a/src/server/external.rs +++ b/src/server/external.rs @@ -3,7 +3,9 @@ use std::net::SocketAddr; use std::sync::Arc; -use crate::utils::{SummersetError, safe_tcp_read, safe_tcp_write}; +use crate::utils::{ + SummersetError, safe_tcp_read, safe_tcp_write, tcp_bind_with_retry, +}; use crate::server::{ReplicaId, Command, CommandResult}; use crate::client::ClientId; @@ -115,7 +117,7 @@ impl ExternalApi { let (client_servant_handles_write, client_servant_handles_read) = flashmap::new::>(); - let client_listener = TcpListener::bind(api_addr).await?; + let client_listener = tcp_bind_with_retry(api_addr, 10).await?; let client_acceptor_handle = tokio::spawn(Self::client_acceptor_thread( me, diff --git a/src/server/transport.rs b/src/server/transport.rs index 10deff8b..ca121d70 100644 --- a/src/server/transport.rs +++ b/src/server/transport.rs @@ -3,7 +3,9 @@ use std::fmt; use std::net::SocketAddr; -use crate::utils::{SummersetError, Bitmap, safe_tcp_read, safe_tcp_write}; +use crate::utils::{ + SummersetError, Bitmap, safe_tcp_read, safe_tcp_write, tcp_bind_with_retry, +}; use crate::server::ReplicaId; use bytes::BytesMut; @@ -84,7 +86,7 @@ where let (tx_connect, rx_connect) = mpsc::unbounded_channel(); let (tx_connack, rx_connack) = mpsc::unbounded_channel(); - let peer_listener = TcpListener::bind(p2p_addr).await?; + let peer_listener = tcp_bind_with_retry(p2p_addr, 10).await?; let peer_acceptor_handle = tokio::spawn(Self::peer_acceptor_thread( me, tx_recv.clone(), diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 7510b772..23a43006 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -15,5 +15,5 @@ mod rscoding; pub use error::SummersetError; pub use bitmap::Bitmap; pub use timer::Timer; -pub use safetcp::{safe_tcp_read, safe_tcp_write}; +pub use safetcp::{safe_tcp_read, safe_tcp_write, tcp_bind_with_retry}; pub use rscoding::RSCodeword; diff --git a/src/utils/safetcp.rs b/src/utils/safetcp.rs index 2e3cad88..6a0df26a 100644 --- a/src/utils/safetcp.rs +++ b/src/utils/safetcp.rs @@ -1,7 +1,9 @@ //! Safe TCP read/write helpers that provides cancellation safety on the read -//! side and deadlock avoidance on the write side. +//! side and deadlock avoidance on the write side. Safe `TcpListener` binding +//! wrapper that provides a retrying logic. use std::io::ErrorKind; +use std::net::SocketAddr; use crate::utils::SummersetError; @@ -13,7 +15,8 @@ use rmp_serde::encode::to_vec as encode_to_vec; use rmp_serde::decode::from_read as decode_from_read; use tokio::io::AsyncReadExt; -use tokio::net::TcpStream; +use tokio::net::{TcpStream, TcpListener}; +use tokio::time::{self, Duration}; /// Receives an object of type `T` from TCP readable connection `conn_read`, /// using `read_buf` as buffer storage for partial reads. Returns: @@ -140,4 +143,23 @@ where Ok(true) } -// No unit tests for these two helpers... +/// Wrapper over tokio `TcpListener::bind()` that provides a retrying logic. +pub async fn tcp_bind_with_retry( + addr: SocketAddr, + mut retries: u8, +) -> Result { + loop { + match TcpListener::bind(addr).await { + Ok(listener) => return Ok(listener), + Err(e) => { + if retries == 0 { + return Err(e.into()); + } + retries -= 1; + time::sleep(Duration::from_secs(1)).await; + } + } + } +} + +// No unit tests for these helpers... From aec9b00d5f03c9dea0ae6dce87a9ee18539db9e6 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Wed, 30 Aug 2023 15:01:37 +0800 Subject: [PATCH 12/21] fixing scripts address already in use --- scripts/local_bench.tmp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py index ad51e517..96a03bf4 100644 --- a/scripts/local_bench.tmp.py +++ b/scripts/local_bench.tmp.py @@ -109,8 +109,8 @@ def bench_round( ): print( f"{protocol:<10s} n={num_replicas:1d} v={value_size:<9d} " - + f"f={fault_tolerance if fault_tolerance is not None else 'x':1d} " - + f"s={shards_per_replica if shards_per_replica is not None else 'x':1d} " + + f"f={fault_tolerance if fault_tolerance is not None else 'x':1} " + + f"s={shards_per_replica if shards_per_replica is not None else 'x':1} " + f"w%={put_ratio:<3d} {length_s:3d}s" ) kill_all_matching("summerset_client") From 9359dbe73e579f711c8f49eb9498c5abfec663f0 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Wed, 30 Aug 2023 15:02:09 +0800 Subject: [PATCH 13/21] fixing scripts address already in use --- scripts/local_bench.tmp.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py index 96a03bf4..1537f7c2 100644 --- a/scripts/local_bench.tmp.py +++ b/scripts/local_bench.tmp.py @@ -42,6 +42,7 @@ def wait_cluster_setup(proc, num_replicas): for line in iter(proc.stderr.readline, b""): l = line.decode() + print(l, end="") if "manager" not in l and "accepting clients" in l: replica = int(l[l.find("(") + 1 : l.find(")")]) assert not accepting_clients[replica] From 667f705cdf4fd7e6e60388cbd90fd83773bf5e89 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Wed, 30 Aug 2023 17:29:30 +0800 Subject: [PATCH 14/21] add proper termination signals handling --- Cargo.lock | 22 ++++++++++++ Cargo.toml | 1 + scripts/local_bench.tmp.py | 12 ++++--- scripts/local_cluster.py | 29 ++++++++++++--- src/manager/clusman.rs | 23 ++++++++++-- src/protocols/crossword.rs | 17 ++++++++- src/protocols/multipaxos.rs | 17 ++++++++- src/protocols/rep_nothing.rs | 17 ++++++++- src/protocols/rs_paxos.rs | 17 ++++++++- src/protocols/simple_push.rs | 17 ++++++++- src/server/replica.rs | 7 ++-- src/server/transport.rs | 2 +- src/utils/error.rs | 1 + summerset_client/src/main.rs | 3 +- summerset_manager/src/main.rs | 5 +-- summerset_server/src/main.rs | 66 ++++++++++++++++++++++------------- 16 files changed, 209 insertions(+), 47 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 38874908..e9fc04f8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -230,6 +230,16 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" +[[package]] +name = "ctrlc" +version = "3.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a011bbe2c35ce9c1f143b7af6f94f29a167beb4cd1d29e6740ce836f723120e" +dependencies = [ + "nix", + "windows-sys", +] + [[package]] name = "dirs" version = "4.0.0" @@ -607,6 +617,17 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "nix" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "598beaf3cc6fdd9a5dfb1630c2800c7acd31df7aaf0f565796fba2b53ca1af1b" +dependencies = [ + "bitflags 1.3.2", + "cfg-if", + "libc", +] + [[package]] name = "nom" version = "5.1.3" @@ -1095,6 +1116,7 @@ version = "0.1.0" dependencies = [ "async-trait", "bytes", + "ctrlc", "fixedbitset", "flashmap", "futures", diff --git a/Cargo.toml b/Cargo.toml index 18f2bfe9..707f1150 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,3 +22,4 @@ serde = { version = "1.0", features = ["derive"] } toml = { version = "0.7", features = ["parse"] } log = "0.4" reed-solomon-erasure = { version = "6.0", features = ["simd-accel"] } +ctrlc = { version = "3.4", features = ["termination"] } diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py index 1537f7c2..03d9e9ba 100644 --- a/scripts/local_bench.tmp.py +++ b/scripts/local_bench.tmp.py @@ -16,10 +16,12 @@ def run_process(cmd): return proc -def kill_all_matching(name): +def kill_all_matching(name, force=False): # print("Kill all:", name) assert name.count(" ") == 0 - os.system(f"killall -9 {name} > /dev/null 2>&1") + cmd = "killall -9" if force else "killall" + cmd += f" {name} > /dev/null 2>&1" + os.system(cmd) def launch_cluster(protocol, num_replicas, config): @@ -114,9 +116,9 @@ def bench_round( + f"s={shards_per_replica if shards_per_replica is not None else 'x':1} " + f"w%={put_ratio:<3d} {length_s:3d}s" ) - kill_all_matching("summerset_client") - kill_all_matching("summerset_server") - kill_all_matching("summerset_manager") + kill_all_matching("summerset_client", force=True) + kill_all_matching("summerset_server", force=True) + kill_all_matching("summerset_manager", force=True) configs = [] if fault_tolerance is not None: diff --git a/scripts/local_cluster.py b/scripts/local_cluster.py index 87fd94e3..c4e0877c 100644 --- a/scripts/local_cluster.py +++ b/scripts/local_cluster.py @@ -1,5 +1,6 @@ import sys import os +import signal import argparse import subprocess from pathlib import Path @@ -24,10 +25,12 @@ def run_process(cmd, capture_stderr=False): return proc -def kill_all_matching(name): +def kill_all_matching(name, force=False): print("Kill all:", name) assert name.count(" ") == 0 - os.system(f"killall -9 {name} > /dev/null 2>&1") + cmd = "killall -9" if force else "killall" + cmd += f" {name} > /dev/null 2>&1" + os.system(cmd) MANAGER_SRV_PORT = 52600 @@ -155,8 +158,8 @@ def launch_servers(protocol, num_replicas, release, config): args = parser.parse_args() # kill all existing server and manager processes - kill_all_matching("summerset_server") - kill_all_matching("summerset_manager") + kill_all_matching("summerset_server", force=True) + kill_all_matching("summerset_manager", force=True) # remove all existing wal files for path in Path("/tmp").glob("summerset.*.wal"): @@ -170,11 +173,27 @@ def launch_servers(protocol, num_replicas, release, config): wait_manager_setup(manager_proc) # then launch server replicas - launch_servers(args.protocol, args.num_replicas, args.release, args.config) + server_procs = launch_servers( + args.protocol, args.num_replicas, args.release, args.config + ) + + # register termination signals handler + def kill_spawned_procs(*args): + for proc in server_procs: + proc.terminate() + for proc in server_procs: + proc.wait() + manager_proc.terminate() + + signal.signal(signal.SIGINT, kill_spawned_procs) + signal.signal(signal.SIGTERM, kill_spawned_procs) + signal.signal(signal.SIGHUP, kill_spawned_procs) + # since we piped manager proc's output, re-print it out for line in iter(manager_proc.stderr.readline, b""): sys.stderr.buffer.write(line) sys.stderr.flush() + # reaches here after manager proc has terminated rc = manager_proc.wait() sys.exit(rc) diff --git a/src/manager/clusman.rs b/src/manager/clusman.rs index 15c7372e..e0c6f842 100644 --- a/src/manager/clusman.rs +++ b/src/manager/clusman.rs @@ -11,6 +11,8 @@ use crate::server::ReplicaId; use crate::client::ClientId; use crate::protocols::SmrProtocol; +use tokio::sync::mpsc; + /// Information about an active server. // TODO: maybe add things like leader info, etc. #[derive(Debug, Clone)] @@ -73,8 +75,17 @@ impl ClusterManager { }) } - /// Main event loop logic of the cluster manager. - pub async fn run(&mut self) { + /// Main event loop logic of the cluster manager. Breaks out of the loop + /// only upon catching termination signals to the process. + pub async fn run(&mut self) -> Result<(), SummersetError> { + // set up termination signals handler + let (tx_term, mut rx_term) = mpsc::unbounded_channel(); + ctrlc::set_handler(move || { + if let Err(e) = tx_term.send(true) { + pf_error!("m"; "error sending to term channel: {}", e); + } + })?; + loop { tokio::select! { // receiving server control message @@ -102,8 +113,16 @@ impl ClusterManager { client, e); } }, + + // receiving termination signal + _ = rx_term.recv() => { + pf_warn!("m"; "manager caught termination signal"); + break; + } } } + + Ok(()) } } diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index da59f976..db359123 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -22,6 +22,7 @@ use async_trait::async_trait; use serde::{Serialize, Deserialize}; use tokio::time::Duration; +use tokio::sync::mpsc; use reed_solomon_erasure::galois_8::ReedSolomon; @@ -1215,7 +1216,15 @@ impl GenericReplica for CrosswordReplica { }) } - async fn run(&mut self) { + async fn run(&mut self) -> Result { + // set up termination signals handler + let (tx_term, mut rx_term) = mpsc::unbounded_channel(); + ctrlc::set_handler(move || { + if let Err(e) = tx_term.send(true) { + pf_error!("s"; "error sending to term channel: {}", e); + } + })?; + // TODO: proper leader election if self.id == 0 { self.is_leader = true; @@ -1282,6 +1291,12 @@ impl GenericReplica for CrosswordReplica { if let Err(e) = self.handle_ctrl_msg(ctrl_msg) { pf_error!(self.id; "error handling ctrl msg: {}", e); } + }, + + // receiving termination signal + _ = rx_term.recv() => { + pf_warn!(self.id; "server caught termination signal"); + return Ok(false); } } } diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs index d44056f1..5ed71b3d 100644 --- a/src/protocols/multipaxos.rs +++ b/src/protocols/multipaxos.rs @@ -26,6 +26,7 @@ use async_trait::async_trait; use serde::{Serialize, Deserialize}; use tokio::time::Duration; +use tokio::sync::mpsc; /// Configuration parameters struct. #[derive(Debug, Deserialize)] @@ -1010,7 +1011,15 @@ impl GenericReplica for MultiPaxosReplica { }) } - async fn run(&mut self) { + async fn run(&mut self) -> Result { + // set up termination signals handler + let (tx_term, mut rx_term) = mpsc::unbounded_channel(); + ctrlc::set_handler(move || { + if let Err(e) = tx_term.send(true) { + pf_error!("s"; "error sending to term channel: {}", e); + } + })?; + // TODO: proper leader election if self.id == 0 { self.is_leader = true; @@ -1077,6 +1086,12 @@ impl GenericReplica for MultiPaxosReplica { if let Err(e) = self.handle_ctrl_msg(ctrl_msg) { pf_error!(self.id; "error handling ctrl msg: {}", e); } + }, + + // receiving termination signal + _ = rx_term.recv() => { + pf_warn!(self.id; "server caught termination signal"); + return Ok(false); } } } diff --git a/src/protocols/rep_nothing.rs b/src/protocols/rep_nothing.rs index b071253b..f4ea852c 100644 --- a/src/protocols/rep_nothing.rs +++ b/src/protocols/rep_nothing.rs @@ -21,6 +21,7 @@ use async_trait::async_trait; use serde::{Serialize, Deserialize}; use tokio::time::Duration; +use tokio::sync::mpsc; /// Configuration parameters struct. #[derive(Debug, Deserialize)] @@ -295,7 +296,15 @@ impl GenericReplica for RepNothingReplica { }) } - async fn run(&mut self) { + async fn run(&mut self) -> Result { + // set up termination signals handler + let (tx_term, mut rx_term) = mpsc::unbounded_channel(); + ctrlc::set_handler(move || { + if let Err(e) = tx_term.send(true) { + pf_error!("s"; "error sending to term channel: {}", e); + } + })?; + loop { tokio::select! { // client request batch @@ -344,6 +353,12 @@ impl GenericReplica for RepNothingReplica { if let Err(e) = self.handle_ctrl_msg(ctrl_msg) { pf_error!(self.id; "error handling ctrl msg: {}", e); } + }, + + // receiving termination signal + _ = rx_term.recv() => { + pf_warn!(self.id; "server caught termination signal"); + return Ok(false); } } } diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs index b2da668d..ce3ccc85 100644 --- a/src/protocols/rs_paxos.rs +++ b/src/protocols/rs_paxos.rs @@ -22,6 +22,7 @@ use async_trait::async_trait; use serde::{Serialize, Deserialize}; use tokio::time::Duration; +use tokio::sync::mpsc; use reed_solomon_erasure::galois_8::ReedSolomon; @@ -1108,7 +1109,15 @@ impl GenericReplica for RSPaxosReplica { }) } - async fn run(&mut self) { + async fn run(&mut self) -> Result { + // set up termination signals handler + let (tx_term, mut rx_term) = mpsc::unbounded_channel(); + ctrlc::set_handler(move || { + if let Err(e) = tx_term.send(true) { + pf_error!("s"; "error sending to term channel: {}", e); + } + })?; + // TODO: proper leader election if self.id == 0 { self.is_leader = true; @@ -1175,6 +1184,12 @@ impl GenericReplica for RSPaxosReplica { if let Err(e) = self.handle_ctrl_msg(ctrl_msg) { pf_error!(self.id; "error handling ctrl msg: {}", e); } + }, + + // receiving termination signal + _ = rx_term.recv() => { + pf_warn!(self.id; "server caught termination signal"); + return Ok(false); } } } diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs index eb082de3..841d8c20 100644 --- a/src/protocols/simple_push.rs +++ b/src/protocols/simple_push.rs @@ -22,6 +22,7 @@ use async_trait::async_trait; use serde::{Serialize, Deserialize}; use tokio::time::Duration; +use tokio::sync::mpsc; /// Configuration parameters struct. #[derive(Debug, Deserialize)] @@ -470,7 +471,15 @@ impl GenericReplica for SimplePushReplica { }) } - async fn run(&mut self) { + async fn run(&mut self) -> Result { + // set up termination signals handler + let (tx_term, mut rx_term) = mpsc::unbounded_channel(); + ctrlc::set_handler(move || { + if let Err(e) = tx_term.send(true) { + pf_error!("s"; "error sending to term channel: {}", e); + } + })?; + loop { tokio::select! { // client request batch @@ -541,6 +550,12 @@ impl GenericReplica for SimplePushReplica { if let Err(e) = self.handle_ctrl_msg(ctrl_msg) { pf_error!(self.id; "error handling ctrl msg: {}", e); } + }, + + // receiving termination signal + _ = rx_term.recv() => { + pf_warn!(self.id; "server caught termination signal"); + return Ok(false); } } } diff --git a/src/server/replica.rs b/src/server/replica.rs index 174e60a2..cae9d042 100644 --- a/src/server/replica.rs +++ b/src/server/replica.rs @@ -24,6 +24,9 @@ pub trait GenericReplica { where Self: Sized; - /// Main event loop logic of running this replica. - async fn run(&mut self); + /// Main event loop logic of running this replica. Returns `Ok(true)` if + /// terminated normally and wants to restart (e.g., receiving a reset + /// control message) or `Ok(false)` if terminated normally and does not + /// want to restart (e.g., receiving a termination signal). + async fn run(&mut self) -> Result; } diff --git a/src/server/transport.rs b/src/server/transport.rs index ca121d70..504e32a2 100644 --- a/src/server/transport.rs +++ b/src/server/transport.rs @@ -370,7 +370,7 @@ where to_connect = rx_connect.recv() => { if to_connect.is_none() { pf_error!(me; "connect channel closed"); - continue; + break; // channel gets closed and no messages remain } let (peer, addr) = to_connect.unwrap(); if let Err(e) = Self::connect_new_peer( diff --git a/src/utils/error.rs b/src/utils/error.rs index cdef3b56..90e576c5 100644 --- a/src/utils/error.rs +++ b/src/utils/error.rs @@ -45,6 +45,7 @@ impl_from_error!( tokio::sync::mpsc::error::SendError<(ReplicaId, net::SocketAddr)> ); impl_from_error!(reed_solomon_erasure::Error); +impl_from_error!(ctrlc::Error); #[cfg(test)] mod error_tests { diff --git a/summerset_client/src/main.rs b/summerset_client/src/main.rs index bba5eae3..26346720 100644 --- a/summerset_client/src/main.rs +++ b/summerset_client/src/main.rs @@ -10,7 +10,7 @@ use env_logger::Env; use tokio::runtime::Builder; use tokio::time::Duration; -use summerset::{SmrProtocol, SummersetError, pf_error}; +use summerset::{SmrProtocol, SummersetError, pf_warn, pf_error}; mod drivers; mod clients; @@ -160,6 +160,7 @@ fn main() -> ExitCode { pf_error!("c"; "client_main exitted: {}", e); ExitCode::FAILURE } else { + pf_warn!("c"; "client_main exitted successfully"); ExitCode::SUCCESS } } diff --git a/summerset_manager/src/main.rs b/summerset_manager/src/main.rs index 87fc1d61..6b886372 100644 --- a/summerset_manager/src/main.rs +++ b/summerset_manager/src/main.rs @@ -9,7 +9,7 @@ use env_logger::Env; use tokio::runtime::Builder; -use summerset::{SmrProtocol, SummersetError, pf_error}; +use summerset::{SmrProtocol, SummersetError, pf_warn, pf_error}; /// Command line arguments definition. #[derive(Parser, Debug)] @@ -113,7 +113,7 @@ fn manager_main() -> Result<(), SummersetError> { .new_cluster_manager_setup(srv_addr, cli_addr, args.population) .await?; - manager.run().await; + manager.run().await?; Ok::<(), SummersetError>(()) // give type hint for this async closure }) @@ -130,6 +130,7 @@ fn main() -> ExitCode { pf_error!("m"; "manager_main exitted: {}", e); ExitCode::FAILURE } else { + pf_warn!("m"; "manager_main exitted successfully"); ExitCode::SUCCESS } } diff --git a/summerset_server/src/main.rs b/summerset_server/src/main.rs index df3a736b..abbbc20d 100644 --- a/summerset_server/src/main.rs +++ b/summerset_server/src/main.rs @@ -2,6 +2,8 @@ use std::net::SocketAddr; use std::process::ExitCode; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; use clap::Parser; @@ -9,7 +11,7 @@ use env_logger::Env; use tokio::runtime::Builder; -use summerset::{SmrProtocol, SummersetError, pf_error}; +use summerset::{SmrProtocol, SummersetError, pf_warn, pf_error}; /// Command line arguments definition. #[derive(Parser, Debug)] @@ -107,29 +109,44 @@ fn server_main() -> Result<(), SummersetError> { Some(&args.config[..]) }; - // create tokio multi-threaded runtime - let runtime = Builder::new_multi_thread() - .enable_all() - .worker_threads(args.threads) - .thread_name("tokio-worker-replica") - .build()?; - - // enter tokio runtime, setup the server replica, and start the main event - // loop logic - runtime.block_on(async move { - let mut replica = protocol - .new_server_replica_setup( - api_addr, - p2p_addr, - args.manager, - config_str, - ) - .await?; - - replica.run().await; - - Ok::<(), SummersetError>(()) // give type hint for this async closure - }) + let shutdown = Arc::new(AtomicBool::new(false)); + while !shutdown.load(Ordering::SeqCst) { + let sd = shutdown.clone(); + + // create tokio multi-threaded runtime + let runtime = Builder::new_multi_thread() + .enable_all() + .worker_threads(args.threads) + .thread_name("tokio-worker-replica") + .build()?; + + // enter tokio runtime, setup the server replica, and start the main + // event loop logic + runtime.block_on(async move { + let mut replica = protocol + .new_server_replica_setup( + api_addr, + p2p_addr, + args.manager, + config_str, + ) + .await?; + + if replica.run().await? { + // event loop terminated but wants to restart (e.g., when + // receiving a reset control message); just drop this runtime + // and move to the next iteration of loop + } else { + // event loop terminated and does not want to restart (e.g., + // when receiving a termination signal) + sd.store(true, Ordering::SeqCst); + } + + Ok::<(), SummersetError>(()) // give type hint for this async closure + })?; + } + + Ok(()) } fn main() -> ExitCode { @@ -143,6 +160,7 @@ fn main() -> ExitCode { pf_error!("s"; "server_main exitted: {}", e); ExitCode::FAILURE } else { + pf_warn!("s"; "server_main exitted successfully"); ExitCode::SUCCESS } } From f7d71d45aafc69bee22a11ad6d4b5ae10c5cfe51 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Wed, 30 Aug 2023 17:33:21 +0800 Subject: [PATCH 15/21] fixing scripts address already in use --- scripts/local_bench.tmp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py index 03d9e9ba..a76b77ea 100644 --- a/scripts/local_bench.tmp.py +++ b/scripts/local_bench.tmp.py @@ -44,7 +44,7 @@ def wait_cluster_setup(proc, num_replicas): for line in iter(proc.stderr.readline, b""): l = line.decode() - print(l, end="") + # print(l, end="") if "manager" not in l and "accepting clients" in l: replica = int(l[l.find("(") + 1 : l.find(")")]) assert not accepting_clients[replica] From edbd4f5386a30d5eb0fa63c51632e840be71cd72 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Wed, 30 Aug 2023 18:46:26 +0800 Subject: [PATCH 16/21] fix wrong NewServerJoin message send timing --- src/manager/clusman.rs | 3 ++- src/manager/reigner.rs | 15 ++++++++++-- src/protocols/crossword.rs | 45 ++++++++++++++++++++---------------- src/protocols/multipaxos.rs | 43 +++++++++++++++++++--------------- src/protocols/rep_nothing.rs | 26 ++++++++++++--------- src/protocols/rs_paxos.rs | 43 +++++++++++++++++++--------------- src/protocols/simple_push.rs | 43 +++++++++++++++++++--------------- src/server/control.rs | 9 ++++++-- 8 files changed, 134 insertions(+), 93 deletions(-) diff --git a/src/manager/clusman.rs b/src/manager/clusman.rs index e0c6f842..8890faa9 100644 --- a/src/manager/clusman.rs +++ b/src/manager/clusman.rs @@ -61,7 +61,8 @@ impl ClusterManager { return logged_err!("m"; "invalid population {}", population); } - let server_reigner = ServerReigner::new_and_setup(srv_addr).await?; + let server_reigner = + ServerReigner::new_and_setup(srv_addr, population).await?; let client_reactor = ClientReactor::new_and_setup(cli_addr).await?; Ok(ClusterManager { diff --git a/src/manager/reigner.rs b/src/manager/reigner.rs index cff8f18f..ef7d7579 100644 --- a/src/manager/reigner.rs +++ b/src/manager/reigner.rs @@ -64,6 +64,7 @@ impl ServerReigner { /// messages. pub async fn new_and_setup( srv_addr: SocketAddr, + population: u8, ) -> Result { let (tx_recv, rx_recv) = mpsc::unbounded_channel(); @@ -76,6 +77,7 @@ impl ServerReigner { let server_listener = tcp_bind_with_retry(srv_addr, 10).await?; let server_acceptor_handle = tokio::spawn(Self::server_acceptor_thread( + population, tx_recv, server_listener, tx_sends_write, @@ -128,10 +130,12 @@ impl ServerReigner { // ServerReigner server_acceptor thread implementation impl ServerReigner { /// Accepts a new server connection. + #[allow(clippy::too_many_arguments)] async fn accept_new_server( mut stream: TcpStream, addr: SocketAddr, id: ReplicaId, + population: u8, tx_recv: mpsc::UnboundedSender<(ReplicaId, CtrlMsg)>, tx_sends: &mut flashmap::WriteHandle< ReplicaId, @@ -143,11 +147,16 @@ impl ServerReigner { >, tx_exit: mpsc::UnboundedSender, ) -> Result<(), SummersetError> { - // send ID assignment + // first send server ID assignment if let Err(e) = stream.write_u8(id).await { return logged_err!("m"; "error assigning new server ID: {}", e); } + // then send population + if let Err(e) = stream.write_u8(population).await { + return logged_err!("m"; "error sending population: {}", e); + } + let mut tx_sends_guard = tx_sends.guard(); if let Some(sender) = tx_sends_guard.get(&id) { if sender.is_closed() { @@ -205,6 +214,7 @@ impl ServerReigner { /// Server acceptor thread function. async fn server_acceptor_thread( + population: u8, tx_recv: mpsc::UnboundedSender<(ReplicaId, CtrlMsg)>, server_listener: TcpListener, mut tx_sends: flashmap::WriteHandle< @@ -241,6 +251,7 @@ impl ServerReigner { stream, addr, next_server_id, + population, tx_recv.clone(), &mut tx_sends, &mut server_controller_handles, @@ -471,7 +482,7 @@ mod reigner_tests { }); // manager let mut reigner = - ServerReigner::new_and_setup("127.0.0.1:53600".parse()?).await?; + ServerReigner::new_and_setup("127.0.0.1:53600".parse()?, 2).await?; setup_bar.wait().await; // recv message from server 0 let (id, msg) = reigner.recv_ctrl().await?; diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index db359123..dc74afe8 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -1116,14 +1116,16 @@ impl GenericReplica for CrosswordReplica { manager: SocketAddr, config_str: Option<&str>, ) -> Result { - let config = parsed_config!(config_str => ReplicaConfigCrossword; - batch_interval_us, max_batch_size, - backer_path, logger_sync, fault_tolerance, - shards_per_replica)?; // connect to the cluster manager and get assigned a server ID let mut control_hub = ControlHub::new_and_setup(manager).await?; let id = control_hub.me; + let population = control_hub.population; + // parse protocol-specific configs + let config = parsed_config!(config_str => ReplicaConfigCrossword; + batch_interval_us, max_batch_size, + backer_path, logger_sync, fault_tolerance, + shards_per_replica)?; if config.batch_interval_us == 0 { return logged_err!( id; @@ -1132,20 +1134,31 @@ impl GenericReplica for CrosswordReplica { ); } - // ask for population number and the list of peers to proactively - // connect to + // setup state machine module + let state_machine = StateMachine::new_and_setup(id).await?; + + // setup storage hub module + let storage_hub = + StorageHub::new_and_setup(id, Path::new(&config.backer_path)) + .await?; + + // setup transport hub module + let mut transport_hub = + TransportHub::new_and_setup(id, population, p2p_addr).await?; + + // ask for the list of peers to proactively connect to. Do this after + // transport hub has been set up, so that I will be able to accept + // later peer connections control_hub.send_ctrl(CtrlMsg::NewServerJoin { id, protocol: SmrProtocol::Crossword, api_addr, p2p_addr, })?; - let (population, to_peers) = if let CtrlMsg::ConnectToPeers { - population, - to_peers, - } = control_hub.recv_ctrl().await? + let to_peers = if let CtrlMsg::ConnectToPeers { to_peers, .. } = + control_hub.recv_ctrl().await? { - (population, to_peers) + to_peers } else { return logged_err!(id; "unexpected ctrl msg type received"); }; @@ -1168,15 +1181,6 @@ impl GenericReplica for CrosswordReplica { (population - quorum_cnt) as usize, )?; - let state_machine = StateMachine::new_and_setup(id).await?; - - let storage_hub = - StorageHub::new_and_setup(id, Path::new(&config.backer_path)) - .await?; - - let mut transport_hub = - TransportHub::new_and_setup(id, population, p2p_addr).await?; - // proactively connect to some peers, then wait for all population // have been connected with me for (peer, addr) in to_peers { @@ -1184,6 +1188,7 @@ impl GenericReplica for CrosswordReplica { } transport_hub.wait_for_group(population).await?; + // setup external API module, ready to take in client requests let external_api = ExternalApi::new_and_setup( id, api_addr, diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs index 5ed71b3d..4e50c9a3 100644 --- a/src/protocols/multipaxos.rs +++ b/src/protocols/multipaxos.rs @@ -931,13 +931,15 @@ impl GenericReplica for MultiPaxosReplica { manager: SocketAddr, config_str: Option<&str>, ) -> Result { - let config = parsed_config!(config_str => ReplicaConfigMultiPaxos; - batch_interval_us, max_batch_size, - backer_path, logger_sync)?; // connect to the cluster manager and get assigned a server ID let mut control_hub = ControlHub::new_and_setup(manager).await?; let id = control_hub.me; + let population = control_hub.population; + // parse protocol-specific configs + let config = parsed_config!(config_str => ReplicaConfigMultiPaxos; + batch_interval_us, max_batch_size, + backer_path, logger_sync)?; if config.batch_interval_us == 0 { return logged_err!( id; @@ -946,33 +948,35 @@ impl GenericReplica for MultiPaxosReplica { ); } - // ask for population number and the list of peers to proactively - // connect to + // setup state machine module + let state_machine = StateMachine::new_and_setup(id).await?; + + // setup storage hub module + let storage_hub = + StorageHub::new_and_setup(id, Path::new(&config.backer_path)) + .await?; + + // setup transport hub module + let mut transport_hub = + TransportHub::new_and_setup(id, population, p2p_addr).await?; + + // ask for the list of peers to proactively connect to. Do this after + // transport hub has been set up, so that I will be able to accept + // later peer connections control_hub.send_ctrl(CtrlMsg::NewServerJoin { id, protocol: SmrProtocol::MultiPaxos, api_addr, p2p_addr, })?; - let (population, to_peers) = if let CtrlMsg::ConnectToPeers { - population, - to_peers, - } = control_hub.recv_ctrl().await? + let to_peers = if let CtrlMsg::ConnectToPeers { to_peers, .. } = + control_hub.recv_ctrl().await? { - (population, to_peers) + to_peers } else { return logged_err!(id; "unexpected ctrl msg type received"); }; - let state_machine = StateMachine::new_and_setup(id).await?; - - let storage_hub = - StorageHub::new_and_setup(id, Path::new(&config.backer_path)) - .await?; - - let mut transport_hub = - TransportHub::new_and_setup(id, population, p2p_addr).await?; - // proactively connect to some peers, then wait for all population // have been connected with me for (peer, addr) in to_peers { @@ -980,6 +984,7 @@ impl GenericReplica for MultiPaxosReplica { } transport_hub.wait_for_group(population).await?; + // setup external API module, ready to take in client requests let external_api = ExternalApi::new_and_setup( id, api_addr, diff --git a/src/protocols/rep_nothing.rs b/src/protocols/rep_nothing.rs index f4ea852c..bbfb79c6 100644 --- a/src/protocols/rep_nothing.rs +++ b/src/protocols/rep_nothing.rs @@ -242,13 +242,14 @@ impl GenericReplica for RepNothingReplica { manager: SocketAddr, config_str: Option<&str>, ) -> Result { - let config = parsed_config!(config_str => ReplicaConfigRepNothing; - batch_interval_us, max_batch_size, - backer_path, logger_sync)?; // connect to the cluster manager and get assigned a server ID let mut control_hub = ControlHub::new_and_setup(manager).await?; let id = control_hub.me; + // parse protocol-specific configs + let config = parsed_config!(config_str => ReplicaConfigRepNothing; + batch_interval_us, max_batch_size, + backer_path, logger_sync)?; if config.batch_interval_us == 0 { return logged_err!( id; @@ -257,6 +258,16 @@ impl GenericReplica for RepNothingReplica { ); } + // setup state machine module + let state_machine = StateMachine::new_and_setup(id).await?; + + // setup storage hub module + let storage_hub = + StorageHub::new_and_setup(id, Path::new(&config.backer_path)) + .await?; + + // TransportHub is not needed in RepNothing + // tell the manager tha I have joined control_hub.send_ctrl(CtrlMsg::NewServerJoin { id, @@ -266,14 +277,7 @@ impl GenericReplica for RepNothingReplica { })?; control_hub.recv_ctrl().await?; - let state_machine = StateMachine::new_and_setup(id).await?; - - let storage_hub = - StorageHub::new_and_setup(id, Path::new(&config.backer_path)) - .await?; - - // TransportHub is not needed in RepNothing - + // setup external API module, ready to take in client requests let external_api = ExternalApi::new_and_setup( id, api_addr, diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs index ce3ccc85..1c72b41a 100644 --- a/src/protocols/rs_paxos.rs +++ b/src/protocols/rs_paxos.rs @@ -1016,13 +1016,15 @@ impl GenericReplica for RSPaxosReplica { manager: SocketAddr, config_str: Option<&str>, ) -> Result { - let config = parsed_config!(config_str => ReplicaConfigRSPaxos; - batch_interval_us, max_batch_size, - backer_path, logger_sync, fault_tolerance)?; // connect to the cluster manager and get assigned a server ID let mut control_hub = ControlHub::new_and_setup(manager).await?; let id = control_hub.me; + let population = control_hub.population; + // parse protocol-specific configs + let config = parsed_config!(config_str => ReplicaConfigRSPaxos; + batch_interval_us, max_batch_size, + backer_path, logger_sync, fault_tolerance)?; if config.batch_interval_us == 0 { return logged_err!( id; @@ -1031,20 +1033,31 @@ impl GenericReplica for RSPaxosReplica { ); } - // ask for population number and the list of peers to proactively - // connect to + // setup state machine module + let state_machine = StateMachine::new_and_setup(id).await?; + + // setup storage hub module + let storage_hub = + StorageHub::new_and_setup(id, Path::new(&config.backer_path)) + .await?; + + // setup transport hub module + let mut transport_hub = + TransportHub::new_and_setup(id, population, p2p_addr).await?; + + // ask for the list of peers to proactively connect to. Do this after + // transport hub has been set up, so that I will be able to accept + // later peer connections control_hub.send_ctrl(CtrlMsg::NewServerJoin { id, protocol: SmrProtocol::RSPaxos, api_addr, p2p_addr, })?; - let (population, to_peers) = if let CtrlMsg::ConnectToPeers { - population, - to_peers, - } = control_hub.recv_ctrl().await? + let to_peers = if let CtrlMsg::ConnectToPeers { to_peers, .. } = + control_hub.recv_ctrl().await? { - (population, to_peers) + to_peers } else { return logged_err!(id; "unexpected ctrl msg type received"); }; @@ -1061,15 +1074,6 @@ impl GenericReplica for RSPaxosReplica { (population - quorum_cnt) as usize, )?; - let state_machine = StateMachine::new_and_setup(id).await?; - - let storage_hub = - StorageHub::new_and_setup(id, Path::new(&config.backer_path)) - .await?; - - let mut transport_hub = - TransportHub::new_and_setup(id, population, p2p_addr).await?; - // proactively connect to some peers, then wait for all population // have been connected with me for (peer, addr) in to_peers { @@ -1077,6 +1081,7 @@ impl GenericReplica for RSPaxosReplica { } transport_hub.wait_for_group(population).await?; + // setup external API module, ready to take in client requests let external_api = ExternalApi::new_and_setup( id, api_addr, diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs index 841d8c20..b0156ad3 100644 --- a/src/protocols/simple_push.rs +++ b/src/protocols/simple_push.rs @@ -398,13 +398,15 @@ impl GenericReplica for SimplePushReplica { manager: SocketAddr, config_str: Option<&str>, ) -> Result { - let config = parsed_config!(config_str => ReplicaConfigSimplePush; - batch_interval_us, max_batch_size, - backer_path, rep_degree)?; // connect to the cluster manager and get assigned a server ID let mut control_hub = ControlHub::new_and_setup(manager).await?; let id = control_hub.me; + let population = control_hub.population; + // parse protocol-specific configs + let config = parsed_config!(config_str => ReplicaConfigSimplePush; + batch_interval_us, max_batch_size, + backer_path, rep_degree)?; if config.batch_interval_us == 0 { return logged_err!( id; @@ -413,33 +415,35 @@ impl GenericReplica for SimplePushReplica { ); } - // ask for population number and the list of peers to proactively - // connect to + // setup state machine module + let state_machine = StateMachine::new_and_setup(id).await?; + + // setup storage hub module + let storage_hub = + StorageHub::new_and_setup(id, Path::new(&config.backer_path)) + .await?; + + // setup transport hub module + let mut transport_hub = + TransportHub::new_and_setup(id, population, p2p_addr).await?; + + // ask for the list of peers to proactively connect to. Do this after + // transport hub has been set up, so that I will be able to accept + // later peer connections control_hub.send_ctrl(CtrlMsg::NewServerJoin { id, protocol: SmrProtocol::SimplePush, api_addr, p2p_addr, })?; - let (population, to_peers) = if let CtrlMsg::ConnectToPeers { - population, - to_peers, - } = control_hub.recv_ctrl().await? + let to_peers = if let CtrlMsg::ConnectToPeers { to_peers, .. } = + control_hub.recv_ctrl().await? { - (population, to_peers) + to_peers } else { return logged_err!(id; "unexpected ctrl msg type received"); }; - let state_machine = StateMachine::new_and_setup(id).await?; - - let storage_hub = - StorageHub::new_and_setup(id, Path::new(&config.backer_path)) - .await?; - - let mut transport_hub = - TransportHub::new_and_setup(id, population, p2p_addr).await?; - // proactively connect to some peers, then wait for all population // have been connected with me for (peer, addr) in to_peers { @@ -447,6 +451,7 @@ impl GenericReplica for SimplePushReplica { } transport_hub.wait_for_group(population).await?; + // setup external API module, ready to take in client requests let external_api = ExternalApi::new_and_setup( id, api_addr, diff --git a/src/server/control.rs b/src/server/control.rs index 68b812fd..05627db3 100644 --- a/src/server/control.rs +++ b/src/server/control.rs @@ -19,6 +19,9 @@ pub struct ControlHub { /// My replica ID. pub me: ReplicaId, + /// Number of replicas in cluster. + pub population: u8, + /// Receiver side of the recv channel. rx_recv: mpsc::UnboundedReceiver, @@ -42,8 +45,9 @@ impl ControlHub { // connect to the cluster manager and receive my assigned server ID pf_info!("s"; "connecting to manager '{}'...", manager); let mut stream = TcpStream::connect(manager).await?; - let id = stream.read_u8().await?; // receive my server ID - pf_debug!(id; "assigned server ID: {}", id); + let id = stream.read_u8().await?; // first receive assigned server ID + let population = stream.read_u8().await?; // then receive population + pf_debug!(id; "assigned server ID: {} of {}", id, population); let (tx_recv, rx_recv) = mpsc::unbounded_channel(); let (tx_send, rx_send) = mpsc::unbounded_channel(); @@ -54,6 +58,7 @@ impl ControlHub { Ok(ControlHub { me: id, + population, rx_recv, tx_send, _control_messenger_handle: control_messenger_handle, From f1295e8dfb3dc1154cbdbb9c0ad27f0f19551cab Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Wed, 30 Aug 2023 18:59:34 +0800 Subject: [PATCH 17/21] minor updates to README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3ccadae9..22eb30f3 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ git checkout -b git branch --set-upstream-to=private/main git pull private git push origin -# then, on GitHub, make a PR from branch to main +# then, on GitHub, make a squashing PR from branch to main ``` # Summerset From 056f385bb73a3dabad4e0cc643fe8ed3ca562d52 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Wed, 30 Aug 2023 20:18:56 +0800 Subject: [PATCH 18/21] staging progress on reset control message --- src/manager/reactor.rs | 13 ++++++++++++- src/manager/reigner.rs | 6 +++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/src/manager/reactor.rs b/src/manager/reactor.rs index e3a1b198..3aba3ea2 100644 --- a/src/manager/reactor.rs +++ b/src/manager/reactor.rs @@ -1,6 +1,6 @@ //! Cluster manager client-facing reactor module implementation. -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::net::SocketAddr; use crate::utils::{ @@ -26,6 +26,14 @@ pub enum CtrlRequest { /// Query the set of active servers and their info. QueryInfo, + /// Reset the specified server(s) to initial state. + ResetServer { + /// ID of server to reset. If `None`, resets all active servers. + server: Option, + /// If false, cleans durable storage state as well. + durable: bool, + }, + /// Client leave notification. Leave, } @@ -38,6 +46,9 @@ pub enum CtrlReply { servers: HashMap, }, + /// Reply to server reset request. + ResetServer { servers: HashSet }, + /// Reply to client leave notification. Leave, } diff --git a/src/manager/reigner.rs b/src/manager/reigner.rs index ef7d7579..459918b2 100644 --- a/src/manager/reigner.rs +++ b/src/manager/reigner.rs @@ -21,7 +21,7 @@ use tokio::task::JoinHandle; /// Control message from/to servers. Control traffic could be bidirectional: /// some initiated by the manager and some by servers. -// TODO: add reset, pause, resume, server leave, leader change, etc. +// TODO: add pause, resume, server leave, leader change, etc. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] pub enum CtrlMsg { /// Server -> Manager: new server up, requesting a list of peers' addresses @@ -38,6 +38,10 @@ pub enum CtrlMsg { population: u8, to_peers: HashMap, }, + + /// Manager -> Server: reset to initial state. If durable is false, cleans + /// durable storage state as well. + ResetState { durable: bool }, } /// The server-facing controller API module. From bc2e22f175dc66cd3a6fbd8da212f3887474cb31 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Wed, 30 Aug 2023 21:17:32 +0800 Subject: [PATCH 19/21] add tcp_connect wrapper; better server ID assign logic --- src/client/apistub.rs | 7 ++++--- src/manager/clusman.rs | 40 +++++++++++++++++++++++++++++++++-- src/manager/reigner.rs | 46 +++++++++++++++++++++++++++-------------- src/server/control.rs | 6 ++++-- src/server/transport.rs | 3 ++- src/utils/error.rs | 4 +++- src/utils/mod.rs | 4 +++- src/utils/safetcp.rs | 23 ++++++++++++++++++--- 8 files changed, 104 insertions(+), 29 deletions(-) diff --git a/src/client/apistub.rs b/src/client/apistub.rs index 8106f7f7..ea0bb14f 100644 --- a/src/client/apistub.rs +++ b/src/client/apistub.rs @@ -2,13 +2,14 @@ use std::net::SocketAddr; -use crate::utils::{SummersetError, safe_tcp_read, safe_tcp_write}; +use crate::utils::{ + SummersetError, safe_tcp_read, safe_tcp_write, tcp_connect_with_retry, +}; use crate::server::{ApiRequest, ApiReply}; use crate::client::ClientId; use bytes::BytesMut; -use tokio::net::TcpStream; use tokio::net::tcp::{OwnedReadHalf, OwnedWriteHalf}; use tokio::io::AsyncWriteExt; @@ -40,7 +41,7 @@ impl ClientApiStub { addr: SocketAddr, ) -> Result { pf_info!(id; "connecting to server '{}'...", addr); - let mut stream = TcpStream::connect(addr).await?; + let mut stream = tcp_connect_with_retry(addr, 10).await?; stream.write_u64(id).await?; // send my client ID let (read_half, write_half) = stream.into_split(); diff --git a/src/manager/clusman.rs b/src/manager/clusman.rs index 8890faa9..9a153aa3 100644 --- a/src/manager/clusman.rs +++ b/src/manager/clusman.rs @@ -1,6 +1,6 @@ //! Summerset cluster manager oracle implementation. -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::net::SocketAddr; use crate::utils::SummersetError; @@ -41,11 +41,20 @@ pub struct ClusterManager { /// ServerReigner module. server_reigner: ServerReigner, + /// Receiver side of the server ID assignment channel. + rx_id_assign: mpsc::UnboundedReceiver<()>, + + /// Sender side of the server ID assignment result channel. + tx_id_result: mpsc::UnboundedSender<(ReplicaId, u8)>, + /// ClientReactor module. client_reactor: ClientReactor, /// Information of current active servers. server_info: HashMap, + + /// Currently assigned server IDs. + assigned_ids: HashSet, } impl ClusterManager { @@ -61,8 +70,12 @@ impl ClusterManager { return logged_err!("m"; "invalid population {}", population); } + let (tx_id_assign, rx_id_assign) = mpsc::unbounded_channel(); + let (tx_id_result, rx_id_result) = mpsc::unbounded_channel(); let server_reigner = - ServerReigner::new_and_setup(srv_addr, population).await?; + ServerReigner::new_and_setup(srv_addr, tx_id_assign, rx_id_result) + .await?; + let client_reactor = ClientReactor::new_and_setup(cli_addr).await?; Ok(ClusterManager { @@ -71,11 +84,27 @@ impl ClusterManager { _cli_addr: cli_addr, population, server_reigner, + rx_id_assign, + tx_id_result, client_reactor, server_info: HashMap::new(), + assigned_ids: HashSet::new(), }) } + /// Assign the first vacant server ID to a new server. + fn assign_server_id(&mut self) -> Result<(), SummersetError> { + for id in 0..self.population { + if !self.assigned_ids.contains(&id) { + self.tx_id_result.send((id, self.population))?; + self.assigned_ids.insert(id); + return Ok(()); + } + } + + logged_err!("m"; "no server ID < population left available") + } + /// Main event loop logic of the cluster manager. Breaks out of the loop /// only upon catching termination signals to the process. pub async fn run(&mut self) -> Result<(), SummersetError> { @@ -89,6 +118,13 @@ impl ClusterManager { loop { tokio::select! { + // receiving server ID assignment request + _ = self.rx_id_assign.recv() => { + if let Err(e) = self.assign_server_id() { + pf_error!("m"; "error assigning new server ID: {}", e); + } + }, + // receiving server control message ctrl_msg = self.server_reigner.recv_ctrl() => { if let Err(e) = ctrl_msg { diff --git a/src/manager/reigner.rs b/src/manager/reigner.rs index 459918b2..05436ac0 100644 --- a/src/manager/reigner.rs +++ b/src/manager/reigner.rs @@ -64,11 +64,12 @@ pub struct ServerReigner { // ServerReigner public API implementation impl ServerReigner { /// Creates a new server-facing controller module. Spawns the server - /// acceptor thread. Creates a recv channel for buffering incoming control - /// messages. + /// acceptor thread. Creates a pair of ID assignment channels. Creates + /// a recv channel for buffering incoming control messages. pub async fn new_and_setup( srv_addr: SocketAddr, - population: u8, + tx_id_assign: mpsc::UnboundedSender<()>, + rx_id_result: mpsc::UnboundedReceiver<(ReplicaId, u8)>, ) -> Result { let (tx_recv, rx_recv) = mpsc::unbounded_channel(); @@ -81,7 +82,8 @@ impl ServerReigner { let server_listener = tcp_bind_with_retry(srv_addr, 10).await?; let server_acceptor_handle = tokio::spawn(Self::server_acceptor_thread( - population, + tx_id_assign, + rx_id_result, tx_recv, server_listener, tx_sends_write, @@ -138,8 +140,8 @@ impl ServerReigner { async fn accept_new_server( mut stream: TcpStream, addr: SocketAddr, - id: ReplicaId, - population: u8, + tx_id_assign: &mpsc::UnboundedSender<()>, + rx_id_result: &mut mpsc::UnboundedReceiver<(ReplicaId, u8)>, tx_recv: mpsc::UnboundedSender<(ReplicaId, CtrlMsg)>, tx_sends: &mut flashmap::WriteHandle< ReplicaId, @@ -151,6 +153,12 @@ impl ServerReigner { >, tx_exit: mpsc::UnboundedSender, ) -> Result<(), SummersetError> { + // communicate with the manager's main thread to get assigned server ID + tx_id_assign.send(())?; + let (id, population) = rx_id_result.recv().await.ok_or( + SummersetError("failed to get server ID assignment".into()), + )?; + // first send server ID assignment if let Err(e) = stream.write_u8(id).await { return logged_err!("m"; "error assigning new server ID: {}", e); @@ -218,7 +226,8 @@ impl ServerReigner { /// Server acceptor thread function. async fn server_acceptor_thread( - population: u8, + tx_id_assign: mpsc::UnboundedSender<()>, + mut rx_id_result: mpsc::UnboundedReceiver<(ReplicaId, u8)>, tx_recv: mpsc::UnboundedSender<(ReplicaId, CtrlMsg)>, server_listener: TcpListener, mut tx_sends: flashmap::WriteHandle< @@ -235,9 +244,6 @@ impl ServerReigner { let local_addr = server_listener.local_addr().unwrap(); pf_info!("m"; "accepting servers on '{}'", local_addr); - // maintain a monotonically increasing server ID for new servers - let mut next_server_id: ReplicaId = 0; - // create an exit mpsc channel for getting notified about termination // of server controller threads let (tx_exit, mut rx_exit) = mpsc::unbounded_channel(); @@ -254,16 +260,14 @@ impl ServerReigner { if let Err(e) = Self::accept_new_server( stream, addr, - next_server_id, - population, + &tx_id_assign, + &mut rx_id_result, tx_recv.clone(), &mut tx_sends, &mut server_controller_handles, tx_exit.clone(), ).await { pf_error!("m"; "error accepting new server: {}", e); - } else { - next_server_id += 1; } }, @@ -485,10 +489,18 @@ mod reigner_tests { Ok::<(), SummersetError>(()) }); // manager - let mut reigner = - ServerReigner::new_and_setup("127.0.0.1:53600".parse()?, 2).await?; + let (tx_id_assign, mut rx_id_assign) = mpsc::unbounded_channel(); + let (tx_id_result, rx_id_result) = mpsc::unbounded_channel(); + let mut reigner = ServerReigner::new_and_setup( + "127.0.0.1:53600".parse()?, + tx_id_assign, + rx_id_result, + ) + .await?; setup_bar.wait().await; // recv message from server 0 + rx_id_assign.recv().await; + tx_id_result.send((0, 2))?; let (id, msg) = reigner.recv_ctrl().await?; assert_eq!(id, 0); assert_eq!( @@ -509,6 +521,8 @@ mod reigner_tests { id, )?; // recv message from server 1 + rx_id_assign.recv().await; + tx_id_result.send((1, 2))?; let (id, msg) = reigner.recv_ctrl().await?; assert_eq!(id, 1); assert_eq!( diff --git a/src/server/control.rs b/src/server/control.rs index 05627db3..ef5ff794 100644 --- a/src/server/control.rs +++ b/src/server/control.rs @@ -2,7 +2,9 @@ use std::net::SocketAddr; -use crate::utils::{SummersetError, safe_tcp_read, safe_tcp_write}; +use crate::utils::{ + SummersetError, safe_tcp_read, safe_tcp_write, tcp_connect_with_retry, +}; use crate::manager::CtrlMsg; use crate::server::ReplicaId; @@ -44,7 +46,7 @@ impl ControlHub { ) -> Result { // connect to the cluster manager and receive my assigned server ID pf_info!("s"; "connecting to manager '{}'...", manager); - let mut stream = TcpStream::connect(manager).await?; + let mut stream = tcp_connect_with_retry(manager, 10).await?; let id = stream.read_u8().await?; // first receive assigned server ID let population = stream.read_u8().await?; // then receive population pf_debug!(id; "assigned server ID: {} of {}", id, population); diff --git a/src/server/transport.rs b/src/server/transport.rs index 504e32a2..8f5f69cf 100644 --- a/src/server/transport.rs +++ b/src/server/transport.rs @@ -5,6 +5,7 @@ use std::net::SocketAddr; use crate::utils::{ SummersetError, Bitmap, safe_tcp_read, safe_tcp_write, tcp_bind_with_retry, + tcp_connect_with_retry, }; use crate::server::ReplicaId; @@ -253,7 +254,7 @@ where tx_exit: mpsc::UnboundedSender, ) -> Result<(), SummersetError> { pf_debug!(me; "connecting to peer {} '{}'...", id, addr); - let mut stream = TcpStream::connect(addr).await?; + let mut stream = tcp_connect_with_retry(addr, 10).await?; stream.write_u8(me).await?; // send my ID let mut peer_messenger_handles_guard = peer_messenger_handles.guard(); diff --git a/src/utils/error.rs b/src/utils/error.rs index 90e576c5..0e73dccb 100644 --- a/src/utils/error.rs +++ b/src/utils/error.rs @@ -37,13 +37,15 @@ impl_from_error!(toml::ser::Error); impl_from_error!(toml::de::Error); impl_from_error!(tokio::sync::SetError); impl_from_error!(tokio::sync::SetError); -impl_from_error!(tokio::sync::mpsc::error::TryRecvError); impl_from_error!( tokio::sync::watch::error::SendError> ); +impl_from_error!(tokio::sync::mpsc::error::TryRecvError); +impl_from_error!(tokio::sync::mpsc::error::SendError<()>); impl_from_error!( tokio::sync::mpsc::error::SendError<(ReplicaId, net::SocketAddr)> ); +impl_from_error!(tokio::sync::mpsc::error::SendError<(ReplicaId, u8)>); impl_from_error!(reed_solomon_erasure::Error); impl_from_error!(ctrlc::Error); diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 23a43006..31533217 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -15,5 +15,7 @@ mod rscoding; pub use error::SummersetError; pub use bitmap::Bitmap; pub use timer::Timer; -pub use safetcp::{safe_tcp_read, safe_tcp_write, tcp_bind_with_retry}; +pub use safetcp::{ + safe_tcp_read, safe_tcp_write, tcp_bind_with_retry, tcp_connect_with_retry, +}; pub use rscoding::RSCodeword; diff --git a/src/utils/safetcp.rs b/src/utils/safetcp.rs index 6a0df26a..2c337317 100644 --- a/src/utils/safetcp.rs +++ b/src/utils/safetcp.rs @@ -1,6 +1,4 @@ -//! Safe TCP read/write helpers that provides cancellation safety on the read -//! side and deadlock avoidance on the write side. Safe `TcpListener` binding -//! wrapper that provides a retrying logic. +//! Safe TCP bind/connect/read/write helper functions. use std::io::ErrorKind; use std::net::SocketAddr; @@ -162,4 +160,23 @@ pub async fn tcp_bind_with_retry( } } +/// Wrapper over tokio `TcpStream::connect()` that provides a retrying logic. +pub async fn tcp_connect_with_retry( + addr: SocketAddr, + mut retries: u8, +) -> Result { + loop { + match TcpStream::connect(addr).await { + Ok(stream) => return Ok(stream), + Err(e) => { + if retries == 0 { + return Err(e.into()); + } + retries -= 1; + time::sleep(Duration::from_secs(1)).await; + } + } + } +} + // No unit tests for these helpers... From 36d0c5bc7b57625fad4f621ef2445be54752f257 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Thu, 31 Aug 2023 14:02:13 +0800 Subject: [PATCH 20/21] huge updates adding server leave and reset support --- Cargo.lock | 2 + src/client/endpoint.rs | 18 +- src/lib.rs | 2 +- src/manager/clusman.rs | 84 ++++++-- src/manager/reactor.rs | 15 +- src/manager/reigner.rs | 140 ++++++++++++- src/protocols/crossword.rs | 164 ++++++++++----- src/protocols/mod.rs | 22 +- src/protocols/multipaxos.rs | 164 ++++++++++----- src/protocols/rep_nothing.rs | 161 ++++++++++----- src/protocols/rs_paxos.rs | 164 ++++++++++----- src/protocols/simple_push.rs | 164 ++++++++++----- src/server/external.rs | 15 +- src/server/replica.rs | 10 +- src/server/transport.rs | 217 ++++++++++++++++---- summerset_client/Cargo.toml | 2 +- summerset_client/src/clients/tester.rs | 28 ++- summerset_client/src/drivers/closed_loop.rs | 17 +- summerset_client/src/drivers/open_loop.rs | 16 +- summerset_client/src/main.rs | 5 +- summerset_manager/Cargo.toml | 3 +- summerset_manager/src/main.rs | 56 +++-- summerset_server/Cargo.toml | 3 +- summerset_server/src/main.rs | 30 ++- 24 files changed, 1118 insertions(+), 384 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e9fc04f8..883efd71 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1151,6 +1151,7 @@ name = "summerset_manager" version = "0.1.0" dependencies = [ "clap", + "ctrlc", "env_logger", "log", "rand", @@ -1163,6 +1164,7 @@ name = "summerset_server" version = "0.1.0" dependencies = [ "clap", + "ctrlc", "env_logger", "log", "rand", diff --git a/src/client/endpoint.rs b/src/client/endpoint.rs index bf3cd10f..37387e2b 100644 --- a/src/client/endpoint.rs +++ b/src/client/endpoint.rs @@ -5,6 +5,7 @@ use std::net::SocketAddr; use crate::utils::SummersetError; use crate::server::{ApiRequest, ApiReply}; +use crate::client::ClientCtrlStub; use async_trait::async_trait; @@ -14,8 +15,9 @@ pub type ClientId = u64; /// Client trait to be implement by all protocol-specific client structs. #[async_trait] pub trait GenericEndpoint { - /// Creates a new client stub. - fn new( + /// Creates a new client stub and sets up required functionality modules + /// according to protocol-specific logic. + async fn new_and_setup( manager: SocketAddr, // remote address of manager oracle config_str: Option<&str>, ) -> Result @@ -23,9 +25,8 @@ pub trait GenericEndpoint { Self: Sized; /// Establishes connection to the service (or re-joins the service) - /// according to protocol-specific logic. Returns the assigned client ID - /// on success. - async fn connect(&mut self) -> Result; + /// according to protocol-specific logic. + async fn connect(&mut self) -> Result<(), SummersetError>; /// Leaves the service: forgets about the current TCP connections and send /// leave notifications according to protocol-specific logic. If `permanent` @@ -40,4 +41,11 @@ pub trait GenericEndpoint { /// Receives a reply from the service according to protocol-specific logic. async fn recv_reply(&mut self) -> Result; + + /// Gets my client ID. + fn id(&self) -> ClientId; + + /// Gets a mutable reference to the control stub for sending control + /// requests and receiving control replies for testing purposes. + fn ctrl_stub(&mut self) -> &mut ClientCtrlStub; } diff --git a/src/lib.rs b/src/lib.rs index 24a24bb6..2de53e51 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -25,7 +25,7 @@ pub use crate::server::{ }; #[doc(inline)] -pub use crate::client::{ClientId, GenericEndpoint}; +pub use crate::client::{ClientId, GenericEndpoint, ClientCtrlStub}; #[doc(inline)] pub use crate::protocols::SmrProtocol; diff --git a/src/manager/clusman.rs b/src/manager/clusman.rs index 9a153aa3..de18afb5 100644 --- a/src/manager/clusman.rs +++ b/src/manager/clusman.rs @@ -11,7 +11,7 @@ use crate::server::ReplicaId; use crate::client::ClientId; use crate::protocols::SmrProtocol; -use tokio::sync::mpsc; +use tokio::sync::{mpsc, watch}; /// Information about an active server. // TODO: maybe add things like leader info, etc. @@ -107,15 +107,10 @@ impl ClusterManager { /// Main event loop logic of the cluster manager. Breaks out of the loop /// only upon catching termination signals to the process. - pub async fn run(&mut self) -> Result<(), SummersetError> { - // set up termination signals handler - let (tx_term, mut rx_term) = mpsc::unbounded_channel(); - ctrlc::set_handler(move || { - if let Err(e) = tx_term.send(true) { - pf_error!("m"; "error sending to term channel: {}", e); - } - })?; - + pub async fn run( + &mut self, + mut rx_term: watch::Receiver, + ) -> Result<(), SummersetError> { loop { tokio::select! { // receiving server ID assignment request @@ -132,7 +127,7 @@ impl ClusterManager { continue; } let (server, msg) = ctrl_msg.unwrap(); - if let Err(e) = self.handle_ctrl_msg(server, msg) { + if let Err(e) = self.handle_ctrl_msg(server, msg).await { pf_error!("m"; "error handling ctrl msg <- {}: {}", server, e); } @@ -145,14 +140,14 @@ impl ClusterManager { continue; } let (client, req) = ctrl_req.unwrap(); - if let Err(e) = self.handle_ctrl_req(client, req) { + if let Err(e) = self.handle_ctrl_req(client, req).await { pf_error!("m"; "error handling ctrl req <- {}: {}", client, e); } }, // receiving termination signal - _ = rx_term.recv() => { + _ = rx_term.changed() => { pf_warn!("m"; "manager caught termination signal"); break; } @@ -203,7 +198,7 @@ impl ClusterManager { } /// Synthesized handler of server-initiated control messages. - fn handle_ctrl_msg( + async fn handle_ctrl_msg( &mut self, server: ReplicaId, msg: CtrlMsg, @@ -249,8 +244,62 @@ impl ClusterManager { .send_reply(CtrlReply::QueryInfo { servers }, client) } + /// Handler of client ResetServer request. + async fn handle_client_reset_server( + &mut self, + client: ClientId, + server: Option, + durable: bool, + ) -> Result<(), SummersetError> { + let num_replicas = self.server_info.len(); + let mut servers: Vec = if server.is_none() { + // all active servers + self.server_info.keys().copied().collect() + } else { + vec![server.unwrap()] + }; + + // reset specified server(s) + let mut reset_done = HashSet::new(); + while let Some(s) = servers.pop() { + // send reset server control message to server + self.server_reigner + .send_ctrl(CtrlMsg::ResetState { durable }, s)?; + + // remove information about this server + assert!(self.assigned_ids.contains(&s)); + assert!(self.server_info.contains_key(&s)); + self.assigned_ids.remove(&s); + self.server_info.remove(&s); + + // wait for the new server ID assignment request from it + self.rx_id_assign.recv().await; + if let Err(e) = self.assign_server_id() { + return logged_err!("m"; "error assigning new server ID: {}", e); + } + + reset_done.insert(s); + } + + // now the reset servers should be sending NewServerJoin messages to + // me. Process them until all servers joined + while self.server_info.len() < num_replicas { + let (s, msg) = self.server_reigner.recv_ctrl().await?; + if let Err(e) = self.handle_ctrl_msg(s, msg).await { + pf_error!("m"; "error handling ctrl msg <- {}: {}", s, e); + } + } + + self.client_reactor.send_reply( + CtrlReply::ResetServer { + servers: reset_done, + }, + client, + ) + } + /// Synthesized handler of client-initiated control requests. - fn handle_ctrl_req( + async fn handle_ctrl_req( &mut self, client: ClientId, req: CtrlRequest, @@ -261,6 +310,11 @@ impl ClusterManager { self.handle_client_query_info(client)?; } + CtrlRequest::ResetServer { server, durable } => { + self.handle_client_reset_server(client, server, durable) + .await?; + } + _ => {} // ignore all other types } diff --git a/src/manager/reactor.rs b/src/manager/reactor.rs index 3aba3ea2..41a0582d 100644 --- a/src/manager/reactor.rs +++ b/src/manager/reactor.rs @@ -441,9 +441,11 @@ mod reactor_tests { ClientReactor::new_and_setup("127.0.0.1:53601".parse()?) .await?; barrier2.wait().await; + // recv request from client let (client, req) = reactor.recv_req().await?; assert!(reactor.has_client(client)); assert_eq!(req, CtrlRequest::QueryInfo); + // send reply to client reactor.send_reply( CtrlReply::QueryInfo { servers: HashMap::::from([ @@ -459,7 +461,9 @@ mod reactor_tests { barrier.wait().await; let mut ctrl_stub = ClientCtrlStub::new_by_connect("127.0.0.1:53601".parse()?).await?; + // send request to manager ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?; + // recv reply from manager assert_eq!( ctrl_stub.recv_reply().await?, CtrlReply::QueryInfo { @@ -482,7 +486,9 @@ mod reactor_tests { let mut ctrl_stub = ClientCtrlStub::new_by_connect("127.0.0.1:54601".parse()?) .await?; + // send request to manager ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?; + // recv reply from manager assert_eq!( ctrl_stub.recv_reply().await?, CtrlReply::QueryInfo { @@ -492,14 +498,17 @@ mod reactor_tests { ]), } ); + // leave and come back as new client ctrl_stub.send_req(Some(&CtrlRequest::Leave))?; assert_eq!(ctrl_stub.recv_reply().await?, CtrlReply::Leave); ctrl_stub.forget(); - time::sleep(Duration::from_millis(10)).await; + time::sleep(Duration::from_millis(100)).await; let mut ctrl_stub = ClientCtrlStub::new_by_connect("127.0.0.1:54601".parse()?) .await?; + // send request to manager ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?; + // recv reply from manager assert_eq!( ctrl_stub.recv_reply().await?, CtrlReply::QueryInfo { @@ -515,9 +524,11 @@ mod reactor_tests { let mut reactor = ClientReactor::new_and_setup("127.0.0.1:54601".parse()?).await?; barrier.wait().await; + // recv request from client let (client, req) = reactor.recv_req().await?; assert!(reactor.has_client(client)); assert_eq!(req, CtrlRequest::QueryInfo); + // send reply to client reactor.send_reply( CtrlReply::QueryInfo { servers: HashMap::::from([ @@ -527,10 +538,12 @@ mod reactor_tests { }, client, )?; + // recv request from new client let (client2, req2) = reactor.recv_req().await?; assert!(reactor.has_client(client2)); assert!(!reactor.has_client(client)); assert_eq!(req2, CtrlRequest::QueryInfo); + // send reply to new client reactor.send_reply( CtrlReply::QueryInfo { servers: HashMap::::from([ diff --git a/src/manager/reigner.rs b/src/manager/reigner.rs index 05436ac0..02e4e4c3 100644 --- a/src/manager/reigner.rs +++ b/src/manager/reigner.rs @@ -21,7 +21,7 @@ use tokio::task::JoinHandle; /// Control message from/to servers. Control traffic could be bidirectional: /// some initiated by the manager and some by servers. -// TODO: add pause, resume, server leave, leader change, etc. +// TODO: add pause, resume, leader change, membership change, etc. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] pub enum CtrlMsg { /// Server -> Manager: new server up, requesting a list of peers' addresses @@ -42,6 +42,12 @@ pub enum CtrlMsg { /// Manager -> Server: reset to initial state. If durable is false, cleans /// durable storage state as well. ResetState { durable: bool }, + + /// Server -> Manager: leave notification. + Leave, + + /// Manager -> Server: dummy leave reply. + LeaveReply, } /// The server-facing controller API module. @@ -98,6 +104,13 @@ impl ServerReigner { }) } + /// Returns whether a server ID is connected to me. + #[allow(dead_code)] + pub fn has_server(&self, server: ReplicaId) -> bool { + let tx_sends_guard = self.tx_sends.guard(); + tx_sends_guard.contains_key(&server) + } + /// Waits for the next control event message from some server. pub async fn recv_ctrl( &mut self, @@ -359,6 +372,22 @@ impl ServerReigner { // receives control message from server msg = Self::read_ctrl(&mut read_buf, &mut conn_read) => { match msg { + Ok(CtrlMsg::Leave) => { + // server leaving, send dummy reply and break + let msg = CtrlMsg::LeaveReply; + if let Err(e) = Self::write_ctrl( + &mut write_buf, + &mut write_buf_cursor, + &conn_write, + Some(&msg) + ) { + pf_error!("m"; "error replying -> {}: {}", id, e); + } else { // skips `WouldBlock` failure check here + pf_info!("m"; "server {} has left", id); + } + break; + }, + Ok(CtrlMsg::NewServerJoin { id, protocol, @@ -380,7 +409,7 @@ impl ServerReigner { if let Err(e) = tx_recv.send((id, msg)) { pf_error!("m"; "error sending to tx_recv for {}: {}", id, e); } - } + }, Ok(msg) => { // pf_trace!("m"; "recv <- {} ctrl {:?}", id, msg); @@ -432,6 +461,7 @@ mod reigner_tests { use std::sync::Arc; use crate::server::ControlHub; use tokio::sync::Barrier; + use tokio::time::{self, Duration}; #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn api_send_recv() -> Result<(), SummersetError> { @@ -544,4 +574,110 @@ mod reigner_tests { )?; Ok(()) } + + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn api_server_leave() -> Result<(), SummersetError> { + let barrier = Arc::new(Barrier::new(2)); + let barrier2 = barrier.clone(); + tokio::spawn(async move { + // replica 0 + barrier2.wait().await; + let mut hub = + ControlHub::new_and_setup("127.0.0.1:54600".parse()?).await?; + assert_eq!(hub.me, 0); + // send a message to manager + hub.send_ctrl(CtrlMsg::NewServerJoin { + id: hub.me, + protocol: SmrProtocol::SimplePush, + api_addr: "127.0.0.1:54700".parse()?, + p2p_addr: "127.0.0.1:54800".parse()?, + })?; + // recv a message from manager + assert_eq!( + hub.recv_ctrl().await?, + CtrlMsg::ConnectToPeers { + population: 1, + to_peers: HashMap::new(), + } + ); + // leave and re-join as 0 + hub.send_ctrl(CtrlMsg::Leave)?; + assert_eq!(hub.recv_ctrl().await?, CtrlMsg::LeaveReply); + time::sleep(Duration::from_millis(100)).await; + let mut hub = + ControlHub::new_and_setup("127.0.0.1:54600".parse()?).await?; + assert_eq!(hub.me, 0); + // send a message to manager + hub.send_ctrl(CtrlMsg::NewServerJoin { + id: hub.me, + protocol: SmrProtocol::SimplePush, + api_addr: "127.0.0.1:54700".parse()?, + p2p_addr: "127.0.0.1:54800".parse()?, + })?; + // recv a message from manager + assert_eq!( + hub.recv_ctrl().await?, + CtrlMsg::ConnectToPeers { + population: 1, + to_peers: HashMap::new(), + } + ); + Ok::<(), SummersetError>(()) + }); + // manager + let (tx_id_assign, mut rx_id_assign) = mpsc::unbounded_channel(); + let (tx_id_result, rx_id_result) = mpsc::unbounded_channel(); + let mut reigner = ServerReigner::new_and_setup( + "127.0.0.1:54600".parse()?, + tx_id_assign, + rx_id_result, + ) + .await?; + barrier.wait().await; + // recv message from server 0 + rx_id_assign.recv().await; + tx_id_result.send((0, 1))?; + let (id, msg) = reigner.recv_ctrl().await?; + assert_eq!(id, 0); + assert_eq!( + msg, + CtrlMsg::NewServerJoin { + id: 0, + protocol: SmrProtocol::SimplePush, + api_addr: "127.0.0.1:54700".parse()?, + p2p_addr: "127.0.0.1:54800".parse()? + } + ); + // send reply to server 0 + reigner.send_ctrl( + CtrlMsg::ConnectToPeers { + population: 1, + to_peers: HashMap::new(), + }, + id, + )?; + rx_id_assign.recv().await; + tx_id_result.send((0, 1))?; + // recv message from server 0 + let (id, msg) = reigner.recv_ctrl().await?; + assert_eq!(id, 0); + assert_eq!( + msg, + CtrlMsg::NewServerJoin { + id: 0, + protocol: SmrProtocol::SimplePush, + api_addr: "127.0.0.1:54700".parse()?, + p2p_addr: "127.0.0.1:54800".parse()? + } + ); + // send reply to server 0 + reigner.send_ctrl( + CtrlMsg::ConnectToPeers { + population: 1, + to_peers: HashMap::new(), + }, + id, + )?; + Ok(()) + } } diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs index dc74afe8..493213d4 100644 --- a/src/protocols/crossword.rs +++ b/src/protocols/crossword.rs @@ -22,7 +22,7 @@ use async_trait::async_trait; use serde::{Serialize, Deserialize}; use tokio::time::Duration; -use tokio::sync::mpsc; +use tokio::sync::watch; use reed_solomon_erasure::galois_8::ReedSolomon; @@ -1101,11 +1101,61 @@ impl CrosswordReplica { Ok(()) } - /// Synthesized handler of manager control messages. - fn handle_ctrl_msg(&mut self, _msg: CtrlMsg) -> Result<(), SummersetError> { - // TODO: fill this when more control message types added + /// Handler of ResetState control message. + async fn handle_ctrl_reset_state( + &mut self, + durable: bool, + ) -> Result<(), SummersetError> { + // send leave notification to peers and wait for their replies + self.transport_hub.leave().await?; + + // send leave notification to manager and wait for its reply + self.control_hub.send_ctrl(CtrlMsg::Leave)?; + while self.control_hub.recv_ctrl().await? != CtrlMsg::LeaveReply {} + + // if `durable` is false, truncate backer file + if !durable { + // use 0 as a special log action ID here + self.storage_hub + .submit_action(0, LogAction::Truncate { offset: 0 })?; + loop { + let (action_id, log_result) = + self.storage_hub.get_result().await?; + if action_id == 0 { + if log_result + != (LogResult::Truncate { + offset_ok: true, + now_size: 0, + }) + { + return logged_err!(self.id; "failed to truncate log to 0"); + } else { + return Ok(()); + } + } + } + } + Ok(()) } + + /// Synthesized handler of manager control messages. If ok, returns + /// `Some(true)` if decides to terminate and reboot, `Some(false)` if + /// decides to shutdown completely, and `None` if not terminating. + async fn handle_ctrl_msg( + &mut self, + msg: CtrlMsg, + ) -> Result, SummersetError> { + // TODO: fill this when more control message types added + match msg { + CtrlMsg::ResetState { durable } => { + self.handle_ctrl_reset_state(durable).await?; + Ok(Some(true)) + } + + _ => Ok(None), // ignore all other types + } + } } #[async_trait] @@ -1221,15 +1271,10 @@ impl GenericReplica for CrosswordReplica { }) } - async fn run(&mut self) -> Result { - // set up termination signals handler - let (tx_term, mut rx_term) = mpsc::unbounded_channel(); - ctrlc::set_handler(move || { - if let Err(e) = tx_term.send(true) { - pf_error!("s"; "error sending to term channel: {}", e); - } - })?; - + async fn run( + &mut self, + mut rx_term: watch::Receiver, + ) -> Result { // TODO: proper leader election if self.id == 0 { self.is_leader = true; @@ -1293,19 +1338,34 @@ impl GenericReplica for CrosswordReplica { continue; } let ctrl_msg = ctrl_msg.unwrap(); - if let Err(e) = self.handle_ctrl_msg(ctrl_msg) { - pf_error!(self.id; "error handling ctrl msg: {}", e); + match self.handle_ctrl_msg(ctrl_msg).await { + Ok(terminate) => { + if let Some(restart) = terminate { + pf_warn!( + self.id; + "server got {} req", + if restart { "restart" } else { "shutdown" }); + return Ok(restart); + } + }, + Err(e) => { + pf_error!(self.id; "error handling ctrl msg: {}", e); + } } }, // receiving termination signal - _ = rx_term.recv() => { + _ = rx_term.changed() => { pf_warn!(self.id; "server caught termination signal"); return Ok(false); } } } } + + fn id(&self) -> ReplicaId { + self.id + } } /// Configuration parameters struct. @@ -1327,9 +1387,6 @@ pub struct CrosswordClient { /// Client ID. id: ClientId, - /// Address of the cluster manager oracle. - manager: SocketAddr, - /// Configuration parameters struct. _config: ClientConfigCrossword, @@ -1340,7 +1397,7 @@ pub struct CrosswordClient { server_id: ReplicaId, /// Control API stub to the cluster manager. - ctrl_stub: Option, + ctrl_stub: ClientCtrlStub, /// API stubs for communicating with servers. api_stub: Option, @@ -1348,47 +1405,43 @@ pub struct CrosswordClient { #[async_trait] impl GenericEndpoint for CrosswordClient { - fn new( + async fn new_and_setup( manager: SocketAddr, config_str: Option<&str>, ) -> Result { + // connect to the cluster manager and get assigned a client ID + let ctrl_stub = ClientCtrlStub::new_by_connect(manager).await?; + let id = ctrl_stub.id; + + // parse protocol-specific configs let config = parsed_config!(config_str => ClientConfigCrossword; init_server_id)?; let init_server_id = config.init_server_id; Ok(CrosswordClient { - id: 255, // nil at this time - manager, + id, _config: config, servers: HashMap::new(), server_id: init_server_id, - ctrl_stub: None, + ctrl_stub, api_stub: None, }) } - async fn connect(&mut self) -> Result { + async fn connect(&mut self) -> Result<(), SummersetError> { // disallow reconnection without leaving if self.api_stub.is_some() { return logged_err!(self.id; "reconnecting without leaving"); } - // if ctrl_stubs not established yet, connect to the manager - if self.ctrl_stub.is_none() { - let ctrl_stub = - ClientCtrlStub::new_by_connect(self.manager).await?; - self.id = ctrl_stub.id; - self.ctrl_stub = Some(ctrl_stub); - } - let ctrl_stub = self.ctrl_stub.as_mut().unwrap(); - // ask the manager about the list of active servers - let mut sent = ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?; + let mut sent = + self.ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?; while !sent { - sent = ctrl_stub.send_req(None)?; + sent = self.ctrl_stub.send_req(None)?; } - let reply = ctrl_stub.recv_reply().await?; + let reply = self.ctrl_stub.recv_reply().await?; match reply { CtrlReply::QueryInfo { servers } => { // connect to the one with server ID in config @@ -1399,7 +1452,7 @@ impl GenericEndpoint for CrosswordClient { .await?; self.api_stub = Some(api_stub); self.servers = servers; - Ok(self.id) + Ok(()) } _ => logged_err!(self.id; "unexpected reply type received"), } @@ -1427,26 +1480,19 @@ impl GenericEndpoint for CrosswordClient { // if permanently leaving, send leave notification to the manager if permanent { - // disallow multiple permanent leaving - if self.ctrl_stub.is_none() { - return logged_err!(self.id; "repeated permanent leaving"); + let mut sent = + self.ctrl_stub.send_req(Some(&CtrlRequest::Leave))?; + while !sent { + sent = self.ctrl_stub.send_req(None)?; } - if let Some(mut ctrl_stub) = self.ctrl_stub.take() { - let mut sent = ctrl_stub.send_req(Some(&CtrlRequest::Leave))?; - while !sent { - sent = ctrl_stub.send_req(None)?; + let reply = self.ctrl_stub.recv_reply().await?; + match reply { + CtrlReply::Leave => { + pf_info!(self.id; "left current manager connection"); } - - let reply = ctrl_stub.recv_reply().await?; - match reply { - CtrlReply::Leave => { - pf_info!(self.id; "left current manager connection"); - ctrl_stub.forget(); - } - _ => { - return logged_err!(self.id; "unexpected reply type received"); - } + _ => { + return logged_err!(self.id; "unexpected reply type received"); } } } @@ -1492,4 +1538,12 @@ impl GenericEndpoint for CrosswordClient { None => logged_err!(self.id; "client is not set up"), } } + + fn id(&self) -> ClientId { + self.id + } + + fn ctrl_stub(&mut self) -> &mut ClientCtrlStub { + &mut self.ctrl_stub + } } diff --git a/src/protocols/mod.rs b/src/protocols/mod.rs index 98ecf371..3aae79bf 100644 --- a/src/protocols/mod.rs +++ b/src/protocols/mod.rs @@ -126,26 +126,36 @@ impl SmrProtocol { } /// Create a client endpoint instance of this protocol on heap. - pub fn new_client_endpoint( + pub async fn new_client_endpoint( &self, manager: SocketAddr, config_str: Option<&str>, ) -> Result, SummersetError> { match self { Self::RepNothing => { - box_if_ok!(RepNothingClient::new(manager, config_str)) + box_if_ok!( + RepNothingClient::new_and_setup(manager, config_str).await + ) } Self::SimplePush => { - box_if_ok!(SimplePushClient::new(manager, config_str)) + box_if_ok!( + SimplePushClient::new_and_setup(manager, config_str).await + ) } Self::MultiPaxos => { - box_if_ok!(MultiPaxosClient::new(manager, config_str)) + box_if_ok!( + MultiPaxosClient::new_and_setup(manager, config_str).await + ) } Self::RSPaxos => { - box_if_ok!(RSPaxosClient::new(manager, config_str)) + box_if_ok!( + RSPaxosClient::new_and_setup(manager, config_str).await + ) } Self::Crossword => { - box_if_ok!(CrosswordClient::new(manager, config_str)) + box_if_ok!( + CrosswordClient::new_and_setup(manager, config_str).await + ) } } } diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs index 4e50c9a3..2431ff86 100644 --- a/src/protocols/multipaxos.rs +++ b/src/protocols/multipaxos.rs @@ -26,7 +26,7 @@ use async_trait::async_trait; use serde::{Serialize, Deserialize}; use tokio::time::Duration; -use tokio::sync::mpsc; +use tokio::sync::watch; /// Configuration parameters struct. #[derive(Debug, Deserialize)] @@ -916,11 +916,61 @@ impl MultiPaxosReplica { Ok(()) } - /// Synthesized handler of manager control messages. - fn handle_ctrl_msg(&mut self, _msg: CtrlMsg) -> Result<(), SummersetError> { - // TODO: fill this when more control message types added + /// Handler of ResetState control message. + async fn handle_ctrl_reset_state( + &mut self, + durable: bool, + ) -> Result<(), SummersetError> { + // send leave notification to peers and wait for their replies + self.transport_hub.leave().await?; + + // send leave notification to manager and wait for its reply + self.control_hub.send_ctrl(CtrlMsg::Leave)?; + while self.control_hub.recv_ctrl().await? != CtrlMsg::LeaveReply {} + + // if `durable` is false, truncate backer file + if !durable { + // use 0 as a special log action ID here + self.storage_hub + .submit_action(0, LogAction::Truncate { offset: 0 })?; + loop { + let (action_id, log_result) = + self.storage_hub.get_result().await?; + if action_id == 0 { + if log_result + != (LogResult::Truncate { + offset_ok: true, + now_size: 0, + }) + { + return logged_err!(self.id; "failed to truncate log to 0"); + } else { + return Ok(()); + } + } + } + } + Ok(()) } + + /// Synthesized handler of manager control messages. If ok, returns + /// `Some(true)` if decides to terminate and reboot, `Some(false)` if + /// decides to shutdown completely, and `None` if not terminating. + async fn handle_ctrl_msg( + &mut self, + msg: CtrlMsg, + ) -> Result, SummersetError> { + // TODO: fill this when more control message types added + match msg { + CtrlMsg::ResetState { durable } => { + self.handle_ctrl_reset_state(durable).await?; + Ok(Some(true)) + } + + _ => Ok(None), // ignore all other types + } + } } #[async_trait] @@ -1016,15 +1066,10 @@ impl GenericReplica for MultiPaxosReplica { }) } - async fn run(&mut self) -> Result { - // set up termination signals handler - let (tx_term, mut rx_term) = mpsc::unbounded_channel(); - ctrlc::set_handler(move || { - if let Err(e) = tx_term.send(true) { - pf_error!("s"; "error sending to term channel: {}", e); - } - })?; - + async fn run( + &mut self, + mut rx_term: watch::Receiver, + ) -> Result { // TODO: proper leader election if self.id == 0 { self.is_leader = true; @@ -1088,19 +1133,34 @@ impl GenericReplica for MultiPaxosReplica { continue; } let ctrl_msg = ctrl_msg.unwrap(); - if let Err(e) = self.handle_ctrl_msg(ctrl_msg) { - pf_error!(self.id; "error handling ctrl msg: {}", e); + match self.handle_ctrl_msg(ctrl_msg).await { + Ok(terminate) => { + if let Some(restart) = terminate { + pf_warn!( + self.id; + "server got {} req", + if restart { "restart" } else { "shutdown" }); + return Ok(restart); + } + }, + Err(e) => { + pf_error!(self.id; "error handling ctrl msg: {}", e); + } } }, // receiving termination signal - _ = rx_term.recv() => { + _ = rx_term.changed() => { pf_warn!(self.id; "server caught termination signal"); return Ok(false); } } } } + + fn id(&self) -> ReplicaId { + self.id + } } /// Configuration parameters struct. @@ -1122,9 +1182,6 @@ pub struct MultiPaxosClient { /// Client ID. id: ClientId, - /// Address of the cluster manager oracle. - manager: SocketAddr, - /// Configuration parameters struct. _config: ClientConfigMultiPaxos, @@ -1135,7 +1192,7 @@ pub struct MultiPaxosClient { server_id: ReplicaId, /// Control API stub to the cluster manager. - ctrl_stub: Option, + ctrl_stub: ClientCtrlStub, /// API stubs for communicating with servers. api_stub: Option, @@ -1143,47 +1200,43 @@ pub struct MultiPaxosClient { #[async_trait] impl GenericEndpoint for MultiPaxosClient { - fn new( + async fn new_and_setup( manager: SocketAddr, config_str: Option<&str>, ) -> Result { + // connect to the cluster manager and get assigned a client ID + let ctrl_stub = ClientCtrlStub::new_by_connect(manager).await?; + let id = ctrl_stub.id; + + // parse protocol-specific configs let config = parsed_config!(config_str => ClientConfigMultiPaxos; init_server_id)?; let init_server_id = config.init_server_id; Ok(MultiPaxosClient { - id: 255, // nil at this time - manager, + id, _config: config, servers: HashMap::new(), server_id: init_server_id, - ctrl_stub: None, + ctrl_stub, api_stub: None, }) } - async fn connect(&mut self) -> Result { + async fn connect(&mut self) -> Result<(), SummersetError> { // disallow reconnection without leaving if self.api_stub.is_some() { return logged_err!(self.id; "reconnecting without leaving"); } - // if ctrl_stubs not established yet, connect to the manager - if self.ctrl_stub.is_none() { - let ctrl_stub = - ClientCtrlStub::new_by_connect(self.manager).await?; - self.id = ctrl_stub.id; - self.ctrl_stub = Some(ctrl_stub); - } - let ctrl_stub = self.ctrl_stub.as_mut().unwrap(); - // ask the manager about the list of active servers - let mut sent = ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?; + let mut sent = + self.ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?; while !sent { - sent = ctrl_stub.send_req(None)?; + sent = self.ctrl_stub.send_req(None)?; } - let reply = ctrl_stub.recv_reply().await?; + let reply = self.ctrl_stub.recv_reply().await?; match reply { CtrlReply::QueryInfo { servers } => { // connect to the one with server ID in config @@ -1194,7 +1247,7 @@ impl GenericEndpoint for MultiPaxosClient { .await?; self.api_stub = Some(api_stub); self.servers = servers; - Ok(self.id) + Ok(()) } _ => logged_err!(self.id; "unexpected reply type received"), } @@ -1222,26 +1275,19 @@ impl GenericEndpoint for MultiPaxosClient { // if permanently leaving, send leave notification to the manager if permanent { - // disallow multiple permanent leaving - if self.ctrl_stub.is_none() { - return logged_err!(self.id; "repeated permanent leaving"); + let mut sent = + self.ctrl_stub.send_req(Some(&CtrlRequest::Leave))?; + while !sent { + sent = self.ctrl_stub.send_req(None)?; } - if let Some(mut ctrl_stub) = self.ctrl_stub.take() { - let mut sent = ctrl_stub.send_req(Some(&CtrlRequest::Leave))?; - while !sent { - sent = ctrl_stub.send_req(None)?; + let reply = self.ctrl_stub.recv_reply().await?; + match reply { + CtrlReply::Leave => { + pf_info!(self.id; "left current manager connection"); } - - let reply = ctrl_stub.recv_reply().await?; - match reply { - CtrlReply::Leave => { - pf_info!(self.id; "left current manager connection"); - ctrl_stub.forget(); - } - _ => { - return logged_err!(self.id; "unexpected reply type received"); - } + _ => { + return logged_err!(self.id; "unexpected reply type received"); } } } @@ -1287,4 +1333,12 @@ impl GenericEndpoint for MultiPaxosClient { None => logged_err!(self.id; "client is not set up"), } } + + fn id(&self) -> ClientId { + self.id + } + + fn ctrl_stub(&mut self) -> &mut ClientCtrlStub { + &mut self.ctrl_stub + } } diff --git a/src/protocols/rep_nothing.rs b/src/protocols/rep_nothing.rs index bbfb79c6..ffbc57e1 100644 --- a/src/protocols/rep_nothing.rs +++ b/src/protocols/rep_nothing.rs @@ -21,7 +21,7 @@ use async_trait::async_trait; use serde::{Serialize, Deserialize}; use tokio::time::Duration; -use tokio::sync::mpsc; +use tokio::sync::watch; /// Configuration parameters struct. #[derive(Debug, Deserialize)] @@ -227,11 +227,58 @@ impl RepNothingReplica { Ok(()) } - /// Synthesized handler of manager control messages. - fn handle_ctrl_msg(&mut self, _msg: CtrlMsg) -> Result<(), SummersetError> { - // TODO: fill this when more control message types added + /// Handler of ResetState control message. + async fn handle_ctrl_reset_state( + &mut self, + durable: bool, + ) -> Result<(), SummersetError> { + // send leave notification to manager and wait for its reply + self.control_hub.send_ctrl(CtrlMsg::Leave)?; + while self.control_hub.recv_ctrl().await? != CtrlMsg::LeaveReply {} + + // if `durable` is false, truncate backer file + if !durable { + // use 0 as a special log action ID here + self.storage_hub + .submit_action(0, LogAction::Truncate { offset: 0 })?; + loop { + let (action_id, log_result) = + self.storage_hub.get_result().await?; + if action_id == 0 { + if log_result + != (LogResult::Truncate { + offset_ok: true, + now_size: 0, + }) + { + return logged_err!(self.id; "failed to truncate log to 0"); + } else { + return Ok(()); + } + } + } + } + Ok(()) } + + /// Synthesized handler of manager control messages. If ok, returns + /// `Some(true)` if decides to terminate and reboot, `Some(false)` if + /// decides to shutdown completely, and `None` if not terminating. + async fn handle_ctrl_msg( + &mut self, + msg: CtrlMsg, + ) -> Result, SummersetError> { + // TODO: fill this when more control message types added + match msg { + CtrlMsg::ResetState { durable } => { + self.handle_ctrl_reset_state(durable).await?; + Ok(Some(true)) + } + + _ => Ok(None), // ignore all other types + } + } } #[async_trait] @@ -300,15 +347,10 @@ impl GenericReplica for RepNothingReplica { }) } - async fn run(&mut self) -> Result { - // set up termination signals handler - let (tx_term, mut rx_term) = mpsc::unbounded_channel(); - ctrlc::set_handler(move || { - if let Err(e) = tx_term.send(true) { - pf_error!("s"; "error sending to term channel: {}", e); - } - })?; - + async fn run( + &mut self, + mut rx_term: watch::Receiver, + ) -> Result { loop { tokio::select! { // client request batch @@ -354,19 +396,34 @@ impl GenericReplica for RepNothingReplica { continue; } let ctrl_msg = ctrl_msg.unwrap(); - if let Err(e) = self.handle_ctrl_msg(ctrl_msg) { - pf_error!(self.id; "error handling ctrl msg: {}", e); + match self.handle_ctrl_msg(ctrl_msg).await { + Ok(terminate) => { + if let Some(restart) = terminate { + pf_warn!( + self.id; + "server got {} req", + if restart { "restart" } else { "shutdown" }); + return Ok(restart); + } + }, + Err(e) => { + pf_error!(self.id; "error handling ctrl msg: {}", e); + } } }, // receiving termination signal - _ = rx_term.recv() => { + _ = rx_term.changed() => { pf_warn!(self.id; "server caught termination signal"); return Ok(false); } } } } + + fn id(&self) -> ReplicaId { + self.id + } } /// Configuration parameters struct. @@ -388,14 +445,11 @@ pub struct RepNothingClient { /// Client ID. id: ClientId, - /// Address of the cluster manager oracle. - manager: SocketAddr, - /// Configuration parameters struct. config: ClientConfigRepNothing, /// Control API stub to the cluster manager. - ctrl_stub: Option, + ctrl_stub: ClientCtrlStub, /// API stubs for communicating with servers. api_stub: Option, @@ -403,44 +457,40 @@ pub struct RepNothingClient { #[async_trait] impl GenericEndpoint for RepNothingClient { - fn new( + async fn new_and_setup( manager: SocketAddr, config_str: Option<&str>, ) -> Result { + // connect to the cluster manager and get assigned a client ID + let ctrl_stub = ClientCtrlStub::new_by_connect(manager).await?; + let id = ctrl_stub.id; + + // parse protocol-specific configs let config = parsed_config!(config_str => ClientConfigRepNothing; server_id)?; Ok(RepNothingClient { - id: 255, // nil at this time - manager, + id, config, - ctrl_stub: None, + ctrl_stub, api_stub: None, }) } - async fn connect(&mut self) -> Result { + async fn connect(&mut self) -> Result<(), SummersetError> { // disallow reconnection without leaving if self.api_stub.is_some() { return logged_err!(self.id; "reconnecting without leaving"); } - // if ctrl_stubs not established yet, connect to the manager - if self.ctrl_stub.is_none() { - let ctrl_stub = - ClientCtrlStub::new_by_connect(self.manager).await?; - self.id = ctrl_stub.id; - self.ctrl_stub = Some(ctrl_stub); - } - let ctrl_stub = self.ctrl_stub.as_mut().unwrap(); - // ask the manager about the list of active servers - let mut sent = ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?; + let mut sent = + self.ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?; while !sent { - sent = ctrl_stub.send_req(None)?; + sent = self.ctrl_stub.send_req(None)?; } - let reply = ctrl_stub.recv_reply().await?; + let reply = self.ctrl_stub.recv_reply().await?; match reply { CtrlReply::QueryInfo { servers } => { // connect to the one with server ID in config @@ -450,7 +500,7 @@ impl GenericEndpoint for RepNothingClient { ) .await?; self.api_stub = Some(api_stub); - Ok(self.id) + Ok(()) } _ => logged_err!(self.id; "unexpected reply type received"), } @@ -478,26 +528,19 @@ impl GenericEndpoint for RepNothingClient { // if permanently leaving, send leave notification to the manager if permanent { - // disallow multiple permanent leaving - if self.ctrl_stub.is_none() { - return logged_err!(self.id; "repeated permanent leaving"); + let mut sent = + self.ctrl_stub.send_req(Some(&CtrlRequest::Leave))?; + while !sent { + sent = self.ctrl_stub.send_req(None)?; } - if let Some(mut ctrl_stub) = self.ctrl_stub.take() { - let mut sent = ctrl_stub.send_req(Some(&CtrlRequest::Leave))?; - while !sent { - sent = ctrl_stub.send_req(None)?; + let reply = self.ctrl_stub.recv_reply().await?; + match reply { + CtrlReply::Leave => { + pf_info!(self.id; "left current manager connection"); } - - let reply = ctrl_stub.recv_reply().await?; - match reply { - CtrlReply::Leave => { - pf_info!(self.id; "left current manager connection"); - ctrl_stub.forget(); - } - _ => { - return logged_err!(self.id; "unexpected reply type received"); - } + _ => { + return logged_err!(self.id; "unexpected reply type received"); } } } @@ -521,4 +564,12 @@ impl GenericEndpoint for RepNothingClient { None => logged_err!(self.id; "client is not set up"), } } + + fn id(&self) -> ClientId { + self.id + } + + fn ctrl_stub(&mut self) -> &mut ClientCtrlStub { + &mut self.ctrl_stub + } } diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs index 1c72b41a..ada30d45 100644 --- a/src/protocols/rs_paxos.rs +++ b/src/protocols/rs_paxos.rs @@ -22,7 +22,7 @@ use async_trait::async_trait; use serde::{Serialize, Deserialize}; use tokio::time::Duration; -use tokio::sync::mpsc; +use tokio::sync::watch; use reed_solomon_erasure::galois_8::ReedSolomon; @@ -1001,11 +1001,61 @@ impl RSPaxosReplica { Ok(()) } - /// Synthesized handler of manager control messages. - fn handle_ctrl_msg(&mut self, _msg: CtrlMsg) -> Result<(), SummersetError> { - // TODO: fill this when more control message types added + /// Handler of ResetState control message. + async fn handle_ctrl_reset_state( + &mut self, + durable: bool, + ) -> Result<(), SummersetError> { + // send leave notification to peers and wait for their replies + self.transport_hub.leave().await?; + + // send leave notification to manager and wait for its reply + self.control_hub.send_ctrl(CtrlMsg::Leave)?; + while self.control_hub.recv_ctrl().await? != CtrlMsg::LeaveReply {} + + // if `durable` is false, truncate backer file + if !durable { + // use 0 as a special log action ID here + self.storage_hub + .submit_action(0, LogAction::Truncate { offset: 0 })?; + loop { + let (action_id, log_result) = + self.storage_hub.get_result().await?; + if action_id == 0 { + if log_result + != (LogResult::Truncate { + offset_ok: true, + now_size: 0, + }) + { + return logged_err!(self.id; "failed to truncate log to 0"); + } else { + return Ok(()); + } + } + } + } + Ok(()) } + + /// Synthesized handler of manager control messages. If ok, returns + /// `Some(true)` if decides to terminate and reboot, `Some(false)` if + /// decides to shutdown completely, and `None` if not terminating. + async fn handle_ctrl_msg( + &mut self, + msg: CtrlMsg, + ) -> Result, SummersetError> { + // TODO: fill this when more control message types added + match msg { + CtrlMsg::ResetState { durable } => { + self.handle_ctrl_reset_state(durable).await?; + Ok(Some(true)) + } + + _ => Ok(None), // ignore all other types + } + } } #[async_trait] @@ -1114,15 +1164,10 @@ impl GenericReplica for RSPaxosReplica { }) } - async fn run(&mut self) -> Result { - // set up termination signals handler - let (tx_term, mut rx_term) = mpsc::unbounded_channel(); - ctrlc::set_handler(move || { - if let Err(e) = tx_term.send(true) { - pf_error!("s"; "error sending to term channel: {}", e); - } - })?; - + async fn run( + &mut self, + mut rx_term: watch::Receiver, + ) -> Result { // TODO: proper leader election if self.id == 0 { self.is_leader = true; @@ -1186,19 +1231,34 @@ impl GenericReplica for RSPaxosReplica { continue; } let ctrl_msg = ctrl_msg.unwrap(); - if let Err(e) = self.handle_ctrl_msg(ctrl_msg) { - pf_error!(self.id; "error handling ctrl msg: {}", e); + match self.handle_ctrl_msg(ctrl_msg).await { + Ok(terminate) => { + if let Some(restart) = terminate { + pf_warn!( + self.id; + "server got {} req", + if restart { "restart" } else { "shutdown" }); + return Ok(restart); + } + }, + Err(e) => { + pf_error!(self.id; "error handling ctrl msg: {}", e); + } } }, // receiving termination signal - _ = rx_term.recv() => { + _ = rx_term.changed() => { pf_warn!(self.id; "server caught termination signal"); return Ok(false); } } } } + + fn id(&self) -> ReplicaId { + self.id + } } /// Configuration parameters struct. @@ -1220,9 +1280,6 @@ pub struct RSPaxosClient { /// Client ID. id: ClientId, - /// Address of the cluster manager oracle. - manager: SocketAddr, - /// Configuration parameters struct. _config: ClientConfigRSPaxos, @@ -1233,7 +1290,7 @@ pub struct RSPaxosClient { server_id: ReplicaId, /// Control API stub to the cluster manager. - ctrl_stub: Option, + ctrl_stub: ClientCtrlStub, /// API stubs for communicating with servers. api_stub: Option, @@ -1241,47 +1298,43 @@ pub struct RSPaxosClient { #[async_trait] impl GenericEndpoint for RSPaxosClient { - fn new( + async fn new_and_setup( manager: SocketAddr, config_str: Option<&str>, ) -> Result { + // connect to the cluster manager and get assigned a client ID + let ctrl_stub = ClientCtrlStub::new_by_connect(manager).await?; + let id = ctrl_stub.id; + + // parse protocol-specific configs let config = parsed_config!(config_str => ClientConfigRSPaxos; init_server_id)?; let init_server_id = config.init_server_id; Ok(RSPaxosClient { - id: 255, // nil at this time - manager, + id, _config: config, servers: HashMap::new(), server_id: init_server_id, - ctrl_stub: None, + ctrl_stub, api_stub: None, }) } - async fn connect(&mut self) -> Result { + async fn connect(&mut self) -> Result<(), SummersetError> { // disallow reconnection without leaving if self.api_stub.is_some() { return logged_err!(self.id; "reconnecting without leaving"); } - // if ctrl_stubs not established yet, connect to the manager - if self.ctrl_stub.is_none() { - let ctrl_stub = - ClientCtrlStub::new_by_connect(self.manager).await?; - self.id = ctrl_stub.id; - self.ctrl_stub = Some(ctrl_stub); - } - let ctrl_stub = self.ctrl_stub.as_mut().unwrap(); - // ask the manager about the list of active servers - let mut sent = ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?; + let mut sent = + self.ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?; while !sent { - sent = ctrl_stub.send_req(None)?; + sent = self.ctrl_stub.send_req(None)?; } - let reply = ctrl_stub.recv_reply().await?; + let reply = self.ctrl_stub.recv_reply().await?; match reply { CtrlReply::QueryInfo { servers } => { // connect to the one with server ID in config @@ -1292,7 +1345,7 @@ impl GenericEndpoint for RSPaxosClient { .await?; self.api_stub = Some(api_stub); self.servers = servers; - Ok(self.id) + Ok(()) } _ => logged_err!(self.id; "unexpected reply type received"), } @@ -1320,26 +1373,19 @@ impl GenericEndpoint for RSPaxosClient { // if permanently leaving, send leave notification to the manager if permanent { - // disallow multiple permanent leaving - if self.ctrl_stub.is_none() { - return logged_err!(self.id; "repeated permanent leaving"); + let mut sent = + self.ctrl_stub.send_req(Some(&CtrlRequest::Leave))?; + while !sent { + sent = self.ctrl_stub.send_req(None)?; } - if let Some(mut ctrl_stub) = self.ctrl_stub.take() { - let mut sent = ctrl_stub.send_req(Some(&CtrlRequest::Leave))?; - while !sent { - sent = ctrl_stub.send_req(None)?; + let reply = self.ctrl_stub.recv_reply().await?; + match reply { + CtrlReply::Leave => { + pf_info!(self.id; "left current manager connection"); } - - let reply = ctrl_stub.recv_reply().await?; - match reply { - CtrlReply::Leave => { - pf_info!(self.id; "left current manager connection"); - ctrl_stub.forget(); - } - _ => { - return logged_err!(self.id; "unexpected reply type received"); - } + _ => { + return logged_err!(self.id; "unexpected reply type received"); } } } @@ -1385,4 +1431,12 @@ impl GenericEndpoint for RSPaxosClient { None => logged_err!(self.id; "client is not set up"), } } + + fn id(&self) -> ClientId { + self.id + } + + fn ctrl_stub(&mut self) -> &mut ClientCtrlStub { + &mut self.ctrl_stub + } } diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs index b0156ad3..7d9aa763 100644 --- a/src/protocols/simple_push.rs +++ b/src/protocols/simple_push.rs @@ -22,7 +22,7 @@ use async_trait::async_trait; use serde::{Serialize, Deserialize}; use tokio::time::Duration; -use tokio::sync::mpsc; +use tokio::sync::watch; /// Configuration parameters struct. #[derive(Debug, Deserialize)] @@ -383,11 +383,61 @@ impl SimplePushReplica { Ok(()) } - /// Synthesized handler of manager control messages. - fn handle_ctrl_msg(&mut self, _msg: CtrlMsg) -> Result<(), SummersetError> { - // TODO: fill this when more control message types added + /// Handler of ResetState control message. + async fn handle_ctrl_reset_state( + &mut self, + durable: bool, + ) -> Result<(), SummersetError> { + // send leave notification to peers and wait for their replies + self.transport_hub.leave().await?; + + // send leave notification to manager and wait for its reply + self.control_hub.send_ctrl(CtrlMsg::Leave)?; + while self.control_hub.recv_ctrl().await? != CtrlMsg::LeaveReply {} + + // if `durable` is false, truncate backer file + if !durable { + // use 0 as a special log action ID here + self.storage_hub + .submit_action(0, LogAction::Truncate { offset: 0 })?; + loop { + let (action_id, log_result) = + self.storage_hub.get_result().await?; + if action_id == 0 { + if log_result + != (LogResult::Truncate { + offset_ok: true, + now_size: 0, + }) + { + return logged_err!(self.id; "failed to truncate log to 0"); + } else { + return Ok(()); + } + } + } + } + Ok(()) } + + /// Synthesized handler of manager control messages. If ok, returns + /// `Some(true)` if decides to terminate and reboot, `Some(false)` if + /// decides to shutdown completely, and `None` if not terminating. + async fn handle_ctrl_msg( + &mut self, + msg: CtrlMsg, + ) -> Result, SummersetError> { + // TODO: fill this when more control message types added + match msg { + CtrlMsg::ResetState { durable } => { + self.handle_ctrl_reset_state(durable).await?; + Ok(Some(true)) + } + + _ => Ok(None), // ignore all other types + } + } } #[async_trait] @@ -476,15 +526,10 @@ impl GenericReplica for SimplePushReplica { }) } - async fn run(&mut self) -> Result { - // set up termination signals handler - let (tx_term, mut rx_term) = mpsc::unbounded_channel(); - ctrlc::set_handler(move || { - if let Err(e) = tx_term.send(true) { - pf_error!("s"; "error sending to term channel: {}", e); - } - })?; - + async fn run( + &mut self, + mut rx_term: watch::Receiver, + ) -> Result { loop { tokio::select! { // client request batch @@ -552,19 +597,34 @@ impl GenericReplica for SimplePushReplica { continue; } let ctrl_msg = ctrl_msg.unwrap(); - if let Err(e) = self.handle_ctrl_msg(ctrl_msg) { - pf_error!(self.id; "error handling ctrl msg: {}", e); + match self.handle_ctrl_msg(ctrl_msg).await { + Ok(terminate) => { + if let Some(restart) = terminate { + pf_warn!( + self.id; + "server got {} req", + if restart { "restart" } else { "shutdown" }); + return Ok(restart); + } + }, + Err(e) => { + pf_error!(self.id; "error handling ctrl msg: {}", e); + } } }, // receiving termination signal - _ = rx_term.recv() => { + _ = rx_term.changed() => { pf_warn!(self.id; "server caught termination signal"); return Ok(false); } } } } + + fn id(&self) -> ReplicaId { + self.id + } } /// Configuration parameters struct. @@ -586,14 +646,11 @@ pub struct SimplePushClient { /// Client ID. id: ClientId, - /// Address of the cluster manager oracle. - manager: SocketAddr, - /// Configuration parameters struct. config: ClientConfigSimplePush, /// Control API stub to the cluster manager. - ctrl_stub: Option, + ctrl_stub: ClientCtrlStub, /// API stubs for communicating with servers. api_stub: Option, @@ -601,44 +658,40 @@ pub struct SimplePushClient { #[async_trait] impl GenericEndpoint for SimplePushClient { - fn new( + async fn new_and_setup( manager: SocketAddr, config_str: Option<&str>, ) -> Result { + // connect to the cluster manager and get assigned a client ID + let ctrl_stub = ClientCtrlStub::new_by_connect(manager).await?; + let id = ctrl_stub.id; + + // parse protocol-specific configs let config = parsed_config!(config_str => ClientConfigSimplePush; server_id)?; Ok(SimplePushClient { - id: 255, // nil at this time - manager, + id, config, - ctrl_stub: None, + ctrl_stub, api_stub: None, }) } - async fn connect(&mut self) -> Result { + async fn connect(&mut self) -> Result<(), SummersetError> { // disallow reconnection without leaving if self.api_stub.is_some() { return logged_err!(self.id; "reconnecting without leaving"); } - // if ctrl_stubs not established yet, connect to the manager - if self.ctrl_stub.is_none() { - let ctrl_stub = - ClientCtrlStub::new_by_connect(self.manager).await?; - self.id = ctrl_stub.id; - self.ctrl_stub = Some(ctrl_stub); - } - let ctrl_stub = self.ctrl_stub.as_mut().unwrap(); - // ask the manager about the list of active servers - let mut sent = ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?; + let mut sent = + self.ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?; while !sent { - sent = ctrl_stub.send_req(None)?; + sent = self.ctrl_stub.send_req(None)?; } - let reply = ctrl_stub.recv_reply().await?; + let reply = self.ctrl_stub.recv_reply().await?; match reply { CtrlReply::QueryInfo { servers } => { // connect to the one with server ID in config @@ -648,7 +701,7 @@ impl GenericEndpoint for SimplePushClient { ) .await?; self.api_stub = Some(api_stub); - Ok(self.id) + Ok(()) } _ => logged_err!(self.id; "unexpected reply type received"), } @@ -676,26 +729,19 @@ impl GenericEndpoint for SimplePushClient { // if permanently leaving, send leave notification to the manager if permanent { - // disallow multiple permanent leaving - if self.ctrl_stub.is_none() { - return logged_err!(self.id; "repeated permanent leaving"); + let mut sent = + self.ctrl_stub.send_req(Some(&CtrlRequest::Leave))?; + while !sent { + sent = self.ctrl_stub.send_req(None)?; } - if let Some(mut ctrl_stub) = self.ctrl_stub.take() { - let mut sent = ctrl_stub.send_req(Some(&CtrlRequest::Leave))?; - while !sent { - sent = ctrl_stub.send_req(None)?; + let reply = self.ctrl_stub.recv_reply().await?; + match reply { + CtrlReply::Leave => { + pf_info!(self.id; "left current manager connection"); } - - let reply = ctrl_stub.recv_reply().await?; - match reply { - CtrlReply::Leave => { - pf_info!(self.id; "left current manager connection"); - ctrl_stub.forget(); - } - _ => { - return logged_err!(self.id; "unexpected reply type received"); - } + _ => { + return logged_err!(self.id; "unexpected reply type received"); } } } @@ -719,4 +765,12 @@ impl GenericEndpoint for SimplePushClient { None => logged_err!(self.id; "client is not set up"), } } + + fn id(&self) -> ClientId { + self.id + } + + fn ctrl_stub(&mut self) -> &mut ClientCtrlStub { + &mut self.ctrl_stub + } } diff --git a/src/server/external.rs b/src/server/external.rs index 3083a662..c52a946c 100644 --- a/src/server/external.rs +++ b/src/server/external.rs @@ -519,6 +519,7 @@ mod external_tests { ) .await?; barrier2.wait().await; + // recv requests from client let mut reqs: Vec<(ClientId, ApiRequest)> = vec![]; while reqs.len() < 3 { let mut req_batch = api.get_req_batch().await?; @@ -551,6 +552,7 @@ mod external_tests { cmd: Command::Get { key: "Jose".into() }, } ); + // send replies to client api.send_reply( ApiReply::Reply { id: 0, @@ -584,6 +586,7 @@ mod external_tests { let mut api_stub = ClientApiStub::new_by_connect(2857, "127.0.0.1:53700".parse()?) .await?; + // send requests to server api_stub.send_req(Some(&ApiRequest::Req { id: 0, cmd: Command::Put { @@ -599,6 +602,7 @@ mod external_tests { id: 1, cmd: Command::Get { key: "Jose".into() }, }))?; + // recv replies from server assert_eq!( api_stub.recv_reply().await?, ApiReply::Reply { @@ -642,6 +646,7 @@ mod external_tests { ) .await?; barrier2.wait().await; + // recv request from client let mut reqs: Vec<(ClientId, ApiRequest)> = vec![]; while reqs.is_empty() { let mut req_batch = api.get_req_batch().await?; @@ -660,6 +665,7 @@ mod external_tests { }, } ); + // send reply to client api.send_reply( ApiReply::Reply { id: 0, @@ -668,6 +674,7 @@ mod external_tests { }, client, )?; + // recv request from new client reqs.clear(); while reqs.is_empty() { let mut req_batch = api.get_req_batch().await?; @@ -687,6 +694,7 @@ mod external_tests { }, } ); + // send reply to new client api.send_reply( ApiReply::Reply { id: 0, @@ -704,6 +712,7 @@ mod external_tests { let mut api_stub = ClientApiStub::new_by_connect(2857, "127.0.0.1:54700".parse()?) .await?; + // send request to server api_stub.send_req(Some(&ApiRequest::Req { id: 0, cmd: Command::Put { @@ -711,6 +720,7 @@ mod external_tests { value: "123".into(), }, }))?; + // recv reply from server assert_eq!( api_stub.recv_reply().await?, ApiReply::Reply { @@ -719,13 +729,15 @@ mod external_tests { redirect: None, } ); + // leave and come back as new client api_stub.send_req(Some(&ApiRequest::Leave))?; assert_eq!(api_stub.recv_reply().await?, ApiReply::Leave); api_stub.forget(); - time::sleep(Duration::from_millis(1)).await; + time::sleep(Duration::from_millis(100)).await; let mut api_stub = ClientApiStub::new_by_connect(2858, "127.0.0.1:54700".parse()?) .await?; + // send request to server api_stub.send_req(Some(&ApiRequest::Req { id: 0, cmd: Command::Put { @@ -733,6 +745,7 @@ mod external_tests { value: "456".into(), }, }))?; + // recv reply from server assert_eq!( api_stub.recv_reply().await?, ApiReply::Reply { diff --git a/src/server/replica.rs b/src/server/replica.rs index cae9d042..6c305ad3 100644 --- a/src/server/replica.rs +++ b/src/server/replica.rs @@ -7,6 +7,8 @@ use crate::utils::SummersetError; use async_trait::async_trait; +use tokio::sync::watch; + /// Server replica ID type. pub type ReplicaId = u8; @@ -28,5 +30,11 @@ pub trait GenericReplica { /// terminated normally and wants to restart (e.g., receiving a reset /// control message) or `Ok(false)` if terminated normally and does not /// want to restart (e.g., receiving a termination signal). - async fn run(&mut self) -> Result; + async fn run( + &mut self, + rx_term: watch::Receiver, // termination signals channel + ) -> Result; + + /// Gets my replica ID. + fn id(&self) -> ReplicaId; } diff --git a/src/server/transport.rs b/src/server/transport.rs index 8f5f69cf..e7ca2998 100644 --- a/src/server/transport.rs +++ b/src/server/transport.rs @@ -11,7 +11,7 @@ use crate::server::ReplicaId; use bytes::BytesMut; -use serde::{Serialize, de::DeserializeOwned}; +use serde::{Serialize, Deserialize, de::DeserializeOwned}; use tokio::net::{TcpListener, TcpStream}; use tokio::net::tcp::{OwnedReadHalf, OwnedWriteHalf}; @@ -20,6 +20,19 @@ use tokio::sync::mpsc; use tokio::task::JoinHandle; use tokio::time::{self, Duration}; +/// Peer-peer message wrapper type that includes leave notification variants. +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] +enum PeerMessage { + /// Normal protocol-specific request. + Msg { msg: Msg }, + + /// Server leave notification. + Leave, + + /// Reply to leave notification. + LeaveReply, +} + /// Server internal TCP transport module. pub struct TransportHub { /// My replica ID. @@ -29,11 +42,14 @@ pub struct TransportHub { population: u8, /// Receiver side of the recv channel. - rx_recv: mpsc::UnboundedReceiver<(ReplicaId, Msg)>, + rx_recv: mpsc::UnboundedReceiver<(ReplicaId, PeerMessage)>, /// Map from peer ID -> sender side of the send channel, shared with the /// peer acceptor thread. - tx_sends: flashmap::ReadHandle>, + tx_sends: flashmap::ReadHandle< + ReplicaId, + mpsc::UnboundedSender>, + >, /// Join handle of the peer acceptor thread. _peer_acceptor_handle: JoinHandle<()>, @@ -76,8 +92,10 @@ where let (tx_recv, rx_recv) = mpsc::unbounded_channel(); - let (tx_sends_write, tx_sends_read) = - flashmap::new::>(); + let (tx_sends_write, tx_sends_read) = flashmap::new::< + ReplicaId, + mpsc::UnboundedSender>, + >(); let (peer_messenger_handles_write, peer_messenger_handles_read) = flashmap::new::>(); @@ -141,7 +159,7 @@ where logged_err!(self.me; "invalid group size {}", group) } else { while self.current_peers()?.count() + 1 < group { - time::sleep(Duration::from_millis(10)).await; + time::sleep(Duration::from_millis(100)).await; } Ok(()) } @@ -170,7 +188,7 @@ where match tx_sends_guard.get(&peer) { Some(tx_send) => { tx_send - .send(msg) + .send(PeerMessage::Msg { msg }) .map_err(|e| SummersetError(e.to_string()))?; } None => { @@ -207,7 +225,7 @@ where tx_sends_guard .get(&peer) .unwrap() - .send(msg.clone()) + .send(PeerMessage::Msg { msg: msg.clone() }) .map_err(|e| SummersetError(e.to_string()))?; } @@ -220,10 +238,47 @@ where &mut self, ) -> Result<(ReplicaId, Msg), SummersetError> { match self.rx_recv.recv().await { - Some((id, msg)) => Ok((id, msg)), + Some((id, peer_msg)) => match peer_msg { + PeerMessage::Msg { msg } => Ok((id, msg)), + _ => logged_err!(self.me; "unexpected peer message type"), + }, None => logged_err!(self.me; "recv channel has been closed"), } } + + /// Broadcasts leave notifications to all peers and waits for replies. + pub async fn leave(&mut self) -> Result<(), SummersetError> { + let tx_sends_guard = self.tx_sends.guard(); + let mut num_peers = 0; + for &peer in tx_sends_guard.keys() { + if peer == self.me { + continue; + } + + // not skipped + tx_sends_guard + .get(&peer) + .unwrap() + .send(PeerMessage::Leave) + .map_err(|e| SummersetError(e.to_string()))?; + num_peers += 1; + } + + let mut replies = Bitmap::new(self.population, false); + while replies.count() < num_peers { + match self.rx_recv.recv().await { + Some((id, peer_msg)) => match peer_msg { + PeerMessage::LeaveReply => replies.set(id, true)?, + _ => continue, // ignore all other types of messages + }, + None => { + return logged_err!(self.me; "recv channel has been closed"); + } + } + } + + Ok(()) + } } // TransportHub peer_acceptor thread implementation @@ -242,10 +297,10 @@ where me: ReplicaId, id: ReplicaId, addr: SocketAddr, - tx_recv: mpsc::UnboundedSender<(ReplicaId, Msg)>, + tx_recv: mpsc::UnboundedSender<(ReplicaId, PeerMessage)>, tx_sends: &mut flashmap::WriteHandle< ReplicaId, - mpsc::UnboundedSender, + mpsc::UnboundedSender>, >, peer_messenger_handles: &mut flashmap::WriteHandle< ReplicaId, @@ -280,10 +335,10 @@ where me: ReplicaId, mut stream: TcpStream, addr: SocketAddr, - tx_recv: mpsc::UnboundedSender<(ReplicaId, Msg)>, + tx_recv: mpsc::UnboundedSender<(ReplicaId, PeerMessage)>, tx_sends: &mut flashmap::WriteHandle< ReplicaId, - mpsc::UnboundedSender, + mpsc::UnboundedSender>, >, peer_messenger_handles: &mut flashmap::WriteHandle< ReplicaId, @@ -321,7 +376,7 @@ where id: ReplicaId, tx_sends: &mut flashmap::WriteHandle< ReplicaId, - mpsc::UnboundedSender, + mpsc::UnboundedSender>, >, peer_messenger_handles: &mut flashmap::WriteHandle< ReplicaId, @@ -343,11 +398,11 @@ where /// Peer acceptor thread function. async fn peer_acceptor_thread( me: ReplicaId, - tx_recv: mpsc::UnboundedSender<(ReplicaId, Msg)>, + tx_recv: mpsc::UnboundedSender<(ReplicaId, PeerMessage)>, peer_listener: TcpListener, mut tx_sends: flashmap::WriteHandle< ReplicaId, - mpsc::UnboundedSender, + mpsc::UnboundedSender>, >, mut peer_messenger_handles: flashmap::WriteHandle< ReplicaId, @@ -444,7 +499,7 @@ where write_buf: &mut BytesMut, write_buf_cursor: &mut usize, conn_write: &OwnedWriteHalf, - msg: Option<&Msg>, + msg: Option<&PeerMessage>, ) -> Result { safe_tcp_write(write_buf, write_buf_cursor, conn_write, msg) } @@ -455,7 +510,7 @@ where // message itself read_buf: &mut BytesMut, conn_read: &mut OwnedReadHalf, - ) -> Result { + ) -> Result, SummersetError> { safe_tcp_read(read_buf, conn_read).await } @@ -465,8 +520,8 @@ where id: ReplicaId, // corresonding peer's ID addr: SocketAddr, // corresponding peer's address conn: TcpStream, - mut rx_send: mpsc::UnboundedReceiver, - tx_recv: mpsc::UnboundedSender<(ReplicaId, Msg)>, + mut rx_send: mpsc::UnboundedReceiver>, + tx_recv: mpsc::UnboundedSender<(ReplicaId, PeerMessage)>, tx_exit: mpsc::UnboundedSender, ) { pf_debug!(me; "peer_messenger thread for {} ({}) spawned", id, addr); @@ -482,12 +537,32 @@ where // gets a message to send out msg = rx_send.recv(), if !retrying => { match msg { - Some(msg) => { + Some(PeerMessage::Leave) => { + // I decide to leave, notify peers + let peer_msg = PeerMessage::Leave; + if let Err(e) = Self::write_msg( + &mut write_buf, + &mut write_buf_cursor, + &conn_write, + Some(&peer_msg), + ) { + pf_error!(me; "error sending -> {}: {}", id, e); + } else { // skips `WouldBlock` failure check here + pf_debug!(me; "sent leave notification -> {}", id); + } + }, + + Some(PeerMessage::LeaveReply) => { + pf_error!(me; "proactively sending LeaveReply msg"); + }, + + Some(PeerMessage::Msg { msg }) => { + let peer_msg = PeerMessage::Msg { msg }; match Self::write_msg( &mut write_buf, &mut write_buf_cursor, &conn_write, - Some(&msg), + Some(&peer_msg), ) { Ok(true) => { // pf_trace!(me; "sent -> {} msg {:?}", id, msg); @@ -501,6 +576,7 @@ where } } }, + None => break, // channel gets closed and no messages remain } }, @@ -508,9 +584,35 @@ where // receives new message from peer msg = Self::read_msg(&mut read_buf, &mut conn_read) => { match msg { - Ok(msg) => { + Ok(PeerMessage::Leave) => { + // peer leaving, send dummy reply and break + let peer_msg = PeerMessage::LeaveReply; + if let Err(e) = Self::write_msg( + &mut write_buf, + &mut write_buf_cursor, + &conn_write, + Some(&peer_msg), + ) { + pf_error!(me; "error sending -> {}: {}", id, e); + } else { // skips `WouldBlock` failure check here + pf_debug!(me; "peer {} has left", id); + } + break; + }, + + Ok(PeerMessage::LeaveReply) => { + // my leave notification is acked by peer, break + let peer_msg = PeerMessage::LeaveReply; + if let Err(e) = tx_recv.send((id, peer_msg)) { + pf_error!(me; "error sending to tx_recv for {}: {}", id, e); + } + break; + } + + Ok(PeerMessage::Msg { msg }) => { // pf_trace!(me; "recv <- {} msg {:?}", id, msg); - if let Err(e) = tx_recv.send((id, msg)) { + let peer_msg = PeerMessage::Msg { msg }; + if let Err(e) = tx_recv.send((id, peer_msg)) { pf_error!(me; "error sending to tx_recv for {}: {}", id, e); } }, @@ -570,53 +672,53 @@ mod transport_tests { tokio::spawn(async move { // replica 1 let mut hub: TransportHub = - TransportHub::new_and_setup(1, 3, "127.0.0.1:54801".parse()?) + TransportHub::new_and_setup(1, 3, "127.0.0.1:53801".parse()?) .await?; barrier1.wait().await; - hub.connect_to_peer(2, "127.0.0.1:54802".parse()?).await?; + hub.connect_to_peer(2, "127.0.0.1:53802".parse()?).await?; // recv a message from 0 let (id, msg) = hub.recv_msg().await?; - assert!(id == 0); + assert_eq!(id, 0); assert_eq!(msg, TestMsg("hello".into())); // send a message to 0 hub.send_msg(TestMsg("world".into()), 0)?; // recv another message from 0 let (id, msg) = hub.recv_msg().await?; - assert!(id == 0); + assert_eq!(id, 0); assert_eq!(msg, TestMsg("nice".into())); // send another message to 0 hub.send_msg(TestMsg("job!".into()), 0)?; // wait for termination message let (id, msg) = hub.recv_msg().await?; - assert!(id == 0); + assert_eq!(id, 0); assert_eq!(msg, TestMsg("terminate".into())); Ok::<(), SummersetError>(()) }); tokio::spawn(async move { // replica 2 let mut hub: TransportHub = - TransportHub::new_and_setup(2, 3, "127.0.0.1:54802".parse()?) + TransportHub::new_and_setup(2, 3, "127.0.0.1:53802".parse()?) .await?; barrier2.wait().await; // recv a message from 0 let (id, msg) = hub.recv_msg().await?; - assert!(id == 0); + assert_eq!(id, 0); assert_eq!(msg, TestMsg("hello".into())); // send a message to 0 hub.send_msg(TestMsg("world".into()), 0)?; // wait for termination message let (id, msg) = hub.recv_msg().await?; - assert!(id == 0); + assert_eq!(id, 0); assert_eq!(msg, TestMsg("terminate".into())); Ok::<(), SummersetError>(()) }); // replica 0 let mut hub: TransportHub = - TransportHub::new_and_setup(0, 3, "127.0.0.1:54800".parse()?) + TransportHub::new_and_setup(0, 3, "127.0.0.1:53800".parse()?) .await?; barrier.wait().await; - hub.connect_to_peer(1, "127.0.0.1:54801".parse()?).await?; - hub.connect_to_peer(2, "127.0.0.1:54802".parse()?).await?; + hub.connect_to_peer(1, "127.0.0.1:53801".parse()?).await?; + hub.connect_to_peer(2, "127.0.0.1:53802".parse()?).await?; // send a message to 1 and 2 hub.bcast_msg(TestMsg("hello".into()), None)?; // recv a message from both 1 and 2 @@ -638,4 +740,49 @@ mod transport_tests { hub.bcast_msg(TestMsg("terminate".into()), None)?; Ok(()) } + + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn api_server_leave() -> Result<(), SummersetError> { + let barrier = Arc::new(Barrier::new(2)); + let barrier2 = barrier.clone(); + tokio::spawn(async move { + // replica 1/2 + let mut hub: TransportHub = + TransportHub::new_and_setup(1, 3, "127.0.0.1:54801".parse()?) + .await?; + barrier2.wait().await; + // recv a message from 0 + let (id, msg) = hub.recv_msg().await?; + assert_eq!(id, 0); + assert!(hub.current_peers()?.get(id)?); + assert_eq!(msg, TestMsg("goodbye".into())); + // leave and come back as 2 + hub.leave().await?; + time::sleep(Duration::from_millis(100)).await; + let mut hub: TransportHub = + TransportHub::new_and_setup(2, 3, "127.0.0.1:54802".parse()?) + .await?; + hub.connect_to_peer(0, "127.0.0.1:54800".parse()?).await?; + // send a message to 0 + hub.send_msg(TestMsg("hello".into()), 0)?; + Ok::<(), SummersetError>(()) + }); + // replica 0 + let mut hub: TransportHub = + TransportHub::new_and_setup(0, 3, "127.0.0.1:54800".parse()?) + .await?; + barrier.wait().await; + hub.connect_to_peer(1, "127.0.0.1:54801".parse()?).await?; + assert!(hub.current_peers()?.get(1)?); + assert!(!hub.current_peers()?.get(2)?); + // send a message to 1 + hub.send_msg(TestMsg("goodbye".into()), 1)?; + // recv a message from 2 + let (id, msg) = hub.recv_msg().await?; + assert_eq!(id, 2); + assert_eq!(msg, TestMsg("hello".into())); + assert!(!hub.current_peers()?.get(1)?); + assert!(hub.current_peers()?.get(2)?); + Ok(()) + } } diff --git a/summerset_client/Cargo.toml b/summerset_client/Cargo.toml index 9633986c..bf270e74 100644 --- a/summerset_client/Cargo.toml +++ b/summerset_client/Cargo.toml @@ -7,7 +7,7 @@ authors = ["Guanzhou Jose Hu "] [dependencies] summerset = { path = "../" } -tokio = { version = "1.29", features = ["macros", "rt-multi-thread"] } +tokio = { version = "1.29", features = ["full"] } rand = "0.8" lazy_static = "1.4" clap = { version = "4.0", features = ["derive"] } diff --git a/summerset_client/src/clients/tester.rs b/summerset_client/src/clients/tester.rs index 98743091..9ad2c3d6 100644 --- a/summerset_client/src/clients/tester.rs +++ b/summerset_client/src/clients/tester.rs @@ -18,8 +18,8 @@ use serde::Deserialize; use tokio::time::Duration; use summerset::{ - GenericEndpoint, CommandResult, RequestId, SummersetError, pf_error, - logged_err, parsed_config, + GenericEndpoint, CommandResult, RequestId, CtrlRequest, CtrlReply, + SummersetError, pf_error, logged_err, parsed_config, }; lazy_static! { @@ -215,13 +215,35 @@ impl ClientTester { } } + /// Resets all servers in the cluster to initial empty state. + async fn reset_cluster(&mut self) -> Result<(), SummersetError> { + let ctrl_stub = self.driver.ctrl_stub(); + + // send ResetServer request to manager + let req = CtrlRequest::ResetServer { + server: None, + durable: false, + }; + let mut sent = ctrl_stub.send_req(Some(&req))?; + while !sent { + sent = ctrl_stub.send_req(None)?; + } + + // wait for reply from manager + let reply = ctrl_stub.recv_reply().await?; + match reply { + CtrlReply::ResetServer { .. } => Ok(()), + _ => logged_err!("c"; "unexpected control reply type"), + } + } + /// Runs the individual correctness test. async fn do_test_by_name( &mut self, name: &str, ) -> Result<(), SummersetError> { // reset everything to initial state at the start of each test - // TODO: reset service state here + self.reset_cluster().await?; self.driver.connect().await?; self.cached_replies.clear(); diff --git a/summerset_client/src/drivers/closed_loop.rs b/summerset_client/src/drivers/closed_loop.rs index c3b1761c..cb361cf5 100644 --- a/summerset_client/src/drivers/closed_loop.rs +++ b/summerset_client/src/drivers/closed_loop.rs @@ -4,13 +4,14 @@ use tokio::time::{Duration, Instant}; use summerset::{ GenericEndpoint, ClientId, Command, CommandResult, ApiRequest, ApiReply, - RequestId, Timer, SummersetError, pf_debug, pf_error, logged_err, + RequestId, ClientCtrlStub, Timer, SummersetError, pf_debug, pf_error, + logged_err, }; /// Closed-loop driver struct. pub struct DriverClosedLoop { /// Client ID. - id: ClientId, + pub id: ClientId, /// Protocol-specific client endpoint. endpoint: Box, @@ -29,7 +30,7 @@ impl DriverClosedLoop { /// Creates a new closed-loop client. pub fn new(endpoint: Box, timeout: Duration) -> Self { DriverClosedLoop { - id: 255, // nil at this time + id: endpoint.id(), endpoint, next_req: 0, timer: Timer::new(), @@ -39,9 +40,7 @@ impl DriverClosedLoop { /// Establishes connection with the service. pub async fn connect(&mut self) -> Result<(), SummersetError> { - let id = self.endpoint.connect().await?; - self.id = id; - Ok(()) + self.endpoint.connect().await } /// Sends leave notification and forgets about the current TCP connections. @@ -185,4 +184,10 @@ impl DriverClosedLoop { _ => logged_err!(self.id; "unexpected reply type received"), } } + + /// Gets a mutable reference to the endpoint's control stub. + #[allow(dead_code)] + pub fn ctrl_stub(&mut self) -> &mut ClientCtrlStub { + self.endpoint.ctrl_stub() + } } diff --git a/summerset_client/src/drivers/open_loop.rs b/summerset_client/src/drivers/open_loop.rs index d07c4351..433f68a2 100644 --- a/summerset_client/src/drivers/open_loop.rs +++ b/summerset_client/src/drivers/open_loop.rs @@ -11,13 +11,14 @@ use tokio::time::{Duration, Instant}; use summerset::{ GenericEndpoint, ClientId, Command, CommandResult, ApiRequest, ApiReply, - RequestId, Timer, SummersetError, pf_debug, pf_error, logged_err, + RequestId, ClientCtrlStub, Timer, SummersetError, pf_debug, pf_error, + logged_err, }; /// Open-loop driver struct. pub struct DriverOpenLoop { /// Client ID. - id: ClientId, + pub id: ClientId, /// Protocol-specific client endpoint. endpoint: Box, @@ -43,7 +44,7 @@ impl DriverOpenLoop { /// Creates a new open-loop client. pub fn new(endpoint: Box, timeout: Duration) -> Self { DriverOpenLoop { - id: 255, // nil at this time + id: endpoint.id(), endpoint, next_req: 0, pending_reqs: HashMap::new(), @@ -55,9 +56,7 @@ impl DriverOpenLoop { /// Establishes connection with the service. pub async fn connect(&mut self) -> Result<(), SummersetError> { - let id = self.endpoint.connect().await?; - self.id = id; - Ok(()) + self.endpoint.connect().await } /// Waits for all pending replies to be received, then sends leave @@ -211,4 +210,9 @@ impl DriverOpenLoop { _ => logged_err!(self.id; "unexpected reply type received"), } } + + /// Gets a mutable reference to the endpoint's control stub. + pub fn ctrl_stub(&mut self) -> &mut ClientCtrlStub { + self.endpoint.ctrl_stub() + } } diff --git a/summerset_client/src/main.rs b/summerset_client/src/main.rs index 26346720..81fbcdba 100644 --- a/summerset_client/src/main.rs +++ b/summerset_client/src/main.rs @@ -113,8 +113,9 @@ fn client_main() -> Result<(), SummersetError> { // enter tokio runtime, connect to the service, and do work runtime.block_on(async move { - let endpoint = - protocol.new_client_endpoint(args.manager, config_str)?; + let endpoint = protocol + .new_client_endpoint(args.manager, config_str) + .await?; match mode { ClientMode::Repl => { diff --git a/summerset_manager/Cargo.toml b/summerset_manager/Cargo.toml index f0464aa8..f2920305 100644 --- a/summerset_manager/Cargo.toml +++ b/summerset_manager/Cargo.toml @@ -7,8 +7,9 @@ authors = ["Guanzhou Jose Hu "] [dependencies] summerset = { path = "../" } -tokio = { version = "1.29", features = ["macros", "rt-multi-thread"] } +tokio = { version = "1.29", features = ["full"] } rand = "0.8" clap = { version = "4.0", features = ["derive"] } log = "0.4" env_logger = "0.10" +ctrlc = { version = "3.4", features = ["termination"] } diff --git a/summerset_manager/src/main.rs b/summerset_manager/src/main.rs index 6b886372..9a08319e 100644 --- a/summerset_manager/src/main.rs +++ b/summerset_manager/src/main.rs @@ -5,11 +5,14 @@ use std::process::ExitCode; use clap::Parser; +use log::{self, LevelFilter}; + use env_logger::Env; use tokio::runtime::Builder; +use tokio::sync::watch; -use summerset::{SmrProtocol, SummersetError, pf_warn, pf_error}; +use summerset::{SmrProtocol, SummersetError, pf_error}; /// Command line arguments definition. #[derive(Parser, Debug)] @@ -99,24 +102,43 @@ fn manager_main() -> Result<(), SummersetError> { )) })?; - // create tokio multi-threaded runtime - let runtime = Builder::new_multi_thread() - .enable_all() - .worker_threads(args.threads) - .thread_name("tokio-worker-manager") - .build()?; + // set up termination signals handler + let (tx_term, rx_term) = watch::channel(false); + ctrlc::set_handler(move || { + if let Err(e) = tx_term.send(true) { + pf_error!("m"; "error sending to term channel: {}", e); + } + })?; + + let log_level = log::max_level(); - // enter tokio runtime, setup the cluster manager, and start the main - // event loop logic - runtime.block_on(async move { - let mut manager = protocol - .new_cluster_manager_setup(srv_addr, cli_addr, args.population) - .await?; + { + // create tokio multi-threaded runtime + let runtime = Builder::new_multi_thread() + .enable_all() + .worker_threads(args.threads) + .thread_name("tokio-worker-manager") + .build()?; - manager.run().await?; + // enter tokio runtime, setup the cluster manager, and start the main + // event loop logic + runtime.block_on(async move { + let mut manager = protocol + .new_cluster_manager_setup(srv_addr, cli_addr, args.population) + .await?; + + manager.run(rx_term).await?; + + // suppress logging before dropping the runtime to avoid spurious + // error messages + log::set_max_level(LevelFilter::Off); + + Ok::<(), SummersetError>(()) // give type hint for this async closure + })?; + } - Ok::<(), SummersetError>(()) // give type hint for this async closure - }) + log::set_max_level(log_level); + Ok(()) } fn main() -> ExitCode { @@ -130,7 +152,7 @@ fn main() -> ExitCode { pf_error!("m"; "manager_main exitted: {}", e); ExitCode::FAILURE } else { - pf_warn!("m"; "manager_main exitted successfully"); + // pf_warn!("m"; "manager_main exitted successfully"); ExitCode::SUCCESS } } diff --git a/summerset_server/Cargo.toml b/summerset_server/Cargo.toml index 0a8ad28b..3058e797 100644 --- a/summerset_server/Cargo.toml +++ b/summerset_server/Cargo.toml @@ -7,8 +7,9 @@ authors = ["Guanzhou Jose Hu "] [dependencies] summerset = { path = "../" } -tokio = { version = "1.29", features = ["macros", "rt-multi-thread"] } +tokio = { version = "1.29", features = ["full"] } rand = "0.8" clap = { version = "4.0", features = ["derive"] } log = "0.4" env_logger = "0.10" +ctrlc = { version = "3.4", features = ["termination"] } diff --git a/summerset_server/src/main.rs b/summerset_server/src/main.rs index abbbc20d..800ae9e1 100644 --- a/summerset_server/src/main.rs +++ b/summerset_server/src/main.rs @@ -7,11 +7,14 @@ use std::sync::atomic::{AtomicBool, Ordering}; use clap::Parser; +use log::{self, LevelFilter}; + use env_logger::Env; use tokio::runtime::Builder; +use tokio::sync::watch; -use summerset::{SmrProtocol, SummersetError, pf_warn, pf_error}; +use summerset::{SmrProtocol, SummersetError, pf_error}; /// Command line arguments definition. #[derive(Parser, Debug)] @@ -109,9 +112,21 @@ fn server_main() -> Result<(), SummersetError> { Some(&args.config[..]) }; + // set up termination signals handler + let (tx_term, rx_term) = watch::channel(false); + ctrlc::set_handler(move || { + if let Err(e) = tx_term.send(true) { + pf_error!("s"; "error sending to term channel: {}", e); + } + })?; + + let log_level = log::max_level(); let shutdown = Arc::new(AtomicBool::new(false)); + while !shutdown.load(Ordering::SeqCst) { - let sd = shutdown.clone(); + log::set_max_level(log_level); + let shutdown_clone = shutdown.clone(); + let rx_term_clone = rx_term.clone(); // create tokio multi-threaded runtime let runtime = Builder::new_multi_thread() @@ -132,20 +147,25 @@ fn server_main() -> Result<(), SummersetError> { ) .await?; - if replica.run().await? { + if replica.run(rx_term_clone).await? { // event loop terminated but wants to restart (e.g., when // receiving a reset control message); just drop this runtime // and move to the next iteration of loop } else { // event loop terminated and does not want to restart (e.g., // when receiving a termination signal) - sd.store(true, Ordering::SeqCst); + shutdown_clone.store(true, Ordering::SeqCst); } + // suppress logging before dropping the runtime to avoid spurious + // error messages + log::set_max_level(LevelFilter::Off); + Ok::<(), SummersetError>(()) // give type hint for this async closure })?; } + log::set_max_level(log_level); Ok(()) } @@ -160,7 +180,7 @@ fn main() -> ExitCode { pf_error!("s"; "server_main exitted: {}", e); ExitCode::FAILURE } else { - pf_warn!("s"; "server_main exitted successfully"); + // pf_warn!("s"; "server_main exitted successfully"); ExitCode::SUCCESS } } From 280f53b4c2dc7236fefcd4288168163fa07c2564 Mon Sep 17 00:00:00 2001 From: Guanzhou Hu Date: Thu, 31 Aug 2023 14:32:50 +0800 Subject: [PATCH 21/21] necessary hiding --- README.md | 38 +- scripts/local_bench.tmp.py | 182 --- scripts/local_cluster.py | 1 - src/lib.rs | 1 - src/protocols/crossword.rs | 1549 ---------------------- src/protocols/mod.rs | 20 - tla+/crossword/.gitignore | 1 - tla+/crossword/ConsensusMulti.cfg | 17 - tla+/crossword/ConsensusMulti.tla | 108 -- tla+/crossword/Crossword.tla | 355 ----- tla+/crossword/Crossword_MC.cfg | 22 - tla+/crossword/Crossword_MC.tla | 89 -- tla+/multipaxos_practical/MultiPaxos.tla | 4 +- 13 files changed, 5 insertions(+), 2382 deletions(-) delete mode 100644 scripts/local_bench.tmp.py delete mode 100644 src/protocols/crossword.rs delete mode 100644 tla+/crossword/.gitignore delete mode 100644 tla+/crossword/ConsensusMulti.cfg delete mode 100644 tla+/crossword/ConsensusMulti.tla delete mode 100644 tla+/crossword/Crossword.tla delete mode 100644 tla+/crossword/Crossword_MC.cfg delete mode 100644 tla+/crossword/Crossword_MC.tla diff --git a/README.md b/README.md index 22eb30f3..007a9e43 100644 --- a/README.md +++ b/README.md @@ -1,39 +1,8 @@ -This is a private mirror of [Summerset](https://github.com/josehu07/summerset). Below are a memo of development commands... - -To create a branch to track public repo `main`, pull new things from it, and merge into the private `main`: - -```bash -# in the private repo: -git remote add public git@github.com:josehu07/summerset.git -git config --add --local checkout.defaultRemote origin -git checkout -b public-main -git branch --set-upstream-to=public/main public-main -git checkout main -# skip the above for later times -git pull public -git merge public-main -git push -``` - -To create a pull request on the public repo to make batched contributions from private repo `main`: - -```bash -# in the public repo: -git remote add private git@github.com:josehu07/summerset-private.git -git config --add --local checkout.defaultRemote origin -# skip the above for later times -git checkout -b -git branch --set-upstream-to=private/main -git pull private -git push origin -# then, on GitHub, make a squashing PR from branch to main -``` - # Summerset -[![Format check](https://github.com/josehu07/summerset-private/actions/workflows/format.yml/badge.svg)](https://github.com/josehu07/summerset/actions?query=josehu07%3Aformat) -[![Build status](https://github.com/josehu07/summerset-private/actions/workflows/build.yml/badge.svg)](https://github.com/josehu07/summerset/actions?query=josehu07%3Abuild) -[![Tests status](https://github.com/josehu07/summerset-private/actions/workflows/tests.yml/badge.svg)](https://github.com/josehu07/summerset/actions?query=josehu07%3Atests) +[![Format check](https://github.com/josehu07/summerset/actions/workflows/format.yml/badge.svg)](https://github.com/josehu07/summerset/actions?query=josehu07%3Aformat) +[![Build status](https://github.com/josehu07/summerset/actions/workflows/build.yml/badge.svg)](https://github.com/josehu07/summerset/actions?query=josehu07%3Abuild) +[![Tests status](https://github.com/josehu07/summerset/actions/workflows/tests.yml/badge.svg)](https://github.com/josehu07/summerset/actions?query=josehu07%3Atests) [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT) Summerset is a distributed key-value store supporting a wide range of state machine replication (SMR) protocols for research purposes. More protocols are actively being added. @@ -149,7 +118,6 @@ Complete cluster management and benchmarking scripts are available in another re - [ ] separate commit vs. exec responses? - [ ] membership discovery & view changes - [ ] implementation of Raft -- [ ] implementation of Crossword prototype - [x] client-side utilities - [x] REPL-style client - [x] random benchmarking client diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py deleted file mode 100644 index a76b77ea..00000000 --- a/scripts/local_bench.tmp.py +++ /dev/null @@ -1,182 +0,0 @@ -import os -import subprocess -import statistics - - -def do_cargo_build(): - print("Building everything...") - cmd = ["cargo", "build", "--workspace", "-r"] - proc = subprocess.Popen(cmd) - proc.wait() - - -def run_process(cmd): - # print("Run:", " ".join(cmd)) - proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - return proc - - -def kill_all_matching(name, force=False): - # print("Kill all:", name) - assert name.count(" ") == 0 - cmd = "killall -9" if force else "killall" - cmd += f" {name} > /dev/null 2>&1" - os.system(cmd) - - -def launch_cluster(protocol, num_replicas, config): - cmd = [ - "python3", - "./scripts/local_cluster.py", - "-p", - protocol, - "-n", - str(num_replicas), - "-r", - ] - if config is not None and len(config) > 0: - cmd += ["--config", config] - return run_process(cmd) - - -def wait_cluster_setup(proc, num_replicas): - accepting_clients = [False for _ in range(num_replicas)] - - for line in iter(proc.stderr.readline, b""): - l = line.decode() - # print(l, end="") - if "manager" not in l and "accepting clients" in l: - replica = int(l[l.find("(") + 1 : l.find(")")]) - assert not accepting_clients[replica] - accepting_clients[replica] = True - - if accepting_clients.count(True) == num_replicas: - break - - -def run_bench_client(protocol, value_size, put_ratio, length_s): - cmd = [ - "python3", - "./scripts/local_client.py", - "-p", - protocol, - "-r", - "bench", - "-v", - str(value_size), - "-w", - str(put_ratio), - "-l", - str(length_s), - ] - return run_process(cmd) - - -def parse_output(output): - lines = [l.strip() for l in output.split("\n") if l.count("|") == 3] - assert len(lines) >= 4 - assert lines[0].startswith("Elapsed") - lines = lines[1:] - - warmup, tail = len(lines) // 3, len(lines) // 10 - lines = lines[warmup:-tail] - - tpts, lats = [], [] - for line in lines: - segs = line.split() - tpt = float(segs[2]) # reqs/s - lat = float(segs[4]) / 1000.0 # ms - tpts.append(tpt) - lats.append(lat) - - median_tpt = tpts[len(tpts) // 2] - median_lat = lats[len(lats) // 2] - print(f" med tpt {median_tpt:9.2f} reqs/s lat {median_lat:9.2f} ms") - - avg_tpt = sum(tpts) / len(tpts) - std_tpt = statistics.stdev(tpts) - avg_lat = sum(lats) / len(lats) - std_lat = statistics.stdev(lats) - print(f" avg tpt {avg_tpt:9.2f} reqs/s lat {avg_lat:9.2f} ms") - print(f" std tpt {std_tpt:9.2f} lat {std_lat:9.2f}") - - -def bench_round( - protocol, - num_replicas, - value_size, - put_ratio, - length_s, - fault_tolerance=None, - shards_per_replica=None, -): - print( - f"{protocol:<10s} n={num_replicas:1d} v={value_size:<9d} " - + f"f={fault_tolerance if fault_tolerance is not None else 'x':1} " - + f"s={shards_per_replica if shards_per_replica is not None else 'x':1} " - + f"w%={put_ratio:<3d} {length_s:3d}s" - ) - kill_all_matching("summerset_client", force=True) - kill_all_matching("summerset_server", force=True) - kill_all_matching("summerset_manager", force=True) - - configs = [] - if fault_tolerance is not None: - configs.append(f"fault_tolerance={fault_tolerance}") - if shards_per_replica is not None: - configs.append(f"shards_per_replica={shards_per_replica}") - proc_cluster = launch_cluster(protocol, num_replicas, "+".join(configs)) - wait_cluster_setup(proc_cluster, num_replicas) - - proc_client = run_bench_client(protocol, value_size, put_ratio, length_s) - out, err = proc_client.communicate() - - proc_cluster.terminate() - proc_cluster.wait() - - if proc_client.returncode != 0: - print(err.decode()) - else: - parse_output(out.decode()) - - -if __name__ == "__main__": - do_cargo_build() - - def all_protocol_configs(num_replicas): - quorum_cnt = num_replicas // 2 + 1 - max_fault_tolerance = num_replicas - quorum_cnt - - config_choices = [("MultiPaxos", None, None)] - for shards_per_replica in range(quorum_cnt, 0, -1): - config_choices.append( - ("Crossword", max_fault_tolerance, shards_per_replica) - ) - config_choices.append(("Crossword", 0, 1)) - - return config_choices - - for num_replicas in (3, 5, 7): - for value_size in (1024, 65536, 4194304): - for protocol, fault_tolerance, shards_per_replica in all_protocol_configs( - num_replicas - ): - # print( - # num_replicas, - # value_size, - # protocol, - # fault_tolerance, - # shards_per_replica, - # ) - bench_round( - protocol, - num_replicas, - value_size, - 100, - 60, - fault_tolerance=fault_tolerance, - shards_per_replica=shards_per_replica, - ) - - bench_round("MultiPaxos", 5, 65536, 0, 60) - bench_round("Crossword", 5, 65536, 0, 60, fault_tolerance=0, shards_per_replica=1) diff --git a/scripts/local_cluster.py b/scripts/local_cluster.py index c4e0877c..aca35807 100644 --- a/scripts/local_cluster.py +++ b/scripts/local_cluster.py @@ -45,7 +45,6 @@ def kill_all_matching(name, force=False): "SimplePush": lambda r: f"backer_path='/tmp/summerset.simple_push.{r}.wal'", "MultiPaxos": lambda r: f"backer_path='/tmp/summerset.multipaxos.{r}.wal'", "RSPaxos": lambda r: f"backer_path='/tmp/summerset.rs_paxos.{r}.wal'", - "Crossword": lambda r: f"backer_path='/tmp/summerset.crossword.{r}.wal'", } diff --git a/src/lib.rs b/src/lib.rs index 2de53e51..9e044072 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -36,4 +36,3 @@ pub use crate::protocols::{ReplicaConfigRepNothing, ClientConfigRepNothing}; pub use crate::protocols::{ReplicaConfigSimplePush, ClientConfigSimplePush}; pub use crate::protocols::{ReplicaConfigMultiPaxos, ClientConfigMultiPaxos}; pub use crate::protocols::{ReplicaConfigRSPaxos, ClientConfigRSPaxos}; -pub use crate::protocols::{ReplicaConfigCrossword, ClientConfigCrossword}; diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs deleted file mode 100644 index 493213d4..00000000 --- a/src/protocols/crossword.rs +++ /dev/null @@ -1,1549 +0,0 @@ -//! Replication protocol: Crossword. -//! -//! MultiPaxos with flexible Reed-Solomon erasure coding that supports tunable -//! shard groups and asymmetric shard assignment. - -use std::collections::HashMap; -use std::path::Path; -use std::net::SocketAddr; - -use crate::utils::{SummersetError, Bitmap, RSCodeword}; -use crate::manager::{CtrlMsg, CtrlRequest, CtrlReply}; -use crate::server::{ - ReplicaId, ControlHub, StateMachine, CommandResult, CommandId, ExternalApi, - ApiRequest, ApiReply, StorageHub, LogAction, LogResult, LogActionId, - TransportHub, GenericReplica, -}; -use crate::client::{ClientId, ClientApiStub, ClientCtrlStub, GenericEndpoint}; -use crate::protocols::SmrProtocol; - -use async_trait::async_trait; - -use serde::{Serialize, Deserialize}; - -use tokio::time::Duration; -use tokio::sync::watch; - -use reed_solomon_erasure::galois_8::ReedSolomon; - -/// Configuration parameters struct. -#[derive(Debug, Deserialize)] -pub struct ReplicaConfigCrossword { - /// Client request batching interval in microsecs. - pub batch_interval_us: u64, - - /// Client request batching maximum batch size. - pub max_batch_size: usize, - - /// Path to backing file. - pub backer_path: String, - - /// Whether to call `fsync()`/`fdatasync()` on logger. - pub logger_sync: bool, - - /// Fault-tolerance level. - pub fault_tolerance: u8, - - /// Number of shards to assign to each replica. - // TODO: proper config options. - pub shards_per_replica: u8, -} - -#[allow(clippy::derivable_impls)] -impl Default for ReplicaConfigCrossword { - fn default() -> Self { - ReplicaConfigCrossword { - batch_interval_us: 1000, - max_batch_size: 5000, - backer_path: "/tmp/summerset.rs_paxos.wal".into(), - logger_sync: false, - fault_tolerance: 0, - shards_per_replica: 1, - } - } -} - -/// Ballot number type. Use 0 as a null ballot number. -type Ballot = u64; - -/// Instance status enum. -#[derive( - Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Serialize, Deserialize, -)] -enum Status { - Null = 0, - Preparing = 1, - Accepting = 2, - Committed = 3, - Executed = 4, -} - -/// Request batch type (i.e., the "value" in Paxos). -type ReqBatch = Vec<(ClientId, ApiRequest)>; - -/// Leader-side bookkeeping info for each instance initiated. -#[derive(Debug, Clone)] -struct LeaderBookkeeping { - /// Replicas from which I have received Prepare confirmations. - prepare_acks: Bitmap, - - /// Max ballot among received Prepare replies. - prepare_max_bal: Ballot, - - /// Replicas and their assigned shards which the received Accept - /// confirmations cover. - accept_acks: HashMap, -} - -/// Follower-side bookkeeping info for each instance received. -#[derive(Debug, Clone)] -struct ReplicaBookkeeping { - /// Source leader replica ID for replyiing to Prepares and Accepts. - source: ReplicaId, -} - -/// In-memory instance containing a complete commands batch. -#[derive(Debug, Clone)] -struct Instance { - /// Ballot number. - bal: Ballot, - - /// Instance status. - status: Status, - - /// Shards of a batch of client requests. - reqs_cw: RSCodeword, - - /// Leader-side bookkeeping info. - leader_bk: Option, - - /// Follower-side bookkeeping info. - replica_bk: Option, -} - -/// Stable storage log entry type. -#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] -enum LogEntry { - /// Records an update to the largest prepare ballot seen. - PrepareBal { slot: usize, ballot: Ballot }, - - /// Records a newly accepted request batch data shards at slot index. - AcceptData { - slot: usize, - ballot: Ballot, - reqs_cw: RSCodeword, - }, - - /// Records an event of committing the instance at index. - CommitSlot { slot: usize }, -} - -/// Peer-peer message type. -#[derive(Debug, Clone, Serialize, Deserialize)] -enum PeerMsg { - /// Prepare message from leader to replicas. - Prepare { slot: usize, ballot: Ballot }, - - /// Prepare reply from replica to leader. - PrepareReply { - slot: usize, - ballot: Ballot, - /// The accepted ballot number for that instance and the corresponding - /// request batch value shards known by replica. - voted: Option<(Ballot, RSCodeword)>, - }, - - /// Accept message from leader to replicas. - Accept { - slot: usize, - ballot: Ballot, - reqs_cw: RSCodeword, - }, - - /// Accept reply from replica to leader. - AcceptReply { slot: usize, ballot: Ballot }, - - /// Commit notification from leader to replicas. - Commit { slot: usize }, -} - -/// Crossword server replica module. -pub struct CrosswordReplica { - /// Replica ID in cluster. - id: ReplicaId, - - /// Total number of replicas in cluster. - population: u8, - - /// Majority quorum size. - quorum_cnt: u8, - - /// Configuration parameters struct. - config: ReplicaConfigCrossword, - - /// Address string for client requests API. - _api_addr: SocketAddr, - - /// Address string for internal peer-peer communication. - _p2p_addr: SocketAddr, - - /// ControlHub module. - control_hub: ControlHub, - - /// ExternalApi module. - external_api: ExternalApi, - - /// StateMachine module. - state_machine: StateMachine, - - /// StorageHub module. - storage_hub: StorageHub, - - /// TransportHub module. - transport_hub: TransportHub, - - /// Do I think I am the leader? - is_leader: bool, - - /// In-memory log of instances. - insts: Vec, - - /// Largest ballot number that a leader has sent Prepare messages in. - bal_prep_sent: Ballot, - - /// Largest ballot number that a leader knows has been safely prepared. - bal_prepared: Ballot, - - /// Largest ballot number seen as acceptor. - bal_max_seen: Ballot, - - /// Index of the first non-committed instance. - commit_bar: usize, - - /// Index of the first non-executed instance. - /// It is always true that exec_bar <= commit_bar <= insts.len() - exec_bar: usize, - - /// Current durable log file offset. - log_offset: usize, - - /// Fixed Reed-Solomon coder. - rs_coder: ReedSolomon, -} - -impl CrosswordReplica { - /// Compose a unique ballot number from base. - fn make_unique_ballot(&self, base: u64) -> Ballot { - ((base << 8) | ((self.id + 1) as u64)) as Ballot - } - - /// Compose a unique ballot number greater than the given one. - fn make_greater_ballot(&self, bal: Ballot) -> Ballot { - self.make_unique_ballot((bal >> 8) + 1) - } - - /// Compose LogActionId from slot index & entry type. - /// Uses the `Status` enum type to represent differnet entry types. - fn make_log_action_id(slot: usize, entry_type: Status) -> LogActionId { - let type_num = match entry_type { - Status::Preparing => 1, - Status::Accepting => 2, - Status::Committed => 3, - _ => panic!("unknown log entry type {:?}", entry_type), - }; - ((slot << 2) | type_num) as LogActionId - } - - /// Decompose LogActionId into slot index & entry type. - fn split_log_action_id(log_action_id: LogActionId) -> (usize, Status) { - let slot = (log_action_id >> 2) as usize; - let type_num = log_action_id & ((1 << 2) - 1); - let entry_type = match type_num { - 1 => Status::Preparing, - 2 => Status::Accepting, - 3 => Status::Committed, - _ => panic!("unknown log entry type num {}", type_num), - }; - (slot, entry_type) - } - - /// Compose CommandId from slot index & command index within. - fn make_command_id(slot: usize, cmd_idx: usize) -> CommandId { - assert!(slot <= (u32::MAX as usize)); - assert!(cmd_idx <= (u32::MAX as usize)); - ((slot << 32) | cmd_idx) as CommandId - } - - /// Decompose CommandId into slot index & command index within. - fn split_command_id(command_id: CommandId) -> (usize, usize) { - let slot = (command_id >> 32) as usize; - let cmd_idx = (command_id & ((1 << 32) - 1)) as usize; - (slot, cmd_idx) - } - - /// TODO: maybe remove this. - fn shards_for_replica( - id: ReplicaId, - population: u8, - num_shards: u8, - ) -> Vec { - (id..(id + num_shards)).map(|i| (i % population)).collect() - } - - /// TODO: make better impl of this. - fn coverage_under_faults( - population: u8, - acks: &HashMap, - fault_tolerance: u8, - ) -> u8 { - if acks.len() <= fault_tolerance as usize { - return 0; - } - - // enumerate all subsets of acks excluding fault number of replicas - let cnt = (acks.len() - fault_tolerance as usize) as u32; - let servers: Vec = acks.keys().cloned().collect(); - let mut min_coverage = population; - - for n in (0..2usize.pow(servers.len() as u32)) - .filter(|n| n.count_ones() == cnt) - { - let mut coverage = Bitmap::new(population, false); - for (_, server) in servers - .iter() - .enumerate() - .filter(|&(i, _)| (n >> i) % 2 == 1) - { - for shard in acks[server].iter().filter_map(|(s, flag)| { - if flag { - Some(s) - } else { - None - } - }) { - coverage.set(shard, true).expect("impossible shard index"); - } - } - - if coverage.count() < min_coverage { - min_coverage = coverage.count(); - } - } - - min_coverage - } - - /// Handler of client request batch chan recv. - fn handle_req_batch( - &mut self, - req_batch: ReqBatch, - ) -> Result<(), SummersetError> { - let batch_size = req_batch.len(); - assert!(batch_size > 0); - pf_debug!(self.id; "got request batch of size {}", batch_size); - - // if I'm not a leader, ignore client requests - if !self.is_leader { - for (client, req) in req_batch { - if let ApiRequest::Req { id: req_id, .. } = req { - // tell the client to try on the next replica - let next_replica = (self.id + 1) % self.population; - self.external_api.send_reply( - ApiReply::Reply { - id: req_id, - result: None, - redirect: Some(next_replica), - }, - client, - )?; - pf_trace!(self.id; "redirected client {} to replica {}", - client, next_replica); - } - } - return Ok(()); - } - - // compute the complete Reed-Solomon codeword for the batch data - let mut reqs_cw = RSCodeword::from_data( - req_batch, - self.quorum_cnt, - self.population - self.quorum_cnt, - )?; - reqs_cw.compute_parity(Some(&self.rs_coder))?; - - // create a new instance in the first null slot (or append a new one - // at the end if no holes exist) - // TODO: maybe use a null_idx variable to better keep track of this - let mut slot = self.insts.len(); - for s in self.commit_bar..self.insts.len() { - if self.insts[s].status == Status::Null { - slot = s; - break; - } - } - if slot < self.insts.len() { - let old_inst = &mut self.insts[slot]; - assert_eq!(old_inst.status, Status::Null); - old_inst.reqs_cw = reqs_cw; - old_inst.leader_bk = Some(LeaderBookkeeping { - prepare_acks: Bitmap::new(self.population, false), - prepare_max_bal: 0, - accept_acks: HashMap::new(), - }); - } else { - let new_inst = Instance { - bal: 0, - status: Status::Null, - reqs_cw, - leader_bk: Some(LeaderBookkeeping { - prepare_acks: Bitmap::new(self.population, false), - prepare_max_bal: 0, - accept_acks: HashMap::new(), - }), - replica_bk: None, - }; - self.insts.push(new_inst); - } - - // decide whether we can enter fast path for this instance - // TODO: remember to reset bal_prepared to 0, update bal_max_seen, - // and re-handle all Preparing & Accepting instances in autonomous - // Prepare initiation - if self.bal_prepared == 0 { - // slow case: Prepare phase not done yet. Initiate a Prepare round - // if none is on the fly, or just wait for some Prepare reply to - // trigger my Accept phase - if self.bal_prep_sent == 0 { - self.bal_prep_sent = - self.make_greater_ballot(self.bal_max_seen); - self.bal_max_seen = self.bal_prep_sent; - } - - let inst = &mut self.insts[slot]; - inst.bal = self.bal_prep_sent; - inst.status = Status::Preparing; - pf_debug!(self.id; "enter Prepare phase for slot {} bal {}", - slot, inst.bal); - - // record update to largest prepare ballot - self.storage_hub.submit_action( - Self::make_log_action_id(slot, Status::Preparing), - LogAction::Append { - entry: LogEntry::PrepareBal { - slot, - ballot: self.bal_prep_sent, - }, - sync: self.config.logger_sync, - }, - )?; - pf_trace!(self.id; "submitted PrepareBal log action for slot {} bal {}", - slot, inst.bal); - - // send Prepare messages to all peers - self.transport_hub.bcast_msg( - PeerMsg::Prepare { - slot, - ballot: self.bal_prep_sent, - }, - None, - )?; - pf_trace!(self.id; "broadcast Prepare messages for slot {} bal {}", - slot, inst.bal); - } else { - // normal case: Prepare phase covered, only do the Accept phase - let inst = &mut self.insts[slot]; - inst.bal = self.bal_prepared; - inst.status = Status::Accepting; - pf_debug!(self.id; "enter Accept phase for slot {} bal {}", - slot, inst.bal); - - // record update to largest accepted ballot and corresponding data - self.storage_hub.submit_action( - Self::make_log_action_id(slot, Status::Accepting), - LogAction::Append { - entry: LogEntry::AcceptData { - slot, - ballot: inst.bal, - // persist only some shards on myself - reqs_cw: inst.reqs_cw.subset_copy( - Bitmap::from( - self.population, - Self::shards_for_replica( - self.id, - self.population, - self.config.shards_per_replica, - ), - ), - false, - )?, - }, - sync: self.config.logger_sync, - }, - )?; - pf_trace!(self.id; "submitted AcceptData log action for slot {} bal {}", - slot, inst.bal); - - // send Accept messages to all peers, each getting its subset of - // shards of data - for peer in 0..self.population { - if peer == self.id { - continue; - } - self.transport_hub.send_msg( - PeerMsg::Accept { - slot, - ballot: inst.bal, - reqs_cw: inst.reqs_cw.subset_copy( - Bitmap::from( - self.population, - Self::shards_for_replica( - peer, - self.population, - self.config.shards_per_replica, - ), - ), - false, - )?, - }, - peer, - )?; - } - pf_trace!(self.id; "broadcast Accept messages for slot {} bal {}", - slot, inst.bal); - } - - Ok(()) - } - - /// Handler of PrepareBal logging result chan recv. - fn handle_logged_prepare_bal( - &mut self, - slot: usize, - ) -> Result<(), SummersetError> { - pf_trace!(self.id; "finished PrepareBal logging for slot {} bal {}", - slot, self.insts[slot].bal); - let inst = &self.insts[slot]; - let voted = if inst.status >= Status::Accepting { - Some((inst.bal, inst.reqs_cw.clone())) - } else { - None - }; - - if self.is_leader { - // on leader, finishing the logging of a PrepareBal entry - // is equivalent to receiving a Prepare reply from myself - // (as an acceptor role) - self.handle_msg_prepare_reply(self.id, slot, inst.bal, voted)?; - } else { - // on follower replica, finishing the logging of a - // PrepareBal entry leads to sending back a Prepare reply - assert!(inst.replica_bk.is_some()); - let source = inst.replica_bk.as_ref().unwrap().source; - self.transport_hub.send_msg( - PeerMsg::PrepareReply { - slot, - ballot: inst.bal, - voted, - }, - source, - )?; - pf_trace!(self.id; "sent PrepareReply -> {} for slot {} bal {}", - source, slot, inst.bal); - } - - Ok(()) - } - - /// Handler of AcceptData logging result chan recv. - fn handle_logged_accept_data( - &mut self, - slot: usize, - ) -> Result<(), SummersetError> { - pf_trace!(self.id; "finished AcceptData logging for slot {} bal {}", - slot, self.insts[slot].bal); - let inst = &self.insts[slot]; - - if self.is_leader { - // on leader, finishing the logging of an AcceptData entry - // is equivalent to receiving an Accept reply from myself - // (as an acceptor role) - self.handle_msg_accept_reply(self.id, slot, inst.bal)?; - } else { - // on follower replica, finishing the logging of an - // AcceptData entry leads to sending back an Accept reply - assert!(inst.replica_bk.is_some()); - let source = inst.replica_bk.as_ref().unwrap().source; - self.transport_hub.send_msg( - PeerMsg::AcceptReply { - slot, - ballot: inst.bal, - }, - source, - )?; - pf_trace!(self.id; "sent AcceptReply -> {} for slot {} bal {}", - source, slot, inst.bal); - } - - Ok(()) - } - - /// Handler of CommitSlot logging result chan recv. - fn handle_logged_commit_slot( - &mut self, - slot: usize, - ) -> Result<(), SummersetError> { - pf_trace!(self.id; "finished CommitSlot logging for slot {} bal {}", - slot, self.insts[slot].bal); - assert!(self.insts[slot].status >= Status::Committed); - - // update index of the first non-committed instance - if slot == self.commit_bar { - while self.commit_bar < self.insts.len() { - let inst = &mut self.insts[self.commit_bar]; - if inst.status < Status::Committed { - break; - } - - if inst.reqs_cw.avail_shards() < self.quorum_cnt { - // can't execute if I don't have the complete request batch - pf_debug!(self.id; "postponing execution for slot {} (shards {}/{})", - slot, inst.reqs_cw.avail_shards(), self.quorum_cnt); - break; - } else if inst.reqs_cw.avail_data_shards() < self.quorum_cnt { - // have enough shards but need reconstruction - inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?; - } - let reqs = inst.reqs_cw.get_data()?; - - // submit commands in committed instance to the state machine - // for execution - if reqs.is_empty() { - inst.status = Status::Executed; - } else if inst.status == Status::Committed { - for (cmd_idx, (_, req)) in reqs.iter().enumerate() { - if let ApiRequest::Req { cmd, .. } = req { - self.state_machine.submit_cmd( - Self::make_command_id(self.commit_bar, cmd_idx), - cmd.clone(), - )?; - } else { - continue; // ignore other types of requests - } - } - pf_trace!(self.id; "submitted {} exec commands for slot {}", - reqs.len(), self.commit_bar); - } - - self.commit_bar += 1; - } - } - - Ok(()) - } - - /// Synthesized handler of durable logging result chan recv. - fn handle_log_result( - &mut self, - action_id: LogActionId, - log_result: LogResult, - ) -> Result<(), SummersetError> { - let (slot, entry_type) = Self::split_log_action_id(action_id); - assert!(slot < self.insts.len()); - - if let LogResult::Append { now_size } = log_result { - assert!(now_size >= self.log_offset); - self.log_offset = now_size; - } else { - return logged_err!(self.id; "unexpected log result type: {:?}", log_result); - } - - match entry_type { - Status::Preparing => self.handle_logged_prepare_bal(slot), - Status::Accepting => self.handle_logged_accept_data(slot), - Status::Committed => self.handle_logged_commit_slot(slot), - _ => { - logged_err!(self.id; "unexpected log entry type: {:?}", entry_type) - } - } - } - - /// Handler of Prepare message from leader. - fn handle_msg_prepare( - &mut self, - peer: ReplicaId, - slot: usize, - ballot: Ballot, - ) -> Result<(), SummersetError> { - pf_trace!(self.id; "received Prepare <- {} for slot {} bal {}", - peer, slot, ballot); - - // if ballot is not smaller than what I have seen: - if ballot >= self.bal_max_seen { - // locate instance in memory, filling in null instances if needed - while self.insts.len() <= slot { - self.insts.push(Instance { - bal: 0, - status: Status::Null, - reqs_cw: RSCodeword::::from_null( - self.quorum_cnt, - self.population - self.quorum_cnt, - )?, - leader_bk: None, - replica_bk: None, - }); - } - let inst = &mut self.insts[slot]; - assert!(inst.bal <= ballot); - - inst.bal = ballot; - inst.status = Status::Preparing; - inst.replica_bk = Some(ReplicaBookkeeping { source: peer }); - - // update largest ballot seen - self.bal_max_seen = ballot; - - // record update to largest prepare ballot - self.storage_hub.submit_action( - Self::make_log_action_id(slot, Status::Preparing), - LogAction::Append { - entry: LogEntry::PrepareBal { slot, ballot }, - sync: self.config.logger_sync, - }, - )?; - pf_trace!(self.id; "submitted PrepareBal log action for slot {} bal {}", - slot, ballot); - } - - Ok(()) - } - - /// Handler of Prepare reply from replica. - fn handle_msg_prepare_reply( - &mut self, - peer: ReplicaId, - slot: usize, - ballot: Ballot, - voted: Option<(Ballot, RSCodeword)>, - ) -> Result<(), SummersetError> { - pf_trace!(self.id; "received PrepareReply <- {} for slot {} bal {} shards {:?}", - peer, slot, ballot, - voted.as_ref().map(|(_, cw)| cw.avail_shards_map())); - - // if ballot is what I'm currently waiting on for Prepare replies: - if ballot == self.bal_prep_sent { - assert!(slot < self.insts.len()); - let inst = &mut self.insts[slot]; - - // ignore spurious duplications and outdated replies - if (inst.status != Status::Preparing) || (ballot < inst.bal) { - return Ok(()); - } - assert_eq!(inst.bal, ballot); - assert!(self.bal_max_seen >= ballot); - assert!(inst.leader_bk.is_some()); - let leader_bk = inst.leader_bk.as_mut().unwrap(); - if leader_bk.prepare_acks.get(peer)? { - return Ok(()); - } - - // bookkeep this Prepare reply - leader_bk.prepare_acks.set(peer, true)?; - if let Some((bal, val)) = voted { - #[allow(clippy::comparison_chain)] - if bal > leader_bk.prepare_max_bal { - // is of ballot > current maximum, so discard the current - // codeword and take the replied codeword - leader_bk.prepare_max_bal = bal; - inst.reqs_cw = val; - } else if bal == leader_bk.prepare_max_bal { - // is of ballot == the one currently taken, so merge the - // replied codeword into the current one - inst.reqs_cw.absorb_other(val)?; - } - } - - // if quorum size reached AND enough shards are known to - // reconstruct the original data, enter Accept phase for this - // instance using the request batch value constructed using shards - // with the highest ballot number in quorum - if leader_bk.prepare_acks.count() >= self.quorum_cnt - && inst.reqs_cw.avail_data_shards() >= self.quorum_cnt - { - inst.status = Status::Accepting; - pf_debug!(self.id; "enter Accept phase for slot {} bal {}", - slot, inst.bal); - - // update bal_prepared - assert!(self.bal_prepared <= ballot); - self.bal_prepared = ballot; - - // if parity shards not computed yet, compute them now - if inst.reqs_cw.avail_shards() < self.population { - inst.reqs_cw.compute_parity(Some(&self.rs_coder))?; - } - - // record update to largest accepted ballot and corresponding data - self.storage_hub.submit_action( - Self::make_log_action_id(slot, Status::Accepting), - LogAction::Append { - entry: LogEntry::AcceptData { - slot, - ballot, - reqs_cw: inst.reqs_cw.subset_copy( - Bitmap::from( - self.population, - Self::shards_for_replica( - self.id, - self.population, - self.config.shards_per_replica, - ), - ), - false, - )?, - }, - sync: self.config.logger_sync, - }, - )?; - pf_trace!(self.id; "submitted AcceptData log action for slot {} bal {}", - slot, ballot); - - // send Accept messages to all peers - for peer in 0..self.population { - if peer == self.id { - continue; - } - self.transport_hub.send_msg( - PeerMsg::Accept { - slot, - ballot, - reqs_cw: inst.reqs_cw.subset_copy( - Bitmap::from( - self.population, - Self::shards_for_replica( - peer, - self.population, - self.config.shards_per_replica, - ), - ), - false, - )?, - }, - peer, - )?; - } - pf_trace!(self.id; "broadcast Accept messages for slot {} bal {}", - slot, ballot); - } - } - - Ok(()) - } - - /// Handler of Accept message from leader. - fn handle_msg_accept( - &mut self, - peer: ReplicaId, - slot: usize, - ballot: Ballot, - reqs_cw: RSCodeword, - ) -> Result<(), SummersetError> { - pf_trace!(self.id; "received Accept <- {} for slot {} bal {} shards {:?}", - peer, slot, ballot, reqs_cw.avail_shards_map()); - - // if ballot is not smaller than what I have made promises for: - if ballot >= self.bal_max_seen { - // locate instance in memory, filling in null instances if needed - while self.insts.len() <= slot { - self.insts.push(Instance { - bal: 0, - status: Status::Null, - reqs_cw: RSCodeword::::from_null( - self.quorum_cnt, - self.population - self.quorum_cnt, - )?, - leader_bk: None, - replica_bk: None, - }); - } - let inst = &mut self.insts[slot]; - assert!(inst.bal <= ballot); - - inst.bal = ballot; - inst.status = Status::Accepting; - inst.reqs_cw = reqs_cw; - inst.replica_bk = Some(ReplicaBookkeeping { source: peer }); - - // update largest ballot seen - self.bal_max_seen = ballot; - - // record update to largest prepare ballot - self.storage_hub.submit_action( - Self::make_log_action_id(slot, Status::Accepting), - LogAction::Append { - entry: LogEntry::AcceptData { - slot, - ballot, - reqs_cw: inst.reqs_cw.clone(), - }, - sync: self.config.logger_sync, - }, - )?; - pf_trace!(self.id; "submitted AcceptData log action for slot {} bal {}", - slot, ballot); - } - - Ok(()) - } - - /// Handler of Accept reply from replica. - fn handle_msg_accept_reply( - &mut self, - peer: ReplicaId, - slot: usize, - ballot: Ballot, - ) -> Result<(), SummersetError> { - pf_trace!(self.id; "received AcceptReply <- {} for slot {} bal {}", - peer, slot, ballot); - - // if ballot is what I'm currently waiting on for Accept replies: - if ballot == self.bal_prepared { - assert!(slot < self.insts.len()); - let inst = &mut self.insts[slot]; - - // ignore spurious duplications and outdated replies - if (inst.status != Status::Accepting) || (ballot < inst.bal) { - return Ok(()); - } - assert_eq!(inst.bal, ballot); - assert!(self.bal_max_seen >= ballot); - assert!(inst.leader_bk.is_some()); - let leader_bk = inst.leader_bk.as_mut().unwrap(); - if leader_bk.accept_acks.contains_key(&peer) { - return Ok(()); - } - - // bookkeep this Accept reply - leader_bk.accept_acks.insert( - peer, - Bitmap::from( - self.population, - Self::shards_for_replica( - peer, - self.population, - self.config.shards_per_replica, - ), - ), - ); - - // if quorum size reached AND enough number of shards are - // remembered, mark this instance as committed - if leader_bk.accept_acks.len() as u8 >= self.quorum_cnt - && Self::coverage_under_faults( - self.population, - &leader_bk.accept_acks, - self.config.fault_tolerance, - ) >= self.quorum_cnt - { - inst.status = Status::Committed; - pf_debug!(self.id; "committed instance at slot {} bal {}", - slot, inst.bal); - - // record commit event - self.storage_hub.submit_action( - Self::make_log_action_id(slot, Status::Committed), - LogAction::Append { - entry: LogEntry::CommitSlot { slot }, - sync: self.config.logger_sync, - }, - )?; - pf_trace!(self.id; "submitted CommitSlot log action for slot {} bal {}", - slot, inst.bal); - - // send Commit messages to all peers - self.transport_hub - .bcast_msg(PeerMsg::Commit { slot }, None)?; - pf_trace!(self.id; "broadcast Commit messages for slot {} bal {}", - slot, ballot); - } - } - - Ok(()) - } - - /// Handler of Commit message from leader. - /// TODO: take care of missing/lost Commit messages - fn handle_msg_commit( - &mut self, - peer: ReplicaId, - slot: usize, - ) -> Result<(), SummersetError> { - pf_trace!(self.id; "received Commit <- {} for slot {}", peer, slot); - - // locate instance in memory, filling in null instances if needed - while self.insts.len() <= slot { - self.insts.push(Instance { - bal: 0, - status: Status::Null, - reqs_cw: RSCodeword::::from_null( - self.quorum_cnt, - self.population - self.quorum_cnt, - )?, - leader_bk: None, - replica_bk: None, - }); - } - let inst = &mut self.insts[slot]; - - // ignore spurious duplications - if inst.status != Status::Accepting { - return Ok(()); - } - - // mark this instance as committed - inst.status = Status::Committed; - pf_debug!(self.id; "committed instance at slot {} bal {}", - slot, inst.bal); - - // record commit event - self.storage_hub.submit_action( - Self::make_log_action_id(slot, Status::Committed), - LogAction::Append { - entry: LogEntry::CommitSlot { slot }, - sync: self.config.logger_sync, - }, - )?; - pf_trace!(self.id; "submitted CommitSlot log action for slot {} bal {}", - slot, inst.bal); - - Ok(()) - } - - /// Synthesized handler of receiving message from peer. - fn handle_msg_recv( - &mut self, - peer: ReplicaId, - msg: PeerMsg, - ) -> Result<(), SummersetError> { - match msg { - PeerMsg::Prepare { slot, ballot } => { - self.handle_msg_prepare(peer, slot, ballot) - } - PeerMsg::PrepareReply { - slot, - ballot, - voted, - } => self.handle_msg_prepare_reply(peer, slot, ballot, voted), - PeerMsg::Accept { - slot, - ballot, - reqs_cw, - } => self.handle_msg_accept(peer, slot, ballot, reqs_cw), - PeerMsg::AcceptReply { slot, ballot } => { - self.handle_msg_accept_reply(peer, slot, ballot) - } - PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot), - } - } - - /// Handler of state machine exec result chan recv. - fn handle_cmd_result( - &mut self, - cmd_id: CommandId, - cmd_result: CommandResult, - ) -> Result<(), SummersetError> { - let (slot, cmd_idx) = Self::split_command_id(cmd_id); - assert!(slot < self.insts.len()); - pf_trace!(self.id; "executed cmd in instance at slot {} idx {}", - slot, cmd_idx); - - let inst = &mut self.insts[slot]; - let reqs = inst.reqs_cw.get_data()?; - assert!(cmd_idx < reqs.len()); - let (client, ref req) = reqs[cmd_idx]; - - // reply command result back to client - if let ApiRequest::Req { id: req_id, .. } = req { - if self.external_api.has_client(client) { - self.external_api.send_reply( - ApiReply::Reply { - id: *req_id, - result: Some(cmd_result), - redirect: None, - }, - client, - )?; - pf_trace!(self.id; "replied -> client {} for slot {} idx {}", - client, slot, cmd_idx); - } - } else { - return logged_err!(self.id; "unexpected API request type"); - } - - // if all commands in this instance have been executed, set status to - // Executed and update `exec_bar` - if cmd_idx == reqs.len() - 1 { - inst.status = Status::Executed; - pf_debug!(self.id; "executed all cmds in instance at slot {}", - slot); - - // update index of the first non-executed instance - if slot == self.exec_bar { - while self.exec_bar < self.insts.len() { - let inst = &mut self.insts[self.exec_bar]; - if inst.status < Status::Executed { - break; - } - self.exec_bar += 1; - } - } - } - - Ok(()) - } - - /// Handler of ResetState control message. - async fn handle_ctrl_reset_state( - &mut self, - durable: bool, - ) -> Result<(), SummersetError> { - // send leave notification to peers and wait for their replies - self.transport_hub.leave().await?; - - // send leave notification to manager and wait for its reply - self.control_hub.send_ctrl(CtrlMsg::Leave)?; - while self.control_hub.recv_ctrl().await? != CtrlMsg::LeaveReply {} - - // if `durable` is false, truncate backer file - if !durable { - // use 0 as a special log action ID here - self.storage_hub - .submit_action(0, LogAction::Truncate { offset: 0 })?; - loop { - let (action_id, log_result) = - self.storage_hub.get_result().await?; - if action_id == 0 { - if log_result - != (LogResult::Truncate { - offset_ok: true, - now_size: 0, - }) - { - return logged_err!(self.id; "failed to truncate log to 0"); - } else { - return Ok(()); - } - } - } - } - - Ok(()) - } - - /// Synthesized handler of manager control messages. If ok, returns - /// `Some(true)` if decides to terminate and reboot, `Some(false)` if - /// decides to shutdown completely, and `None` if not terminating. - async fn handle_ctrl_msg( - &mut self, - msg: CtrlMsg, - ) -> Result, SummersetError> { - // TODO: fill this when more control message types added - match msg { - CtrlMsg::ResetState { durable } => { - self.handle_ctrl_reset_state(durable).await?; - Ok(Some(true)) - } - - _ => Ok(None), // ignore all other types - } - } -} - -#[async_trait] -impl GenericReplica for CrosswordReplica { - async fn new_and_setup( - api_addr: SocketAddr, - p2p_addr: SocketAddr, - manager: SocketAddr, - config_str: Option<&str>, - ) -> Result { - // connect to the cluster manager and get assigned a server ID - let mut control_hub = ControlHub::new_and_setup(manager).await?; - let id = control_hub.me; - let population = control_hub.population; - - // parse protocol-specific configs - let config = parsed_config!(config_str => ReplicaConfigCrossword; - batch_interval_us, max_batch_size, - backer_path, logger_sync, fault_tolerance, - shards_per_replica)?; - if config.batch_interval_us == 0 { - return logged_err!( - id; - "invalid config.batch_interval_us '{}'", - config.batch_interval_us - ); - } - - // setup state machine module - let state_machine = StateMachine::new_and_setup(id).await?; - - // setup storage hub module - let storage_hub = - StorageHub::new_and_setup(id, Path::new(&config.backer_path)) - .await?; - - // setup transport hub module - let mut transport_hub = - TransportHub::new_and_setup(id, population, p2p_addr).await?; - - // ask for the list of peers to proactively connect to. Do this after - // transport hub has been set up, so that I will be able to accept - // later peer connections - control_hub.send_ctrl(CtrlMsg::NewServerJoin { - id, - protocol: SmrProtocol::Crossword, - api_addr, - p2p_addr, - })?; - let to_peers = if let CtrlMsg::ConnectToPeers { to_peers, .. } = - control_hub.recv_ctrl().await? - { - to_peers - } else { - return logged_err!(id; "unexpected ctrl msg type received"); - }; - - // create a Reed-Solomon coder with num_data_shards == quorum size and - // num_parity shards == population - quorum - let quorum_cnt = (population / 2) + 1; - if config.fault_tolerance > (population - quorum_cnt) { - return logged_err!(id; "invalid config.fault_tolerance '{}'", - config.fault_tolerance); - } - if config.shards_per_replica == 0 - || config.shards_per_replica > quorum_cnt - { - return logged_err!(id; "invalid config.shards_per_replica '{}'", - config.shards_per_replica); - } - let rs_coder = ReedSolomon::new( - quorum_cnt as usize, - (population - quorum_cnt) as usize, - )?; - - // proactively connect to some peers, then wait for all population - // have been connected with me - for (peer, addr) in to_peers { - transport_hub.connect_to_peer(peer, addr).await?; - } - transport_hub.wait_for_group(population).await?; - - // setup external API module, ready to take in client requests - let external_api = ExternalApi::new_and_setup( - id, - api_addr, - Duration::from_micros(config.batch_interval_us), - config.max_batch_size, - ) - .await?; - - Ok(CrosswordReplica { - id, - population, - quorum_cnt, - config, - _api_addr: api_addr, - _p2p_addr: p2p_addr, - control_hub, - external_api, - state_machine, - storage_hub, - transport_hub, - is_leader: false, - insts: vec![], - bal_prep_sent: 0, - bal_prepared: 0, - bal_max_seen: 0, - commit_bar: 0, - exec_bar: 0, - log_offset: 0, - rs_coder, - }) - } - - async fn run( - &mut self, - mut rx_term: watch::Receiver, - ) -> Result { - // TODO: proper leader election - if self.id == 0 { - self.is_leader = true; - } - - loop { - tokio::select! { - // client request batch - req_batch = self.external_api.get_req_batch() => { - if let Err(e) = req_batch { - pf_error!(self.id; "error getting req batch: {}", e); - continue; - } - let req_batch = req_batch.unwrap(); - if let Err(e) = self.handle_req_batch(req_batch) { - pf_error!(self.id; "error handling req batch: {}", e); - } - }, - - // durable logging result - log_result = self.storage_hub.get_result() => { - if let Err(e) = log_result { - pf_error!(self.id; "error getting log result: {}", e); - continue; - } - let (action_id, log_result) = log_result.unwrap(); - if let Err(e) = self.handle_log_result(action_id, log_result) { - pf_error!(self.id; "error handling log result {}: {}", - action_id, e); - } - }, - - // message from peer - msg = self.transport_hub.recv_msg() => { - if let Err(e) = msg { - pf_error!(self.id; "error receiving peer msg: {}", e); - continue; - } - let (peer, msg) = msg.unwrap(); - if let Err(e) = self.handle_msg_recv(peer, msg) { - pf_error!(self.id; "error handling msg recv <- {}: {}", peer, e); - } - } - - // state machine execution result - cmd_result = self.state_machine.get_result() => { - if let Err(e) = cmd_result { - pf_error!(self.id; "error getting cmd result: {}", e); - continue; - } - let (cmd_id, cmd_result) = cmd_result.unwrap(); - if let Err(e) = self.handle_cmd_result(cmd_id, cmd_result) { - pf_error!(self.id; "error handling cmd result {}: {}", cmd_id, e); - } - }, - - // manager control message - ctrl_msg = self.control_hub.recv_ctrl() => { - if let Err(e) = ctrl_msg { - pf_error!(self.id; "error getting ctrl msg: {}", e); - continue; - } - let ctrl_msg = ctrl_msg.unwrap(); - match self.handle_ctrl_msg(ctrl_msg).await { - Ok(terminate) => { - if let Some(restart) = terminate { - pf_warn!( - self.id; - "server got {} req", - if restart { "restart" } else { "shutdown" }); - return Ok(restart); - } - }, - Err(e) => { - pf_error!(self.id; "error handling ctrl msg: {}", e); - } - } - }, - - // receiving termination signal - _ = rx_term.changed() => { - pf_warn!(self.id; "server caught termination signal"); - return Ok(false); - } - } - } - } - - fn id(&self) -> ReplicaId { - self.id - } -} - -/// Configuration parameters struct. -#[derive(Debug, Deserialize)] -pub struct ClientConfigCrossword { - /// Which server to pick initially. - pub init_server_id: ReplicaId, -} - -#[allow(clippy::derivable_impls)] -impl Default for ClientConfigCrossword { - fn default() -> Self { - ClientConfigCrossword { init_server_id: 0 } - } -} - -/// Crossword client-side module. -pub struct CrosswordClient { - /// Client ID. - id: ClientId, - - /// Configuration parameters struct. - _config: ClientConfigCrossword, - - /// Cached list of active servers information. - servers: HashMap, - - /// Current server ID to connect to. - server_id: ReplicaId, - - /// Control API stub to the cluster manager. - ctrl_stub: ClientCtrlStub, - - /// API stubs for communicating with servers. - api_stub: Option, -} - -#[async_trait] -impl GenericEndpoint for CrosswordClient { - async fn new_and_setup( - manager: SocketAddr, - config_str: Option<&str>, - ) -> Result { - // connect to the cluster manager and get assigned a client ID - let ctrl_stub = ClientCtrlStub::new_by_connect(manager).await?; - let id = ctrl_stub.id; - - // parse protocol-specific configs - let config = parsed_config!(config_str => ClientConfigCrossword; - init_server_id)?; - let init_server_id = config.init_server_id; - - Ok(CrosswordClient { - id, - _config: config, - servers: HashMap::new(), - server_id: init_server_id, - ctrl_stub, - api_stub: None, - }) - } - - async fn connect(&mut self) -> Result<(), SummersetError> { - // disallow reconnection without leaving - if self.api_stub.is_some() { - return logged_err!(self.id; "reconnecting without leaving"); - } - - // ask the manager about the list of active servers - let mut sent = - self.ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?; - while !sent { - sent = self.ctrl_stub.send_req(None)?; - } - - let reply = self.ctrl_stub.recv_reply().await?; - match reply { - CtrlReply::QueryInfo { servers } => { - // connect to the one with server ID in config - let api_stub = ClientApiStub::new_by_connect( - self.id, - servers[&self.server_id], - ) - .await?; - self.api_stub = Some(api_stub); - self.servers = servers; - Ok(()) - } - _ => logged_err!(self.id; "unexpected reply type received"), - } - } - - async fn leave(&mut self, permanent: bool) -> Result<(), SummersetError> { - // send leave notification to current connected server - if let Some(mut api_stub) = self.api_stub.take() { - let mut sent = api_stub.send_req(Some(&ApiRequest::Leave))?; - while !sent { - sent = api_stub.send_req(None)?; - } - - let reply = api_stub.recv_reply().await?; - match reply { - ApiReply::Leave => { - pf_info!(self.id; "left current server connection"); - api_stub.forget(); - } - _ => { - return logged_err!(self.id; "unexpected reply type received"); - } - } - } - - // if permanently leaving, send leave notification to the manager - if permanent { - let mut sent = - self.ctrl_stub.send_req(Some(&CtrlRequest::Leave))?; - while !sent { - sent = self.ctrl_stub.send_req(None)?; - } - - let reply = self.ctrl_stub.recv_reply().await?; - match reply { - CtrlReply::Leave => { - pf_info!(self.id; "left current manager connection"); - } - _ => { - return logged_err!(self.id; "unexpected reply type received"); - } - } - } - - Ok(()) - } - - fn send_req( - &mut self, - req: Option<&ApiRequest>, - ) -> Result { - match self.api_stub { - Some(ref mut api_stub) => api_stub.send_req(req), - None => logged_err!(self.id; "client is not set up"), - } - } - - async fn recv_reply(&mut self) -> Result { - match self.api_stub { - Some(ref mut api_stub) => { - let reply = api_stub.recv_reply().await?; - - if let ApiReply::Reply { - ref result, - ref redirect, - .. - } = reply - { - // if the current server redirects me to a different server - if result.is_none() && redirect.is_some() { - let redirect_id = redirect.unwrap(); - assert!(self.servers.contains_key(&redirect_id)); - self.leave(false).await?; - self.server_id = redirect_id; - self.connect().await?; - pf_debug!(self.id; "redirected to replica {} '{}'", - redirect_id, self.servers[&redirect_id]); - } - } - - Ok(reply) - } - None => logged_err!(self.id; "client is not set up"), - } - } - - fn id(&self) -> ClientId { - self.id - } - - fn ctrl_stub(&mut self) -> &mut ClientCtrlStub { - &mut self.ctrl_stub - } -} diff --git a/src/protocols/mod.rs b/src/protocols/mod.rs index 3aae79bf..b7aaaf4f 100644 --- a/src/protocols/mod.rs +++ b/src/protocols/mod.rs @@ -26,10 +26,6 @@ mod rs_paxos; use rs_paxos::{RSPaxosReplica, RSPaxosClient}; pub use rs_paxos::{ReplicaConfigRSPaxos, ClientConfigRSPaxos}; -mod crossword; -use crossword::{CrosswordReplica, CrosswordClient}; -pub use crossword::{ReplicaConfigCrossword, ClientConfigCrossword}; - /// Enum of supported replication protocol types. #[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)] pub enum SmrProtocol { @@ -37,7 +33,6 @@ pub enum SmrProtocol { SimplePush, MultiPaxos, RSPaxos, - Crossword, } /// Helper macro for saving boilder-plate `Box` mapping in @@ -57,7 +52,6 @@ impl SmrProtocol { "SimplePush" => Some(Self::SimplePush), "MultiPaxos" => Some(Self::MultiPaxos), "RSPaxos" => Some(Self::RSPaxos), - "Crossword" => Some(Self::Crossword), _ => None, } } @@ -114,14 +108,6 @@ impl SmrProtocol { .await ) } - Self::Crossword => { - box_if_ok!( - CrosswordReplica::new_and_setup( - api_addr, p2p_addr, manager, config_str - ) - .await - ) - } } } @@ -152,11 +138,6 @@ impl SmrProtocol { RSPaxosClient::new_and_setup(manager, config_str).await ) } - Self::Crossword => { - box_if_ok!( - CrosswordClient::new_and_setup(manager, config_str).await - ) - } } } } @@ -186,7 +167,6 @@ mod protocols_name_tests { valid_name_test!(SimplePush); valid_name_test!(MultiPaxos); valid_name_test!(RSPaxos); - valid_name_test!(Crossword); } #[test] diff --git a/tla+/crossword/.gitignore b/tla+/crossword/.gitignore deleted file mode 100644 index 5ff20e95..00000000 --- a/tla+/crossword/.gitignore +++ /dev/null @@ -1 +0,0 @@ -Crossword.cfg diff --git a/tla+/crossword/ConsensusMulti.cfg b/tla+/crossword/ConsensusMulti.cfg deleted file mode 100644 index 5a0c285e..00000000 --- a/tla+/crossword/ConsensusMulti.cfg +++ /dev/null @@ -1,17 +0,0 @@ -\* Add statements after this line. -SPECIFICATION LiveSpec - -CONSTANTS - Values = {v1, v2} - Slots = {s1, s2} - -INVARIANTS - TypeOK - -PROPERTIES - Nontriviality - Stability - Consistency - Liveness - -CHECK_DEADLOCK FALSE diff --git a/tla+/crossword/ConsensusMulti.tla b/tla+/crossword/ConsensusMulti.tla deleted file mode 100644 index 3cfe0b52..00000000 --- a/tla+/crossword/ConsensusMulti.tla +++ /dev/null @@ -1,108 +0,0 @@ -(***************************************************************************) -(* The consensus problem specification, extended with an array of *) -(* istances where each instance is a basic consensus problem. *) -(* *) -(* Adapted from: *) -(* https://lamport.azurewebsites.net/tla/Consensus.tla *) -(* and ../paxos/Consensus.tla *) -(***************************************************************************) - ----- MODULE ConsensusMulti ----- -EXTENDS Naturals, FiniteSets - -CONSTANT Values, Slots - -ASSUME /\ Values # {} - /\ Slots # {} - -(*************************) -(* Consensus model spec. *) -(*************************) -(*--algorithm ConsensusMulti -variable proposed = {}, - chosen = [s \in Slots |-> {}]; - -\* Propose new value: -macro Propose() begin - with v \in Values do - await v \notin proposed; - proposed := proposed \cup {v}; - end with; -end macro; - -\* Choose a proposed value for a given empty slot: -macro Choose(s) begin - await chosen[s] = {}; - with v \in proposed do - chosen[s] := {v}; - end with; -end macro; - -begin - lbl: while TRUE do - either - Propose(); - or - with s \in Slots do - Choose(s); - end with; - end either; - end while; -end algorithm; *) - -\* BEGIN TRANSLATION (chksum(pcal) = "dbf82723" /\ chksum(tla) = "39f644db") -VARIABLES proposed, chosen - -vars == << proposed, chosen >> - -Init == (* Global variables *) - /\ proposed = {} - /\ chosen = [s \in Slots |-> {}] - -Next == \/ /\ \E v \in Values: - /\ v \notin proposed - /\ proposed' = (proposed \cup {v}) - /\ UNCHANGED chosen - \/ /\ \E s \in Slots: - /\ chosen[s] = {} - /\ \E v \in proposed: - chosen' = [chosen EXCEPT ![s] = {v}] - /\ UNCHANGED proposed - -Spec == Init /\ [][Next]_vars - -\* END TRANSLATION - ----------- - -(**********************) -(* Safety properties. *) -(**********************) -TypeOK == /\ chosen \in [Slots -> SUBSET Values] - /\ \A s \in Slots: IsFiniteSet(chosen[s]) - -Nontriviality == - [](\A s \in Slots: \A v \in chosen[s]: v \in proposed) - -Stability == - \A s \in Slots: - \A v \in Values: - (v \in chosen[s]) => [](v \in chosen[s]) - -Consistency == - [](\A s \in Slots: Cardinality(chosen[s]) =< 1) - -THEOREM Spec => ([]TypeOK) /\ Nontriviality /\ Stability /\ Consistency - ----------- - -(************************) -(* Liveness properties. *) -(************************) -LiveSpec == Spec /\ WF_vars(Next) - -Liveness == <>(\A s \in Slots: chosen[s] # {}) - -THEOREM LiveSpec => Liveness - -==== diff --git a/tla+/crossword/Crossword.tla b/tla+/crossword/Crossword.tla deleted file mode 100644 index bb237fd9..00000000 --- a/tla+/crossword/Crossword.tla +++ /dev/null @@ -1,355 +0,0 @@ -(*********************************************************************************) -(* Crossword protocol combining MultiPaxos and erasure code sharding, built upon *) -(* the practical version of MultiPaxos spec. *) -(* *) -(* Leader shards recovery is not explicitly modeled in this spec, but should be *) -(* quite straightforward to add. *) -(*********************************************************************************) - ----- MODULE Crossword ---- -EXTENDS FiniteSets, Integers, TLC - -CONSTANT Replicas, Values, Slots, Ballots, Shards, NumDataShards, MaxFaults - -MajorityNum == (Cardinality(Replicas) \div 2) + 1 - -ReplicasAssumption == /\ IsFiniteSet(Replicas) - /\ Cardinality(Replicas) >= 3 - -ValuesAssumption == /\ IsFiniteSet(Values) - /\ Cardinality(Values) >= 2 - /\ 0 \notin Values - -SlotsAssumption == /\ IsFiniteSet(Slots) - /\ Slots # {} - -BallotsAssumption == /\ IsFiniteSet(Ballots) - /\ Ballots # {} - /\ Ballots \subseteq Nat - -ShardsAssumption == /\ IsFiniteSet(Shards) - /\ Shards # {} - -NumDataShardsAssumption == /\ NumDataShards > 0 - /\ NumDataShards =< Cardinality(Shards) - -MaxFaultsAssumption == /\ MaxFaults >= 0 - /\ MaxFaults =< (Cardinality(Replicas) - MajorityNum) - -ASSUME /\ ReplicasAssumption - /\ ValuesAssumption - /\ SlotsAssumption - /\ BallotsAssumption - /\ ShardsAssumption - /\ NumDataShardsAssumption - /\ MaxFaultsAssumption - -(*--algorithm Crossword -variable msgs = {}, - lBallot = [r \in Replicas |-> -1], - lStatus = [r \in Replicas |-> - [s \in Slots |-> ""]], - rBallot = [r \in Replicas |-> -1], - rVoted = [r \in Replicas |-> - [s \in Slots |-> - [bal |-> -1, val |-> 0, shards |-> {}]]], - proposed = [s \in Slots |-> {}], - learned = [s \in Slots |-> {}]; - -define - \* Is g a subset of u that is large enough under given MaxFaults? - BigEnoughUnderFaults(g, u) == - Cardinality(g) >= (Cardinality(u) - MaxFaults) - - \* Set of subsets of u that we consider under given MaxFaults. - SubsetsUnderFaults(u) == - {g \in SUBSET u: BigEnoughUnderFaults(g, u)} - - \* Is cs a coverage set (i.e., a set of sets of shards) from which we can - \* reconstruct the original data? - IsGoodCoverageSet(cs) == - Cardinality(UNION cs) >= NumDataShards - - \* Set of all valid shard assignments. - ValidAssignments == - {assign \in [Replicas -> SUBSET Shards]: - \A group \in SubsetsUnderFaults(Replicas): - IsGoodCoverageSet({assign[r]: r \in group})} - - \* Is v a safely prepared value given the prepare reply pattern and ballot? - ValuePreparedIn(v, prPat, pBal) == - \/ /\ Cardinality(prPat) >= MajorityNum - /\ \A pr \in prPat: pr.vBal = -1 - \/ /\ Cardinality(prPat) >= MajorityNum - /\ \E c \in 0..(pBal-1): - /\ \A pr \in prPat: pr.vBal =< c - /\ \E pr \in prPat: pr.vBal = c /\ pr.vVal = v - /\ IsGoodCoverageSet({pr.vShards: pr \in {prr \in prPat: prr.vVal = v}}) - \/ /\ BigEnoughUnderFaults(prPat, Replicas) - /\ ~\E vv \in Values: - IsGoodCoverageSet({pr.vShards: pr \in {prr \in prPat: prr.vVal = vv}}) - - \* Does the given accept reply pattern decide a value to be chosen? - PatternDecidesChosen(arPat) == - /\ Cardinality(arPat) >= MajorityNum - /\ \A group \in SubsetsUnderFaults(arPat): - IsGoodCoverageSet({ar.aShards: ar \in group}) -end define; - -\* Send message helpers. -macro Send(m) begin - msgs := msgs \cup {m}; -end macro; - -macro SendAll(ms) begin - msgs := msgs \cup ms; -end macro; - -\* Leader sends Prepare message to replicas. -\* This is the first message a leader makes after being elected. Think of this -\* as a Prepare message that covers infinitely many slots up to infinity. -macro Prepare(r) begin - with b \in Ballots do - await /\ b > lBallot[r] - /\ ~\E m \in msgs: (m.type = "Prepare") /\ (m.bal = b); - \* using this clause to model that ballot numbers from different - \* proposers should be unique - Send([type |-> "Prepare", - from |-> r, - bal |-> b]); - lBallot[r] := b; - lStatus[r] := [s \in Slots |-> - IF lStatus[r][s] = "Learned" THEN "Learned" - ELSE "Preparing"]; - end with; -end macro; - -\* Replica replies to a Prepare message. -\* Replicas reply with their known value shards for recovery reconstruction. -macro PrepareReply(r) begin - with m \in msgs do - await (m.type = "Prepare") /\ (m.bal > rBallot[r]); - Send([type |-> "PrepareReply", - from |-> r, - bal |-> m.bal, - voted |-> rVoted[r]]); - rBallot[r] := m.bal; - end with; -end macro; - -\* Leader sends Accept message to replicas for a slot. -\* Value shards are assigned to replicas according to some reasonable assignment. -macro Accept(r, s) begin - await lStatus[r][s] = "Preparing"; - with v \in Values do - await \E MS \in SUBSET {m \in msgs: /\ m.type = "PrepareReply" - /\ m.bal = lBallot[r]}: - LET prPat == {[replica |-> m.from, - vBal |-> m.voted[s].bal, - vVal |-> m.voted[s].val, - vShards |-> m.voted[s].shards]: m \in MS} - IN ValuePreparedIn(v, prPat, lBallot[r]); - with assign \in ValidAssignments do - SendAll({[type |-> "Accept", - from |-> r, - to |-> rt, - slot |-> s, - bal |-> lBallot[r], - val |-> v, - shards |-> assign[rt]]: rt \in Replicas}); - end with; - lStatus[r][s] := "Accepting"; - proposed[s] := proposed[s] \cup {v}; - end with; -end macro; - -\* Replica replies to an Accept message. -\* Such a reply does not need to contain the actual value data; only the shards -\* metadata is enough for the leader to gather Acceptance Patterns. -macro AcceptReply(r) begin - with m \in msgs do - await (m.type = "Accept") /\ (m.to = r) /\ (m.bal >= rBallot[r]); - Send([type |-> "AcceptReply", - from |-> r, - slot |-> m.slot, - bal |-> m.bal, - val |-> m.val, - shards |-> m.shards]); - rBallot[r] := m.bal; - rVoted[r][m.slot] := [bal |-> m.bal, val |-> m.val, shards |-> m.shards]; - end with; -end macro; - -\* Leader learns a chosen value at a slot. -macro Learn(r, s) begin - await lStatus[r][s] = "Accepting"; - with v \in Values do - await \E MS \in SUBSET {m \in msgs: /\ m.type = "AcceptReply" - /\ m.slot = s - /\ m.bal = lBallot[r] - /\ m.val = v}: - LET arPat == {[replica |-> m.from, - aShards |-> m.shards]: m \in MS} - IN PatternDecidesChosen(arPat); - lStatus[r][s] := "Learned"; - learned[s] := learned[s] \cup {v}; - end with; -end macro; - -process Replica \in Replicas -begin - r: while TRUE do - either - \* p: Prepare(self); - Prepare(self); - or - \* pr: PrepareReply(self); - PrepareReply(self); - or - \* a: with s \in Slots do - with s \in Slots do - Accept(self, s); - end with; - or - \* ar: AcceptReply(self); - AcceptReply(self); - or - \* l: with s \in Slots do - with s \in Slots do - Learn(self, s); - end with; - end either; - end while; -end process; -end algorithm; *) - -\* BEGIN TRANSLATION (chksum(pcal) = "12868a7a" /\ chksum(tla) = "eede354c") -VARIABLES msgs, lBallot, lStatus, rBallot, rVoted, proposed, learned - -(* define statement *) -BigEnoughUnderFaults(g, u) == - Cardinality(g) >= (Cardinality(u) - MaxFaults) - - -SubsetsUnderFaults(u) == - {g \in SUBSET u: BigEnoughUnderFaults(g, u)} - - - -IsGoodCoverageSet(cs) == - Cardinality(UNION cs) >= NumDataShards - - -ValidAssignments == - {assign \in [Replicas -> SUBSET Shards]: - \A group \in SubsetsUnderFaults(Replicas): - IsGoodCoverageSet({assign[r]: r \in group})} - - -ValuePreparedIn(v, prPat, pBal) == - \/ /\ Cardinality(prPat) >= MajorityNum - /\ \A pr \in prPat: pr.vBal = -1 - \/ /\ Cardinality(prPat) >= MajorityNum - /\ \E c \in 0..(pBal-1): - /\ \A pr \in prPat: pr.vBal =< c - /\ \E pr \in prPat: pr.vBal = c /\ pr.vVal = v - /\ IsGoodCoverageSet({pr.vShards: pr \in {prr \in prPat: prr.vVal = v}}) - \/ /\ BigEnoughUnderFaults(prPat, Replicas) - /\ ~\E vv \in Values: - IsGoodCoverageSet({pr.vShards: pr \in {prr \in prPat: prr.vVal = vv}}) - - -PatternDecidesChosen(arPat) == - /\ Cardinality(arPat) >= MajorityNum - /\ \A group \in SubsetsUnderFaults(arPat): - IsGoodCoverageSet({ar.aShards: ar \in group}) - - -vars == << msgs, lBallot, lStatus, rBallot, rVoted, proposed, learned >> - -ProcSet == (Replicas) - -Init == (* Global variables *) - /\ msgs = {} - /\ lBallot = [r \in Replicas |-> -1] - /\ lStatus = [r \in Replicas |-> - [s \in Slots |-> ""]] - /\ rBallot = [r \in Replicas |-> -1] - /\ rVoted = [r \in Replicas |-> - [s \in Slots |-> - [bal |-> -1, val |-> 0, shards |-> {}]]] - /\ proposed = [s \in Slots |-> {}] - /\ learned = [s \in Slots |-> {}] - -Replica(self) == \/ /\ \E b \in Ballots: - /\ /\ b > lBallot[self] - /\ ~\E m \in msgs: (m.type = "Prepare") /\ (m.bal = b) - /\ msgs' = (msgs \cup {([type |-> "Prepare", - from |-> self, - bal |-> b])}) - /\ lBallot' = [lBallot EXCEPT ![self] = b] - /\ lStatus' = [lStatus EXCEPT ![self] = [s \in Slots |-> - IF lStatus[self][s] = "Learned" THEN "Learned" - ELSE "Preparing"]] - /\ UNCHANGED <> - \/ /\ \E m \in msgs: - /\ (m.type = "Prepare") /\ (m.bal > rBallot[self]) - /\ msgs' = (msgs \cup {([type |-> "PrepareReply", - from |-> self, - bal |-> m.bal, - voted |-> rVoted[self]])}) - /\ rBallot' = [rBallot EXCEPT ![self] = m.bal] - /\ UNCHANGED <> - \/ /\ \E s \in Slots: - /\ lStatus[self][s] = "Preparing" - /\ \E v \in Values: - /\ \E MS \in SUBSET {m \in msgs: /\ m.type = "PrepareReply" - /\ m.bal = lBallot[self]}: - LET prPat == {[replica |-> m.from, - vBal |-> m.voted[s].bal, - vVal |-> m.voted[s].val, - vShards |-> m.voted[s].shards]: m \in MS} - IN ValuePreparedIn(v, prPat, lBallot[self]) - /\ \E assign \in ValidAssignments: - msgs' = (msgs \cup ({[type |-> "Accept", - from |-> self, - to |-> rt, - slot |-> s, - bal |-> lBallot[self], - val |-> v, - shards |-> assign[rt]]: rt \in Replicas})) - /\ lStatus' = [lStatus EXCEPT ![self][s] = "Accepting"] - /\ proposed' = [proposed EXCEPT ![s] = proposed[s] \cup {v}] - /\ UNCHANGED <> - \/ /\ \E m \in msgs: - /\ (m.type = "Accept") /\ (m.to = self) /\ (m.bal >= rBallot[self]) - /\ msgs' = (msgs \cup {([type |-> "AcceptReply", - from |-> self, - slot |-> m.slot, - bal |-> m.bal, - val |-> m.val, - shards |-> m.shards])}) - /\ rBallot' = [rBallot EXCEPT ![self] = m.bal] - /\ rVoted' = [rVoted EXCEPT ![self][m.slot] = [bal |-> m.bal, val |-> m.val, shards |-> m.shards]] - /\ UNCHANGED <> - \/ /\ \E s \in Slots: - /\ lStatus[self][s] = "Accepting" - /\ \E v \in Values: - /\ \E MS \in SUBSET {m \in msgs: /\ m.type = "AcceptReply" - /\ m.slot = s - /\ m.bal = lBallot[self] - /\ m.val = v}: - LET arPat == {[replica |-> m.from, - aShards |-> m.shards]: m \in MS} - IN PatternDecidesChosen(arPat) - /\ lStatus' = [lStatus EXCEPT ![self][s] = "Learned"] - /\ learned' = [learned EXCEPT ![s] = learned[s] \cup {v}] - /\ UNCHANGED <> - -Next == (\E self \in Replicas: Replica(self)) - -Spec == Init /\ [][Next]_vars - -\* END TRANSLATION - -==== diff --git a/tla+/crossword/Crossword_MC.cfg b/tla+/crossword/Crossword_MC.cfg deleted file mode 100644 index d0b96091..00000000 --- a/tla+/crossword/Crossword_MC.cfg +++ /dev/null @@ -1,22 +0,0 @@ -SPECIFICATION Spec - -CONSTANTS - Replicas = {r1, r2, r3} - Values = {v1, v2} - Slots = {s1} - Ballots <- ConstBallots - Shards = {c1, c2, c3} - NumDataShards <- ConstNumDataShards - MaxFaults <- ConstMaxFaults - -SYMMETRY SymmetricPerms - -INVARIANTS - TypeOK - NontrivialityInv - ConsistencyInv - -\* PROPERTIES -\* ConsensusSpec \* check this only on very small inputs - -CHECK_DEADLOCK FALSE diff --git a/tla+/crossword/Crossword_MC.tla b/tla+/crossword/Crossword_MC.tla deleted file mode 100644 index adef2ee8..00000000 --- a/tla+/crossword/Crossword_MC.tla +++ /dev/null @@ -1,89 +0,0 @@ ----- MODULE Crossword_MC ---- -EXTENDS Crossword - -SymmetricPerms == Permutations(Replicas) - \cup Permutations(Values) - \cup Permutations(Slots) - \cup Permutations(Shards) - -ConstBallots == 0..1 -ConstNumDataShards == 2 -ConstMaxFaults == 1 - ----------- - -(*************************) -(* Type check invariant. *) -(*************************) -StatusSet == {"", "Preparing", "Accepting", "Learned"} - -SlotVotes == [Slots -> [bal: Ballots \cup {-1}, - val: Values \cup {0}, - shards: SUBSET Shards]] - -Messages == [type: {"Prepare"}, from: Replicas, - bal: Ballots] - \cup [type: {"PrepareReply"}, from: Replicas, - bal: Ballots, - voted: SlotVotes] - \cup [type: {"Accept"}, from: Replicas, - to: Replicas, - slot: Slots, - bal: Ballots, - val: Values, - shards: SUBSET Shards] - \cup [type: {"AcceptReply"}, from: Replicas, - slot: Slots, - bal: Ballots, - val: Values, - shards: SUBSET Shards] - -TypeOK == /\ msgs \in SUBSET Messages - /\ lBallot \in [Replicas -> Ballots \cup {-1}] - /\ lStatus \in [Replicas -> [Slots -> StatusSet]] - /\ rBallot \in [Replicas -> Ballots \cup {-1}] - /\ rVoted \in [Replicas -> SlotVotes] - /\ proposed \in [Slots -> SUBSET Values] - /\ learned \in [Slots -> SUBSET Values] - -THEOREM Spec => []TypeOK - ----------- - -(*****************************************************************************) -(* Check that it implements the ConsensusMulti spec. This transitively means *) -(* that it satisfies the following three properties: *) -(* - Nontriviality *) -(* - Stability *) -(* - Consistency *) -(* *) -(* Only check this property on very small model constants inputs, otherwise *) -(* it would take a prohibitively long time due to state bloating. *) -(*****************************************************************************) -proposedSet == UNION {proposed[s]: s \in Slots} - -ConsensusModule == INSTANCE ConsensusMulti WITH proposed <- proposedSet, - chosen <- learned -ConsensusSpec == ConsensusModule!Spec - -THEOREM Spec => ConsensusSpec - ----------- - -(********************************************************************************) -(* The non-triviality and consistency properties stated in invariant flavor. *) -(* The stability property cannot be stated as an invariant. *) -(* *) -(* Checking invariants takes significantly less time than checking more complex *) -(* temporal properties. Hence, first check these as invariants on larger *) -(* constants inputs, then check the ConsensusSpec property on small inputs. *) -(********************************************************************************) -NontrivialityInv == - \A s \in Slots: \A v \in learned[s]: v \in proposed[s] - -ConsistencyInv == - \A s \in Slots: Cardinality(learned[s]) =< 1 - -THEOREM Spec => [](NontrivialityInv /\ ConsistencyInv) - -==== \ No newline at end of file diff --git a/tla+/multipaxos_practical/MultiPaxos.tla b/tla+/multipaxos_practical/MultiPaxos.tla index c95beaa0..94d1190c 100644 --- a/tla+/multipaxos_practical/MultiPaxos.tla +++ b/tla+/multipaxos_practical/MultiPaxos.tla @@ -32,7 +32,7 @@ ASSUME /\ ReplicasAssumption /\ SlotsAssumption /\ BallotsAssumption -(*--algorithm Crossword +(*--algorithm MultiPaxos variable msgs = {}, lBallot = [r \in Replicas |-> -1], lStatus = [r \in Replicas |-> @@ -170,7 +170,7 @@ begin end process; end algorithm; *) -\* BEGIN TRANSLATION (chksum(pcal) = "18ddaafd" /\ chksum(tla) = "c40b8299") +\* BEGIN TRANSLATION (chksum(pcal) = "88c9342c" /\ chksum(tla) = "c40b8299") VARIABLES msgs, lBallot, lStatus, rBallot, rVoted, proposed, learned (* define statement *)